[med-svn] [spades] 01/01: New upstream version 3.10.0+dfsg
Sascha Steinbiss
satta at debian.org
Sun Jun 18 17:02:46 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to annotated tag upstream/3.10.0+dfsg
in repository spades.
commit 55e89edffa500c8f0d1fdc423884571ac501e59b
Author: Sascha Steinbiss <satta at debian.org>
Date: Sat Feb 25 14:00:00 2017 +0000
New upstream version 3.10.0+dfsg
---
VERSION | 2 +-
changelog.html | 13 +
configs/debruijn/config.info | 32 +-
configs/debruijn/distance_estimation.info | 12 +-
configs/debruijn/large_genome_mode.info | 22 +
configs/debruijn/log.properties | 6 +
configs/debruijn/mda_mode.info | 11 +-
configs/debruijn/meta_mode.info | 49 +-
configs/debruijn/moleculo_mode.info | 2 +
configs/debruijn/path_extend/pe_params.info | 206 --
configs/debruijn/pe_params.info | 56 +-
configs/debruijn/rna_mode.info | 31 +-
configs/debruijn/simplification.info | 1 +
ext/include/btree/btree.h | 138 +-
ext/include/btree/btree_container.h | 45 +-
ext/include/btree/btree_map.h | 9 +
ext/include/btree/safe_btree.h | 32 +-
ext/include/btree/safe_btree_map.h | 8 +
ext/include/bwa/bntseq.h | 91 +
ext/include/bwa/bwa.h | 62 +
ext/include/bwa/bwamem.h | 184 +
ext/include/bwa/bwt.h | 130 +
ext/include/bwa/utils.h | 111 +
ext/include/cuckoo/LICENSE | 18 +
ext/include/cuckoo/city_hasher.hh | 44 +
ext/include/cuckoo/cuckoohash_config.hh | 36 +
ext/include/cuckoo/cuckoohash_map.hh | 2537 +++++++++++++
ext/include/cuckoo/cuckoohash_util.hh | 136 +
ext/include/cuckoo/libcuckoo_lazy_array.hh | 202 +
ext/include/getopt_pp/getopt_pp.cpp | 206 --
ext/include/llvm/Support/MathExtras.h | 1 +
ext/src/CMakeLists.txt | 4 +-
ext/src/bwa/CMakeLists.txt | 31 +
ext/src/bwa/ChangeLog | 3864 ++++++++++++++++++++
ext/src/bwa/LICENSE.txt | 168 +
ext/src/bwa/Makefile | 88 +
ext/src/bwa/NEWS.md | 1146 ++++++
ext/src/bwa/README-alt.md | 178 +
ext/src/bwa/README.md | 174 +
ext/src/bwa/bamlite.c | 210 ++
ext/src/bwa/bamlite.h | 114 +
ext/src/bwa/bntseq.c | 446 +++
ext/src/bwa/bwa.1 | 825 +++++
ext/src/bwa/bwa.c | 447 +++
ext/src/bwa/bwakit/README.md | 115 +
ext/src/bwa/bwakit/bwa-postalt.js | 524 +++
ext/src/bwa/bwakit/run-HLA | 20 +
ext/src/bwa/bwakit/run-bwamem | 186 +
ext/src/bwa/bwakit/run-gen-ref | 39 +
ext/src/bwa/bwakit/typeHLA-selctg.js | 62 +
ext/src/bwa/bwakit/typeHLA.js | 496 +++
ext/src/bwa/bwakit/typeHLA.sh | 49 +
ext/src/bwa/bwamem.c | 1201 ++++++
ext/src/bwa/bwamem_extra.c | 140 +
ext/src/bwa/bwamem_pair.c | 388 ++
ext/src/bwa/bwape.c | 783 ++++
ext/src/bwa/bwase.c | 602 +++
ext/src/bwa/bwase.h | 29 +
ext/src/bwa/bwaseqio.c | 235 ++
ext/src/bwa/bwashm.c | 213 ++
ext/src/bwa/bwt.c | 469 +++
ext/src/bwa/bwt_lite.c | 98 +
ext/src/bwa/bwt_lite.h | 29 +
ext/src/bwa/bwtaln.c | 320 ++
ext/src/bwa/bwtaln.h | 153 +
ext/src/bwa/bwtgap.c | 264 ++
ext/src/bwa/bwtgap.h | 40 +
ext/src/bwa/bwtindex.c | 304 ++
ext/src/bwa/bwtsw2.h | 69 +
ext/src/bwa/bwtsw2_aux.c | 776 ++++
ext/src/bwa/bwtsw2_chain.c | 112 +
ext/src/bwa/bwtsw2_core.c | 619 ++++
ext/src/bwa/bwtsw2_main.c | 89 +
ext/src/bwa/bwtsw2_pair.c | 268 ++
ext/src/bwa/example.c | 60 +
ext/src/bwa/fastmap.c | 441 +++
ext/src/bwa/is.c | 223 ++
ext/src/bwa/kbtree.h | 388 ++
ext/src/bwa/khash.h | 614 ++++
ext/src/bwa/kopen.c | 374 ++
ext/src/bwa/kseq.h | 239 ++
ext/src/bwa/ksort.h | 273 ++
ext/src/bwa/kstring.c | 20 +
ext/src/bwa/kstring.h | 134 +
ext/src/bwa/ksw.c | 713 ++++
ext/src/bwa/ksw.h | 114 +
ext/src/bwa/kthread.c | 152 +
ext/src/bwa/kvec.h | 94 +
ext/src/bwa/main.c | 101 +
ext/src/bwa/malloc_wrap.c | 57 +
ext/src/bwa/malloc_wrap.h | 47 +
ext/src/bwa/maxk.c | 67 +
ext/src/bwa/pemerge.c | 291 ++
ext/src/bwa/qualfa2fq.pl | 27 +
ext/src/bwa/rle.c | 191 +
ext/src/bwa/rle.h | 77 +
ext/src/bwa/rope.c | 318 ++
ext/src/bwa/rope.h | 58 +
ext/src/bwa/utils.c | 295 ++
ext/src/bwa/xa2multi.pl | 25 +
ext/src/getopt_pp/CMakeLists.txt | 5 +
ext/src/getopt_pp/getopt_pp.cpp | 206 ++
ext/src/llvm/CMakeLists.txt | 3 +
manual.html | 105 +-
metaspades.py | 102 +-
plasmidspades.py | 102 +-
rnaspades.py | 102 +-
rnaspades_manual.html | 21 +-
spades.py | 102 +-
src/CMakeLists.txt | 6 +-
src/cmake/options.cmake | 3 +
src/cmake/pack.cmake | 6 +-
src/common/CMakeLists.txt | 22 +
src/{utils => common}/adt/array_vector.hpp | 0
src/common/adt/bag.hpp | 87 +
src/{utils => common}/adt/bf.hpp | 0
src/{utils => common}/adt/chained_iterator.hpp | 0
src/common/adt/concurrent_dsu.hpp | 297 ++
src/{utils => common}/adt/filter_iterator.hpp | 0
src/{utils => common}/adt/flat_map.hpp | 0
src/{utils => common}/adt/flat_set.hpp | 0
src/{utils => common}/adt/hll.hpp | 0
src/{utils => common}/adt/iterator_range.hpp | 0
src/common/adt/kmer_hash_vector.hpp | 370 ++
src/common/adt/kmer_vector.hpp | 192 +
src/common/adt/loser_tree.hpp | 134 +
src/common/adt/parallel_seq_vector.hpp | 110 +
.../adt/parallel_unordered_map.hpp | 0
src/{utils => common}/adt/pointer_iterator.hpp | 0
src/common/adt/queue_iterator.hpp | 143 +
src/{utils => common}/adt/small_pod_vector.hpp | 0
src/common/assembly_graph/CMakeLists.txt | 12 +
.../components/component_filters.hpp | 0
.../components/connected_component.cpp | 0
.../components/connected_component.hpp | 26 +
.../assembly_graph/components/graph_component.hpp | 226 ++
src/common/assembly_graph/components/splitters.hpp | 882 +++++
src/common/assembly_graph/core/action_handlers.hpp | 347 ++
.../assembly_graph/core/basic_graph_stats.hpp | 53 +
.../assembly_graph/core/construction_helper.hpp | 84 +
src/common/assembly_graph/core/coverage.hpp | 335 ++
src/common/assembly_graph/core/debruijn_data.hpp | 170 +
.../assembly_graph/core}/directions.hpp | 0
.../assembly_graph/core}/graph.hpp | 0
src/common/assembly_graph/core/graph_core.hpp | 620 ++++
src/common/assembly_graph/core/graph_iterators.hpp | 408 +++
.../assembly_graph/core/observable_graph.hpp | 499 +++
src/common/assembly_graph/core/order_and_law.hpp | 644 ++++
.../assembly_graph/dijkstra/dijkstra_algorithm.hpp | 288 ++
.../assembly_graph/dijkstra/dijkstra_helper.hpp | 163 +
.../assembly_graph}/dijkstra/dijkstra_settings.hpp | 0
.../assembly_graph/dijkstra/length_calculator.hpp | 112 +
.../dijkstra/neighbours_iterator.hpp | 0
.../dijkstra/vertex_process_checker.hpp | 0
.../dijkstra/vertex_put_checker.hpp | 0
.../graph_support/basic_edge_conditions.hpp | 151 +
.../graph_support/basic_vertex_conditions.hpp | 66 +
.../assembly_graph/graph_support/chimera_stats.hpp | 0
.../assembly_graph/graph_support/comparators.hpp | 0
.../assembly_graph/graph_support/contig_output.hpp | 602 +++
.../graph_support/coverage_filling.hpp | 80 +
.../graph_support/coverage_uniformity_analyzer.cpp | 70 +
.../graph_support/coverage_uniformity_analyzer.hpp | 23 +
.../graph_support/detail_coverage.hpp | 190 +
.../assembly_graph/graph_support/edge_removal.hpp | 172 +
.../graph_support/genomic_quality.hpp | 555 +++
.../graph_support/graph_processing_algorithm.hpp | 146 +
.../graph_support/marks_and_locks.hpp | 0
.../graph_support/parallel_processing.hpp | 306 ++
.../graph_support/scaff_supplementary.cpp | 261 ++
.../graph_support/scaff_supplementary.hpp | 99 +
.../handlers/edge_labels_handler.hpp | 226 ++
.../handlers/edges_position_handler.hpp | 212 ++
.../assembly_graph/handlers/id_track_handler.hpp | 110 +
.../assembly_graph/paths/bidirectional_path.cpp | 21 +
.../assembly_graph/paths/bidirectional_path.hpp | 1098 ++++++
.../bidirectional_path_output.cpp | 68 +
.../bidirectional_path_output.hpp | 60 +
.../paths/bidirectional_path_io/io_support.cpp | 177 +
.../paths/bidirectional_path_io/io_support.hpp | 190 +
src/common/assembly_graph/paths/mapping_path.hpp | 301 ++
src/common/assembly_graph/paths/path_finders.hpp | 124 +
src/common/assembly_graph/paths/path_processor.hpp | 386 ++
src/common/assembly_graph/paths/path_utils.hpp | 130 +
src/common/assembly_graph/stats/picture_dump.hpp | 455 +++
src/common/assembly_graph/stats/statistics.hpp | 273 ++
src/{modules => common}/empty.cpp | 0
src/common/func/func.hpp | 25 +
src/common/func/function_traits.hpp | 71 +
src/common/func/pred.hpp | 175 +
src/common/io/CMakeLists.txt | 16 +
src/common/io/dataset_support/dataset_readers.hpp | 121 +
src/common/io/dataset_support/read_converter.hpp | 279 ++
src/common/io/kmers/kmer_iterator.hpp | 54 +
src/common/io/kmers/mmapped_reader.hpp | 396 ++
src/common/io/kmers/mmapped_writer.hpp | 191 +
src/common/io/reads/binary_converter.hpp | 262 ++
src/common/io/reads/binary_streams.hpp | 140 +
.../io/reads/careful_filtering_reader_wrapper.hpp | 183 +
.../io/reads}/converting_reader_wrapper.hpp | 0
.../io/reads}/delegating_reader_wrapper.hpp | 0
src/common/io/reads/fasta_fastq_gz_parser.hpp | 165 +
src/common/io/reads/file_reader.hpp | 129 +
.../io/reads}/filtering_reader_wrapper.hpp | 0
src/common/io/reads/io_helper.hpp | 112 +
src/common/io/reads/ireader.hpp | 117 +
src/common/io/reads/ireadstream.hpp | 168 +
src/common/io/reads/modifying_reader_wrapper.hpp | 115 +
.../reads_io => common/io/reads}/mpmc_bounded.hpp | 0
.../io/reads}/multifile_reader.hpp | 0
.../reads_io => common/io/reads}/orientation.hpp | 0
src/common/io/reads/osequencestream.hpp | 381 ++
src/{modules => common}/io/reads/paired_read.hpp | 0
src/common/io/reads/paired_readers.hpp | 252 ++
src/common/io/reads/parser.cpp | 90 +
src/common/io/reads/parser.hpp | 145 +
.../io/reads}/rc_reader_wrapper.hpp | 0
src/common/io/reads/read.hpp | 244 ++
src/common/io/reads/read_processor.hpp | 209 ++
src/common/io/reads/read_stream_vector.hpp | 137 +
src/common/io/reads/sequence_reader.hpp | 77 +
src/common/io/reads/single_read.hpp | 336 ++
src/common/io/reads/splitting_wrapper.hpp | 76 +
src/common/io/reads/vector_reader.hpp | 61 +
src/common/io/reads/wrapper_collection.hpp | 115 +
src/common/io/sam/bam_parser.hpp | 67 +
src/common/io/sam/bam_reader.hpp | 107 +
src/common/io/sam/read.cpp | 42 +
src/{modules/io/sam_io => common/io/sam}/read.hpp | 0
src/common/io/sam/sam_reader.cpp | 73 +
src/common/io/sam/sam_reader.hpp | 49 +
src/common/math/smooth.hpp | 189 +
src/{modules => common}/math/xmath.h | 0
src/common/modules/CMakeLists.txt | 13 +
src/common/modules/alignment/bwa_index.cpp | 327 ++
src/common/modules/alignment/bwa_index.hpp | 44 +
.../modules/alignment/bwa_sequence_mapper.hpp | 35 +
src/common/modules/alignment/edge_index.hpp | 103 +
.../modules/alignment/edge_index_refiller.cpp | 33 +
.../modules/alignment}/edge_index_refiller.hpp | 0
src/common/modules/alignment/kmer_map.hpp | 151 +
src/common/modules/alignment/kmer_mapper.hpp | 219 ++
.../modules/alignment/kmer_mapper_logger.hpp | 45 +
src/common/modules/alignment/long_read_mapper.hpp | 172 +
src/common/modules/alignment/long_read_storage.hpp | 354 ++
src/common/modules/alignment/pacbio/pac_index.hpp | 916 +++++
.../alignment/pacbio/pacbio_read_structures.hpp | 309 ++
src/common/modules/alignment/sequence_mapper.hpp | 405 ++
.../modules/alignment/sequence_mapper_notifier.hpp | 184 +
src/common/modules/alignment/short_read_mapper.hpp | 93 +
src/common/modules/genome_consistance_checker.cpp | 276 ++
src/common/modules/genome_consistance_checker.hpp | 79 +
src/common/modules/graph_construction.hpp | 180 +
src/common/modules/graph_read_correction.hpp | 187 +
src/common/modules/mismatch_shall_not_pass.hpp | 333 ++
src/common/modules/path_extend/CMakeLists.txt | 23 +
.../modules/path_extend/extension_chooser.hpp | 1162 ++++++
.../modules}/path_extend/ideal_pair_info.hpp | 0
src/common/modules/path_extend/loop_traverser.hpp | 228 ++
.../modules/path_extend/overlap_analysis.hpp | 123 +
src/common/modules/path_extend/paired_library.hpp | 186 +
src/common/modules/path_extend/path_extender.hpp | 1576 ++++++++
src/common/modules/path_extend/path_filter.hpp | 186 +
src/common/modules/path_extend/path_visualizer.hpp | 172 +
.../modules/path_extend/pe_config_struct.cpp | 211 ++
.../modules/path_extend/pe_config_struct.hpp | 246 ++
src/common/modules/path_extend/pe_resolver.hpp | 577 +++
src/common/modules/path_extend/pe_utils.hpp | 397 ++
.../path_extend/pipeline/extenders_logic.cpp | 423 +++
.../path_extend/pipeline/extenders_logic.hpp | 118 +
.../path_extend/pipeline/launch_support.cpp | 128 +
.../path_extend/pipeline/launch_support.hpp | 145 +
.../modules/path_extend/pipeline/launcher.cpp | 448 +++
.../modules/path_extend/pipeline/launcher.hpp | 115 +
.../scaffolder2015/connection_condition2015.cpp | 260 ++
.../scaffolder2015/connection_condition2015.hpp | 143 +
.../scaffolder2015/extension_chooser2015.cpp | 93 +
.../scaffolder2015/extension_chooser2015.hpp | 65 +
.../path_extend/scaffolder2015/path_polisher.cpp | 326 ++
.../path_extend/scaffolder2015/path_polisher.hpp | 85 +
.../path_extend/scaffolder2015/scaffold_graph.cpp | 258 ++
.../path_extend/scaffolder2015/scaffold_graph.hpp | 228 ++
.../scaffolder2015/scaffold_graph_constructor.cpp | 75 +
.../scaffolder2015/scaffold_graph_constructor.hpp | 80 +
.../scaffolder2015/scaffold_graph_visualizer.cpp | 69 +
.../scaffolder2015/scaffold_graph_visualizer.hpp | 79 +
.../modules/path_extend/split_graph_pair_info.hpp | 432 +++
src/common/modules/path_extend/weight_counter.hpp | 357 ++
.../modules/simplification/bulge_remover.hpp | 680 ++++
src/common/modules/simplification/cleaner.hpp | 39 +
.../simplification/complex_bulge_remover.hpp | 1215 ++++++
.../modules/simplification/complex_tip_clipper.hpp | 178 +
src/common/modules/simplification/compressor.hpp | 125 +
.../simplification/dominated_set_finder.hpp | 136 +
.../modules/simplification/ec_threshold_finder.hpp | 152 +
.../erroneous_connection_remover.hpp | 659 ++++
.../modules}/simplification/mf_ec_remover.hpp | 0
.../parallel_simplification_algorithms.hpp | 900 +++++
.../simplification/relative_coverage_remover.hpp | 690 ++++
src/common/modules/simplification/tip_clipper.hpp | 248 ++
.../simplification/topological_edge_conditions.hpp | 286 ++
.../paired_info/concurrent_pair_info_buffer.hpp | 120 +
src/common/paired_info/data_divider.hpp | 137 +
src/common/paired_info/distance_estimation.hpp | 300 ++
src/common/paired_info/histogram.hpp | 199 +
src/common/paired_info/histptr.hpp | 156 +
.../paired_info/index_point.hpp | 0
src/common/paired_info/insert_size_refiner.hpp | 165 +
src/common/paired_info/is_counter.hpp | 150 +
src/common/paired_info/pair_info_bounds.hpp | 30 +
src/common/paired_info/pair_info_filler.hpp | 108 +
.../paired_info/pair_info_filters.hpp | 0
src/common/paired_info/pair_info_improver.hpp | 280 ++
src/common/paired_info/paired_info.hpp | 630 ++++
src/common/paired_info/paired_info_buffer.hpp | 227 ++
.../paired_info/paired_info_helpers.hpp | 0
src/common/paired_info/peak_finder.hpp | 385 ++
.../paired_info/smoothing_distance_estimation.hpp | 283 ++
src/common/paired_info/split_path_constructor.hpp | 142 +
.../paired_info/weighted_distance_estimation.hpp | 112 +
src/common/paired_info/weights.hpp | 83 +
src/{modules => common}/pipeline/CMakeLists.txt | 0
src/common/pipeline/config_common.hpp | 140 +
src/common/pipeline/config_singl.hpp | 57 +
src/common/pipeline/config_struct.cpp | 858 +++++
src/common/pipeline/config_struct.hpp | 608 +++
src/{modules => common}/pipeline/genomic_info.hpp | 0
src/common/pipeline/genomic_info_filler.cpp | 149 +
.../pipeline/genomic_info_filler.hpp | 0
src/common/pipeline/graph_pack.hpp | 170 +
src/common/pipeline/graphio.hpp | 1047 ++++++
src/common/pipeline/library.cpp | 139 +
src/common/pipeline/library.hpp | 367 ++
src/{modules => common}/pipeline/library.inl | 0
src/common/pipeline/stage.cpp | 133 +
src/{modules => common}/pipeline/stage.hpp | 0
src/common/sequence/genome_storage.hpp | 55 +
src/common/sequence/nucl.hpp | 123 +
.../sequence/quality.hpp | 0
src/common/sequence/rtseq.hpp | 751 ++++
src/common/sequence/seq.hpp | 529 +++
src/common/sequence/seq_common.hpp | 44 +
src/common/sequence/sequence.hpp | 553 +++
src/common/sequence/sequence_tools.hpp | 159 +
src/common/sequence/simple_seq.hpp | 157 +
src/{modules => common}/stages/CMakeLists.txt | 0
src/common/stages/construction.cpp | 70 +
src/{modules => common}/stages/construction.hpp | 0
src/common/stages/simplification.cpp | 613 ++++
src/{modules => common}/stages/simplification.hpp | 0
.../graph_simplification.hpp | 678 ++++
.../simplification_pipeline/rna_simplification.hpp | 22 +
.../simplification_settings.hpp | 112 +
.../single_cell_simplification.hpp | 142 +
src/common/utils/CMakeLists.txt | 20 +
.../utils}/autocompletion.cpp | 0
.../utils}/autocompletion.hpp | 0
src/common/utils/copy_file.cpp | 158 +
src/common/utils/copy_file.hpp | 18 +
src/common/utils/coverage_model/CMakeLists.txt | 14 +
.../utils/coverage_model/kmer_coverage_model.cpp | 380 ++
.../utils/coverage_model/kmer_coverage_model.hpp | 48 +
.../dev_support => common/utils}/cpp_utils.hpp | 0
.../debruijn_graph/debruijn_graph_constructor.hpp | 558 +++
.../utils/debruijn_graph/early_simplification.hpp | 191 +
src/common/utils/file_limit.hpp | 33 +
src/common/utils/indices/edge_index_builders.hpp | 174 +
src/common/utils/indices/edge_info_updater.hpp | 109 +
src/common/utils/indices/edge_multi_index.hpp | 155 +
src/common/utils/indices/edge_position_index.hpp | 216 ++
src/common/utils/indices/editable_index.hpp | 270 ++
src/common/utils/indices/key_with_hash.hpp | 229 ++
src/common/utils/indices/kmer_extension_index.hpp | 309 ++
.../utils/indices/kmer_extension_index_builder.hpp | 106 +
src/common/utils/indices/kmer_splitters.hpp | 317 ++
src/common/utils/indices/perfect_hash_map.hpp | 339 ++
.../utils/indices/perfect_hash_map_builder.hpp | 102 +
src/common/utils/indices/storing_traits.hpp | 81 +
.../utils}/indices/values.hpp | 0
src/common/utils/levenshtein.hpp | 241 ++
src/{modules/dev_support => common/utils}/log.hpp | 0
src/common/utils/logger/log_writers.hpp | 43 +
src/common/utils/logger/logger.hpp | 149 +
src/common/utils/logger/logger_impl.cpp | 148 +
src/{modules/dev_support => common/utils}/md5.h | 0
.../dev_support => common/utils}/memory.hpp | 0
.../dev_support => common/utils}/memory_limit.hpp | 0
.../utils}/mph_index/CMakeLists.txt | 0
.../utils}/mph_index/base_hash.hpp | 0
.../utils}/mph_index/bitpair_vector.cpp | 0
.../utils}/mph_index/bitpair_vector.hpp | 0
.../utils}/mph_index/common.hpp | 0
.../utils}/mph_index/emphf_config.hpp | 0
.../utils}/mph_index/hypergraph.hpp | 0
.../utils/mph_index/hypergraph_sorter_seq.hpp | 130 +
.../utils}/mph_index/kmer_index.hpp | 0
src/common/utils/mph_index/kmer_index_builder.hpp | 486 +++
src/common/utils/mph_index/kmer_index_traits.hpp | 92 +
src/common/utils/mph_index/mphf.hpp | 136 +
.../utils}/mph_index/ranked_bitpair_vector.hpp | 0
.../dev_support => common/utils}/openmp_wrapper.h | 0
.../utils}/parallel_wrapper.hpp | 0
src/common/utils/path_helper.cpp | 249 ++
src/common/utils/path_helper.hpp | 74 +
.../dev_support => common/utils}/perfcounter.hpp | 0
src/common/utils/range.hpp | 92 +
src/common/utils/segfault_handler.hpp | 58 +
src/common/utils/simple_tools.hpp | 189 +
.../dev_support => common/utils}/stacktrace.hpp | 0
src/common/utils/standard_base.hpp | 140 +
src/common/utils/verify.hpp | 33 +
src/common/visualization/graph_colorer.hpp | 355 ++
src/common/visualization/graph_labeler.hpp | 308 ++
src/common/visualization/graph_print_utils.hpp | 327 ++
src/common/visualization/graph_printer.hpp | 186 +
src/common/visualization/position_filler.hpp | 96 +
.../visualization/printing_parameter_storage.hpp | 88 +
src/common/visualization/vertex_linker.hpp | 46 +
.../visualization/visualization.hpp | 0
src/common/visualization/visualization_utils.hpp | 223 ++
src/common/visualization/visualizers.hpp | 180 +
src/modules/CMakeLists.txt | 24 -
src/modules/algorithms/CMakeLists.txt | 11 -
.../algorithms/dijkstra/dijkstra_algorithm.hpp | 288 --
.../algorithms/dijkstra/dijkstra_helper.hpp | 163 -
.../algorithms/dijkstra/length_calculator.hpp | 112 -
.../algorithms/genome_consistance_checker.cpp | 238 --
.../algorithms/genome_consistance_checker.hpp | 77 -
src/modules/algorithms/graph_construction.hpp | 180 -
src/modules/algorithms/graph_read_correction.hpp | 187 -
src/modules/algorithms/mismatch_shall_not_pass.hpp | 339 --
src/modules/algorithms/path_extend/CMakeLists.txt | 18 -
.../algorithms/path_extend/extension_chooser.hpp | 1555 --------
.../algorithms/path_extend/loop_traverser.hpp | 224 --
.../algorithms/path_extend/next_path_searcher.hpp | 1031 ------
.../algorithms/path_extend/overlap_analysis.hpp | 123 -
.../algorithms/path_extend/paired_library.hpp | 179 -
.../algorithms/path_extend/path_extend_launch.hpp | 1257 -------
.../algorithms/path_extend/path_extender.hpp | 1561 --------
src/modules/algorithms/path_extend/path_filter.hpp | 158 -
.../algorithms/path_extend/path_visualizer.hpp | 172 -
.../algorithms/path_extend/pe_config_struct.cpp | 199 -
.../algorithms/path_extend/pe_config_struct.hpp | 271 --
src/modules/algorithms/path_extend/pe_io.hpp | 290 --
src/modules/algorithms/path_extend/pe_resolver.hpp | 523 ---
src/modules/algorithms/path_extend/pe_utils.hpp | 462 ---
.../scaffolder2015/connection_condition2015.cpp | 144 -
.../scaffolder2015/connection_condition2015.hpp | 90 -
.../scaffolder2015/extension_chooser2015.cpp | 82 -
.../scaffolder2015/extension_chooser2015.hpp | 59 -
.../path_extend/scaffolder2015/scaffold_graph.cpp | 275 --
.../path_extend/scaffolder2015/scaffold_graph.hpp | 234 --
.../scaffolder2015/scaffold_graph_constructor.cpp | 77 -
.../scaffolder2015/scaffold_graph_constructor.hpp | 101 -
.../scaffolder2015/scaffold_graph_visualizer.cpp | 72 -
.../scaffolder2015/scaffold_graph_visualizer.hpp | 73 -
.../path_extend/split_graph_pair_info.hpp | 449 ---
.../algorithms/path_extend/weight_counter.hpp | 544 ---
.../algorithms/simplification/bulge_remover.hpp | 783 ----
src/modules/algorithms/simplification/cleaner.hpp | 43 -
.../simplification/complex_bulge_remover.hpp | 1162 ------
.../simplification/complex_tip_clipper.hpp | 158 -
.../algorithms/simplification/compressor.hpp | 141 -
.../simplification/dominated_set_finder.hpp | 137 -
.../simplification/ec_threshold_finder.hpp | 152 -
.../erroneous_connection_remover.hpp | 690 ----
.../parallel_simplification_algorithms.hpp | 820 -----
.../simplification/relative_coverage_remover.hpp | 674 ----
.../algorithms/simplification/tip_clipper.hpp | 271 --
src/modules/assembly_graph/CMakeLists.txt | 12 -
.../components/connected_component.hpp | 26 -
.../assembly_graph/components/graph_component.hpp | 198 -
.../assembly_graph/components/splitters.hpp | 921 -----
.../assembly_graph/graph_alignment/edge_index.hpp | 103 -
.../graph_alignment/edge_index_refiller.cpp | 33 -
.../assembly_graph/graph_alignment/kmer_map.hpp | 151 -
.../assembly_graph/graph_alignment/kmer_mapper.hpp | 239 --
.../graph_alignment/kmer_mapper_logger.hpp | 45 -
.../graph_alignment/long_read_mapper.hpp | 190 -
.../graph_alignment/long_read_storage.hpp | 376 --
.../graph_alignment/pacbio/pac_index.hpp | 824 -----
.../graph_alignment/pacbio/pacbio_gap_closer.hpp | 396 --
.../pacbio/pacbio_read_structures.hpp | 320 --
.../graph_alignment/sequence_mapper.hpp | 387 --
.../graph_alignment/sequence_mapper_notifier.hpp | 178 -
.../graph_alignment/short_read_mapper.hpp | 98 -
.../assembly_graph/graph_core/action_handlers.hpp | 347 --
.../graph_core/basic_graph_stats.hpp | 53 -
.../graph_core/construction_helper.hpp | 84 -
src/modules/assembly_graph/graph_core/coverage.hpp | 343 --
.../assembly_graph/graph_core/debruijn_data.hpp | 170 -
.../assembly_graph/graph_core/graph_core.hpp | 620 ----
.../assembly_graph/graph_core/graph_iterators.hpp | 408 ---
.../assembly_graph/graph_core/observable_graph.hpp | 499 ---
.../assembly_graph/graph_core/order_and_law.hpp | 644 ----
.../graph_support/basic_edge_conditions.hpp | 273 --
.../graph_support/basic_vertex_conditions.hpp | 52 -
.../assembly_graph/graph_support/contig_output.hpp | 425 ---
.../graph_support/detail_coverage.hpp | 258 --
.../graph_support/genomic_quality.hpp | 554 ---
.../graph_support/graph_processing_algorithm.hpp | 262 --
.../graph_support/parallel_processing.hpp | 290 --
.../graph_support/scaff_supplementary.cpp | 66 -
.../graph_support/scaff_supplementary.hpp | 77 -
.../handlers/edge_labels_handler.hpp | 226 --
.../handlers/edges_position_handler.hpp | 213 --
.../assembly_graph/handlers/id_track_handler.hpp | 110 -
.../assembly_graph/paths/bidirectional_path.cpp | 21 -
.../assembly_graph/paths/bidirectional_path.hpp | 1087 ------
src/modules/assembly_graph/paths/mapping_path.hpp | 232 --
src/modules/assembly_graph/paths/path_finders.hpp | 124 -
.../assembly_graph/paths/path_processor.hpp | 441 ---
src/modules/assembly_graph/paths/path_utils.hpp | 128 -
src/modules/assembly_graph/stats/picture_dump.hpp | 447 ---
src/modules/assembly_graph/stats/statistics.hpp | 273 --
.../debruijn_graph/debruijn_graph_constructor.hpp | 548 ---
.../debruijn_graph/early_simplification.hpp | 192 -
.../indices/edge_index_builders.hpp | 174 -
.../data_structures/indices/edge_info_updater.hpp | 108 -
.../data_structures/indices/edge_multi_index.hpp | 155 -
.../indices/edge_position_index.hpp | 184 -
.../data_structures/indices/editable_index.hpp | 270 --
.../data_structures/indices/key_with_hash.hpp | 227 --
.../indices/kmer_extension_index.hpp | 309 --
.../indices/kmer_extension_index_builder.hpp | 106 -
.../data_structures/indices/kmer_splitters.hpp | 312 --
.../data_structures/indices/perfect_hash_map.hpp | 318 --
.../indices/perfect_hash_map_builder.hpp | 102 -
.../data_structures/indices/storing_traits.hpp | 61 -
.../mph_index/hypergraph_sorter_seq.hpp | 130 -
.../mph_index/kmer_index_builder.hpp | 404 --
.../mph_index/kmer_index_traits.hpp | 87 -
src/modules/data_structures/mph_index/mphf.hpp | 136 -
.../data_structures/sequence/CMakeLists.txt | 10 -
.../data_structures/sequence/genome_storage.cpp | 45 -
.../data_structures/sequence/genome_storage.hpp | 33 -
src/modules/data_structures/sequence/nucl.hpp | 123 -
src/modules/data_structures/sequence/rtseq.hpp | 740 ----
src/modules/data_structures/sequence/runtime_k.hpp | 47 -
src/modules/data_structures/sequence/seq.hpp | 529 ---
.../data_structures/sequence/seq_common.hpp | 20 -
src/modules/data_structures/sequence/sequence.hpp | 553 ---
.../data_structures/sequence/sequence_tools.hpp | 159 -
.../data_structures/sequence/simple_seq.hpp | 157 -
src/modules/dev_support/CMakeLists.txt | 13 -
src/modules/dev_support/copy_file.cpp | 158 -
src/modules/dev_support/copy_file.hpp | 18 -
src/modules/dev_support/file_limit.hpp | 33 -
src/modules/dev_support/func.hpp | 69 -
src/modules/dev_support/logger/log_writers.hpp | 43 -
src/modules/dev_support/logger/logger.hpp | 149 -
src/modules/dev_support/logger/logger_impl.cpp | 148 -
src/modules/dev_support/path_helper.cpp | 249 --
src/modules/dev_support/path_helper.hpp | 74 -
src/modules/dev_support/range.hpp | 92 -
src/modules/dev_support/segfault_handler.hpp | 58 -
src/modules/dev_support/simple_tools.hpp | 184 -
src/modules/dev_support/standard_base.hpp | 140 -
src/modules/dev_support/verify.hpp | 33 -
src/modules/io/CMakeLists.txt | 16 -
src/modules/io/dataset_support/dataset_readers.hpp | 122 -
src/modules/io/dataset_support/read_converter.hpp | 273 --
src/modules/io/graph_io/graph_print_utils.hpp | 328 --
src/modules/io/kmers_io/kmer_iterator.hpp | 54 -
src/modules/io/kmers_io/mmapped_reader.hpp | 396 --
src/modules/io/kmers_io/mmapped_writer.hpp | 191 -
src/modules/io/reads/read.hpp | 244 --
src/modules/io/reads/single_read.hpp | 334 --
src/modules/io/reads_io/binary_converter.hpp | 295 --
src/modules/io/reads_io/binary_streams.hpp | 357 --
.../reads_io/careful_filtering_reader_wrapper.hpp | 183 -
src/modules/io/reads_io/cutting_reader_wrapper.hpp | 135 -
src/modules/io/reads_io/easy_reader.hpp | 122 -
src/modules/io/reads_io/fasta_fastq_gz_parser.hpp | 165 -
src/modules/io/reads_io/file_reader.hpp | 129 -
src/modules/io/reads_io/io_helper.hpp | 118 -
src/modules/io/reads_io/ireader.hpp | 117 -
src/modules/io/reads_io/ireadstream.hpp | 170 -
src/modules/io/reads_io/is_corrupting_wrapper.hpp | 33 -
.../io/reads_io/modifying_reader_wrapper.hpp | 115 -
src/modules/io/reads_io/osequencestream.hpp | 374 --
src/modules/io/reads_io/paired_readers.hpp | 252 --
src/modules/io/reads_io/parser.cpp | 90 -
src/modules/io/reads_io/parser.hpp | 145 -
src/modules/io/reads_io/read_processor.hpp | 209 --
src/modules/io/reads_io/read_stream_vector.hpp | 183 -
src/modules/io/reads_io/sequence_reader.hpp | 77 -
src/modules/io/reads_io/splitting_wrapper.hpp | 76 -
src/modules/io/reads_io/vector_reader.hpp | 61 -
src/modules/io/reads_io/wrapper_collection.hpp | 115 -
src/modules/io/sam_io/bam_parser.hpp | 67 -
src/modules/io/sam_io/bam_reader.hpp | 107 -
src/modules/io/sam_io/read.cpp | 42 -
src/modules/io/sam_io/sam_reader.cpp | 75 -
src/modules/io/sam_io/sam_reader.hpp | 49 -
src/modules/math/CMakeLists.txt | 14 -
src/modules/math/kmer_coverage_model.cpp | 394 --
src/modules/math/kmer_coverage_model.hpp | 50 -
src/modules/math/pred.hpp | 169 -
src/modules/math/smooth.hpp | 195 -
src/modules/paired_info/CMakeLists.txt | 14 -
src/modules/paired_info/bwa_pair_info_filler.cpp | 408 ---
src/modules/paired_info/bwa_pair_info_filler.hpp | 253 --
src/modules/paired_info/data_divider.hpp | 137 -
src/modules/paired_info/distance_estimation.hpp | 309 --
src/modules/paired_info/histogram.hpp | 190 -
src/modules/paired_info/insert_size_refiner.hpp | 165 -
src/modules/paired_info/is_counter.hpp | 167 -
src/modules/paired_info/pair_info_bounds.hpp | 30 -
src/modules/paired_info/pair_info_filler.hpp | 119 -
src/modules/paired_info/pair_info_improver.hpp | 280 --
src/modules/paired_info/paired_info.hpp | 712 ----
src/modules/paired_info/peak_finder.hpp | 385 --
.../paired_info/smoothing_distance_estimation.hpp | 283 --
src/modules/paired_info/split_path_constructor.hpp | 140 -
.../paired_info/weighted_distance_estimation.hpp | 112 -
src/modules/paired_info/weights.hpp | 82 -
src/modules/pipeline/config_common.hpp | 140 -
src/modules/pipeline/config_singl.hpp | 57 -
src/modules/pipeline/config_struct.cpp | 819 -----
src/modules/pipeline/config_struct.hpp | 583 ---
src/modules/pipeline/genomic_info_filler.cpp | 149 -
src/modules/pipeline/graph_pack.hpp | 161 -
src/modules/pipeline/graphio.hpp | 1040 ------
src/modules/pipeline/library.cpp | 137 -
src/modules/pipeline/library.hpp | 366 --
src/modules/pipeline/stage.cpp | 133 -
src/modules/stages/construction.cpp | 70 -
src/modules/stages/simplification.cpp | 574 ---
.../graph_simplification.hpp | 1034 ------
.../simplification_settings.hpp | 112 -
.../single_cell_simplification.hpp | 110 -
src/modules/visualization/graph_colorer.hpp | 340 --
src/modules/visualization/graph_labeler.hpp | 304 --
src/modules/visualization/graph_printer.hpp | 176 -
src/modules/visualization/position_filler.hpp | 93 -
.../visualization/printing_parameter_storage.hpp | 81 -
src/modules/visualization/vertex_linker.hpp | 41 -
src/modules/visualization/visualization_utils.hpp | 210 --
src/modules/visualization/visualizers.hpp | 173 -
src/projects/CMakeLists.txt | 2 +-
src/projects/cap/assembly_compare.hpp | 22 +-
src/projects/cap/assembly_problem_detection.hpp | 8 +-
src/projects/cap/cap_commands.hpp | 4 +-
src/projects/cap/cap_environment.hpp | 4 +-
src/projects/cap/cap_environment_manager.hpp | 2 +-
src/projects/cap/cap_kmer_index.hpp | 8 +-
src/projects/cap/cap_logger.hpp | 2 +-
src/projects/cap/colored_graph_construction.hpp | 10 +-
src/projects/cap/coloring.hpp | 25 +-
src/projects/cap/compare_standard.hpp | 26 +-
src/projects/cap/comparison_utils.hpp | 16 +-
src/projects/cap/coordinates_handler.hpp | 4 +-
src/projects/cap/deprecated/tools_deprecated.cpp | 4 +-
src/projects/cap/diff_masking.hpp | 10 +-
src/projects/cap/gene_analysis.hpp | 4 +-
src/projects/cap/genome_correction.hpp | 8 +-
src/projects/cap/junk_cropping_reader.hpp | 4 +-
src/projects/cap/longseq.hpp | 6 +-
src/projects/cap/main.cpp | 8 +-
src/projects/cap/mosaic.hpp | 20 +-
src/projects/cap/repeat_masking.hpp | 8 +-
src/projects/cap/serialization.hpp | 2 +-
src/projects/cap/simple_inversion_finder.hpp | 8 +-
src/projects/cap/stats.hpp | 38 +-
src/projects/cap/tools.cpp | 2 +-
src/projects/cap/untangling.hpp | 4 +-
src/projects/cap/visualization.hpp | 28 +-
src/projects/cclean/CMakeLists.txt | 30 +
src/projects/cclean/adapter_index.cpp | 50 +
src/projects/cclean/adapter_index.hpp | 61 +
src/projects/cclean/additional.cpp | 69 +
src/projects/cclean/brute_force_clean.cpp | 97 +
src/projects/cclean/brute_force_clean.hpp | 72 +
src/projects/cclean/comparator.hpp | 18 +
src/projects/cclean/config_struct_cclean.cpp | 44 +
src/projects/cclean/config_struct_cclean.hpp | 42 +
src/projects/cclean/job_wrappers.cpp | 97 +
src/projects/cclean/job_wrappers.hpp | 73 +
src/projects/cclean/main.cpp | 86 +
src/projects/cclean/output.cpp | 82 +
src/projects/cclean/output.hpp | 49 +
src/projects/cclean/running_modes.cpp | 268 ++
src/projects/cclean/running_modes.hpp | 93 +
src/projects/cclean/utils.cpp | 136 +
src/projects/cclean/utils.hpp | 58 +
src/projects/cclean/valid_kmer_generator.hpp | 198 +
src/projects/corrector/CMakeLists.txt | 2 +-
src/projects/corrector/config_struct.cpp | 2 +-
src/projects/corrector/contig_processor.cpp | 8 +-
src/projects/corrector/contig_processor.hpp | 6 +-
src/projects/corrector/dataset_processor.cpp | 10 +-
src/projects/corrector/dataset_processor.hpp | 6 +-
.../corrector/interesting_pos_processor.cpp | 2 +-
src/projects/corrector/main.cpp | 4 +-
src/projects/dipspades/CMakeLists.txt | 6 +-
.../consensus_contigs_constructor.hpp | 4 +-
.../contig_correctors/close_gaps_corrector.hpp | 2 +-
src/projects/dipspades/dipspades.hpp | 4 +-
src/projects/dipspades/dipspades_config.cpp | 2 +-
.../conservative_regions_searcher.hpp | 4 +-
.../dipspades/kmer_gluing/equal_sequence_gluer.hpp | 2 +-
src/projects/dipspades/main.cpp | 10 +-
.../bulge_paths_searcher.hpp | 2 +-
.../complex_bulge_remover.hpp | 2 +-
.../polymorphic_bulge_remover.hpp | 8 +-
src/projects/dipspades/utils/edge_gluer.hpp | 2 +-
src/projects/dipspades/utils/path_routines.hpp | 5 +-
src/projects/hammer/CMakeLists.txt | 2 +-
src/projects/hammer/config_struct_hammer.cpp | 2 +-
src/projects/hammer/hamcluster.cpp | 4 +-
src/projects/hammer/hamcluster.hpp | 6 +-
src/projects/hammer/hammer_tools.cpp | 4 +-
src/projects/hammer/hammer_tools.hpp | 6 +-
src/projects/hammer/kmer_cluster.cpp | 4 +-
src/projects/hammer/kmer_data.cpp | 14 +-
src/projects/hammer/kmer_data.hpp | 4 +-
src/projects/hammer/kmer_stat.hpp | 4 +-
src/projects/hammer/main.cpp | 14 +-
src/projects/hammer/parallel_radix_sort.hpp | 2 +-
src/projects/hammer/quake_correct/bithash.cpp | 2 +-
src/projects/hammer/quake_count/quake_count.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_17.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_19.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_21.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_25.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_29.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_33.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_37.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_45.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_55.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_65.cpp | 2 +-
src/projects/hammer/quake_count/quake_count_75.cpp | 2 +-
.../hammer/quake_count/valid_kmer_generator.hpp | 2 +-
src/projects/hammer/quake_enhanced/count.cpp | 2 +-
src/projects/hammer/quake_enhanced/count/count.cpp | 2 +-
.../quake_enhanced/filter_trusted_enh/main.cpp | 2 +-
src/projects/hammer/valid_kmer_generator.hpp | 2 +-
src/projects/ionhammer/CMakeLists.txt | 2 +-
src/projects/ionhammer/HSeq.hpp | 2 +-
src/projects/ionhammer/config_struct.cpp | 2 +-
src/projects/ionhammer/err_helper_table.cpp | 2 +-
src/projects/ionhammer/err_helper_table.hpp | 2 +-
src/projects/ionhammer/expander.cpp | 2 +-
src/projects/ionhammer/hamcluster.cpp | 4 +-
src/projects/ionhammer/hamcluster.hpp | 4 +-
src/projects/ionhammer/kmer_data.cpp | 10 +-
src/projects/ionhammer/kmer_data.hpp | 2 +-
src/projects/ionhammer/main.cpp | 20 +-
src/projects/ionhammer/read_corrector.hpp | 2 +-
src/projects/ionhammer/subcluster.cpp | 2 +-
src/projects/mph_test/CMakeLists.txt | 2 +-
src/projects/mph_test/main.cpp | 27 +-
src/projects/mts/CMakeLists.txt | 57 +
src/projects/mts/Common.snake | 69 +
src/projects/mts/README | 21 +
src/projects/mts/Snakefile | 175 +
src/projects/mts/Stats.snake | 270 ++
src/projects/mts/annotation.hpp | 310 ++
src/projects/mts/config.yaml | 10 +
src/projects/mts/contig_abundance.cpp | 176 +
src/projects/mts/contig_abundance.hpp | 143 +
src/projects/mts/contig_abundance_counter.cpp | 101 +
src/projects/mts/formats.hpp | 29 +
src/projects/mts/kmc_api/kmc_file.cpp | 1093 ++++++
src/projects/mts/kmc_api/kmc_file.h | 141 +
src/projects/mts/kmc_api/kmer_api.cpp | 48 +
src/projects/mts/kmc_api/kmer_api.h | 596 +++
src/projects/mts/kmc_api/kmer_defs.h | 54 +
src/projects/mts/kmc_api/mmer.cpp | 49 +
src/projects/mts/kmc_api/mmer.h | 182 +
src/projects/mts/kmc_api/stdafx.h | 4 +
src/projects/mts/kmer_multiplicity_counter.cpp | 256 ++
src/projects/mts/log.properties | 10 +
src/projects/mts/logger.hpp | 11 +
src/projects/mts/mts.py | 73 +
src/projects/mts/prop_binning.cpp | 128 +
src/projects/mts/propagate.cpp | 331 ++
src/projects/mts/propagate.hpp | 29 +
src/projects/mts/read_binning.cpp | 90 +
src/projects/mts/read_binning.hpp | 92 +
.../empty.cpp => projects/mts/scripts/__init__.py} | 0
src/projects/mts/scripts/calc_kmers_mpl.py | 38 +
src/projects/mts/scripts/canopy_launch.sh | 17 +
src/projects/mts/scripts/choose_samples.py | 61 +
src/projects/mts/scripts/combine_contigs.py | 28 +
src/projects/mts/scripts/common.py | 121 +
src/projects/mts/scripts/filter_nucmer.py | 54 +
src/projects/mts/scripts/gather_stats.py | 28 +
src/projects/mts/scripts/gen_samples.py | 96 +
src/projects/mts/scripts/make_input.py | 53 +
src/projects/mts/scripts/make_points_matrix.py | 35 +
src/projects/mts/scripts/parse_output.py | 58 +
src/projects/mts/scripts/pca.R | 77 +
src/projects/mts/scripts/ref_stats.sh | 63 +
src/projects/mts/scripts/split_bins.py | 30 +
src/projects/mts/stats.cpp | 194 +
src/projects/mts/test.py | 205 ++
src/projects/mts/visualization.hpp | 66 +
src/projects/online_vis/CMakeLists.txt | 6 +-
src/projects/online_vis/debruijn_environment.hpp | 6 +-
.../drawing_commands/draw_contig_command.hpp | 2 +-
.../drawing_commands/draw_missasemblies.hpp | 4 +-
.../drawing_commands/draw_polymorphic_regions.hpp | 17 +-
.../drawing_commands/draw_poorly_assembled.hpp | 6 +-
.../drawing_commands/draw_position_command.hpp | 4 +-
.../drawing_commands/drawing_command.hpp | 8 +-
.../drawing_commands/show_position_command.hpp | 4 +-
src/projects/online_vis/environment.hpp | 6 +-
src/projects/online_vis/main.cpp | 10 +-
src/projects/online_vis/online_visualizer.hpp | 2 +-
.../position_commands/fill_position_command.hpp | 2 +-
src/projects/online_vis/processing_commands.hpp | 4 +-
src/projects/online_vis/standard_vis.hpp | 2 +-
.../junction_sequence_command.hpp | 4 +-
.../statistics_commands/print_contigs_stats.hpp | 2 +-
src/projects/online_vis/vis_logger.hpp | 6 +-
src/projects/scaffold_correction/CMakeLists.txt | 2 +-
src/projects/scaffold_correction/main.cpp | 15 +-
.../scaffold_correction/scaffold_correction.hpp | 17 +-
src/projects/spades/CMakeLists.txt | 10 +-
src/projects/spades/chromosome_removal.cpp | 45 +-
src/projects/spades/chromosome_removal.hpp | 3 +-
src/projects/spades/contig_output_stage.cpp | 55 +
src/projects/spades/contig_output_stage.hpp | 29 +
src/projects/spades/distance_estimation.cpp | 8 +-
src/projects/spades/gap_closer.cpp | 138 +-
src/projects/spades/gap_closing.hpp | 74 +
src/projects/spades/hybrid_aligning.cpp | 462 +++
src/projects/spades/hybrid_aligning.hpp | 23 +
src/projects/spades/hybrid_gap_closer.hpp | 743 ++++
src/projects/spades/launch.hpp | 66 +-
src/projects/spades/main.cpp | 8 +-
src/projects/spades/mismatch_correction.cpp | 4 +-
src/projects/spades/pacbio_aligning.cpp | 185 -
src/projects/spades/pacbio_aligning.hpp | 23 -
src/projects/spades/pair_info_count.cpp | 409 ++-
src/projects/spades/repeat_resolving.cpp | 60 +-
src/projects/spades/repeat_resolving.hpp | 12 -
src/projects/spades/second_phase_setup.cpp | 3 +-
src/projects/spades/series_analysis.hpp | 323 ++
.../truseq_analysis/AlignmentAnalyserNew.cpp | 4 +-
.../truseq_analysis/AlignmentAnalyserNew.hpp | 2 +-
src/projects/truseq_analysis/CMakeLists.txt | 2 +-
.../truseq_analysis/alignment_analyser.cpp | 2 +-
.../truseq_analysis/alignment_analyser.hpp | 4 +-
src/projects/truseq_analysis/analysis_pipeline.cpp | 6 +-
src/projects/truseq_analysis/analysis_pipeline.hpp | 2 +-
.../truseq_analysis/consistent_mapping.cpp | 2 +-
src/projects/truseq_analysis/main.cpp | 8 +-
src/spades_pipeline/hammer_logic.py | 2 +
src/spades_pipeline/options_storage.py | 19 +-
src/spades_pipeline/spades_logic.py | 11 +-
src/spades_pipeline/support.py | 21 +
src/utils/adt/bag.hpp | 87 -
src/utils/adt/concurrent_dsu.hpp | 297 --
src/utils/adt/function_traits.hpp | 76 -
src/utils/adt/kmer_hash_vector.hpp | 370 --
src/utils/adt/kmer_vector.hpp | 179 -
src/utils/adt/parallel_seq_vector.hpp | 110 -
src/utils/adt/queue_iterator.hpp | 143 -
src/utils/levenshtein.hpp | 241 --
test_dataset_plasmid/pl1.fq.gz | Bin 0 -> 68202 bytes
test_dataset_plasmid/pl2.fq.gz | Bin 0 -> 68276 bytes
864 files changed, 92169 insertions(+), 58300 deletions(-)
diff --git a/VERSION b/VERSION
index 6bd1074..30291cb 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.9.1
+3.10.0
diff --git a/changelog.html b/changelog.html
index 6731f56..e0c69c4 100644
--- a/changelog.html
+++ b/changelog.html
@@ -3,6 +3,19 @@
<h2>SPAdes Genome Assembler changelog</h2>
+<h3>SPAdes 3.10.0, 27 January 2017</h3>
+
+<p>NEW: Scaffolding algorithm for mate-pairs and long reads.</p>
+
+<p>NEW: Contigs and graph output in GFA format.
+
+<p>CHANGE: Better running time and RAM consumption for all pipelines.</p>
+
+<p>CHANGE: Improvements in metagenomic pipeline.</p>
+
+<p>CHANGE: Improved isoform detection algorithm in rnaSPAdes.</p>
+
+
<h3>SPAdes 3.9.1, 4 December 2016</h3>
<p>FIX: macOS Sierra crash.</p>
diff --git a/configs/debruijn/config.info b/configs/debruijn/config.info
index 1620f30..df5179a 100644
--- a/configs/debruijn/config.info
+++ b/configs/debruijn/config.info
@@ -95,6 +95,13 @@ use_scaffolder true
avoid_rc_connections true
+contig_output {
+ contigs_name final_contigs
+ scaffolds_name scaffolds
+ ; none --- do not output broken scaffolds | break_gaps --- break only by N steches | break_all --- break all with overlap < k
+ output_broken_scaffolds break_gaps
+}
+
;position handling
pos
@@ -129,17 +136,19 @@ kmer_coverage_model {
pacbio_processor
{
;align and traverse.
- pacbio_k 13
- additional_debug_info false
- compression_cutoff 0.6
- domination_cutoff 1.5
- path_limit_stretching 1.3
- path_limit_pressing 0.7
- ignore_middle_alignment true
- ;gap_closer
- long_seq_limit 400
- pacbio_min_gap_quantity 2
- contigs_min_gap_quantity 1
+ pacbio_k 13
+ additional_debug_info false
+ compression_cutoff 0.6
+ domination_cutoff 1.5
+ path_limit_stretching 1.3
+ path_limit_pressing 0.7
+ ignore_middle_alignment true
+ max_path_in_dijkstra 15000
+ max_vertex_in_dijkstra 2000
+;gap_closer
+ long_seq_limit 400
+ pacbio_min_gap_quantity 2
+ contigs_min_gap_quantity 1
max_contigs_gap_length 10000
}
@@ -162,3 +171,4 @@ bwa_aligner
;flanking coverage range
flanking_range 55
+series_analysis ""
diff --git a/configs/debruijn/distance_estimation.info b/configs/debruijn/distance_estimation.info
index 3761b05..20954c6 100644
--- a/configs/debruijn/distance_estimation.info
+++ b/configs/debruijn/distance_estimation.info
@@ -2,13 +2,15 @@
de
{
- linkage_distance_coeff 0.0
- max_distance_coeff 2.0
- max_distance_coeff_scaff 2000.0
- filter_threshold 2.0
+ linkage_distance_coeff 0.0
+ max_distance_coeff 2.0
+ max_distance_coeff_scaff 2000.0
+ clustered_filter_threshold 2.0
+ raw_filter_threshold 2
+ rounding_coeff 0.5 ; rounding : min(de_max_distance * rounding_coeff, rounding_thr)
+ rounding_threshold 0
}
-
ade
{
;data dividing
diff --git a/configs/debruijn/large_genome_mode.info b/configs/debruijn/large_genome_mode.info
new file mode 100644
index 0000000..128008e
--- /dev/null
+++ b/configs/debruijn/large_genome_mode.info
@@ -0,0 +1,22 @@
+mode large_genome
+
+
+pe {
+
+debug_output false
+
+params {
+ scaffolding_mode old_pe_2015
+}
+}
+
+
+bwa_aligner
+{
+ bwa_enable true
+ debug false
+ path_to_bwa ./bin/bwa-spades
+ min_contig_len 0
+}
+
+
diff --git a/configs/debruijn/log.properties b/configs/debruijn/log.properties
index cbe4c29..b19eafe 100644
--- a/configs/debruijn/log.properties
+++ b/configs/debruijn/log.properties
@@ -52,3 +52,9 @@ default=INFO
#ScaffoldingPathExtender=DEBUG
#BWAPairInfo=TRACE
+#LongReadMapper=TRACE
+#GapTrackingListener=TRACE
+#MultiGapJoiner=TRACE
+#HybridGapCloser=TRACE
+#GapJoiner=TRACE
+#CountingCallback=TRACE
diff --git a/configs/debruijn/mda_mode.info b/configs/debruijn/mda_mode.info
index c98df33..11c9815 100644
--- a/configs/debruijn/mda_mode.info
+++ b/configs/debruijn/mda_mode.info
@@ -98,10 +98,19 @@ simp
}
}
+de
+{
+ raw_filter_threshold 0
+ rounding_threshold 0
+}
+
+
pe {
params {
normalize_weight true
+ scaffolding_mode old
+
; extension selection
extension_options
{
@@ -114,7 +123,7 @@ params {
long_reads {
pacbio_reads {
- unique_edge_priority 10000.0
+ unique_edge_priority 10.0
}
}
}
diff --git a/configs/debruijn/meta_mode.info b/configs/debruijn/meta_mode.info
index 5462e69..69c7bdc 100644
--- a/configs/debruijn/meta_mode.info
+++ b/configs/debruijn/meta_mode.info
@@ -53,8 +53,6 @@ simp
enabled true
}
-
-
; relative edge disconnector:
relative_ed
{
@@ -100,6 +98,15 @@ simp
max_number_edges 3
}
+ ; hidden ec remover
+ her
+ {
+ enabled true
+ uniqueness_length 1500
+ unreliability_threshold -1.
+ relative_threshold 3.
+ }
+
init_clean
{
early_it_only true
@@ -141,31 +148,57 @@ preliminary_simp
}
+; undo single cell config changes, enforce filtering
+de
+{
+ raw_filter_threshold 1
+ rounding_coeff 0.5 ; rounding : min(de_max_distance * rounding_coeff, rounding_thr)
+ rounding_threshold 0
+}
+
;NB decsends from sc_pe
pe {
+
+long_reads {
+ pacbio_reads {
+ filtering 1.9
+ weight_priority 20.0
+ unique_edge_priority 10.0
+ min_significant_overlap 1000
+ }
+}
+
params {
remove_overlaps true
cut_all_overlaps true
- ;TODO proper configuration of different extenders is not supported
- ;TODO most settings ard hardcoded for now
+ scaffolding_mode old_pe_2015
- ;normalize_weight NA
+ normalize_weight true
+
+ ; extension selection
extension_options
{
- ;use_default_single_threshold NA
- ;single_threshold NA
+ use_default_single_threshold true
+ single_threshold 0.3
weight_threshold 0.6
+ priority_coeff 1.5
max_repeat_length 1000000
- }
+ }
use_coordinated_coverage true
}
+
}
prelim_pe {
params {
+ scaffolding_mode old
+
use_coordinated_coverage false
remove_overlaps false
+ scaffolding2015 {
+ min_unique_length 100000000
+ }
}
}
diff --git a/configs/debruijn/moleculo_mode.info b/configs/debruijn/moleculo_mode.info
index 40c2a54..a3ad118 100644
--- a/configs/debruijn/moleculo_mode.info
+++ b/configs/debruijn/moleculo_mode.info
@@ -103,6 +103,8 @@ params {
normalize_weight true
cut_all_overlaps true
+ scaffolding_mode old
+
; extension selection
extension_options
{
diff --git a/configs/debruijn/path_extend/pe_params.info b/configs/debruijn/path_extend/pe_params.info
deleted file mode 100644
index 86f1cd6..0000000
--- a/configs/debruijn/path_extend/pe_params.info
+++ /dev/null
@@ -1,206 +0,0 @@
-default_pe {
-
-; output options
-
-debug_output true
-
-output {
- write_overlaped_paths true
- write_paths true
-}
-
-visualize {
- print_overlaped_paths true
- print_paths true
-}
-
-; none | break_gaps | break_all
-output_broken_scaffolds break_gaps
-
-params {
- multi_path_extend false
- ; old | 2015 | combined | old_pe_2015
- scaffolding_mode old_pe_2015
-
- remove_overlaps true
- cut_all_overlaps false
-
- split_edge_length 99
- normalize_weight false
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 1.75676
- weight_threshold 0.5
- priority_coeff 1.5
- max_repeat_length 8000
- }
-
- mate_pair_options
- {
- use_default_single_threshold true
- single_threshold 30
- weight_threshold 0.5
- priority_coeff 1.5
- max_repeat_length 8000
- }
-
- scaffolder {
- on true
- cutoff 2
- rel_cutoff 0.1
- sum_threshold 3
-
- cluster_info true
- cl_threshold 0
-
- fix_gaps true
- use_la_gap_joiner true
- ;next param should be 0.51 - 1.0 if use_old_score = true and 3.0 otherwise
- min_gap_score 0.7
-
- max_must_overlap -2
- max_can_overlap 0.5
- short_overlap 6
- artificial_gap 10
- use_old_score true
-
- min_overlap_length 10
- flank_addition_coefficient -5.9
- flank_multiplication_coefficient 0.97
- }
-
- loop_removal
- {
- max_loops 10
- mp_max_loops 10
- }
-
- use_coordinated_coverage false
- coordinated_coverage
- {
- max_edge_length_repeat 300
- delta 0.4
- }
-
- scaffolding2015 {
- autodetect true
- min_unique_length 10000
- unique_coverage_variation 0.5
- ; (median * (1+variation) > unique > median * (1 - variation))
- }
-
- scaffold_graph {
- construct true
- output true
- always_add 40 ; connection with read count >= always_add are always added to the graph
- never_add 5 ; connection with read count < never_add are never added to the graph
- relative_threshold 0.25 ; connection with read count >= max_read_count * relative_threshod are added to the graph if satisfy condition above, max_read_count is calculated amond all alternatives
- graph_connectivity false
- max_path_length 10000
- }
-}
-
-
-long_reads {
- pacbio_reads {
- filtering 2.5
- weight_priority 1.2
- unique_edge_priority 5.0
- }
-
- single_reads {
- filtering 1.25
- weight_priority 5.0
- unique_edge_priority 1000.0
- }
-
- contigs {
- filtering 0.0
- weight_priority 1.5
- unique_edge_priority 2.0
- }
-
- meta_untrusted_contigs {
- filtering 0.0
- weight_priority 100.0
- unique_edge_priority 2.0
- }
-
-}
-}
-
-sc_pe {
-params {
- normalize_weight true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- max_repeat_length 8000
- }
-
-}
-}
-
-moleculo_pe {
-params {
- normalize_weight true
- cut_all_overlaps true
-
- ; extension selection
- extension_options
- {
- use_default_single_threshold false
- single_threshold 0.001
- weight_threshold 0.6
- }
-
- scaffolder {
- short_overlap 10
- use_la_gap_joiner false
- }
-}
-}
-
-;NB decsends from sc_pe
-meta_pe {
-params {
- remove_overlaps true
- cut_all_overlaps true
-
- ;TODO proper configuration of different extenders is not supported
- ;TODO most settings ard hardcoded for now
-
- ;normalize_weight NA
- extension_options
- {
- ;use_default_single_threshold NA
- ;single_threshold NA
- weight_threshold 0.6
- max_repeat_length 50000
- }
-
- use_coordinated_coverage true
-}
-}
-
-prelim_pe {
-params {
- use_coordinated_coverage false
- remove_overlaps false
-}
-}
-
-rna_pe {
-
-params {
- multi_path_extend true
- remove_overlaps false
-}
-}
diff --git a/configs/debruijn/pe_params.info b/configs/debruijn/pe_params.info
index 9c838bd..0d7a172 100644
--- a/configs/debruijn/pe_params.info
+++ b/configs/debruijn/pe_params.info
@@ -14,27 +14,25 @@ visualize {
print_paths true
}
-; none | break_gaps | break_all
-output_broken_scaffolds break_gaps
-
params {
multi_path_extend false
; old | 2015 | combined | old_pe_2015
- scaffolding_mode old
+ scaffolding_mode old_pe_2015
remove_overlaps true
cut_all_overlaps false
split_edge_length 99
- normalize_weight false
+ normalize_weight true
; extension selection
extension_options
{
- use_default_single_threshold false
- single_threshold 1.75676
+ use_default_single_threshold true
+ single_threshold 0.1
weight_threshold 0.5
priority_coeff 1.5
+ ;TODO remove from here
max_repeat_length 8000
}
@@ -44,6 +42,7 @@ params {
single_threshold 30
weight_threshold 0.5
priority_coeff 1.5
+ ;TODO remove from here
max_repeat_length 8000
}
@@ -72,8 +71,8 @@ params {
flank_addition_coefficient -5.9
flank_multiplication_coefficient 0.97
- var_coeff 3.0
- basic_overlap_coeff 2.0
+ var_coeff 3.0
+ basic_overlap_coeff 2.0
}
path_cleaning
@@ -81,12 +80,6 @@ params {
enabled false
}
- loop_removal
- {
- max_loops 10
- mp_max_loops 10
- }
-
use_coordinated_coverage false
coordinated_coverage
{
@@ -96,11 +89,14 @@ params {
}
scaffolding2015 {
- autodetect true
- min_unique_length 10000
- unique_coverage_variation 0.5
; (median * (1+variation) > unique > median * (1 - variation))
- relative_weight_cutoff 2.0
+ relative_weight_cutoff 2.0
+
+ unique_length_upper_bound 2000 ; max(unique_length_upper_bound, max_is(all libs))
+ unique_length_lower_bound 500 ; max(unique_length_lower_bound, unique_length_step)
+ unique_length_step 300
+
+ graph_connectivity_max_edges 200000
}
scaffold_graph {
@@ -109,9 +105,29 @@ params {
always_add 40 ; connection with read count >= always_add are always added to the graph
never_add 5 ; connection with read count < never_add are never added to the graph
relative_threshold 0.25 ; connection with read count >= max_read_count * relative_threshod are added to the graph if satisfy condition above, max_read_count is calculated amond all alternatives
- graph_connectivity false
+ use_graph_connectivity false
max_path_length 10000
}
+
+ genome_consistency_checker {
+ max_gap 1000
+ relative_max_gap 0.2
+ }
+
+ uniqueness_analyser {
+ enabled true
+ unique_coverage_variation 0.5
+
+ nonuniform_coverage_variation 50
+ uniformity_fraction_threshold 0.8
+ }
+
+ loop_traversal
+ {
+ min_edge_length 1000
+ max_component_size 10
+ max_path_length 1000
+ }
}
diff --git a/configs/debruijn/rna_mode.info b/configs/debruijn/rna_mode.info
index aad8fec..aae3d6f 100644
--- a/configs/debruijn/rna_mode.info
+++ b/configs/debruijn/rna_mode.info
@@ -2,6 +2,12 @@ mode rna
preserve_raw_paired_index true
+contig_output {
+ scaffolds_name transcripts
+ ; none --- do not output broken scaffolds | break_gaps --- break only by N steches | break_all --- break all with overlap < k
+ output_broken_scaffolds none
+}
+
simp
{
; enable advanced ec removal algo
@@ -33,17 +39,18 @@ simp
; ec_lb: max_ec_length = k + ec_lb
; icb: iterative coverage bound
; to_ec_lb: max_ec_length = 2*tip_length(to_ec_lb) - 1
- ; condition "{ ec_lb 9, icb 40.0 }"
+ ; nbr: use not bulge erroneous connections remover
+ ; condition "{ ec_lb 9, icb 40.0, nbr }"
condition "{ ec_lb 30, icb 50 }"
}
; relative coverage erroneous connections remover:
rcec
- {
+ {
enabled true
rcec_lb 30
rcec_cb 0.5
- }
+ }
rcc
{
@@ -86,10 +93,24 @@ simp
}
+; disable filtering in rna mode
+de
+{
+ raw_filter_threshold 0
+}
+
pe {
params {
- ;multi_path_extend true
- ;remove_overlaps false
+ multi_path_extend true
+ remove_overlaps false
+
+ scaffolding_mode old
+
+ extension_options
+ {
+ use_default_single_threshold true
+ single_threshold 0.05
+ }
scaffolder {
cutoff 1
diff --git a/configs/debruijn/simplification.info b/configs/debruijn/simplification.info
index 4351abd..3ee8e02 100644
--- a/configs/debruijn/simplification.info
+++ b/configs/debruijn/simplification.info
@@ -73,6 +73,7 @@ simp
{
enabled false
diff_mult 20.
+ edge_sum 10000
}
; final tip clipper:
diff --git a/ext/include/btree/btree.h b/ext/include/btree/btree.h
index e14afdb..d7a2cb6 100644
--- a/ext/include/btree/btree.h
+++ b/ext/include/btree/btree.h
@@ -663,7 +663,8 @@ class btree_node {
// Inserts the value x at position i, shifting all existing values and
// children at positions >= i to the right by 1.
- void insert_value(int i, const value_type &x);
+ template<typename V>
+ void insert_value(int i, V &&x);
// Removes the value at position i, shifting all existing values and children
// at positions > i to the left by 1.
@@ -724,6 +725,12 @@ class btree_node {
void value_init(int i, const value_type &x) {
new (&fields_.values[i]) mutable_value_type(x);
}
+
+ template<class V>
+ void value_init(int i, V&& x) {
+ new (&fields_.values[i]) mutable_value_type(std::forward<V>(x));
+ }
+
void value_destroy(int i) {
fields_.values[i].~mutable_value_type();
}
@@ -885,11 +892,25 @@ class btree : public Params::key_compare {
// class optimization] for more details.
template <typename Base, typename Data>
struct empty_base_handle : public Base {
- empty_base_handle(const Base &b, const Data &d)
+ empty_base_handle(const Base &b, Data *d)
: Base(b),
data(d) {
}
- Data data;
+
+ empty_base_handle(empty_base_handle &&other) noexcept
+ : Base(std::move(other)) {
+ data = other.data;
+ other.data = nullptr;
+ }
+
+ empty_base_handle& operator=(empty_base_handle &&other) noexcept {
+ Base::operator=(std::move(other));
+ data = other.data;
+ other.data = nullptr;
+ return *this;
+ }
+
+ Data *data;
};
struct node_stats {
@@ -937,6 +958,9 @@ class btree : public Params::key_compare {
// Copy constructor.
btree(const self_type &x);
+ // Move constructor.
+ btree(self_type &&x) noexcept;
+
// Destructor.
~btree() {
clear();
@@ -999,17 +1023,23 @@ class btree : public Params::key_compare {
}
// Inserts a value into the btree only if it does not already exist. The
- // boolean return value indicates whether insertion succeeded or failed. The
- // ValuePointer type is used to avoid instatiating the value unless the key
- // is being inserted. Value is not dereferenced if the key already exists in
- // the btree. See btree_map::operator[].
- template <typename ValuePointer>
- std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value);
+ // boolean return value indicates whether insertion succeeded or failed.
+ std::pair<iterator,bool> insert_unique(const key_type &key, value_type&& value);
+
+ // Inserts a value into the btree only if it does not already exist. The
+ // boolean return value indicates whether insertion succeeded or failed.
+ std::pair<iterator,bool> insert_unique(const key_type &key, const value_type& value);
// Inserts a value into the btree only if it does not already exist. The
// boolean return value indicates whether insertion succeeded or failed.
std::pair<iterator,bool> insert_unique(const value_type &v) {
- return insert_unique(params_type::key(v), &v);
+ return insert_unique(params_type::key(v), v);
+ }
+
+ // Inserts a value into the btree only if it does not already exist. The
+ // boolean return value indicates whether insertion succeeded or failed.
+ std::pair<iterator,bool> insert_unique(value_type &&v) {
+ return insert_unique(params_type::key(v), std::move(v));
}
// Insert with hint. Check to see if the value should be placed immediately
@@ -1022,12 +1052,9 @@ class btree : public Params::key_compare {
template <typename InputIterator>
void insert_unique(InputIterator b, InputIterator e);
- // Inserts a value into the btree. The ValuePointer type is used to avoid
- // instatiating the value unless the key is being inserted. Value is not
- // dereferenced if the key already exists in the btree. See
- // btree_map::operator[].
- template <typename ValuePointer>
- iterator insert_multi(const key_type &key, ValuePointer value);
+ // Inserts a value into the btree.
+ iterator insert_multi(const key_type &key, const value_type &value);
+ iterator insert_multi(const key_type &key, value_type &&value);
// Inserts a value into the btree.
iterator insert_multi(const value_type &v) {
@@ -1112,6 +1139,14 @@ class btree : public Params::key_compare {
return *this;
}
+ self_type& operator=(self_type&& x) noexcept {
+ key_compare::operator=(std::move(x.key_comp()));
+ root_ = std::move(x.root_);
+ x.root_.data = nullptr;
+
+ return *this;
+ }
+
key_compare* mutable_key_comp() {
return this;
}
@@ -1305,7 +1340,8 @@ class btree : public Params::key_compare {
// Inserts a value into the btree immediately before iter. Requires that
// key(v) <= iter.key() and (--iter).key() <= key(v).
- iterator internal_insert(iterator iter, const value_type &v);
+ template<class V>
+ iterator internal_insert(iterator iter, V &&v);
// Returns an iterator pointing to the first value >= the value "iter" is
// pointing at. Note that "iter" might be pointing to an invalid location as
@@ -1378,7 +1414,7 @@ class btree : public Params::key_compare {
}
private:
- empty_base_handle<internal_allocator_type, node_type*> root_;
+ empty_base_handle<internal_allocator_type, node_type> root_;
private:
// A never instantiated helper function that returns big_ if we have a
@@ -1419,9 +1455,10 @@ class btree : public Params::key_compare {
////
// btree_node methods
template <typename P>
-inline void btree_node<P>::insert_value(int i, const value_type &x) {
+template <typename V>
+inline void btree_node<P>::insert_value(int i, V &&x) {
assert(i <= count());
- value_init(count(), x);
+ value_init(count(), std::forward<V>(x));
for (int j = count(); j > i; --j) {
value_swap(j, this, j - 1);
}
@@ -1739,9 +1776,16 @@ btree<P>::btree(const self_type &x)
assign(x);
}
-template <typename P> template <typename ValuePointer>
+template <typename P>
+btree<P>::btree(self_type &&x) noexcept
+ : key_compare(std::move(x.key_comp())),
+ root_(std::move(x.root_)) {
+ x.root_.data = nullptr;
+}
+
+template <typename P>
std::pair<typename btree<P>::iterator, bool>
-btree<P>::insert_unique(const key_type &key, ValuePointer value) {
+btree<P>::insert_unique(const key_type &key, value_type&& value) {
if (empty()) {
*mutable_root() = new_leaf_root_node(1);
}
@@ -1759,10 +1803,33 @@ btree<P>::insert_unique(const key_type &key, ValuePointer value) {
}
}
- return std::make_pair(internal_insert(iter, *value), true);
+ return std::make_pair(internal_insert(iter, std::move(value)), true);
}
template <typename P>
+std::pair<typename btree<P>::iterator, bool>
+btree<P>::insert_unique(const key_type &key, const value_type& value) {
+ if (empty()) {
+ *mutable_root() = new_leaf_root_node(1);
+ }
+
+ std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0));
+ iterator &iter = res.first;
+ if (res.second == kExactMatch) {
+ // The key already exists in the tree, do nothing.
+ return std::make_pair(internal_last(iter), false);
+ } else if (!res.second) {
+ iterator last = internal_last(iter);
+ if (last.node && !compare_keys(key, last.key())) {
+ // The key already exists in the tree, do nothing.
+ return std::make_pair(last, false);
+ }
+ }
+
+ return std::make_pair(internal_insert(iter, value), true);
+}
+
+template <typename P>
inline typename btree<P>::iterator
btree<P>::insert_unique(iterator position, const value_type &v) {
if (!empty()) {
@@ -1795,9 +1862,23 @@ void btree<P>::insert_unique(InputIterator b, InputIterator e) {
}
}
-template <typename P> template <typename ValuePointer>
+template <typename P>
+typename btree<P>::iterator
+btree<P>::insert_multi(const key_type &key, value_type &&value) {
+ if (empty()) {
+ *mutable_root() = new_leaf_root_node(1);
+ }
+
+ iterator iter = internal_upper_bound(key, iterator(root(), 0));
+ if (!iter.node) {
+ iter = end();
+ }
+ return internal_insert(iter, std::move(value));
+}
+
+template <typename P>
typename btree<P>::iterator
-btree<P>::insert_multi(const key_type &key, ValuePointer value) {
+btree<P>::insert_multi(const key_type &key, const value_type &value) {
if (empty()) {
*mutable_root() = new_leaf_root_node(1);
}
@@ -1806,7 +1887,7 @@ btree<P>::insert_multi(const key_type &key, ValuePointer value) {
if (!iter.node) {
iter = end();
}
- return internal_insert(iter, *value);
+ return internal_insert(iter, value);
}
template <typename P>
@@ -2197,8 +2278,9 @@ inline IterType btree<P>::internal_last(IterType iter) {
}
template <typename P>
+template <typename V>
inline typename btree<P>::iterator
-btree<P>::internal_insert(iterator iter, const value_type &v) {
+btree<P>::internal_insert(iterator iter, V &&v) {
if (!iter.node->leaf()) {
// We can't insert on an internal node. Instead, we'll insert after the
// previous value which is guaranteed to be on a leaf node.
@@ -2223,7 +2305,7 @@ btree<P>::internal_insert(iterator iter, const value_type &v) {
} else if (!root()->leaf()) {
++*mutable_size();
}
- iter.node->insert_value(iter.position, v);
+ iter.node->insert_value(iter.position, std::forward<V>(v));
return iter;
}
diff --git a/ext/include/btree/btree_container.h b/ext/include/btree/btree_container.h
index fb617ab..7895e67 100644
--- a/ext/include/btree/btree_container.h
+++ b/ext/include/btree/btree_container.h
@@ -56,6 +56,15 @@ class btree_container {
: tree_(x.tree_) {
}
+ btree_container(self_type &&x)
+ : tree_(std::move(x.tree_)) {
+ }
+
+ self_type& operator=(self_type&& x) noexcept {
+ tree_ = std::move(x.tree_);
+ return *this;
+ }
+
// Iterator routines.
iterator begin() { return tree_.begin(); }
const_iterator begin() const { return tree_.begin(); }
@@ -169,6 +178,14 @@ class btree_unique_container : public btree_container<Tree> {
: super_type(x) {
}
+ btree_unique_container(self_type &&x)
+ : super_type(std::move(x)) {
+ }
+
+ self_type& operator=(self_type&& x) noexcept {
+ return static_cast<self_type&>(super_type::operator=(std::move(x)));
+ }
+
// Range constructor.
template <class InputIterator>
btree_unique_container(InputIterator b, InputIterator e,
@@ -200,6 +217,10 @@ class btree_unique_container : public btree_container<Tree> {
void insert(InputIterator b, InputIterator e) {
this->tree_.insert_unique(b, e);
}
+ template<class P>
+ std::pair<iterator,bool> insert(P&& x) {
+ return this->tree_.insert_unique(std::forward<P>(x));
+ }
// Deletion routines.
int erase(const key_type &key) {
@@ -230,20 +251,6 @@ class btree_map_container : public btree_unique_container<Tree> {
typedef typename Tree::key_compare key_compare;
typedef typename Tree::allocator_type allocator_type;
- private:
- // A pointer-like object which only generates its value when
- // dereferenced. Used by operator[] to avoid constructing an empty data_type
- // if the key already exists in the map.
- struct generate_value {
- generate_value(const key_type &k)
- : key(k) {
- }
- value_type operator*() const {
- return std::make_pair(key, data_type());
- }
- const key_type &key;
- };
-
public:
// Default constructor.
btree_map_container(const key_compare &comp = key_compare(),
@@ -256,6 +263,14 @@ class btree_map_container : public btree_unique_container<Tree> {
: super_type(x) {
}
+ btree_map_container(self_type &&x) noexcept
+ : super_type(std::move(x)) {
+ }
+
+ self_type& operator=(self_type&& x) noexcept {
+ return static_cast<self_type&>(super_type::operator=(std::move(x)));
+ }
+
// Range constructor.
template <class InputIterator>
btree_map_container(InputIterator b, InputIterator e,
@@ -266,7 +281,7 @@ class btree_map_container : public btree_unique_container<Tree> {
// Insertion routines.
data_type& operator[](const key_type &key) {
- return this->tree_.insert_unique(key, generate_value(key)).first->second;
+ return this->tree_.insert_unique(key, std::make_pair(key, data_type())).first->second;
}
};
diff --git a/ext/include/btree/btree_map.h b/ext/include/btree/btree_map.h
index b83489f..43b25de 100644
--- a/ext/include/btree/btree_map.h
+++ b/ext/include/btree/btree_map.h
@@ -63,6 +63,15 @@ class btree_map : public btree_map_container<
: super_type(x) {
}
+ // Move constructor.
+ btree_map(self_type &&x) noexcept
+ : super_type(std::move(x)) {
+ }
+
+ self_type& operator=(self_type&& x) noexcept {
+ return static_cast<self_type&>(super_type::operator=(std::move(x)));
+ }
+
// Range constructor.
template <class InputIterator>
btree_map(InputIterator b, InputIterator e,
diff --git a/ext/include/btree/safe_btree.h b/ext/include/btree/safe_btree.h
index 2d85c70..d1227da 100644
--- a/ext/include/btree/safe_btree.h
+++ b/ext/include/btree/safe_btree.h
@@ -206,6 +206,11 @@ class safe_btree {
generation_(1) {
}
+ safe_btree(self_type &&x) noexcept
+ : tree_(std::move(x.tree_)),
+ generation_(x.generation_) {
+ }
+
iterator begin() {
return iterator(this, tree_.begin());
}
@@ -274,9 +279,8 @@ class safe_btree {
}
// Insertion routines.
- template <typename ValuePointer>
- std::pair<iterator, bool> insert_unique(const key_type &key, ValuePointer value) {
- std::pair<tree_iterator, bool> p = tree_.insert_unique(key, value);
+ std::pair<iterator, bool> insert_unique(const key_type &key, value_type &&value) {
+ std::pair<tree_iterator, bool> p = tree_.insert_unique(key, std::move(value));
generation_ += p.second;
return std::make_pair(iterator(this, p.first), p.second);
}
@@ -285,6 +289,11 @@ class safe_btree {
generation_ += p.second;
return std::make_pair(iterator(this, p.first), p.second);
}
+ std::pair<iterator, bool> insert_unique(value_type &&v) {
+ std::pair<tree_iterator, bool> p = tree_.insert_unique(std::move(v));
+ generation_ += p.second;
+ return std::make_pair(iterator(this, p.first), p.second);
+ }
iterator insert_unique(iterator position, const value_type &v) {
tree_iterator tree_pos = position.iter();
++generation_;
@@ -300,6 +309,15 @@ class safe_btree {
++generation_;
return iterator(this, tree_.insert_multi(v));
}
+ iterator insert_multi(value_type &&v) {
+ ++generation_;
+ return iterator(this, tree_.insert_multi(std::move(v)));
+ }
+ iterator insert_multi(iterator position, value_type &&v) {
+ tree_iterator tree_pos = position.iter();
+ ++generation_;
+ return iterator(this, tree_.insert_multi(tree_pos, std::move(v)));
+ }
iterator insert_multi(iterator position, const value_type &v) {
tree_iterator tree_pos = position.iter();
++generation_;
@@ -321,6 +339,14 @@ class safe_btree {
return *this;
}
+ self_type& operator=(self_type&& x) noexcept {
+ tree_ = std::move(x.tree_);
+ generation_ = x.generation_;
+ x.generation_ = -1;
+
+ return *this;
+ }
+
// Deletion routines.
void erase(const iterator &begin, const iterator &end) {
tree_.erase(begin.iter(), end.iter());
diff --git a/ext/include/btree/safe_btree_map.h b/ext/include/btree/safe_btree_map.h
index a0668f1..2eac400 100644
--- a/ext/include/btree/safe_btree_map.h
+++ b/ext/include/btree/safe_btree_map.h
@@ -69,6 +69,14 @@ class safe_btree_map : public btree_map_container<
: super_type(x) {
}
+ safe_btree_map(self_type&& x) noexcept
+ : super_type(std::move(x)) {
+ }
+
+ self_type& operator=(safe_btree_map&& x) noexcept {
+ return static_cast<self_type&>(super_type::operator=(std::move(x)));
+ }
+
// Range constructor.
template <class InputIterator>
safe_btree_map(InputIterator b, InputIterator e,
diff --git a/ext/include/bwa/bntseq.h b/ext/include/bwa/bntseq.h
new file mode 100644
index 0000000..63ad3c3
--- /dev/null
+++ b/ext/include/bwa/bntseq.h
@@ -0,0 +1,91 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#ifndef BWT_BNTSEQ_H
+#define BWT_BNTSEQ_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifndef BWA_UBYTE
+#define BWA_UBYTE
+typedef uint8_t ubyte_t;
+#endif
+
+typedef struct {
+ int64_t offset;
+ int32_t len;
+ int32_t n_ambs;
+ uint32_t gi;
+ int32_t is_alt;
+ char *name, *anno;
+} bntann1_t;
+
+typedef struct {
+ int64_t offset;
+ int32_t len;
+ char amb;
+} bntamb1_t;
+
+typedef struct {
+ int64_t l_pac;
+ int32_t n_seqs;
+ uint32_t seed;
+ bntann1_t *anns; // n_seqs elements
+ int32_t n_holes;
+ bntamb1_t *ambs; // n_holes elements
+ FILE *fp_pac;
+} bntseq_t;
+
+extern unsigned char nst_nt4_table[256];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ void bns_dump(const bntseq_t *bns, const char *prefix);
+ bntseq_t *bns_restore(const char *prefix);
+ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
+ void bns_destroy(bntseq_t *bns);
+ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
+ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
+ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
+ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
+ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid);
+ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev)
+{
+ return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos;
+}
+
+#endif
diff --git a/ext/include/bwa/bwa.h b/ext/include/bwa/bwa.h
new file mode 100644
index 0000000..8f4e06e
--- /dev/null
+++ b/ext/include/bwa/bwa.h
@@ -0,0 +1,62 @@
+#ifndef BWA_H_
+#define BWA_H_
+
+#include <stdint.h>
+#include "bntseq.h"
+#include "bwt.h"
+
+#define BWA_IDX_BWT 0x1
+#define BWA_IDX_BNS 0x2
+#define BWA_IDX_PAC 0x4
+#define BWA_IDX_ALL 0x7
+
+#define BWA_CTL_SIZE 0x10000
+
+typedef struct bwaidx_s {
+ bwt_t *bwt; // FM-index
+ bntseq_t *bns; // information on the reference sequences
+ uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+
+ int is_shm;
+ int64_t l_mem;
+ uint8_t *mem;
+} bwaidx_t;
+
+typedef struct {
+ int l_seq, id;
+ char *name, *comment, *seq, *qual, *sam;
+} bseq1_t;
+
+extern int bwa_verbose;
+extern char bwa_rg_id[256];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
+ void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
+
+ void bwa_fill_scmat(int a, int b, int8_t mat[25]);
+ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+ uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+
+ char *bwa_idx_infer_prefix(const char *hint);
+ bwt_t *bwa_idx_load_bwt(const char *hint);
+
+ bwaidx_t *bwa_idx_load_from_shm(const char *hint);
+ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
+ bwaidx_t *bwa_idx_load(const char *hint, int which);
+ void bwa_idx_destroy(bwaidx_t *idx);
+ int bwa_idx2mem(bwaidx_t *idx);
+ int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
+
+ void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
+ char *bwa_set_rg(const char *s);
+ char *bwa_insert_header(const char *s, char *hdr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/include/bwa/bwamem.h b/ext/include/bwa/bwamem.h
new file mode 100644
index 0000000..3ca79ce
--- /dev/null
+++ b/ext/include/bwa/bwamem.h
@@ -0,0 +1,184 @@
+#ifndef BWAMEM_H_
+#define BWAMEM_H_
+
+#include "bwt.h"
+#include "bntseq.h"
+#include "bwa.h"
+
+#define MEM_MAPQ_COEF 30.0
+#define MEM_MAPQ_MAX 60
+
+struct __smem_i;
+typedef struct __smem_i smem_i;
+
+#define MEM_F_PE 0x2
+#define MEM_F_NOPAIRING 0x4
+#define MEM_F_ALL 0x8
+#define MEM_F_NO_MULTI 0x10
+#define MEM_F_NO_RESCUE 0x20
+#define MEM_F_REF_HDR 0x100
+#define MEM_F_SOFTCLIP 0x200
+#define MEM_F_SMARTPE 0x400
+
+typedef struct mem_opt_s {
+ int a, b; // match score and mismatch penalty
+ int o_del, e_del;
+ int o_ins, e_ins;
+ int pen_unpaired; // phred-scaled penalty for unpaired reads
+ int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score.
+ int w; // band width
+ int zdrop; // Z-dropoff
+
+ uint64_t max_mem_intv;
+
+ int T; // output score threshold; only affecting output
+ int flag; // see MEM_F_* macros
+ int min_seed_len; // minimum seed length
+ int min_chain_weight;
+ int max_chain_extend;
+ float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor
+ int split_width; // split into a seed if its occurence is smaller than this value
+ int max_occ; // skip a seed if its occurence is larger than this value
+ int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed
+ int n_threads; // number of threads
+ int chunk_size; // process chunk_size-bp sequences in a batch
+ float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
+ float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain
+ float XA_drop_ratio; // when counting hits for the XA tag, ignore alignments with score < XA_drop_ratio * max_score; only effective for the XA tag
+ float mask_level_redun;
+ float mapQ_coef_len;
+ int mapQ_coef_fac;
+ int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value
+ int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
+ int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
+ int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
+} mem_opt_t;
+
+typedef struct {
+ int64_t rb, re; // [rb,re): reference sequence in the alignment
+ int qb, qe; // [qb,qe): query sequence in the alignment
+ int rid; // reference seq ID
+ int score; // best local SW score
+ int truesc; // actual score corresponding to the aligned region; possibly smaller than $score
+ int sub; // 2nd best SW score
+ int alt_sc;
+ int csub; // SW score of a tandem hit
+ int sub_n; // approximate number of suboptimal hits
+ int w; // actual band width used in extension
+ int seedcov; // length of regions coverged by seeds
+ int secondary; // index of the parent hit shadowing the current hit; <0 if primary
+ int secondary_all;
+ int seedlen0; // length of the starting seed
+ int n_comp:30, is_alt:2; // number of sub-alignments chained together
+ float frac_rep;
+ uint64_t hash;
+} mem_alnreg_t;
+
+typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
+
+typedef struct {
+ int low, high; // lower and upper bounds within which a read pair is considered to be properly paired
+ int failed; // non-zero if the orientation is not supported by sufficient data
+ double avg, std; // mean and stddev of the insert size distribution
+} mem_pestat_t;
+
+typedef struct { // This struct is only used for the convenience of API.
+ int64_t pos; // forward strand 5'-end mapping position
+ int rid; // reference sequence index in bntseq_t; <0 for unmapped
+ int flag; // extra flag
+ uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
+ int n_cigar; // number of CIGAR operations
+ uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
+ char *XA; // alternative mappings
+
+ int score, sub, alt_sc;
+} mem_aln_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ smem_i *smem_itr_init(const bwt_t *bwt);
+ void smem_itr_destroy(smem_i *itr);
+ void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+ void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
+ const bwtintv_v *smem_next(smem_i *itr);
+
+ mem_opt_t *mem_opt_init(void);
+ void mem_fill_scmat(int a, int b, int8_t mat[25]);
+
+ /**
+ * Align a batch of sequences and generate the alignments in the SAM format
+ *
+ * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
+ * Note that $seqs[i].sam may consist of several SAM lines if the
+ * corresponding sequence has multiple primary hits.
+ *
+ * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
+ * sequences must be interleaved: $n must be an even number and the 2i-th
+ * sequence and the (2i+1)-th sequence constitute a read pair. In this
+ * mode, there should be enough (typically >50) unique pairs for the
+ * routine to infer the orientation and insert size.
+ *
+ * @param opt alignment parameters
+ * @param bwt FM-index of the reference sequence
+ * @param bns Information of the reference
+ * @param pac 2-bit encoded reference
+ * @param n number of query sequences
+ * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call
+ * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements,
+ * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info.
+ */
+ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0);
+
+ /**
+ * Find the aligned regions for one query sequence
+ *
+ * Note that this routine does not generate CIGAR. CIGAR should be
+ * generated later by mem_reg2aln() below.
+ *
+ * @param opt alignment parameters
+ * @param bwt FM-index of the reference sequence
+ * @param bns Information of the reference
+ * @param pac 2-bit encoded reference
+ * @param l_seq length of query sequence
+ * @param seq query sequence
+ *
+ * @return list of aligned regions.
+ */
+ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq);
+
+ /**
+ * Generate CIGAR and forward-strand position from alignment region
+ *
+ * @param opt alignment parameters
+ * @param bns Information of the reference
+ * @param pac 2-bit encoded reference
+ * @param l_seq length of query sequence
+ * @param seq query sequence
+ * @param ar one alignment region
+ *
+ * @return CIGAR, strand, mapping quality and forward-strand position
+ */
+ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar);
+ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name);
+
+ /**
+ * Infer the insert size distribution from interleaved alignment regions
+ *
+ * This function can be called after mem_align1(), as long as paired-end
+ * reads are properly interleaved.
+ *
+ * @param opt alignment parameters
+ * @param l_pac length of concatenated reference sequence
+ * @param n number of query sequences; must be an even number
+ * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
+ * @param pes inferred insert size distribution (output)
+ */
+ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/include/bwa/bwt.h b/ext/include/bwa/bwt.h
new file mode 100644
index 0000000..c71d6b5
--- /dev/null
+++ b/ext/include/bwa/bwt.h
@@ -0,0 +1,130 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#ifndef BWA_BWT_H
+#define BWA_BWT_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
+#define OCC_INTV_SHIFT 7
+#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
+#define OCC_INTV_MASK (OCC_INTERVAL - 1)
+
+#ifndef BWA_UBYTE
+#define BWA_UBYTE
+typedef unsigned char ubyte_t;
+#endif
+
+typedef uint64_t bwtint_t;
+
+typedef struct {
+ bwtint_t primary; // S^{-1}(0), or the primary index of BWT
+ bwtint_t L2[5]; // C(), cumulative count
+ bwtint_t seq_len; // sequence length
+ bwtint_t bwt_size; // size of bwt, about seq_len/4
+ uint32_t *bwt; // BWT
+ // occurance array, separated to two parts
+ uint32_t cnt_table[256];
+ // suffix array
+ int sa_intv;
+ bwtint_t n_sa;
+ bwtint_t *sa;
+} bwt_t;
+
+typedef struct {
+ bwtint_t x[3], info;
+} bwtintv_t;
+
+typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
+
+/* For general OCC_INTERVAL, the following is correct:
+#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
+#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
+*/
+
+// The following two lines are ONLY correct when OCC_INTERVAL==0x80
+#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)])
+#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4))
+
+/* retrieve a character from the $-removed BWT string. Note that
+ * bwt_t::bwt is not exactly the BWT string and therefore this macro is
+ * called bwt_B0 instead of bwt_B */
+#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
+
+#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
+ void bwt_dump_sa(const char *fn, const bwt_t *bwt);
+
+ bwt_t *bwt_restore_bwt(const char *fn);
+ void bwt_restore_sa(const char *fn, bwt_t *bwt);
+
+ void bwt_destroy(bwt_t *bwt);
+
+ void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
+ void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
+ void bwt_cal_sa(bwt_t *bwt, int intv);
+
+ void bwt_bwtupdate_core(bwt_t *bwt);
+
+ bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
+ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]);
+ bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
+
+ // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
+ void bwt_gen_cnt_table(bwt_t *bwt);
+ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
+ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]);
+
+ int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
+ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
+
+ /**
+ * Extend bi-SA-interval _ik_
+ */
+ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back);
+
+ /**
+ * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
+ * Return the end of the longest exact match starting from _x_.
+ */
+ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+ int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+
+ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/include/bwa/utils.h b/ext/include/bwa/utils.h
new file mode 100644
index 0000000..11966b8
--- /dev/null
+++ b/ext/include/bwa/utils.h
@@ -0,0 +1,111 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#ifndef LH3_UTILS_H
+#define LH3_UTILS_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef __GNUC__
+// Tell GCC to validate printf format string and args
+#define ATTRIBUTE(list) __attribute__ (list)
+#else
+#define ATTRIBUTE(list)
+#endif
+
+#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg)
+#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg)
+
+#define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
+#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
+#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
+
+#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg)
+
+typedef struct {
+ uint64_t x, y;
+} pair64_t;
+
+typedef struct { size_t n, m; uint64_t *a; } uint64_v;
+typedef struct { size_t n, m; pair64_t *a; } pair64_v;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
+ void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn));
+ void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn));
+ void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn));
+ FILE *err_xopen_core(const char *func, const char *fn, const char *mode);
+ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp);
+ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode);
+ size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+ size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream);
+
+ int err_gzread(gzFile file, void *ptr, unsigned int len);
+ int err_fseek(FILE *stream, long offset, int whence);
+#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET)
+ long err_ftell(FILE *stream);
+ int err_fprintf(FILE *stream, const char *format, ...)
+ ATTRIBUTE((format(printf, 2, 3)));
+ int err_printf(const char *format, ...)
+ ATTRIBUTE((format(printf, 1, 2)));
+ int err_fputc(int c, FILE *stream);
+#define err_putchar(C) err_fputc((C), stdout)
+ int err_fputs(const char *s, FILE *stream);
+ int err_puts(const char *s);
+ int err_fflush(FILE *stream);
+ int err_fclose(FILE *stream);
+ int err_gzclose(gzFile file);
+
+ double cputime();
+ double realtime();
+
+ void ks_introsort_64 (size_t n, uint64_t *a);
+ void ks_introsort_128(size_t n, pair64_t *a);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline uint64_t hash_64(uint64_t key)
+{
+ key += ~(key << 32);
+ key ^= (key >> 22);
+ key += ~(key << 13);
+ key ^= (key >> 8);
+ key += (key << 3);
+ key ^= (key >> 15);
+ key += ~(key << 27);
+ key ^= (key >> 31);
+ return key;
+}
+
+#endif
diff --git a/ext/include/cuckoo/LICENSE b/ext/include/cuckoo/LICENSE
new file mode 100644
index 0000000..9d8b367
--- /dev/null
+++ b/ext/include/cuckoo/LICENSE
@@ -0,0 +1,18 @@
+Copyright (C) 2013, Carnegie Mellon University and Intel Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---------------------------
+
+The third-party libraries have their own licenses, as detailed in their source
+files.
diff --git a/ext/include/cuckoo/city_hasher.hh b/ext/include/cuckoo/city_hasher.hh
new file mode 100644
index 0000000..a698705
--- /dev/null
+++ b/ext/include/cuckoo/city_hasher.hh
@@ -0,0 +1,44 @@
+/** \file */
+
+#ifndef _CITY_HASHER_HH
+#define _CITY_HASHER_HH
+
+#include <city.h>
+#include <string>
+
+/*! CityHasher is a std::hash-style wrapper around CityHash. We
+ * encourage using CityHasher instead of the default std::hash if
+ * possible. */
+template <class Key>
+class CityHasher {
+public:
+ //! The function call operator for our hash function
+ size_t operator()(const Key& k) const {
+ if (sizeof(size_t) < 8) {
+ return CityHash32((const char*) &k, sizeof(k));
+ }
+ /* Although the following line should be optimized away on 32-bit
+ * builds, the cast is still necessary to stop MSVC emitting a
+ * truncation warning. */
+ return static_cast<size_t>(CityHash64((const char*) &k, sizeof(k)));
+ }
+};
+
+/*! This is a template specialization of CityHasher for
+ * std::string. */
+template <>
+class CityHasher<std::string> {
+public:
+ //! The function call operator for our hash function
+ size_t operator()(const std::string& k) const {
+ if (sizeof(size_t) < 8) {
+ return CityHash32(k.c_str(), k.size());
+ }
+ /* Although the following line should be optimized away on 32-bit
+ * builds, the cast is still necessary to stop MSVC emitting a
+ * truncation warning. */
+ return static_cast<size_t>(CityHash64(k.c_str(), k.size()));
+ }
+};
+
+#endif // _CITY_HASHER_HH
diff --git a/ext/include/cuckoo/cuckoohash_config.hh b/ext/include/cuckoo/cuckoohash_config.hh
new file mode 100644
index 0000000..e894c9b
--- /dev/null
+++ b/ext/include/cuckoo/cuckoohash_config.hh
@@ -0,0 +1,36 @@
+/** \file */
+
+#ifndef _CUCKOOHASH_CONFIG_HH
+#define _CUCKOOHASH_CONFIG_HH
+
+#include <cstddef>
+
+//! The default maximum number of keys per bucket
+constexpr size_t LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET = 4;
+
+//! The default number of elements in an empty hash table
+constexpr size_t LIBCUCKOO_DEFAULT_SIZE =
+ (1U << 16) * LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET;
+
+//! On a scale of 0 to 16, the memory granularity of the locks array. 0 is the
+//! least granular, meaning the array is a contiguous array and thus offers the
+//! best performance but the greatest memory overhead. 16 is the most granular,
+//! offering the least memory overhead but worse performance.
+constexpr size_t LIBCUCKOO_LOCK_ARRAY_GRANULARITY = 0;
+
+//! The default minimum load factor that the table allows for automatic
+//! expansion. It must be a number between 0.0 and 1.0. The table will throw
+//! libcuckoo_load_factor_too_low if the load factor falls below this value
+//! during an automatic expansion.
+constexpr double LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR = 0.001;
+
+//! An alias for the value that sets no limit on the maximum hashpower. If this
+//! value is set as the maximum hashpower limit, there will be no limit. Since 0
+//! is the only hashpower that can never occur, it should stay at 0. This is
+//! also the default initial value for the maximum hashpower in a table.
+constexpr size_t LIBCUCKOO_NO_MAXIMUM_HASHPOWER = 0;
+
+//! set LIBCUCKOO_DEBUG to 1 to enable debug output
+#define LIBCUCKOO_DEBUG 0
+
+#endif // _CUCKOOHASH_CONFIG_HH
diff --git a/ext/include/cuckoo/cuckoohash_map.hh b/ext/include/cuckoo/cuckoohash_map.hh
new file mode 100644
index 0000000..08448e8
--- /dev/null
+++ b/ext/include/cuckoo/cuckoohash_map.hh
@@ -0,0 +1,2537 @@
+/** \file */
+
+#ifndef _CUCKOOHASH_MAP_HH
+#define _CUCKOOHASH_MAP_HH
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "cuckoohash_config.hh"
+#include "cuckoohash_util.hh"
+#include "libcuckoo_lazy_array.hh"
+
+/**
+ * A concurrent hash table
+ *
+ * @tparam Key type of keys in the table
+ * @tparam T type of values in the table
+ * @tparam Pred type of equality comparison functor
+ * @tparam Alloc type of key-value pair allocator
+ * @tparam SLOT_PER_BUCKET number of slots for each bucket in the table
+ */
+template < class Key,
+ class T,
+ class Hash = std::hash<Key>,
+ class Pred = std::equal_to<Key>,
+ class Alloc = std::allocator<std::pair<const Key, T>>,
+ std::size_t SLOT_PER_BUCKET = LIBCUCKOO_DEFAULT_SLOT_PER_BUCKET
+ >
+class cuckoohash_map {
+public:
+ /** @name Type Declarations */
+ /**@{*/
+
+ using key_type = Key;
+ using mapped_type = T;
+ using value_type = std::pair<const Key, T>;
+ using size_type = std::size_t;
+ using difference_type = std::ptrdiff_t;
+ using hasher = Hash;
+ using key_equal = Pred;
+ using allocator_type = Alloc;
+
+private:
+ using allocator_traits_ = std::allocator_traits<allocator_type>;
+
+public:
+ using reference = value_type&;
+ using const_reference = const value_type&;
+ using pointer = typename allocator_traits_::pointer;
+ using const_pointer = typename allocator_traits_::const_pointer;
+ class locked_table;
+
+ /**@}*/
+
+ /** @name Table Parameters */
+ /**@{*/
+
+ /**
+ * The number of slots per hash bucket
+ */
+ static constexpr size_type slot_per_bucket() {
+ return SLOT_PER_BUCKET;
+ }
+
+ /**@}*/
+
+ /** @name Constructors and Destructors */
+ /**@{*/
+
+ /**
+ * Creates a new cuckohash_map instance
+ *
+ * @param n the number of elements to reserve space for initially
+ * @param hf hash function instance to use
+ * @param eql equality function instance to use
+ * @param alloc allocator instance to use
+ */
+ cuckoohash_map(size_type n = LIBCUCKOO_DEFAULT_SIZE,
+ const hasher& hf = hasher(),
+ const key_equal& eql = key_equal(),
+ const allocator_type& alloc = allocator_type())
+ : hashpower_(reserve_calc(n)),
+ hash_fn_(hf),
+ eq_fn_(eql),
+ allocator_(alloc),
+ buckets_(hashsize(hashpower()), alloc),
+ locks_(hashsize(hashpower()), alloc),
+ expansion_lock_(),
+ minimum_load_factor_(LIBCUCKOO_DEFAULT_MINIMUM_LOAD_FACTOR),
+ maximum_hashpower_(LIBCUCKOO_NO_MAXIMUM_HASHPOWER) {}
+
+ /**
+ * Destroys the table. The destructors of all elements stored in the table
+ * are destroyed, and then the table storage is deallocated.
+ */
+ ~cuckoohash_map() {
+ cuckoo_clear();
+ }
+
+ /**@}*/
+
+ /** @name Table Details
+ *
+ * Methods for getting information about the table. Methods that query
+ * changing properties of the table are not synchronized with concurrent
+ * operations, and may return out-of-date information if the table is being
+ * concurrently modified.
+ *
+ */
+ /**@{*/
+
+ /**
+ * Returns the function that hashes the keys
+ *
+ * @return the hash function
+ */
+ hasher hash_function() const {
+ return hash_fn_;
+ }
+
+ /**
+ * Returns the function that compares keys for equality
+ *
+ * @return the key comparison function
+ */
+ key_equal key_eq() const {
+ return eq_fn_;
+ }
+
+ /**
+ * Returns the allocator associated with the container
+ *
+ * @return the associated allocator
+ */
+ allocator_type get_allocator() const {
+ return allocator_;
+ }
+
+ /**
+ * Returns the hashpower of the table, which is log<SUB>2</SUB>(@ref
+ * bucket_count()).
+ *
+ * @return the hashpower
+ */
+ size_type hashpower() const {
+ return hashpower_.load(std::memory_order_acquire);
+ }
+
+ /**
+ * Returns the number of buckets in the table.
+ *
+ * @return the bucket count
+ */
+ size_type bucket_count() const {
+ return buckets_.size();
+ }
+
+ /**
+ * Returns whether the table is empty or not.
+ *
+ * @return true if the table is empty, false otherwise
+ */
+ bool empty() const {
+ for (size_type i = 0; i < locks_.size(); ++i) {
+ if (locks_[i].elem_counter() > 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns the number of elements in the table.
+ *
+ * @return number of elements in the table
+ */
+ size_type size() const {
+ size_type s = 0;
+ for (size_type i = 0; i < locks_.size(); ++i) {
+ s += locks_[i].elem_counter();
+ }
+ return s;
+ }
+
+ /** Returns the current capacity of the table, that is, @ref bucket_count()
+ * × @ref slot_per_bucket().
+ *
+ * @return capacity of table
+ */
+ size_type capacity() const {
+ return bucket_count() * slot_per_bucket();
+ }
+
+ /**
+ * Returns the percentage the table is filled, that is, @ref size() ÷
+ * @ref capacity().
+ *
+ * @return load factor of the table
+ */
+ double load_factor() const {
+ return static_cast<double>(size()) / static_cast<double>(capacity());
+ }
+
+ /**
+ * Sets the minimum load factor allowed for automatic expansions. If an
+ * expansion is needed when the load factor of the table is lower than this
+ * threshold, @ref libcuckoo_load_factor_too_low is thrown. It will not be
+ * thrown for an explicitly-triggered expansion.
+ *
+ * @param mlf the load factor to set the minimum to
+ * @throw std::invalid_argument if the given load factor is less than 0.0
+ * or greater than 1.0
+ */
+ void minimum_load_factor(const double mlf) {
+ if (mlf < 0.0) {
+ throw std::invalid_argument(
+ "load factor " + std::to_string(mlf) + " cannot be "
+ "less than 0");
+ } else if (mlf > 1.0) {
+ throw std::invalid_argument(
+ "load factor " + std::to_string(mlf) + " cannot be "
+ "greater than 1");
+ }
+ minimum_load_factor_.store(mlf, std::memory_order_release);
+ }
+
+ /**
+ * Returns the minimum load factor of the table
+ *
+ * @return the minimum load factor
+ */
+ double minimum_load_factor() {
+ return minimum_load_factor_.load(std::memory_order_acquire);
+ }
+
+ /**
+ * Sets the maximum hashpower the table can be. If set to @ref
+ * LIBCUCKOO_NO_MAXIMUM_HASHPOWER, there will be no limit on the hashpower.
+ * Otherwise, the table will not be able to expand beyond the given
+ * hashpower, either by an explicit or an automatic expansion.
+ *
+ * @param mhp the hashpower to set the maximum to
+ * @throw std::invalid_argument if the current hashpower exceeds the limit
+ */
+ void maximum_hashpower(size_type mhp) {
+ if (mhp != LIBCUCKOO_NO_MAXIMUM_HASHPOWER && hashpower() > mhp) {
+ throw std::invalid_argument(
+ "maximum hashpower " + std::to_string(mhp) + " is less than "
+ "current hashpower");
+
+ }
+ maximum_hashpower_.store(mhp, std::memory_order_release);
+ }
+
+ /**
+ * Returns the maximum hashpower of the table
+ *
+ * @return the maximum hashpower
+ */
+ size_type maximum_hashpower() {
+ return maximum_hashpower_.load(std::memory_order_acquire);
+ }
+
+ /**@}*/
+
+ /** @name Table Operations
+ *
+ * These are operations that affect the data in the table. They are safe to
+ * call concurrently with each other.
+ *
+ */
+ /**@{*/
+
+ /**
+ * Searches the table for @p key, and invokes @p fn on the value. @p fn is
+ * not allowed to modify the contents of the value if found.
+ *
+ * @tparam K type of the key. This can be any type comparable with @c key_type
+ * @tparam F type of the functor. It should implement the method
+ * <tt>void operator()(const mapped_type&)</tt>.
+ * @param key the key to search for
+ * @param fn the functor to invoke if the element is found
+ * @return true if the key was found and functor invoked, false otherwise
+ */
+ template <typename K, typename F>
+ bool find_fn(const K& key, F fn) const {
+ const hash_value hv = hashed_key(key);
+ const auto b = snapshot_and_lock_two<locking_active>(hv);
+ const table_position pos = cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ fn(buckets_[pos.index].val(pos.slot));
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Searches the table for @p key, and invokes @p fn on the value. @p fn is
+ * allow to modify the contents of the value if found.
+ *
+ * @tparam K type of the key. This can be any type comparable with @c key_type
+ * @tparam F type of the functor. It should implement the method
+ * <tt>void operator()(mapped_type&)</tt>.
+ * @param key the key to search for
+ * @param fn the functor to invoke if the element is found
+ * @return true if the key was found and functor invoked, false otherwise
+ */
+ template <typename K, typename F>
+ bool update_fn(const K& key, F fn) {
+ const hash_value hv = hashed_key(key);
+ const auto b = snapshot_and_lock_two<locking_active>(hv);
+ const table_position pos = cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ fn(buckets_[pos.index].val(pos.slot));
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Searches for @p key in the table. If the key is not there, it is inserted
+ * with @p val. If the key is there, then @p fn is called on the value. The
+ * key will be immediately constructed as @c key_type(std::forward<K>(key)).
+ * If the insertion succeeds, this constructed key will be moved into the
+ * table and the value constructed from the @p val parameters. If the
+ * insertion fails, the constructed key will be destroyed, and the @p val
+ * parameters will remain valid. If there is no room left in the table, it
+ * will be automatically expanded. Expansion may throw exceptions.
+ *
+ * @tparam K type of the key
+ * @tparam F type of the functor. It should implement the method
+ * <tt>void operator()(mapped_type&)</tt>.
+ * @tparam Args list of types for the value constructor arguments
+ * @param key the key to insert into the table
+ * @param fn the functor to invoke if the element is found
+ * @param val a list of constructor arguments with which to create the value
+ * @return true if a new key was inserted, false if the key was already in
+ * the table
+ */
+ template <typename K, typename F, typename... Args>
+ bool upsert(K&& key, F fn, Args&&... val) {
+ K k(std::forward<K>(key));
+ hash_value hv = hashed_key(k);
+ auto b = snapshot_and_lock_two<locking_active>(hv);
+ table_position pos = cuckoo_insert_loop(hv, b, k);
+ if (pos.status == ok) {
+ add_to_bucket(pos.index, pos.slot, hv.partial, k,
+ std::forward<Args>(val)...);
+ } else {
+ fn(buckets_[pos.index].val(pos.slot));
+ }
+ return pos.status == ok;
+ }
+
+ /**
+ * Searches for @p key in the table, and invokes @p fn on the value if the
+ * key is found. The functor can mutate the value, and should return @c true
+ * in order to erase the element, and @c false otherwise.
+ *
+ * @tparam K type of the key
+ * @tparam F type of the functor. It should implement the method
+ * <tt>bool operator()(mapped_type&)</tt>.
+ * @param key the key to possibly erase from the table
+ * @param fn the functor to invoke if the element is found
+ * @return true if @p key was found and @p fn invoked, false otherwise
+ */
+ template <typename K, typename F>
+ bool erase_fn(const K& key, F fn) {
+ const hash_value hv = hashed_key(key);
+ const auto b = snapshot_and_lock_two<locking_active>(hv);
+ const table_position pos = cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ if (fn(buckets_[pos.index].val(pos.slot))) {
+ del_from_bucket(buckets_[pos.index], pos.index, pos.slot);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Copies the value associated with @p key into @p val. Equivalent to
+ * calling @ref find_fn with a functor that copies the value into @p val. @c
+ * mapped_type must be @c CopyAssignable.
+ */
+ template <typename K>
+ bool find(const K& key, mapped_type& val) const {
+ return find_fn(key, [&val](const mapped_type& v) mutable {
+ val = v;
+ });
+ }
+
+ /** Searches the table for @p key, and returns the associated value it
+ * finds. @c mapped_type must be @c CopyConstructible.
+ *
+ * @tparam K type of the key
+ * @param key the key to search for
+ * @return the value associated with the given key
+ * @throw std::out_of_range if the key is not found
+ */
+ template <typename K>
+ mapped_type find(const K& key) const {
+ const hash_value hv = hashed_key(key);
+ const auto b = snapshot_and_lock_two<locking_active>(hv);
+ const table_position pos = cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ return buckets_[pos.index].val(pos.slot);
+ } else {
+ throw std::out_of_range("key not found in table");
+ }
+ }
+
+ /** Returns whether or not @p key is in the table. Equivalent to @ref
+ * find_fn with a functor that does nothing.
+ */
+ template <typename K>
+ bool contains(const K& key) const {
+ return find_fn(key, [](const mapped_type&) {});
+ }
+
+ /**
+ * Updates the value associated with @p key to @p val. Equivalent to calling
+ * @ref update_fn with a functor that copies @p val into the associated
+ * value. @c mapped_type must be @c MoveAssignable or @c CopyAssignable.
+ */
+ template <typename K, typename V>
+ bool update(const K& key, V&& val) {
+ return update_fn(key, [&val](mapped_type& v) {
+ v = std::forward<V>(val);
+ });
+ }
+
+ /**
+ * Inserts the key-value pair into the table. Equivalent to calling @ref
+ * upsert with a functor that does nothing.
+ */
+ template <typename K, typename... Args>
+ bool insert(K&& key, Args&&... val) {
+ return upsert(std::forward<K>(key), [](mapped_type&) {},
+ std::forward<Args>(val)...);
+ }
+
+ /**
+ * Erases the key from the table. Equivalent to calling @ref erase_fn with a
+ * functor that just returns true.
+ */
+ template <typename K>
+ bool erase(const K& key) {
+ return erase_fn(key, [](mapped_type&) { return true; });
+ }
+
+ /**
+ * Resizes the table to the given hashpower. If this hashpower is not larger
+ * than the current hashpower, then it decreases the hashpower to the
+ * maximum of the specified value and the smallest hashpower that can hold
+ * all the elements currently in the table.
+ *
+ * @param n the hashpower to set for the table
+ * @return true if the table changed size, false otherwise
+ */
+ bool rehash(size_type n) {
+ return cuckoo_rehash<locking_active>(n);
+ }
+
+ /**
+ * Reserve enough space in the table for the given number of elements. If
+ * the table can already hold that many elements, the function will shrink
+ * the table to the smallest hashpower that can hold the maximum of the
+ * specified amount and the current table size.
+ *
+ * @param n the number of elements to reserve space for
+ * @return true if the size of the table changed, false otherwise
+ */
+ bool reserve(size_type n) {
+ return cuckoo_reserve<locking_active>(n);
+ }
+
+ /**
+ * Removes all elements in the table, calling their destructors.
+ */
+ void clear() {
+ auto unlocker = snapshot_and_lock_all<locking_active>();
+ cuckoo_clear();
+ }
+
+ /**
+ * Construct a @ref locked_table object that owns all the locks in the
+ * table.
+ *
+ * @return a \ref locked_table instance
+ */
+ locked_table lock_table() {
+ return locked_table(*this);
+ }
+
+ /**@}*/
+
+private:
+ // Hashing types and functions
+
+ // Type of the partial key
+ using partial_t = uint8_t;
+
+ // true if the key is small and simple, which means using partial keys for
+ // lookup would probably slow us down
+ static constexpr bool is_simple =
+ std::is_pod<key_type>::value && sizeof(key_type) <= 8;
+
+ // Contains a hash and partial for a given key. The partial key is used for
+ // partial-key cuckoohashing, and for finding the alternate bucket of that a
+ // key hashes to.
+ struct hash_value {
+ size_type hash;
+ partial_t partial;
+ };
+
+ template <typename K>
+ hash_value hashed_key(const K& key) const {
+ const size_type hash = hash_function()(key);
+ return { hash, partial_key(hash) };
+ }
+
+ template <typename K>
+ size_type hashed_key_only_hash(const K& key) const {
+ return hash_function()(key);
+ }
+
+ // hashsize returns the number of buckets corresponding to a given
+ // hashpower.
+ static inline size_type hashsize(const size_type hp) {
+ return size_type(1) << hp;
+ }
+
+ // hashmask returns the bitmask for the buckets array corresponding to a
+ // given hashpower.
+ static inline size_type hashmask(const size_type hp) {
+ return hashsize(hp) - 1;
+ }
+
+ // The partial key must only depend on the hash value. It cannot change with
+ // the hashpower, because, in order for `cuckoo_fast_double` to work
+ // properly, the alt_index must only grow by one bit at the top each time we
+ // expand the table.
+ static partial_t partial_key(const size_type hash) {
+ const uint64_t hash_64bit = hash;
+ const uint32_t hash_32bit = (
+ static_cast<uint32_t>(hash_64bit) ^
+ static_cast<uint32_t>(hash_64bit >> 32));
+ const uint16_t hash_16bit = (
+ static_cast<uint16_t>(hash_32bit) ^
+ static_cast<uint16_t>(hash_32bit >> 16));
+ const uint16_t hash_8bit = (
+ static_cast<uint8_t>(hash_16bit) ^
+ static_cast<uint8_t>(hash_16bit >> 8));
+ return hash_8bit;
+ }
+
+ // index_hash returns the first possible bucket that the given hashed key
+ // could be.
+ static inline size_type index_hash(const size_type hp, const size_type hv) {
+ return hv & hashmask(hp);
+ }
+
+ // alt_index returns the other possible bucket that the given hashed key
+ // could be. It takes the first possible bucket as a parameter. Note that
+ // this function will return the first possible bucket if index is the
+ // second possible bucket, so alt_index(ti, partial, alt_index(ti, partial,
+ // index_hash(ti, hv))) == index_hash(ti, hv).
+ static inline size_type alt_index(const size_type hp, const partial_t partial,
+ const size_type index) {
+ // ensure tag is nonzero for the multiply. 0xc6a4a7935bd1e995 is the
+ // hash constant from 64-bit MurmurHash2
+ const size_type nonzero_tag = static_cast<size_type>(partial) + 1;
+ return (index ^ (nonzero_tag * 0xc6a4a7935bd1e995)) & hashmask(hp);
+ }
+
+ // Locking types and functions
+
+ using locking_active = std::integral_constant<bool, true>;
+ using locking_inactive = std::integral_constant<bool, false>;
+
+ // A fast, lightweight spinlock
+ LIBCUCKOO_SQUELCH_PADDING_WARNING
+ class LIBCUCKOO_ALIGNAS(64) spinlock {
+ public:
+ spinlock() noexcept : elem_counter_(0) {
+ lock_.clear();
+ }
+
+ void lock(locking_active) {
+ while (lock_.test_and_set(std::memory_order_acq_rel));
+ }
+
+ void lock(locking_inactive) {}
+
+ void unlock(locking_active) {
+ lock_.clear(std::memory_order_release);
+ }
+
+ void unlock(locking_inactive) {}
+
+ bool try_lock(locking_active) {
+ return !lock_.test_and_set(std::memory_order_acq_rel);
+ }
+
+ bool try_lock(locking_inactive) {
+ return true;
+ }
+
+ size_type& elem_counter() {
+ return elem_counter_;
+ }
+
+ private:
+ std::atomic_flag lock_;
+ size_type elem_counter_;
+ };
+
+ // The type of the locks container
+ static_assert(LIBCUCKOO_LOCK_ARRAY_GRANULARITY >= 0 &&
+ LIBCUCKOO_LOCK_ARRAY_GRANULARITY <= 16,
+ "LIBCUCKOO_LOCK_ARRAY_GRANULARITY constant must be between "
+ "0 and 16, inclusive");
+ using locks_t = libcuckoo_lazy_array<
+ 16 - LIBCUCKOO_LOCK_ARRAY_GRANULARITY, LIBCUCKOO_LOCK_ARRAY_GRANULARITY,
+ spinlock,
+ typename allocator_traits_::template rebind_alloc<spinlock>
+ >;
+
+ // The type of the expansion lock
+ using expansion_lock_t = std::mutex;
+
+ // Classes for managing locked buckets. By storing and moving around sets of
+ // locked buckets in these classes, we can ensure that they are unlocked
+ // properly.
+
+ template <typename LOCK_T>
+ class OneBucket {
+ public:
+ OneBucket() {}
+ OneBucket(locks_t* locks, size_type i)
+ : locks_(locks, OneUnlocker{i}) {}
+
+ private:
+ struct OneUnlocker {
+ size_type i;
+ void operator()(locks_t* p) const {
+ (*p)[lock_ind(i)].unlock(LOCK_T());
+ }
+ };
+
+ std::unique_ptr<locks_t, OneUnlocker> locks_;
+ };
+
+ template <typename LOCK_T>
+ class TwoBuckets {
+ public:
+ TwoBuckets() {}
+ TwoBuckets(locks_t* locks, size_type i1, size_type i2)
+ : locks_(locks, TwoUnlocker{i1, i2}) {}
+
+ size_type first() const {
+ return locks_.get_deleter().i1;
+ }
+
+ size_type second() const {
+ return locks_.get_deleter().i2;
+ }
+
+ bool is_active() const {
+ return static_cast<bool>(locks_);
+ }
+
+ void unlock() {
+ locks_.reset(nullptr);
+ }
+
+ private:
+ struct TwoUnlocker {
+ size_type i1, i2;
+ void operator()(locks_t* p) const {
+ const size_type l1 = lock_ind(i1);
+ const size_type l2 = lock_ind(i2);
+ (*p)[l1].unlock(LOCK_T());
+ if (l1 != l2) {
+ (*p)[l2].unlock(LOCK_T());
+ }
+ }
+ };
+
+ std::unique_ptr<locks_t, TwoUnlocker> locks_;
+ };
+
+ template <typename LOCK_T>
+ class AllBuckets {
+ public:
+ AllBuckets(locks_t* locks) : locks_(locks) {}
+
+ bool is_active() const {
+ return static_cast<bool>(locks_);
+ }
+
+ void unlock() {
+ locks_.reset(nullptr);
+ }
+
+ void release() {
+ (void)locks_.release();
+ }
+
+ private:
+ struct AllUnlocker {
+ void operator()(locks_t* p) const {
+ for (size_type i = 0; i < p->size(); ++i) {
+ (*p)[i].unlock(LOCK_T());
+ }
+ }
+ };
+
+ std::unique_ptr<locks_t, AllUnlocker> locks_;
+ };
+
+ // This exception is thrown whenever we try to lock a bucket, but the
+ // hashpower is not what was expected
+ class hashpower_changed {};
+
+ // After taking a lock on the table for the given bucket, this function will
+ // check the hashpower to make sure it is the same as what it was before the
+ // lock was taken. If it isn't unlock the bucket and throw a
+ // hashpower_changed exception.
+ template <typename LOCK_T>
+ inline void check_hashpower(const size_type hp, const size_type lock) const {
+ if (hashpower() != hp) {
+ locks_[lock].unlock(LOCK_T());
+ LIBCUCKOO_DBG("%s", "hashpower changed\n");
+ throw hashpower_changed();
+ }
+ }
+
+ // locks the given bucket index.
+ //
+ // throws hashpower_changed if it changed after taking the lock.
+ template <typename LOCK_T>
+ inline OneBucket<LOCK_T> lock_one(const size_type hp, const size_type i) const {
+ const size_type l = lock_ind(i);
+ locks_[l].lock(LOCK_T());
+ check_hashpower<LOCK_T>(hp, l);
+ return OneBucket<LOCK_T>(&locks_, i);
+ }
+
+ // locks the two bucket indexes, always locking the earlier index first to
+ // avoid deadlock. If the two indexes are the same, it just locks one.
+ //
+ // throws hashpower_changed if it changed after taking the lock.
+ template <typename LOCK_T>
+ TwoBuckets<LOCK_T> lock_two(const size_type hp, const size_type i1,
+ const size_type i2) const {
+ size_type l1 = lock_ind(i1);
+ size_type l2 = lock_ind(i2);
+ if (l2 < l1) {
+ std::swap(l1, l2);
+ }
+ locks_[l1].lock(LOCK_T());
+ check_hashpower<LOCK_T>(hp, l1);
+ if (l2 != l1) {
+ locks_[l2].lock(LOCK_T());
+ }
+ return TwoBuckets<LOCK_T>(&locks_, i1, i2);
+ }
+
+ // lock_two_one locks the three bucket indexes in numerical order, returning
+ // the containers as a two (i1 and i2) and a one (i3). The one will not be
+ // active if i3 shares a lock index with i1 or i2.
+ //
+ // throws hashpower_changed if it changed after taking the lock.
+ template <typename LOCK_T>
+ std::pair<TwoBuckets<LOCK_T>, OneBucket<LOCK_T>>
+ lock_three(const size_type hp, const size_type i1,
+ const size_type i2, const size_type i3) const {
+ std::array<size_type, 3> l{{lock_ind(i1), lock_ind(i2), lock_ind(i3)}};
+ // Lock in order.
+ if (l[2] < l[1]) std::swap(l[2], l[1]);
+ if (l[2] < l[0]) std::swap(l[2], l[0]);
+ if (l[1] < l[0]) std::swap(l[1], l[0]);
+ locks_[l[0]].lock(LOCK_T());
+ check_hashpower<LOCK_T>(hp, l[0]);
+ if (l[1] != l[0]) {
+ locks_[l[1]].lock(LOCK_T());
+ }
+ if (l[2] != l[1]) {
+ locks_[l[2]].lock(LOCK_T());
+ }
+ return std::make_pair(
+ TwoBuckets<LOCK_T>(&locks_, i1, i2),
+ OneBucket<LOCK_T>(
+ (lock_ind(i3) == lock_ind(i1) || lock_ind(i3) == lock_ind(i2)) ?
+ nullptr : &locks_, i3)
+ );
+ }
+
+ // snapshot_and_lock_two loads locks the buckets associated with the given
+ // hash value, making sure the hashpower doesn't change before the locks are
+ // taken. Thus it ensures that the buckets and locks corresponding to the
+ // hash value will stay correct as long as the locks are held. It returns
+ // the bucket indices associated with the hash value and the current
+ // hashpower.
+ template <typename LOCK_T>
+ TwoBuckets<LOCK_T> snapshot_and_lock_two(const hash_value& hv) const {
+ while (true) {
+ // Store the current hashpower we're using to compute the buckets
+ const size_type hp = hashpower();
+ const size_type i1 = index_hash(hp, hv.hash);
+ const size_type i2 = alt_index(hp, hv.partial, i1);
+ try {
+ return lock_two<LOCK_T>(hp, i1, i2);
+ } catch (hashpower_changed&) {
+ // The hashpower changed while taking the locks. Try again.
+ continue;
+ }
+ }
+ }
+
+ // snapshot_and_lock_all takes all the locks, and returns a deleter object
+ // that releases the locks upon destruction. Note that after taking all the
+ // locks, it is okay to change the buckets_ vector and the hashpower_, since
+ // no other threads should be accessing the buckets.
+ template <typename LOCK_T>
+ AllBuckets<LOCK_T> snapshot_and_lock_all() const {
+ for (size_type i = 0; i < locks_.size(); ++i) {
+ locks_[i].lock(LOCK_T());
+ }
+ return AllBuckets<LOCK_T>(&locks_);
+ }
+
+ // lock_ind converts an index into buckets to an index into locks.
+ static inline size_type lock_ind(const size_type bucket_ind) {
+ return bucket_ind & (locks_t::max_size() - 1);
+ }
+
+ // Data storage types and functions
+
+ // Value type without const Key, used for storage
+ using storage_value_type = std::pair<key_type, mapped_type>;
+
+ // The Bucket type holds slot_per_bucket() partial keys, key-value pairs,
+ // and a occupied bitset, which indicates whether the slot at the given bit
+ // index is in the table or not. It uses aligned_storage arrays to store the
+ // keys and values to allow constructing and destroying key-value pairs in
+ // place. Internally, the values are stored without the const qualifier in
+ // the key, to enable modifying bucket memory.
+ class Bucket {
+ public:
+ Bucket() noexcept {}
+ // The destructor does nothing to the key-value pairs, since we'd need
+ // an allocator to properly destroy the elements.
+ ~Bucket() noexcept {}
+
+ // No move or copy constructors, since we'd need an
+ // instance of the allocator to do any constructions or destructions
+ Bucket(const Bucket&) = delete;
+ Bucket(Bucket&&) = delete;
+ Bucket& operator=(const Bucket&) = delete;
+ Bucket& operator=(Bucket&&) = delete;
+
+ partial_t partial(size_type ind) const {
+ return partials_[ind];
+ }
+
+ const value_type& kvpair(size_type ind) const {
+ return *static_cast<const value_type*>(
+ static_cast<const void*>(std::addressof(kvpairs_[ind])));
+ }
+
+ value_type& kvpair(size_type ind) {
+ return *static_cast<value_type*>(
+ static_cast<void*>(std::addressof(kvpairs_[ind])));
+ }
+
+ storage_value_type& storage_kvpair(size_type ind) {
+ return *static_cast<storage_value_type*>(
+ static_cast<void*>(std::addressof(kvpairs_[ind])));
+ }
+
+ bool occupied(size_type ind) const {
+ return occupied_[ind];
+ }
+
+ const key_type& key(size_type ind) const {
+ return kvpair(ind).first;
+ }
+
+ const mapped_type& val(size_type ind) const {
+ return kvpair(ind).second;
+ }
+
+ mapped_type& val(size_type ind) {
+ return kvpair(ind).second;
+ }
+
+ template <typename K, typename... Args>
+ void setKV(allocator_type& allocator, size_type ind, partial_t p,
+ K& k, Args&&... args) {
+ partials_[ind] = p;
+ occupied_[ind] = true;
+ allocator_traits_::construct(
+ allocator, &storage_kvpair(ind), std::piecewise_construct,
+ std::forward_as_tuple(std::move(k)),
+ std::forward_as_tuple(std::forward<Args>(args)...));
+ }
+
+ void eraseKV(allocator_type& allocator, size_type ind) {
+ occupied_[ind] = false;
+ allocator_traits_::destroy(
+ allocator, std::addressof(storage_kvpair(ind)));
+ }
+
+ void clear(allocator_type& allocator) {
+ for (size_type i = 0; i < slot_per_bucket(); ++i) {
+ if (occupied(i)) {
+ eraseKV(allocator, i);
+ }
+ }
+ }
+
+ // Moves the item in b1[slot1] into b2[slot2] without copying
+ static void move_to_bucket(allocator_type& allocator,
+ Bucket& b1, size_type slot1,
+ Bucket& b2, size_type slot2) {
+ assert(b1.occupied(slot1));
+ assert(!b2.occupied(slot2));
+ storage_value_type& tomove = b1.storage_kvpair(slot1);
+ b2.setKV(allocator, slot2, b1.partial(slot1),
+ tomove.first, std::move(tomove.second));
+ b1.eraseKV(allocator, slot1);
+ }
+
+ // Moves the contents of b1 to b2
+ static void move_bucket(allocator_type& allocator, Bucket& b1,
+ Bucket& b2) {
+ for (size_type i = 0; i < slot_per_bucket(); ++i) {
+ if (b1.occupied(i)) {
+ move_to_bucket(allocator, b1, i, b2, i);
+ }
+ }
+ }
+
+ private:
+ std::array<partial_t, slot_per_bucket()> partials_;
+ std::bitset<slot_per_bucket()> occupied_;
+ std::array<typename std::aligned_storage<
+ sizeof(storage_value_type),
+ alignof(storage_value_type)>::type,
+ slot_per_bucket()> kvpairs_;
+ };
+
+ class BucketContainer {
+ using traits_ = typename allocator_traits_::
+ template rebind_traits<Bucket>;
+ public:
+ BucketContainer(size_type n, typename traits_::allocator_type alloc)
+ : buckets_(traits_::allocate(allocator_, n)),
+ allocator_(alloc), size_(n) {
+ // The Bucket default constructor is nothrow, so we don't have to
+ // worry about dealing with exceptions when constructing all the
+ // elements.
+ static_assert(
+ std::is_nothrow_constructible<Bucket>::value,
+ "BucketContainer requires Bucket to be nothrow constructible");
+ for (size_type i = 0; i < size_; ++i) {
+ traits_::construct(allocator_, &buckets_[i]);
+ }
+ }
+
+ BucketContainer(const BucketContainer&) = delete;
+ BucketContainer(BucketContainer&&) = delete;
+ BucketContainer& operator=(const BucketContainer&) = delete;
+ BucketContainer& operator=(BucketContainer&&) = delete;
+
+ ~BucketContainer() noexcept {
+ static_assert(
+ std::is_nothrow_destructible<Bucket>::value,
+ "BucketContainer requires Bucket to be nothrow destructible");
+ for (size_type i = 0; i < size_; ++i) {
+ traits_::destroy(allocator_, &buckets_[i]);
+ }
+ traits_::deallocate(allocator_, buckets_, size());
+ }
+
+ size_type size() const {
+ return size_;
+ }
+
+ void swap(BucketContainer& other) noexcept {
+ std::swap(buckets_, other.buckets_);
+ // If propagate_container_on_swap is false, we do nothing if the
+ // allocators are equal. If they're not equal, behavior is
+ // undefined, so we can still do nothing.
+ if (traits_::propagate_on_container_swap::value) {
+ std::swap(allocator_, other.allocator_);
+ }
+ std::swap(size_, other.size_);
+ }
+
+ Bucket& operator[](size_type i) {
+ return buckets_[i];
+ }
+
+ const Bucket& operator[](size_type i) const {
+ return buckets_[i];
+ }
+
+ private:
+ typename traits_::pointer buckets_;
+ typename allocator_traits_::template rebind_alloc<Bucket> allocator_;
+ size_type size_;
+ };
+
+ // The type of the buckets container
+ using buckets_t = BucketContainer;
+
+ // Status codes for internal functions
+
+ enum cuckoo_status {
+ ok,
+ failure,
+ failure_key_not_found,
+ failure_key_duplicated,
+ failure_table_full,
+ failure_under_expansion,
+ };
+
+
+ // A composite type for functions that need to return a table position, and
+ // a status code.
+ struct table_position {
+ size_type index;
+ size_type slot;
+ cuckoo_status status;
+ };
+
+ // Searching types and functions
+
+ // cuckoo_find searches the table for the given key, returning the position
+ // of the element found, or a failure status code if the key wasn't found.
+ // It expects the locks to be taken and released outside the function.
+ template <typename K>
+ table_position cuckoo_find(const K &key, const partial_t partial,
+ const size_type i1, const size_type i2) const {
+ int slot = try_read_from_bucket(buckets_[i1], partial, key);
+ if (slot != -1) {
+ return table_position{i1, static_cast<size_type>(slot), ok};
+ }
+ slot = try_read_from_bucket(buckets_[i2], partial, key);
+ if (slot != -1) {
+ return table_position{i2, static_cast<size_type>(slot), ok};
+ }
+ return table_position{0, 0, failure_key_not_found};
+ }
+
+ // try_read_from_bucket will search the bucket for the given key and return
+ // the index of the slot if found, or -1 if not found.
+ template <typename K>
+ int try_read_from_bucket(const Bucket& b, const partial_t partial,
+ const K &key) const {
+ // Silence a warning from MSVC about partial being unused if is_simple.
+ (void)partial;
+ for (size_type i = 0; i < slot_per_bucket(); ++i) {
+ if (!b.occupied(i) || (!is_simple && partial != b.partial(i))) {
+ continue;
+ } else if (key_eq()(b.key(i), key)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ // Insertion types and function
+
+ /**
+ * Runs cuckoo_insert in a loop until it succeeds in insert and upsert, so
+ * we pulled out the loop to avoid duplicating logic.
+ *
+ * @param hv the hash value of the key
+ * @param b bucket locks
+ * @param key the key to insert
+ * @return table_position of the location to insert the new element, or the
+ * site of the duplicate element with a status code if there was a duplicate.
+ * In either case, the locks will still be held after the function ends.
+ * @throw libcuckoo_load_factor_too_low if expansion is necessary, but the
+ * load factor of the table is below the threshold
+ */
+ template <typename K, typename LOCK_T>
+ table_position cuckoo_insert_loop(hash_value hv, TwoBuckets<LOCK_T>& b,
+ K& key) {
+ table_position pos;
+ while (true) {
+ assert(b.is_active());
+ const size_type hp = hashpower();
+ pos = cuckoo_insert(hv, b, key);
+ switch (pos.status) {
+ case ok:
+ case failure_key_duplicated:
+ return pos;
+ case failure_table_full:
+ // Expand the table and try again, re-grabbing the locks
+ cuckoo_fast_double<LOCK_T, automatic_resize>(hp);
+ case failure_under_expansion:
+ b = snapshot_and_lock_two<LOCK_T>(hv);
+ break;
+ default:
+ assert(false);
+ }
+ }
+ }
+
+ // cuckoo_insert tries to find an empty slot in either of the buckets to
+ // insert the given key into, performing cuckoo hashing if necessary. It
+ // expects the locks to be taken outside the function. Before inserting, it
+ // checks that the key isn't already in the table. cuckoo hashing presents
+ // multiple concurrency issues, which are explained in the function. The
+ // following return states are possible:
+ //
+ // ok -- Found an empty slot, locks will be held on both buckets after the
+ // function ends, and the position of the empty slot is returned
+ //
+ // failure_key_duplicated -- Found a duplicate key, locks will be held, and
+ // the position of the duplicate key will be returned
+ //
+ // failure_under_expansion -- Failed due to a concurrent expansion
+ // operation. Locks are released. No meaningful position is returned.
+ //
+ // failure_table_full -- Failed to find an empty slot for the table. Locks
+ // are released. No meaningful position is returned.
+ template <typename K, typename LOCK_T>
+ table_position cuckoo_insert(const hash_value hv, TwoBuckets<LOCK_T>& b,
+ K& key) {
+ int res1, res2;
+ Bucket& b1 = buckets_[b.first()];
+ if (!try_find_insert_bucket(b1, res1, hv.partial, key)) {
+ return table_position{b.first(), static_cast<size_type>(res1),
+ failure_key_duplicated};
+ }
+ Bucket& b2 = buckets_[b.second()];
+ if (!try_find_insert_bucket(b2, res2, hv.partial, key)) {
+ return table_position{b.second(), static_cast<size_type>(res2),
+ failure_key_duplicated};
+ }
+ if (res1 != -1) {
+ return table_position{b.first(), static_cast<size_type>(res1), ok};
+ }
+ if (res2 != -1) {
+ return table_position{b.second(), static_cast<size_type>(res2), ok};
+ }
+
+ // We are unlucky, so let's perform cuckoo hashing.
+ size_type insert_bucket = 0;
+ size_type insert_slot = 0;
+ cuckoo_status st = run_cuckoo<LOCK_T>(b, insert_bucket, insert_slot);
+ if (st == failure_under_expansion) {
+ // The run_cuckoo operation operated on an old version of the table,
+ // so we have to try again. We signal to the calling insert method
+ // to try again by returning failure_under_expansion.
+ return table_position{0, 0, failure_under_expansion};
+ } else if (st == ok) {
+ assert(!locks_[lock_ind(b.first())].try_lock(LOCK_T()));
+ assert(!locks_[lock_ind(b.second())].try_lock(LOCK_T()));
+ assert(!buckets_[insert_bucket].occupied(insert_slot));
+ assert(insert_bucket == index_hash(hashpower(), hv.hash) ||
+ insert_bucket == alt_index(
+ hashpower(), hv.partial,
+ index_hash(hashpower(), hv.hash)));
+ // Since we unlocked the buckets during run_cuckoo, another insert
+ // could have inserted the same key into either b.first() or
+ // b.second(), so we check for that before doing the insert.
+ table_position pos = cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ pos.status = failure_key_duplicated;
+ return pos;
+ }
+ return table_position{insert_bucket, insert_slot, ok};
+ }
+ assert(st == failure);
+ LIBCUCKOO_DBG("hash table is full (hashpower = %zu, hash_items = %zu,"
+ "load factor = %.2f), need to increase hashpower\n",
+ hashpower(), size(), load_factor());
+ return table_position{0, 0, failure_table_full};
+ }
+
+ // add_to_bucket will insert the given key-value pair into the slot. The key
+ // and value will be move-constructed into the table, so they are not valid
+ // for use afterwards.
+ template <typename K, typename... Args>
+ void add_to_bucket(const size_type bucket_ind, const size_type slot,
+ const partial_t partial, K& key, Args&&... val) {
+ Bucket& b = buckets_[bucket_ind];
+ assert(!b.occupied(slot));
+ b.setKV(allocator_, slot, partial,
+ key, std::forward<Args>(val)...);
+ ++locks_[lock_ind(bucket_ind)].elem_counter();
+ }
+
+ // try_find_insert_bucket will search the bucket for the given key, and for
+ // an empty slot. If the key is found, we store the slot of the key in
+ // `slot` and return false. If we find an empty slot, we store its position
+ // in `slot` and return true. If no duplicate key is found and no empty slot
+ // is found, we store -1 in `slot` and return true.
+ template <typename K>
+ bool try_find_insert_bucket(const Bucket& b, int& slot,
+ const partial_t partial, const K &key) const {
+ // Silence a warning from MSVC about partial being unused if is_simple.
+ (void)partial;
+ slot = -1;
+ for (size_type i = 0; i < slot_per_bucket(); ++i) {
+ if (b.occupied(i)) {
+ if (!is_simple && partial != b.partial(i)) {
+ continue;
+ }
+ if (key_eq()(b.key(i), key)) {
+ slot = i;
+ return false;
+ }
+ } else {
+ slot = i;
+ }
+ }
+ return true;
+ }
+
+ // CuckooRecord holds one position in a cuckoo path. Since cuckoopath
+ // elements only define a sequence of alternate hashings for different hash
+ // values, we only need to keep track of the hash values being moved, rather
+ // than the keys themselves.
+ typedef struct {
+ size_type bucket;
+ size_type slot;
+ hash_value hv;
+ } CuckooRecord;
+
+ // The maximum number of items in a cuckoo BFS path.
+ static constexpr uint8_t MAX_BFS_PATH_LEN = 5;
+
+ // An array of CuckooRecords
+ using CuckooRecords = std::array<CuckooRecord, MAX_BFS_PATH_LEN>;
+
+ // run_cuckoo performs cuckoo hashing on the table in an attempt to free up
+ // a slot on either of the insert buckets, which are assumed to be locked
+ // before the start. On success, the bucket and slot that was freed up is
+ // stored in insert_bucket and insert_slot. In order to perform the search
+ // and the swaps, it has to release the locks, which can lead to certain
+ // concurrency issues, the details of which are explained in the function.
+ // If run_cuckoo returns ok (success), then `b` will be active, otherwise it
+ // will not.
+ template <typename LOCK_T>
+ cuckoo_status run_cuckoo(TwoBuckets<LOCK_T>& b, size_type &insert_bucket,
+ size_type &insert_slot) {
+ // We must unlock the buckets here, so that cuckoopath_search and
+ // cuckoopath_move can lock buckets as desired without deadlock.
+ // cuckoopath_move has to move something out of one of the original
+ // buckets as its last operation, and it will lock both buckets and
+ // leave them locked after finishing. This way, we know that if
+ // cuckoopath_move succeeds, then the buckets needed for insertion are
+ // still locked. If cuckoopath_move fails, the buckets are unlocked and
+ // we try again. This unlocking does present two problems. The first is
+ // that another insert on the same key runs and, finding that the key
+ // isn't in the table, inserts the key into the table. Then we insert
+ // the key into the table, causing a duplication. To check for this, we
+ // search the buckets for the key we are trying to insert before doing
+ // so (this is done in cuckoo_insert, and requires that both buckets are
+ // locked). Another problem is that an expansion runs and changes the
+ // hashpower, meaning the buckets may not be valid anymore. In this
+ // case, the cuckoopath functions will have thrown a hashpower_changed
+ // exception, which we catch and handle here.
+ size_type hp = hashpower();
+ b.unlock();
+ CuckooRecords cuckoo_path;
+ bool done = false;
+ try {
+ while (!done) {
+ const int depth = cuckoopath_search<LOCK_T>(
+ hp, cuckoo_path, b.first(), b.second());
+ if (depth < 0) {
+ break;
+ }
+
+ if (cuckoopath_move(hp, cuckoo_path, depth, b)) {
+ insert_bucket = cuckoo_path[0].bucket;
+ insert_slot = cuckoo_path[0].slot;
+ assert(insert_bucket == b.first() || insert_bucket == b.second());
+ assert(!locks_[lock_ind(b.first())].try_lock(LOCK_T()));
+ assert(!locks_[lock_ind(b.second())].try_lock(LOCK_T()));
+ assert(!buckets_[insert_bucket].occupied(insert_slot));
+ done = true;
+ break;
+ }
+ }
+ } catch (hashpower_changed&) {
+ // The hashpower changed while we were trying to cuckoo, which means
+ // we want to retry. b.first() and b.second() should not be locked
+ // in this case.
+ return failure_under_expansion;
+ }
+ return done ? ok : failure;
+ }
+
+ // cuckoopath_search finds a cuckoo path from one of the starting buckets to
+ // an empty slot in another bucket. It returns the depth of the discovered
+ // cuckoo path on success, and -1 on failure. Since it doesn't take locks on
+ // the buckets it searches, the data can change between this function and
+ // cuckoopath_move. Thus cuckoopath_move checks that the data matches the
+ // cuckoo path before changing it.
+ //
+ // throws hashpower_changed if it changed during the search.
+ template <typename LOCK_T>
+ int cuckoopath_search(const size_type hp,
+ CuckooRecords& cuckoo_path,
+ const size_type i1, const size_type i2) {
+ b_slot x = slot_search<LOCK_T>(hp, i1, i2);
+ if (x.depth == -1) {
+ return -1;
+ }
+ // Fill in the cuckoo path slots from the end to the beginning.
+ for (int i = x.depth; i >= 0; i--) {
+ cuckoo_path[i].slot = x.pathcode % slot_per_bucket();
+ x.pathcode /= slot_per_bucket();
+ }
+ // Fill in the cuckoo_path buckets and keys from the beginning to the
+ // end, using the final pathcode to figure out which bucket the path
+ // starts on. Since data could have been modified between slot_search
+ // and the computation of the cuckoo path, this could be an invalid
+ // cuckoo_path.
+ CuckooRecord& first = cuckoo_path[0];
+ if (x.pathcode == 0) {
+ first.bucket = i1;
+ } else {
+ assert(x.pathcode == 1);
+ first.bucket = i2;
+ }
+ {
+ const auto ob = lock_one<LOCK_T>(hp, first.bucket);
+ const Bucket& b = buckets_[first.bucket];
+ if (!b.occupied(first.slot)) {
+ // We can terminate here
+ return 0;
+ }
+ first.hv = hashed_key(b.key(first.slot));
+ }
+ for (int i = 1; i <= x.depth; ++i) {
+ CuckooRecord& curr = cuckoo_path[i];
+ const CuckooRecord& prev = cuckoo_path[i-1];
+ assert(prev.bucket == index_hash(hp, prev.hv.hash) ||
+ prev.bucket == alt_index(hp, prev.hv.partial,
+ index_hash(hp, prev.hv.hash)));
+ // We get the bucket that this slot is on by computing the alternate
+ // index of the previous bucket
+ curr.bucket = alt_index(hp, prev.hv.partial, prev.bucket);
+ const auto ob = lock_one<LOCK_T>(hp, curr.bucket);
+ const Bucket& b = buckets_[curr.bucket];
+ if (!b.occupied(curr.slot)) {
+ // We can terminate here
+ return i;
+ }
+ curr.hv = hashed_key(b.key(curr.slot));
+ }
+ return x.depth;
+ }
+
+ // cuckoopath_move moves keys along the given cuckoo path in order to make
+ // an empty slot in one of the buckets in cuckoo_insert. Before the start of
+ // this function, the two insert-locked buckets were unlocked in run_cuckoo.
+ // At the end of the function, if the function returns true (success), then
+ // both insert-locked buckets remain locked. If the function is
+ // unsuccessful, then both insert-locked buckets will be unlocked.
+ //
+ // throws hashpower_changed if it changed during the move.
+ template <typename LOCK_T>
+ bool cuckoopath_move(const size_type hp, CuckooRecords& cuckoo_path,
+ size_type depth, TwoBuckets<LOCK_T>& b) {
+ assert(!b.is_active());
+ if (depth == 0) {
+ // There is a chance that depth == 0, when try_add_to_bucket sees
+ // both buckets as full and cuckoopath_search finds one empty. In
+ // this case, we lock both buckets. If the slot that
+ // cuckoopath_search found empty isn't empty anymore, we unlock them
+ // and return false. Otherwise, the bucket is empty and insertable,
+ // so we hold the locks and return true.
+ const size_type bucket = cuckoo_path[0].bucket;
+ assert(bucket == b.first() || bucket == b.second());
+ b = lock_two<LOCK_T>(hp, b.first(), b.second());
+ if (!buckets_[bucket].occupied(cuckoo_path[0].slot)) {
+ return true;
+ } else {
+ b.unlock();
+ return false;
+ }
+ }
+
+ while (depth > 0) {
+ CuckooRecord& from = cuckoo_path[depth-1];
+ CuckooRecord& to = cuckoo_path[depth];
+ const size_type fs = from.slot;
+ const size_type ts = to.slot;
+ TwoBuckets<LOCK_T> twob;
+ OneBucket<LOCK_T> extrab;
+ if (depth == 1) {
+ // Even though we are only swapping out of one of the original
+ // buckets, we have to lock both of them along with the slot we
+ // are swapping to, since at the end of this function, they both
+ // must be locked. We store tb inside the extrab container so it
+ // is unlocked at the end of the loop.
+ std::tie(twob, extrab) = lock_three<LOCK_T>(
+ hp, b.first(), b.second(), to.bucket);
+ } else {
+ twob = lock_two<LOCK_T>(hp, from.bucket, to.bucket);
+ }
+
+ Bucket& fb = buckets_[from.bucket];
+ Bucket& tb = buckets_[to.bucket];
+
+ // We plan to kick out fs, but let's check if it is still there;
+ // there's a small chance we've gotten scooped by a later cuckoo. If
+ // that happened, just... try again. Also the slot we are filling in
+ // may have already been filled in by another thread, or the slot we
+ // are moving from may be empty, both of which invalidate the swap.
+ // We only need to check that the hash value is the same, because,
+ // even if the keys are different and have the same hash value, then
+ // the cuckoopath is still valid.
+ if (hashed_key_only_hash(fb.key(fs)) != from.hv.hash ||
+ tb.occupied(ts) || !fb.occupied(fs)) {
+ return false;
+ }
+
+ Bucket::move_to_bucket(allocator_, fb, fs, tb, ts);
+ if (depth == 1) {
+ // Hold onto the locks contained in twob
+ b = std::move(twob);
+ }
+ depth--;
+ }
+ return true;
+ }
+
+ // A constexpr version of pow that we can use for static_asserts
+ static constexpr size_type const_pow(size_type a, size_type b) {
+ return (b == 0) ? 1 : a * const_pow(a, b - 1);
+ }
+
+ // b_slot holds the information for a BFS path through the table.
+ #pragma pack(push, 1)
+ struct b_slot {
+ // The bucket of the last item in the path.
+ size_type bucket;
+ // a compressed representation of the slots for each of the buckets in
+ // the path. pathcode is sort of like a base-slot_per_bucket number, and
+ // we need to hold at most MAX_BFS_PATH_LEN slots. Thus we need the
+ // maximum pathcode to be at least slot_per_bucket()^(MAX_BFS_PATH_LEN).
+ size_type pathcode;
+ static_assert(const_pow(slot_per_bucket(), MAX_BFS_PATH_LEN) <
+ std::numeric_limits<decltype(pathcode)>::max(),
+ "pathcode may not be large enough to encode a cuckoo "
+ "path");
+ // The 0-indexed position in the cuckoo path this slot occupies. It must
+ // be less than MAX_BFS_PATH_LEN, and also able to hold negative values.
+ int_fast8_t depth;
+ static_assert(MAX_BFS_PATH_LEN - 1 <=
+ std::numeric_limits<decltype(depth)>::max(),
+ "The depth type must able to hold a value of"
+ " MAX_BFS_PATH_LEN - 1");
+ static_assert(-1 >= std::numeric_limits<decltype(depth)>::min(),
+ "The depth type must be able to hold a value of -1");
+ b_slot() {}
+ b_slot(const size_type b, const size_type p, const decltype(depth) d)
+ : bucket(b), pathcode(p), depth(d) {
+ assert(d < MAX_BFS_PATH_LEN);
+ }
+ };
+ #pragma pack(pop)
+
+ // b_queue is the queue used to store b_slots for BFS cuckoo hashing.
+ #pragma pack(push, 1)
+ class b_queue {
+ public:
+ b_queue() noexcept : first_(0), last_(0) {}
+
+ void enqueue(b_slot x) {
+ assert(!full());
+ slots_[last_] = x;
+ last_ = increment(last_);
+ }
+
+ b_slot dequeue() {
+ assert(!empty());
+ b_slot& x = slots_[first_];
+ first_ = increment(first_);
+ return x;
+ }
+
+ bool empty() const {
+ return first_ == last_;
+ }
+
+ bool full() const {
+ return increment(last_) == first_;
+ }
+
+ private:
+ // The maximum size of the BFS queue. Note that unless it's less than
+ // slot_per_bucket()^MAX_BFS_PATH_LEN, it won't really mean anything.
+ static constexpr size_type MAX_CUCKOO_COUNT = 256;
+ static_assert((MAX_CUCKOO_COUNT & (MAX_CUCKOO_COUNT - 1)) == 0,
+ "MAX_CUCKOO_COUNT should be a power of 2");
+ // A circular array of b_slots
+ b_slot slots_[MAX_CUCKOO_COUNT];
+ // The index of the head of the queue in the array
+ size_type first_;
+ // One past the index of the last_ item of the queue in the array.
+ size_type last_;
+
+ // returns the index in the queue after ind, wrapping around if
+ // necessary.
+ size_type increment(size_type ind) const {
+ return (ind + 1) & (MAX_CUCKOO_COUNT - 1);
+ }
+ };
+ #pragma pack(pop)
+
+ // slot_search searches for a cuckoo path using breadth-first search. It
+ // starts with the i1 and i2 buckets, and, until it finds a bucket with an
+ // empty slot, adds each slot of the bucket in the b_slot. If the queue runs
+ // out of space, it fails.
+ //
+ // throws hashpower_changed if it changed during the search
+ template <typename LOCK_T>
+ b_slot slot_search(const size_type hp, const size_type i1,
+ const size_type i2) {
+ b_queue q;
+ // The initial pathcode informs cuckoopath_search which bucket the path
+ // starts on
+ q.enqueue(b_slot(i1, 0, 0));
+ q.enqueue(b_slot(i2, 1, 0));
+ while (!q.full() && !q.empty()) {
+ b_slot x = q.dequeue();
+ // Picks a (sort-of) random slot to start from
+ size_type starting_slot = x.pathcode % slot_per_bucket();
+ for (size_type i = 0; i < slot_per_bucket() && !q.full();
+ ++i) {
+ size_type slot = (starting_slot + i) % slot_per_bucket();
+ auto ob = lock_one<LOCK_T>(hp, x.bucket);
+ Bucket& b = buckets_[x.bucket];
+ if (!b.occupied(slot)) {
+ // We can terminate the search here
+ x.pathcode = x.pathcode * slot_per_bucket() + slot;
+ return x;
+ }
+
+ // If x has less than the maximum number of path components,
+ // create a new b_slot item, that represents the bucket we would
+ // have come from if we kicked out the item at this slot.
+ const partial_t partial = b.partial(slot);
+ if (x.depth < MAX_BFS_PATH_LEN - 1) {
+ b_slot y(alt_index(hp, partial, x.bucket),
+ x.pathcode * slot_per_bucket() + slot, x.depth+1);
+ q.enqueue(y);
+ }
+ }
+ }
+ // We didn't find a short-enough cuckoo path, so the queue ran out of
+ // space. Return a failure value.
+ return b_slot(0, 0, -1);
+ }
+
+ // cuckoo_fast_double will double the size of the table by taking advantage
+ // of the properties of index_hash and alt_index. If the key's move
+ // constructor is not noexcept, we use cuckoo_expand_simple, since that
+ // provides a strong exception guarantee.
+ template <typename LOCK_T, typename AUTO_RESIZE>
+ cuckoo_status cuckoo_fast_double(size_type current_hp) {
+ if (!std::is_nothrow_move_constructible<storage_value_type>::value) {
+ LIBCUCKOO_DBG("%s", "cannot run cuckoo_fast_double because kv-pair "
+ "is not nothrow move constructible");
+ return cuckoo_expand_simple<LOCK_T, AUTO_RESIZE>(current_hp + 1);
+ }
+ const size_type new_hp = current_hp + 1;
+ std::lock_guard<expansion_lock_t> l(expansion_lock_);
+ cuckoo_status st = check_resize_validity<AUTO_RESIZE>(current_hp, new_hp);
+ if (st != ok) {
+ return st;
+ }
+
+ locks_.resize(hashsize(new_hp));
+ auto unlocker = snapshot_and_lock_all<LOCK_T>();
+ // We can't just resize, since the Bucket is non-copyable and
+ // non-movable. Instead, we allocate a new array of buckets, and move
+ // the contents of each bucket manually.
+ {
+ buckets_t new_buckets(buckets_.size() * 2, get_allocator());
+ for (size_type i = 0; i < buckets_.size(); ++i) {
+ Bucket::move_bucket(allocator_, buckets_[i], new_buckets[i]);
+ }
+ buckets_.swap(new_buckets);
+ }
+ set_hashpower(new_hp);
+
+ // We gradually unlock the new table, by processing each of the buckets
+ // corresponding to each lock we took. For each slot in an old bucket,
+ // we either leave it in the old bucket, or move it to the corresponding
+ // new bucket. After we're done with the bucket, we release the lock on
+ // it and the new bucket, letting other threads using the new map
+ // gradually. We only unlock the locks being used by the old table,
+ // because unlocking new locks would enable operations on the table
+ // before we want them. We also re-evaluate the partial key stored at
+ // each slot, since it depends on the hashpower.
+ const size_type locks_to_move = std::min(
+ locks_.size(), hashsize(current_hp));
+ parallel_exec(0, locks_to_move,
+ [this, current_hp, new_hp]
+ (size_type start, size_type end, std::exception_ptr& eptr) {
+ try {
+ move_buckets<LOCK_T>(current_hp, new_hp, start, end);
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+ });
+ parallel_exec(locks_to_move, locks_.size(),
+ [this](size_type i, size_type end, std::exception_ptr&) {
+ for (; i < end; ++i) {
+ locks_[i].unlock(LOCK_T());
+ }
+ });
+ // Since we've unlocked the buckets ourselves, we don't need the
+ // unlocker to do it for us.
+ unlocker.release();
+ return ok;
+ }
+
+ template <typename LOCK_T>
+ void move_buckets(size_type current_hp, size_type new_hp,
+ size_type start_lock_ind, size_type end_lock_ind) {
+ for (; start_lock_ind < end_lock_ind; ++start_lock_ind) {
+ for (size_type bucket_i = start_lock_ind;
+ bucket_i < hashsize(current_hp);
+ bucket_i += locks_t::max_size()) {
+ // By doubling the table size, the index_hash and alt_index of
+ // each key got one bit added to the top, at position
+ // current_hp, which means anything we have to move will either
+ // be at the same bucket position, or exactly
+ // hashsize(current_hp) later than the current bucket
+ Bucket& old_bucket = buckets_[bucket_i];
+ const size_type new_bucket_i = bucket_i + hashsize(current_hp);
+ Bucket& new_bucket = buckets_[new_bucket_i];
+ size_type new_bucket_slot = 0;
+
+ // Move each item from the old bucket that needs moving into the
+ // new bucket
+ for (size_type slot = 0; slot < slot_per_bucket(); ++slot) {
+ if (!old_bucket.occupied(slot)) {
+ continue;
+ }
+ const hash_value hv = hashed_key(old_bucket.key(slot));
+ const size_type old_ihash = index_hash(current_hp, hv.hash);
+ const size_type old_ahash = alt_index(
+ current_hp, hv.partial, old_ihash);
+ const size_type new_ihash = index_hash(new_hp, hv.hash);
+ const size_type new_ahash = alt_index(
+ new_hp, hv.partial, new_ihash);
+ if ((bucket_i == old_ihash && new_ihash == new_bucket_i) ||
+ (bucket_i == old_ahash && new_ahash == new_bucket_i)) {
+ // We're moving the key from the old bucket to the new
+ // one
+ Bucket::move_to_bucket(
+ allocator_,
+ old_bucket, slot, new_bucket, new_bucket_slot++);
+ // Also update the lock counts, in case we're moving to
+ // a different lock.
+ --locks_[lock_ind(bucket_i)].elem_counter();
+ ++locks_[lock_ind(new_bucket_i)].elem_counter();
+ } else {
+ // Check that we don't want to move the new key
+ assert(
+ (bucket_i == old_ihash && new_ihash == old_ihash) ||
+ (bucket_i == old_ahash && new_ahash == old_ahash));
+ }
+ }
+ }
+ // Now we can unlock the lock, because all the buckets corresponding
+ // to it have been unlocked
+ locks_[start_lock_ind].unlock(LOCK_T());
+ }
+ }
+
+ // Checks whether the resize is okay to proceed. Returns a status code, or
+ // throws an exception, depending on the error type.
+ using automatic_resize = std::integral_constant<bool, true>;
+ using manual_resize = std::integral_constant<bool, false>;
+
+ template <typename AUTO_RESIZE>
+ cuckoo_status check_resize_validity(const size_type orig_hp,
+ const size_type new_hp) {
+ const size_type mhp = maximum_hashpower();
+ if (mhp != LIBCUCKOO_NO_MAXIMUM_HASHPOWER && new_hp > mhp) {
+ throw libcuckoo_maximum_hashpower_exceeded(new_hp);
+ }
+ if (AUTO_RESIZE::value && load_factor() < minimum_load_factor()) {
+ throw libcuckoo_load_factor_too_low(minimum_load_factor());
+ }
+ if (hashpower() != orig_hp) {
+ // Most likely another expansion ran before this one could grab the
+ // locks
+ LIBCUCKOO_DBG("%s", "another expansion is on-going\n");
+ return failure_under_expansion;
+ }
+ return ok;
+ }
+
+ // cuckoo_expand_simple will resize the table to at least the given
+ // new_hashpower. When we're shrinking the table, if the current table
+ // contains more elements than can be held by new_hashpower, the resulting
+ // hashpower will be greater than new_hashpower. It needs to take all the
+ // bucket locks, since no other operations can change the table during
+ // expansion. Throws libcuckoo_maximum_hashpower_exceeded if we're expanding
+ // beyond the maximum hashpower, and we have an actual limit.
+ template <typename LOCK_T, typename AUTO_RESIZE>
+ cuckoo_status cuckoo_expand_simple(size_type new_hp) {
+ const auto unlocker = snapshot_and_lock_all<LOCK_T>();
+ const size_type hp = hashpower();
+ cuckoo_status st = check_resize_validity<AUTO_RESIZE>(hp, new_hp);
+ if (st != ok) {
+ return st;
+ }
+ // Creates a new hash table with hashpower new_hp and adds all
+ // the elements from the old buckets.
+ cuckoohash_map new_map(
+ hashsize(new_hp) * slot_per_bucket(),
+ hash_function(),
+ key_eq(),
+ get_allocator());
+
+ parallel_exec(
+ 0, hashsize(hp),
+ [this, &new_map]
+ (size_type i, size_type end, std::exception_ptr& eptr) {
+ try {
+ for (; i < end; ++i) {
+ for (size_type j = 0; j < slot_per_bucket(); ++j) {
+ if (buckets_[i].occupied(j)) {
+ storage_value_type& kvpair = (
+ buckets_[i].storage_kvpair(j));
+ new_map.insert(kvpair.first,
+ std::move(kvpair.second));
+ }
+ }
+ }
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+ });
+
+ // Swap the current buckets vector with new_map's and set the hashpower.
+ // This is okay, because we have all the locks, so nobody else should be
+ // reading from the buckets array. Then the old buckets array will be
+ // deleted when new_map is deleted. All the locks should be released by
+ // the unlocker as well.
+ buckets_.swap(new_map.buckets_);
+ set_hashpower(new_map.hashpower_);
+ return ok;
+ }
+
+ // Executes the function over the given range split over num_threads threads
+ template <typename F>
+ static void parallel_exec(size_type start, size_type end, F func) {
+ static const size_type num_threads = (
+ std::thread::hardware_concurrency() == 0 ?
+ 1 : std::thread::hardware_concurrency());
+ size_type work_per_thread = (end - start) / num_threads;
+ std::vector<std::thread, typename allocator_traits_::
+ template rebind_alloc<std::thread> > threads(num_threads);
+ std::vector<std::exception_ptr, typename allocator_traits_::
+ template rebind_alloc<std::exception_ptr>> eptrs(num_threads, nullptr);
+ for (size_type i = 0; i < num_threads - 1; ++i) {
+ threads[i] = std::thread(func, start, start + work_per_thread,
+ std::ref(eptrs[i]));
+ start += work_per_thread;
+ }
+ threads.back() = std::thread(func, start, end, std::ref(eptrs.back()));
+ for (std::thread& t : threads) {
+ t.join();
+ }
+ for (std::exception_ptr& eptr : eptrs) {
+ if (eptr) {
+ std::rethrow_exception(eptr);
+ }
+ }
+ }
+
+ // Deletion functions
+
+ // Removes an item from a bucket, decrementing the associated counter as
+ // well.
+ void del_from_bucket(Bucket& b, const size_type bucket_ind,
+ const size_type slot) {
+ b.eraseKV(allocator_, slot);
+ --locks_[lock_ind(bucket_ind)].elem_counter();
+ }
+
+ // Empties the table, calling the destructors of all the elements it removes
+ // from the table. It assumes the locks are taken as necessary.
+ cuckoo_status cuckoo_clear() {
+ for (size_type i = 0; i < buckets_.size(); ++i) {
+ buckets_[i].clear(allocator_);
+ }
+ for (size_type i = 0; i < locks_.size(); ++i) {
+ locks_[i].elem_counter() = 0;
+ }
+ return ok;
+ }
+
+ // Rehashing functions
+
+ template <typename LOCK_T>
+ bool cuckoo_rehash(size_type n) {
+ const size_type hp = hashpower();
+ if (n == hp) {
+ return false;
+ }
+ return cuckoo_expand_simple<LOCK_T, manual_resize>(n) == ok;
+ }
+
+ template <typename LOCK_T>
+ bool cuckoo_reserve(size_type n) {
+ const size_type hp = hashpower();
+ const size_type new_hp = reserve_calc(n);
+ if (new_hp == hp) {
+ return false;
+ }
+ return cuckoo_expand_simple<LOCK_T, manual_resize>(new_hp) == ok;
+ }
+
+ // Miscellaneous functions
+
+ void set_hashpower(size_type val) {
+ hashpower_.store(val, std::memory_order_release);
+ }
+
+ // reserve_calc takes in a parameter specifying a certain number of slots
+ // for a table and returns the smallest hashpower that will hold n elements.
+ static size_type reserve_calc(const size_type n) {
+ const size_type buckets = (n + slot_per_bucket() - 1) / slot_per_bucket();
+ size_type blog2;
+ for (blog2 = 1; (1UL << blog2) < buckets; ++blog2);
+ assert(n <= hashsize(blog2) * slot_per_bucket());
+ return blog2;
+ }
+
+ // This class is a friend for unit testing
+ friend class UnitTestInternalAccess;
+
+ // Member variables
+
+ // 2**hashpower is the number of buckets. This cannot be changed unless all
+ // the locks are taken on the table. Since it is still read and written by
+ // multiple threads not necessarily synchronized by a lock, we keep it
+ // atomic
+ std::atomic<size_type> hashpower_;
+
+ // The hash function
+ hasher hash_fn_;
+
+ // The equality function
+ key_equal eq_fn_;
+
+ // The allocator
+ allocator_type allocator_;
+
+ // vector of buckets. The size or memory location of the buckets cannot be
+ // changed unless al the locks are taken on the table. Thus, it is only safe
+ // to access the buckets_ vector when you have at least one lock held.
+ buckets_t buckets_;
+
+ // array of locks. marked mutable, so that const methods can take locks.
+ // Even though it's a vector, it should not ever change in size after the
+ // initial allocation.
+ mutable locks_t locks_;
+
+ // a lock to synchronize expansions
+ expansion_lock_t expansion_lock_;
+
+ // stores the minimum load factor allowed for automatic expansions. Whenever
+ // an automatic expansion is triggered (during an insertion where cuckoo
+ // hashing fails, for example), we check the load factor against this
+ // double, and throw an exception if it's lower than this value. It can be
+ // used to signal when the hash function is bad or the input adversarial.
+ std::atomic<double> minimum_load_factor_;
+
+ // stores the maximum hashpower allowed for any expansions. If set to
+ // NO_MAXIMUM_HASHPOWER, this limit will be disregarded.
+ std::atomic<size_type> maximum_hashpower_;
+
+public:
+ /**
+ * An ownership wrapper around a @ref cuckoohash_map table instance. When
+ * given a table instance, it takes all the locks on the table, blocking all
+ * outside operations on the table. Because the locked_table has unique
+ * ownership of the table, it can provide a set of operations on the table
+ * that aren't possible in a concurrent context.
+ *
+ * The locked_table interface is very similar to the STL unordered_map
+ * interface, and for functions whose signatures correspond to unordered_map
+ * methods, the behavior should be mostly the same.
+ */
+ class locked_table {
+ public:
+ /** @name Type Declarations */
+ /**@{*/
+
+ using key_type = cuckoohash_map::key_type;
+ using mapped_type = cuckoohash_map::mapped_type;
+ using value_type = cuckoohash_map::value_type;
+ using size_type = cuckoohash_map::size_type;
+ using difference_type = cuckoohash_map::difference_type;
+ using hasher = cuckoohash_map::hasher;
+ using key_equal = cuckoohash_map::key_equal;
+ using allocator_type = cuckoohash_map::allocator_type;
+ using reference = cuckoohash_map::reference;
+ using const_reference = cuckoohash_map::const_reference;
+ using pointer = cuckoohash_map::pointer;
+ using const_pointer = cuckoohash_map::const_pointer;
+
+ /**
+ * A constant iterator over a @ref locked_table, which allows read-only
+ * access to the elements of the table. It fulfills the
+ * BidirectionalIterator concept.
+ */
+ class const_iterator {
+ public:
+ using difference_type = locked_table::difference_type;
+ using value_type = locked_table::value_type;
+ using pointer = locked_table::const_pointer;
+ using reference = locked_table::const_reference;
+ using iterator_category = std::bidirectional_iterator_tag;
+
+ const_iterator() {}
+
+ // Return true if the iterators are from the same locked table and
+ // location, false otherwise.
+ bool operator==(const const_iterator& it) const {
+ return buckets_ == it.buckets_ &&
+ index_ == it.index_ && slot_ == it.slot_;
+ }
+
+ bool operator!=(const const_iterator& it) const {
+ return !(operator==(it));
+ }
+
+ reference operator*() const {
+ return (*buckets_)[index_].kvpair(slot_);
+ }
+
+ pointer operator->() const {
+ return &(*buckets_)[index_].kvpair(slot_);
+ }
+
+ // Advance the iterator to the next item in the table, or to the end
+ // of the table. Returns the iterator at its new position.
+ const_iterator& operator++() {
+ // Move forward until we get to a slot that is occupied, or we
+ // get to the end
+ ++slot_;
+ for (; index_ < buckets_->size(); ++index_) {
+ for (; slot_ < slot_per_bucket(); ++slot_) {
+ if ((*buckets_)[index_].occupied(slot_)) {
+ return *this;
+ }
+ }
+ slot_ = 0;
+ }
+ assert(std::make_pair(index_, slot_) == end_pos(*buckets_));
+ return *this;
+ }
+
+ // Advance the iterator to the next item in the table, or to the end
+ // of the table. Returns the iterator at its old position.
+ const_iterator operator++(int) {
+ const_iterator old(*this);
+ ++(*this);
+ return old;
+ }
+
+ // Move the iterator back to the previous item in the table. Returns
+ // the iterator at its new position.
+ const_iterator& operator--() {
+ // Move backward until we get to the beginning. Behavior is
+ // undefined if we are iterating at the first element, so we can
+ // assume we'll reach an element. This means we'll never reach
+ // index_ == 0 and slot_ == 0.
+ if (slot_ == 0) {
+ --index_;
+ slot_ = slot_per_bucket() - 1;
+ } else {
+ --slot_;
+ }
+ while (!(*buckets_)[index_].occupied(slot_)) {
+ if (slot_ == 0) {
+ --index_;
+ slot_ = slot_per_bucket() - 1;
+ } else {
+ --slot_;
+ }
+ }
+ return *this;
+ }
+
+ //! Move the iterator back to the previous item in the table.
+ //! Returns the iterator at its old position. Behavior is undefined
+ //! if the iterator is at the beginning.
+ const_iterator operator--(int) {
+ const_iterator old(*this);
+ --(*this);
+ return old;
+ }
+
+ protected:
+ // The buckets owned by the locked table being iterated over. Even
+ // though const_iterator cannot modify the buckets, we don't mark
+ // them const so that the mutable iterator can derive from this
+ // class. Also, since iterators should be default constructible,
+ // copyable, and movable, we have to make this a raw pointer type.
+ buckets_t* buckets_;
+
+ // The bucket index of the item being pointed to. For implementation
+ // convenience, we let it take on negative values.
+ size_type index_;
+
+ // The slot in the bucket of the item being pointed to. For
+ // implementation convenience, we let it take on negative values.
+ size_type slot_;
+
+ // Returns the position signifying the end of the table
+ static std::pair<size_type, size_type>
+ end_pos(const buckets_t& buckets) {
+ return std::make_pair(buckets.size(), 0);
+ }
+
+ // The private constructor is used by locked_table to create
+ // iterators from scratch. If the given index_-slot_ pair is at the
+ // end of the table, or the given spot is occupied, stay. Otherwise,
+ // step forward to the next data item, or to the end of the table.
+ const_iterator(buckets_t& buckets, size_type index,
+ size_type slot) noexcept
+ : buckets_(std::addressof(buckets)), index_(index), slot_(slot) {
+ if (std::make_pair(index_, slot_) != end_pos(*buckets_) &&
+ !(*buckets_)[index_].occupied(slot_)) {
+ operator++();
+ }
+ }
+
+ friend class locked_table;
+ };
+
+ /**
+ * An iterator over a @ref locked_table, which allows read-write access
+ * to elements of the table. It fulfills the BidirectionalIterator
+ * concept.
+ */
+ class iterator : public const_iterator {
+ public:
+ using pointer = cuckoohash_map::pointer;
+ using reference = cuckoohash_map::reference;
+
+ iterator() {}
+
+ bool operator==(const iterator& it) const {
+ return const_iterator::operator==(it);
+ }
+
+ bool operator!=(const iterator& it) const {
+ return const_iterator::operator!=(it);
+ }
+
+ using const_iterator::operator*;
+ reference operator*() {
+ return (*const_iterator::buckets_)[
+ const_iterator::index_].kvpair(const_iterator::slot_);
+ }
+
+ using const_iterator::operator->;
+ pointer operator->() {
+ return &(*const_iterator::buckets_)[
+ const_iterator::index_].kvpair(const_iterator::slot_);
+ }
+
+ iterator& operator++() {
+ const_iterator::operator++();
+ return *this;
+ }
+
+ iterator operator++(int) {
+ iterator old(*this);
+ const_iterator::operator++();
+ return old;
+ }
+
+ iterator& operator--() {
+ const_iterator::operator--();
+ return *this;
+ }
+
+ iterator operator--(int) {
+ iterator old(*this);
+ const_iterator::operator--();
+ return old;
+ }
+
+ private:
+ iterator(buckets_t& buckets, size_type index, size_type slot) noexcept
+ : const_iterator(buckets, index, slot) {}
+
+ friend class locked_table;
+ };
+
+ /**@}*/
+
+ /** @name Table Parameters */
+ /**@{*/
+
+ static constexpr size_type slot_per_bucket() {
+ return cuckoohash_map::slot_per_bucket();
+ }
+
+ /**@}*/
+
+ /** @name Constructors, Destructors, and Assignment */
+ /**@{*/
+
+ locked_table() = delete;
+ locked_table(const locked_table&) = delete;
+ locked_table& operator=(const locked_table&) = delete;
+
+ locked_table(locked_table&& lt) noexcept
+ : map_(std::move(lt.map_)),
+ unlocker_(std::move(lt.unlocker_))
+ {}
+
+ locked_table& operator=(locked_table&& lt) noexcept {
+ unlock();
+ map_ = std::move(lt.map_);
+ unlocker_ = std::move(lt.unlocker_);
+ return *this;
+ }
+
+ /**
+ * Unlocks the table, thereby freeing the locks on the table, but also
+ * invalidating all iterators and table operations with this object. It
+ * is idempotent.
+ */
+ void unlock() {
+ unlocker_.unlock();
+ }
+
+ /**@}*/
+
+ /** @name Table Details
+ *
+ * Methods for getting information about the table. Many are identical
+ * to their @ref cuckoohash_map counterparts. Only new functions or
+ * those with different behavior are documented.
+ *
+ */
+ /**@{*/
+
+ /**
+ * Returns whether the locked table has ownership of the table
+ *
+ * @return true if it still has ownership, false otherwise
+ */
+ bool is_active() const {
+ return unlocker_.is_active();
+ }
+
+ hasher hash_function() const {
+ return map_.get().hash_function();
+ }
+
+ key_equal key_eq() const {
+ return map_.get().key_eq();
+ }
+
+ allocator_type get_allocator() const {
+ return map_.get().get_allocator();
+ }
+
+ size_type hashpower() const {
+ return map_.get().hashpower();
+ }
+
+ size_type bucket_count() const {
+ return map_.get().bucket_count();
+ }
+
+ bool empty() const {
+ return map_.get().empty();
+ }
+
+ size_type size() const {
+ return map_.get().size();
+ }
+
+ size_type capacity() const {
+ return map_.get().capacity();
+ }
+
+ double load_factor() const {
+ return map_.get().load_factor();
+ }
+
+ void minimum_load_factor(const double mlf) {
+ map_.get().minimum_load_factor(mlf);
+ }
+
+ double minimum_load_factor() {
+ return map_.get().minimum_load_factor();
+ }
+
+ void maximum_hashpower(size_type mhp) {
+ map_.get().maximum_hashpower(mhp);
+ }
+
+ size_type maximum_hashpower() {
+ return map_.get().maximum_hashpower();
+ }
+
+ /**@}*/
+
+ /**@{*/
+ /**
+ * Returns an iterator to the beginning of the table. If the table is
+ * empty, it will point past the end of the table.
+ *
+ * @return an iterator to the beginning of the table
+ */
+
+ iterator begin() {
+ return iterator(map_.get().buckets_, 0, 0);
+ }
+
+ const_iterator begin() const {
+ return const_iterator(map_.get().buckets_, 0, 0);
+ }
+
+ const_iterator cbegin() const {
+ return begin();
+ }
+
+ /**@}*/
+
+ /** @name Iterators */
+ /**@{*/
+
+ /**@{*/
+ /**
+ * Returns an iterator past the end of the table.
+ *
+ * @return an iterator past the end of the table
+ */
+
+ iterator end() {
+ const auto end_pos = const_iterator::end_pos(map_.get().buckets_);
+ return iterator(map_.get().buckets_,
+ static_cast<size_type>(end_pos.first),
+ static_cast<size_type>(end_pos.second));
+ }
+
+ const_iterator end() const {
+ const auto end_pos = const_iterator::end_pos(map_.get().buckets_);
+ return const_iterator(map_.get().buckets_,
+ static_cast<size_type>(end_pos.first),
+ static_cast<size_type>(end_pos.second));
+ }
+
+ const_iterator cend() const {
+ return end();
+ }
+
+ /**@}*/
+
+ /**@}*/
+
+ /** @name Modifiers */
+ /**@{*/
+
+ void clear() {
+ map_.get().cuckoo_clear();
+ }
+
+ /**
+ * This behaves like the @c unordered_map::try_emplace method, but with
+ * the same argument lifetime properties as @ref cuckoohash_map::insert.
+ * It will always invalidate all iterators, due to the possibilities of
+ * cuckoo hashing and expansion.
+ */
+ template <typename K, typename... Args>
+ std::pair<iterator, bool> insert(K&& key, Args&&... val) {
+ K k(std::forward<K>(key));
+ hash_value hv = map_.get().hashed_key(k);
+ auto b = map_.get().template snapshot_and_lock_two<locking_inactive>(hv);
+ table_position pos = map_.get().cuckoo_insert_loop(hv, b, k);
+ if (pos.status == ok) {
+ map_.get().add_to_bucket(
+ pos.index, pos.slot, hv.partial, k,
+ std::forward<Args>(val)...);
+ } else {
+ assert(pos.status == failure_key_duplicated);
+ }
+ return std::make_pair(
+ iterator(map_.get().buckets_, pos.index, pos.slot),
+ pos.status == ok);
+ }
+
+ iterator erase(const_iterator pos) {
+ map_.get().del_from_bucket(map_.get().buckets_[pos.index_],
+ pos.index_,
+ pos.slot_);
+ return iterator(map_.get().buckets_, pos.index_, pos.slot_);
+ }
+
+ iterator erase(iterator pos) {
+ map_.get().del_from_bucket(map_.get().buckets_[pos.index_],
+ pos.index_,
+ pos.slot_);
+ return iterator(map_.get().buckets_, pos.index_, pos.slot_);
+ }
+
+ template <typename K>
+ size_type erase(const K& key) {
+ const hash_value hv = map_.get().hashed_key(key);
+ const auto b = map_.get().
+ template snapshot_and_lock_two<locking_inactive>(hv);
+ const table_position pos = map_.get().cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ map_.get().del_from_bucket(map_.get().buckets_[pos.index],
+ pos.index, pos.slot);
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ /**@}*/
+
+ /** @name Lookup */
+ /**@{*/
+
+ template <typename K>
+ iterator find(const K& key) {
+ const hash_value hv = map_.get().hashed_key(key);
+ const auto b = map_.get().
+ template snapshot_and_lock_two<locking_inactive>(hv);
+ const table_position pos = map_.get().cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ return iterator(map_.get().buckets_, pos.index, pos.slot);
+ } else {
+ return end();
+ }
+ }
+
+ template <typename K>
+ const_iterator find(const K& key) const {
+ const hash_value hv = map_.get().hashed_key(key);
+ const auto b = map_.get().
+ template snapshot_and_lock_two<locking_inactive>(hv);
+ const table_position pos = map_.get().cuckoo_find(
+ key, hv.partial, b.first(), b.second());
+ if (pos.status == ok) {
+ return const_iterator(map_.get().buckets_, pos.index, pos.slot);
+ } else {
+ return end();
+ }
+ }
+
+ template <typename K>
+ mapped_type& at(const K& key) {
+ auto it = find(key);
+ if (it == end()) {
+ throw std::out_of_range("key not found in table");
+ } else {
+ return it->second;
+ }
+ }
+
+ template <typename K>
+ const mapped_type& at(const K& key) const {
+ auto it = find(key);
+ if (it == end()) {
+ throw std::out_of_range("key not found in table");
+ } else {
+ return it->second;
+ }
+ }
+
+ /**
+ * This function has the same lifetime properties as @ref
+ * cuckoohash_map::insert, except that the value is default-constructed,
+ * with no parameters, if it is not already in the table.
+ */
+ template <typename K>
+ T& operator[](K&& key) {
+ auto result = insert(std::forward<K>(key));
+ return result.first->second;
+ }
+
+ template <typename K>
+ size_type count(const K& key) const {
+ const hash_value hv = map_.get().hashed_key(key);
+ const auto b = map_.get().
+ template snapshot_and_lock_two<locking_inactive>(hv);
+ return map_.get().cuckoo_find(
+ key, hv.partial, b.first(), b.second()).status == ok ? 1 : 0;
+ }
+
+ template <typename K>
+ std::pair<iterator, iterator> equal_range(const K& key) {
+ auto it = find(key);
+ if (it == end()) {
+ return std::make_pair(it, it);
+ } else {
+ auto start_it = it++;
+ return std::make_pair(start_it, it);
+ }
+ }
+
+ template <typename K>
+ std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+ auto it = find(key);
+ if (it == end()) {
+ return std::make_pair(it, it);
+ } else {
+ auto start_it = it++;
+ return std::make_pair(start_it, it);
+ }
+ }
+
+ /**@}*/
+
+ /** @name Re-sizing */
+ /**@{*/
+
+ /**
+ * This has the same behavior as @ref cuckoohash_map::rehash, except
+ * that we don't return anything.
+ */
+ void rehash(size_type n) {
+ map_.get().template cuckoo_rehash<locking_inactive>(n);
+ }
+
+ /**
+ * This has the same behavior as @ref cuckoohash_map::reserve, except
+ * that we don't return anything.
+ */
+ void reserve(size_type n) {
+ map_.get().template cuckoo_reserve<locking_inactive>(n);
+ }
+
+ /**@}*/
+
+ /** @name Comparison */
+ /**@{*/
+
+ bool operator==(const locked_table& lt) const {
+ if (size() != lt.size()) {
+ return false;
+ }
+ for (const auto& elem : lt) {
+ auto it = find(elem.first);
+ if (it == end() || it->second != elem.second) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool operator!=(const locked_table& lt) const {
+ if (size() != lt.size()) {
+ return true;
+ }
+ for (const auto& elem : lt) {
+ auto it = find(elem.first);
+ if (it == end() || it->second != elem.second) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**@}*/
+
+ private:
+ // The constructor locks the entire table. We keep this constructor
+ // private (but expose it to the cuckoohash_map class), since we don't
+ // want users calling it.
+ locked_table(cuckoohash_map& map) noexcept
+ : map_(map), unlocker_(
+ map_.get().template snapshot_and_lock_all<locking_active>())
+ {}
+
+ // A reference to the map owned by the table
+ std::reference_wrapper<cuckoohash_map> map_;
+ // A manager for all the locks we took on the table.
+ AllBuckets<locking_active> unlocker_;
+
+ friend class cuckoohash_map;
+ };
+};
+
+#endif // _CUCKOOHASH_MAP_HH
diff --git a/ext/include/cuckoo/cuckoohash_util.hh b/ext/include/cuckoo/cuckoohash_util.hh
new file mode 100644
index 0000000..cdb31c6
--- /dev/null
+++ b/ext/include/cuckoo/cuckoohash_util.hh
@@ -0,0 +1,136 @@
+/** \file */
+
+#ifndef _CUCKOOHASH_UTIL_HH
+#define _CUCKOOHASH_UTIL_HH
+
+#include <exception>
+#include <thread>
+#include <utility>
+#include <vector>
+#include "cuckoohash_config.hh" // for LIBCUCKOO_DEBUG
+
+#if LIBCUCKOO_DEBUG
+//! When \ref LIBCUCKOO_DEBUG is 0, LIBCUCKOO_DBG will printing out status
+//! messages in various situations
+# define LIBCUCKOO_DBG(fmt, ...) \
+ fprintf(stderr, "\x1b[32m""[libcuckoo:%s:%d:%lu] " fmt"" "\x1b[0m", \
+ __FILE__,__LINE__, std::hash<std::thread::id>()(std::this_thread::get_id()), \
+ __VA_ARGS__)
+#else
+//! When \ref LIBCUCKOO_DEBUG is 0, LIBCUCKOO_DBG does nothing
+# define LIBCUCKOO_DBG(fmt, ...) do {} while (0)
+#endif
+
+/**
+ * alignas() requires GCC >= 4.9, so we stick with the alignment attribute for
+ * GCC.
+ */
+#ifdef __GNUC__
+#define LIBCUCKOO_ALIGNAS(x) __attribute__((aligned(x)))
+#else
+#define LIBCUCKOO_ALIGNAS(x) alignas(x)
+#endif
+
+/**
+ * At higher warning levels, MSVC produces an annoying warning that alignment
+ * may cause wasted space: "structure was padded due to __declspec(align())".
+ */
+#ifdef _MSC_VER
+#define LIBCUCKOO_SQUELCH_PADDING_WARNING __pragma(warning(suppress : 4324))
+#else
+#define LIBCUCKOO_SQUELCH_PADDING_WARNING
+#endif
+
+/**
+ * thread_local requires GCC >= 4.8 and is not supported in some clang versions,
+ * so we use __thread if thread_local is not supported
+ */
+#define LIBCUCKOO_THREAD_LOCAL thread_local
+#if defined(__clang__)
+# if !__has_feature(cxx_thread_local)
+# undef LIBCUCKOO_THREAD_LOCAL
+# define LIBCUCKOO_THREAD_LOCAL __thread
+# endif
+#elif defined(__GNUC__)
+# if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+# undef LIBCUCKOO_THREAD_LOCAL
+# define LIBCUCKOO_THREAD_LOCAL __thread
+# endif
+#endif
+
+//! For enabling certain methods based on a condition. Here's an example.
+//! LIBCUCKOO_ENABLE_IF(sizeof(int) == 4, int) method() {
+//! ...
+//! }
+#define LIBCUCKOO_ENABLE_IF(condition, return_type) \
+ template <class Bogus = void*> \
+ typename std::enable_if<sizeof(Bogus) && condition, return_type>::type
+
+/**
+ * Thrown when an automatic expansion is triggered, but the load factor of the
+ * table is below a minimum threshold, which can be set by the \ref
+ * cuckoohash_map::minimum_load_factor method. This can happen if the hash
+ * function does not properly distribute keys, or for certain adversarial
+ * workloads.
+ */
+class libcuckoo_load_factor_too_low : public std::exception {
+public:
+ /**
+ * Constructor
+ *
+ * @param lf the load factor of the table when the exception was thrown
+ */
+ libcuckoo_load_factor_too_low(const double lf)
+ : load_factor_(lf) {}
+
+ /**
+ * @return a descriptive error message
+ */
+ virtual const char* what() const noexcept override {
+ return "Automatic expansion triggered when load factor was below "
+ "minimum threshold";
+ }
+
+ /**
+ * @return the load factor of the table when the exception was thrown
+ */
+ double load_factor() const {
+ return load_factor_;
+ }
+private:
+ const double load_factor_;
+};
+
+/**
+ * Thrown when an expansion is triggered, but the hashpower specified is greater
+ * than the maximum, which can be set with the \ref
+ * cuckoohash_map::maximum_hashpower method.
+ */
+class libcuckoo_maximum_hashpower_exceeded : public std::exception {
+public:
+ /**
+ * Constructor
+ *
+ * @param hp the hash power we were trying to expand to
+ */
+ libcuckoo_maximum_hashpower_exceeded(const size_t hp)
+ : hashpower_(hp) {}
+
+ /**
+ * @return a descriptive error message
+ */
+ virtual const char* what() const noexcept override {
+ return "Expansion beyond maximum hashpower";
+ }
+
+ /**
+ * @return the hashpower we were trying to expand to
+ */
+ size_t hashpower() const {
+ return hashpower_;
+ }
+private:
+ const size_t hashpower_;
+};
+
+#endif // _CUCKOOHASH_UTIL_HH
diff --git a/ext/include/cuckoo/libcuckoo_lazy_array.hh b/ext/include/cuckoo/libcuckoo_lazy_array.hh
new file mode 100644
index 0000000..99c4b5b
--- /dev/null
+++ b/ext/include/cuckoo/libcuckoo_lazy_array.hh
@@ -0,0 +1,202 @@
+/** \file */
+
+#ifndef _LIBCUCKOO_LAZY_ARRAY_HH
+#define _LIBCUCKOO_LAZY_ARRAY_HH
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+#include "cuckoohash_util.hh"
+
+/**
+ * A fixed-size array, broken up into segments that are dynamically allocated
+ * upon request. It is the user's responsibility to make sure they only access
+ * allocated parts of the array.
+ *
+ * @tparam OFFSET_BITS the number of bits of the index used as the offset within
+ * a segment
+ * @tparam SEGMENT_BITS the number of bits of the index used as the segment
+ * index
+ * @tparam T the type of stored in the container
+ * @tparam Alloc the allocator used to allocate data
+ */
+template <uint8_t OFFSET_BITS, uint8_t SEGMENT_BITS,
+ class T, class Alloc = std::allocator<T>
+ >
+class libcuckoo_lazy_array {
+public:
+ using value_type = T;
+ using allocator_type = Alloc;
+private:
+ using traits_ = std::allocator_traits<allocator_type>;
+public:
+ using size_type = std::size_t;
+ using reference = value_type&;
+ using const_reference = const value_type&;
+
+ static_assert(SEGMENT_BITS + OFFSET_BITS <= sizeof(size_type)*8,
+ "The number of segment and offset bits cannot exceed "
+ " the number of bits in a size_type");
+
+ /**
+ * Default constructor. Creates an empty array with no allocated segments.
+ */
+ libcuckoo_lazy_array(const allocator_type& allocator = Alloc())
+ noexcept(noexcept(Alloc()))
+ : segments_{{nullptr}}, allocated_segments_(0), allocator_(allocator) {}
+
+ /**
+ * Constructs an array with enough segments allocated to fit @p target
+ * elements. Each allocated element is default-constructed.
+ *
+ * @param target the number of elements to allocate space for
+ */
+ libcuckoo_lazy_array(size_type target,
+ const allocator_type& allocator = Alloc())
+ noexcept(noexcept(Alloc()))
+ : libcuckoo_lazy_array(allocator) {
+ segments_.fill(nullptr);
+ resize(target);
+ }
+
+ libcuckoo_lazy_array(const libcuckoo_lazy_array&) = delete;
+ libcuckoo_lazy_array& operator=(const libcuckoo_lazy_array&) = delete;
+
+ /**
+ * Move constructor
+ *
+ * @param arr the array being moved
+ */
+ libcuckoo_lazy_array(libcuckoo_lazy_array&& arr) noexcept
+ : segments_(arr.segments_),
+ allocated_segments_(arr.allocated_segments_),
+ allocator_(std::move(arr.allocator_)) {
+ // Deactivate the array by setting its allocated segment count to 0
+ arr.allocated_segments_ = 0;
+ }
+
+ /**
+ * Destructor. Destroys all elements allocated in the array.
+ */
+ ~libcuckoo_lazy_array()
+ noexcept(std::is_nothrow_destructible<T>::value) {
+ clear();
+ }
+
+ /**
+ * Destroys all elements allocated in the array.
+ */
+ void clear() {
+ for (size_type i = 0; i < allocated_segments_; ++i) {
+ destroy_array(segments_[i]);
+ segments_[i] = nullptr;
+ }
+ }
+
+ /**
+ * Index operator
+ *
+ * @return a reference to the data at the given index
+ */
+ reference operator[](size_type i) {
+ assert(get_segment(i) < allocated_segments_);
+ return segments_[get_segment(i)][get_offset(i)];
+ }
+
+ /**
+ * Const index operator
+ *
+ * @return a const reference to the data at the given index
+ */
+ const_reference operator[](size_type i) const {
+ assert(get_segment(i) < allocated_segments_);
+ return segments_[get_segment(i)][get_offset(i)];
+ }
+
+ /**
+ * Returns the number of elements the array has allocated space for
+ *
+ * @return current size of the array
+ */
+ size_type size() const {
+ return allocated_segments_ * SEGMENT_SIZE;
+ }
+
+ /**
+ * Returns the maximum number of elements the array can hold
+ *
+ * @return maximum size of the array
+ */
+ static constexpr size_type max_size() {
+ return 1UL << (OFFSET_BITS + SEGMENT_BITS);
+ }
+
+ /**
+ * Allocate enough space for @p target elements, not exceeding the capacity
+ * of the array. Under no circumstance will the array be shrunk.
+ *
+ * @param target the number of elements to ensure space is allocated for
+ */
+ void resize(size_type target) {
+ target = std::min(target, max_size());
+ if (target == 0) {
+ return;
+ }
+ const size_type last_segment = get_segment(target - 1);
+ for (size_type i = allocated_segments_; i <= last_segment; ++i) {
+ segments_[i] = create_array();
+ }
+ allocated_segments_ = last_segment + 1;
+ }
+
+private:
+ static constexpr size_type SEGMENT_SIZE = 1UL << OFFSET_BITS;
+ static constexpr size_type NUM_SEGMENTS = 1UL << SEGMENT_BITS;
+ static constexpr size_type OFFSET_MASK = SEGMENT_SIZE - 1;
+
+ std::array<T*, NUM_SEGMENTS> segments_;
+ size_type allocated_segments_;
+ allocator_type allocator_;
+
+ static size_type get_segment(size_type i) {
+ return i >> OFFSET_BITS;
+ }
+
+ static size_type get_offset(size_type i) {
+ return i & OFFSET_MASK;
+ }
+
+ // Allocates a SEGMENT_SIZE-sized array and default-initializes each element
+ typename traits_::pointer create_array() {
+ typename traits_::pointer arr = traits_::allocate(
+ allocator_, SEGMENT_SIZE);
+ // Initialize all the elements, safely deallocating and destroying
+ // everything in case of error.
+ size_type i;
+ try {
+ for (i = 0; i < SEGMENT_SIZE; ++i) {
+ traits_::construct(allocator_, &arr[i]);
+ }
+ } catch (...) {
+ for (size_type j = 0; j < i; ++j) {
+ traits_::destroy(allocator_, &arr[j]);
+ }
+ traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
+ throw;
+ }
+ return arr;
+ }
+
+ // Destroys every element of a SEGMENT_SIZE-sized array and then deallocates
+ // the memory.
+ void destroy_array(typename traits_::pointer arr) {
+ for (size_type i = 0; i < SEGMENT_SIZE; ++i) {
+ traits_::destroy(allocator_, &arr[i]);
+ }
+ traits_::deallocate(allocator_, arr, SEGMENT_SIZE);
+ }
+};
+
+#endif // _LIBCUCKOO_LAZY_ARRAY_HH
diff --git a/ext/include/getopt_pp/getopt_pp.cpp b/ext/include/getopt_pp/getopt_pp.cpp
deleted file mode 100644
index 71ccc65..0000000
--- a/ext/include/getopt_pp/getopt_pp.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
-GetOpt_pp: Yet another C++ version of getopt.
- Copyright (C) 2007, 2008 Daniel Gutson, FuDePAN
-
- This file is part of GetOpt_pp.
-
- GetOpt_pp is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- board-games is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <unistd.h>
-#include "getopt_pp.h"
-
-#if __APPLE__
-extern char** environ;
-#endif
-
-namespace GetOpt {
-
-const char GetOpt_pp::EMPTY_OPTION = 0;
-
-GETOPT_INLINE void GetOpt_pp::_init_flags()
-{
- std::stringstream ss;
- _flags = ss.flags();
-}
-
-GETOPT_INLINE void GetOpt_pp::_parse(int argc, char* argv[])
-{
- OptionData* currentData = NULL;
- _app_name = argv[0];
-
- // parse arguments by their '-' or '--':
- // (this will be a state machine soon)
- for(int i=1; i < argc; i++)
- {
- const char current = argv[i][0];
- const char next = argv[i][1];
-
- if (current == '-' && (isalpha(next) || next == '-' ) )
- {
- // see what's next, differentiate whether it's short or long:
- if (next == '-' && argv[i][2] != 0)
- {
- // long option
- currentData = &_longOps[&argv[i][2]];
- }
- else
- {
- // short option
- // iterate over all of them, keeping the last one in currentData
- // (so the intermediates will generate 'existent' arguments, as of '-abc')
- size_t j=1;
- do
- {
- currentData = &_shortOps[argv[i][j]];
- j++;
- }
- while (argv[i][j] != 0);
- }
- }
- else
- {
- // save value!
- if (currentData == NULL)
- currentData = &_shortOps[EMPTY_OPTION];
-
- currentData->args.push_back(argv[i]);
- }
- }
-
- _last = _Option::OK; // TODO: IMPROVE!!
-}
-
-GETOPT_INLINE void GetOpt_pp::_parse_env()
-{
- // this will be optimized in version 3
- std::string var_name;
- std::string var_value;
- size_t var=0;
- std::string::size_type pos;
- OptionData* data;
-
- while (environ[var] != NULL)
- {
- var_name = environ[var];
- pos = var_name.find('=');
-
- if (pos != std::string::npos)
- {
- var_value = var_name.substr(pos+1);
- var_name = var_name.substr(0, pos);
-
- if (_longOps.find(var_name) == _longOps.end())
- {
- data = &_longOps[var_name];
- data->args.push_back(var_value);
- data->flags = OptionData::Envir;
- }
- }
- else
- (data = &_longOps[var_name])->flags = OptionData::Envir;
-
- var++;
- }
-}
-
-GETOPT_INLINE GetOpt_pp::GetOpt_pp(int argc, char* argv[])
- : _exc(std::ios_base::goodbit)
-{
- _init_flags();
- _parse(argc, argv);
-}
-
-GETOPT_INLINE GetOpt_pp::GetOpt_pp(int argc, char* argv[], _EnvTag)
-{
- _init_flags();
- _parse(argc, argv);
- _parse_env();
-}
-
-GETOPT_INLINE GetOpt_pp& GetOpt_pp::operator >> (const _Option& opt) throw (GetOptEx)
-{
- if (_last != _Option::ParsingError)
- {
- _last = opt(_shortOps, _longOps, _flags);
-
- switch(_last)
- {
- case _Option::OK:
- break;
-
- case _Option::OptionNotFound:
- if (_exc & std::ios_base::eofbit )
- throw OptionNotFoundEx();
- break;
-
- case _Option::BadType:
- if (_exc & std::ios_base::failbit )
- throw InvalidFormatEx();
- break;
-
- case _Option::NoArgs:
- if (_exc & std::ios_base::eofbit )
- throw ArgumentNotFoundEx();
- break;
-
- case _Option::TooManyArgs:
- if (_exc & std::ios_base::failbit )
- throw TooManyArgumentsEx();
- break;
-
- case _Option::OptionNotFound_NoEx:
- break; // Ok, it will be read by casting to bool
-
- case _Option::ParsingError: break; // just to disable warning
- }
- }
- else if (_exc & std::ios_base::failbit )
- throw ParsingErrorEx();
-
- return *this;
-}
-
-GETOPT_INLINE GetOpt_pp& GetOpt_pp::operator >> (std::ios_base& (*iomanip)(std::ios_base&))
-{
- std::stringstream ss;
- ss.flags(_flags);
- _flags = (ss << iomanip).flags();
- return *this;
-}
-
-GETOPT_INLINE bool GetOpt_pp::options_remain() const
-{
- bool remain = false;
- ShortOptions::const_iterator it = _shortOps.begin();
- while (it != _shortOps.end() && !remain)
- {
- remain = (it->second.flags == OptionData::CmdLine_NotExtracted);
- ++it;
- }
-
- if (!remain)
- {
- LongOptions::const_iterator it = _longOps.begin();
- while (it != _longOps.end() && !remain)
- {
- remain = (it->second.flags == OptionData::CmdLine_NotExtracted);
- ++it;
- }
- }
-
- return remain;
-}
-
-}
diff --git a/ext/include/llvm/Support/MathExtras.h b/ext/include/llvm/Support/MathExtras.h
index e6f8ffa..8c0b110 100644
--- a/ext/include/llvm/Support/MathExtras.h
+++ b/ext/include/llvm/Support/MathExtras.h
@@ -19,6 +19,7 @@
#include <cassert>
#include <cstring>
#include <type_traits>
+#include <cstdint>
namespace llvm {
/// \brief The behavior an operation has on an input of 0.
diff --git a/ext/src/CMakeLists.txt b/ext/src/CMakeLists.txt
index b702eb9..9038354 100644
--- a/ext/src/CMakeLists.txt
+++ b/ext/src/CMakeLists.txt
@@ -12,5 +12,7 @@ add_subdirectory(samtools)
add_subdirectory(cppformat)
add_subdirectory(ssw)
add_subdirectory(cityhash)
+add_subdirectory(getopt_pp)
add_subdirectory(llvm)
-add_subdirectory(htrie)
\ No newline at end of file
+add_subdirectory(htrie)
+add_subdirectory(bwa)
diff --git a/ext/src/bwa/CMakeLists.txt b/ext/src/bwa/CMakeLists.txt
new file mode 100644
index 0000000..c1bd5bf
--- /dev/null
+++ b/ext/src/bwa/CMakeLists.txt
@@ -0,0 +1,31 @@
+############################################################################
+# Copyright (c) 2016 Saint Petersburg State University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(bwa C)
+
+find_package(Threads)
+if (CMAKE_USE_PTHREADS_INIT)
+ add_definitions(-DHAVE_PTHREAD)
+endif()
+
+include(CheckFunctionExists)
+include(CheckLibraryExists)
+check_function_exists(shm_open HAVE_SHM_OPEN)
+if (NOT HAVE_SHM_OPEN)
+ check_library_exists(rt shm_open "" HAVE_SHM_OPEN_IN_RT)
+endif()
+if (NOT HAVE_SHM_OPEN AND NOT HAVE_SHM_OPEN_IN_RT)
+ error("Cannot find shm_open()")
+endif()
+
+add_library(bwa STATIC
+ utils.c kthread.c ksw.c bwt.c bntseq.c bwa.c bwamem.c bwamem_pair.c
+ bwamem_extra.c malloc_wrap.c is.c bwtindex.c rope.c rle.c)
+
+target_link_libraries(bwa z m)
+if (HAVE_SHM_OPEN_IN_RT)
+ target_link_libraries(bwa rt)
+endif()
diff --git a/ext/src/bwa/ChangeLog b/ext/src/bwa/ChangeLog
new file mode 100644
index 0000000..403e61f
--- /dev/null
+++ b/ext/src/bwa/ChangeLog
@@ -0,0 +1,3864 @@
+------------------------------------------------------------------------
+r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.9rc1-2 (r1605)
+ * fixed a typo/bug in bwasw
+
+------------------------------------------------------------------------
+r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+a typo in the manual
+
+------------------------------------------------------------------------
+r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/utils.c
+ M /branches/prog/bwa/utils.h
+
+ * bwa-0.5.9rc1-1 (r1586)
+ * a few patches by John
+
+------------------------------------------------------------------------
+r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+
+documentation on specifying @RG
+
+------------------------------------------------------------------------
+r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.9rc1 (r1561)
+
+------------------------------------------------------------------------
+r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/main.c
+
+ * fixed a small memory leak caused by the BAM reader
+ * fixed a memory violation, also in the BAM reader
+
+------------------------------------------------------------------------
+r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/Makefile
+
+change Makefile gcc options
+
+------------------------------------------------------------------------
+r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.8-6 (r1557)
+ * added a little more comments to BWA-SW
+ * randomly choosing a mapping if there are more than one
+
+------------------------------------------------------------------------
+r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_aux.c
+
+sometimes unmapped reads may not be printed...
+
+------------------------------------------------------------------------
+r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_aux.c
+
+print unmapped reads
+
+------------------------------------------------------------------------
+r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.8-5 (r1555)
+ * BAM input documentation
+
+------------------------------------------------------------------------
+r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.8-4 (r1544)
+ * supporting adding RG tags and RG lines
+
+------------------------------------------------------------------------
+r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.8-3 (r1543)
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.8-2 (r1542)
+ * fixed a long existing bug in random placement of reads
+
+------------------------------------------------------------------------
+r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/bamlite.c
+ A /branches/prog/bwa/bamlite.h
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+preliminary BAM input support
+
+------------------------------------------------------------------------
+r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+
+change version number and ChangeLog
+
+------------------------------------------------------------------------
+r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+
+ * fixed a bug in the scoring matrix
+ * release bwa-0.5.8c (r1536)
+
+------------------------------------------------------------------------
+r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+version change
+
+------------------------------------------------------------------------
+r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+
+ * bwa-0.5.8b (r1450)
+ * fixed a bug in scoring matrix
+
+------------------------------------------------------------------------
+r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+
+fixed a serious bug
+
+------------------------------------------------------------------------
+r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.8 (r1442)
+
+------------------------------------------------------------------------
+r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-r1440
+ * sorry, forget to remove a debugging line
+
+------------------------------------------------------------------------
+r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-r1439
+ * fixed a bug in bwasw caused by a recent modification
+ * throwing insane insert size when estimating isize
+
+------------------------------------------------------------------------
+r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.7-7 (r1425)
+ * fixed a minor bug in bwasw command-line parsing
+ * When band-width is not large enough, bwasw may find two highly
+ overlapping but not completely overlapping alignments. The old
+ version will filter out one of them, which leads to false
+ negatives. The current outputs both. This solution is obviously not
+ ideal. The ideal one would be to increase the band-width and redo the
+ alignment.
+
+
+------------------------------------------------------------------------
+r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.7-6 (r1399)
+ * fixed a typo/bug (by Vaughn Iverson)
+
+------------------------------------------------------------------------
+r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+small correction
+
+------------------------------------------------------------------------
+r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.7-4 (r1328)
+ * automatically adjust ap_prior based on alignment
+
+------------------------------------------------------------------------
+r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.5.7-3 (r1327)
+ * evaluate hits obtained from SW alignment in a more proper way.
+
+------------------------------------------------------------------------
+r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+
+fixed a potential out-of-boundary error. Need more testing.
+
+------------------------------------------------------------------------
+r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+
+insert size is `weird' if the 3rd quatile larger than 100,000bp
+
+------------------------------------------------------------------------
+r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.7-2 (r1318)
+ * in sampe, allow to disable insert size estimate
+
+------------------------------------------------------------------------
+r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/solid2fastq.pl
+
+ * bwa-0.5.7-1 (r1317)
+ * fixed a potential bug in solid2fastq.pl
+ * fixed a bug in calculating mapping quality (by Rodrigo Goya)
+ * fixed a very rare bug (if ever occur) about pairing
+
+------------------------------------------------------------------------
+r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.7
+
+------------------------------------------------------------------------
+r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.6-2 (r1309)
+ * fixed an unfixed bug (by Carol Scott)
+ * fixed some tiny formatting
+
+------------------------------------------------------------------------
+r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.6-1 (r1304)
+ * optionally write output to a file (by Tim Fennel)
+
+------------------------------------------------------------------------
+r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.6
+
+------------------------------------------------------------------------
+r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-10 (r1302)
+ * improve max insert size estimate (method suggested by Gerton Lunter)
+
+------------------------------------------------------------------------
+r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-9 (r1301)
+ * improve mapping quality calculation for abnomalous pairs
+ * fixed a bug in multiple hits
+ * SOLiD multiple hits should work now
+
+------------------------------------------------------------------------
+r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-8 (r1300)
+ * output kurtosis
+
+------------------------------------------------------------------------
+r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-7 (r1299)
+ * calculate skewness in sampe
+ * increase min_len in SW to 20
+ * perform more SW to fix discordant pairs
+
+------------------------------------------------------------------------
+r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.5.5-6 (r1297)
+ * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya)
+
+------------------------------------------------------------------------
+r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/solid2fastq.pl
+
+the old fix seems not working!
+
+------------------------------------------------------------------------
+r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-5 (r1296)
+ * fixed a minor issue that the lower bound of insert size is not correctly set.
+
+------------------------------------------------------------------------
+r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-4 (r1295)
+ * fixed a memory leak
+ * change the behaviour of -n (samse and sampe)
+ * change the default of -n
+
+------------------------------------------------------------------------
+r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-3 (r1294)
+ * improved multi-hit report
+
+------------------------------------------------------------------------
+r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/solid2fastq.pl
+
+ * bwa-0.5.5-2 (r1293)
+ * bugfix: truncated quality string
+ * bugfix: quality -1 in solid->fastq conversion
+ * bugfix: color reads on the reverse strand is not complemented
+
+------------------------------------------------------------------------
+r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwase.c
+ A /branches/prog/bwa/bwase.h
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.5-1 (r1279)
+ * incorporate changes from Matt Hanna for Java bindings.
+
+------------------------------------------------------------------------
+r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+ A /branches/prog/bwa/qualfa2fq.pl
+
+Release bwa-0.5.5 (r1273)
+
+------------------------------------------------------------------------
+r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.4-3 (r1272)
+ * fixed another typo which may lead to incorrect single-end mapping quality
+
+------------------------------------------------------------------------
+r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.4-2 (r1271)
+ * fixed a serious typo/bug which does not hurt if we allow one gap open
+ and work with <200bp reads, but causes segfault for long reads.
+
+------------------------------------------------------------------------
+r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.4-1 (r1270)
+ * fixed a bug in color alignment
+
+------------------------------------------------------------------------
+r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.4
+
+------------------------------------------------------------------------
+r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+
+ * bwa-0.5.3-4 (r1244)
+ * output the clipped length in XC:i: tag
+ * skip mate alignment when stdaln is buggy
+ * fixed a bug in NM:i: tag
+
+------------------------------------------------------------------------
+r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.3-3 (r1243)
+ * sampe: fixed a bug when a read sequence is identical its reverse complement.
+
+------------------------------------------------------------------------
+r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.3-2 (r1242)
+ * sampe: optionall preload the full index into memory
+ * aln: change the default seed length to 32bp
+
+------------------------------------------------------------------------
+r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/khash.h
+
+Improve portability of khash.h
+
+------------------------------------------------------------------------
+r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.3-1 (r1226)
+ * in dBWT-SW, optionall use hard clipping instead of soft clipping
+
+------------------------------------------------------------------------
+r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.3 (r1225)
+
+------------------------------------------------------------------------
+r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.2
+
+------------------------------------------------------------------------
+r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.1-5 (r1222)
+ * fixed a typo. No real change
+
+------------------------------------------------------------------------
+r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.1-4 (r1221)
+ * trim reads before alignment
+
+------------------------------------------------------------------------
+r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.1-3 (r1216)
+ * fixed a bug about NM tags for gapped alignment
+ * print SAM header
+
+------------------------------------------------------------------------
+r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.1-2 (r1215)
+ * fixed a bug when read lengths vary (by John Marshall)
+
+------------------------------------------------------------------------
+r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.1-1 (r1213)
+ * change default -T to 30
+
+------------------------------------------------------------------------
+r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.1
+
+------------------------------------------------------------------------
+r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+
+ * ChangeLog
+
+------------------------------------------------------------------------
+r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.0-6 (r1206)
+ * fixed two bugs caused by previous modification
+
+------------------------------------------------------------------------
+r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.0-4 (r1205)
+ * reduce false coordinates and CIGAR when a query bridges two reference
+ sequences, although some very rare cases may fail bwa.
+
+------------------------------------------------------------------------
+r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.0-3 (r1204)
+ * choose one repetitive hit to extend
+
+------------------------------------------------------------------------
+r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.0-2 (r1203)
+ * dBWT-SW: change a parameter in calculating mapping quality
+ * fixed a bug in samse
+
+------------------------------------------------------------------------
+r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.5.0-1 (r1202)
+ * change default band width to 50
+ * improve mapping quality a bit
+
+------------------------------------------------------------------------
+r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.5.0 (r1200)
+
+------------------------------------------------------------------------
+r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+
+Updated ChangeLog and the manual
+
+------------------------------------------------------------------------
+r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-36 (r1198)
+ * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse.
+
+------------------------------------------------------------------------
+r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_aux.c
+ A /branches/prog/bwa/bwtsw2_chain.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-35 (r1197)
+ * further heuristic acceleration for long queries
+
+------------------------------------------------------------------------
+r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-34 (r1196)
+ * updated the manual page
+ * output base quality if the input is fastq
+
+------------------------------------------------------------------------
+r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/simple_dp.c
+
+ * bwa-0.4.9-33 (r1191)
+ * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment
+ * in dbwtsw adjust -T and -c according to -a
+
+------------------------------------------------------------------------
+r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update manual
+
+------------------------------------------------------------------------
+r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtsw2_main.c
+
+update documentation
+
+------------------------------------------------------------------------
+r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-32 (r1190)
+ * only help messages are changed
+
+------------------------------------------------------------------------
+r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-31 (r1189)
+ * in bwape/bwase, print CIGAR "*" if the read is unmapped
+ * improved the calculation of mapping quality
+
+------------------------------------------------------------------------
+r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+
+fflush()
+
+------------------------------------------------------------------------
+r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-30 (r1180)
+ * fixed a memory problem
+ * multi-threading sometimes does not work...
+
+------------------------------------------------------------------------
+r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-29 (r1179)
+ * preliminary mutli-threading support in dbwtsw
+
+------------------------------------------------------------------------
+r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-28 (r1178)
+ * fixed a bug in printing repetitive hits
+
+------------------------------------------------------------------------
+r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-27 (r1177)
+ * bwtsw2: fixed a hidden memory leak
+
+------------------------------------------------------------------------
+r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-26
+ * change the way mapping quality is calculated
+
+------------------------------------------------------------------------
+r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-25
+ * code clean up
+ * automatically adjust ->t and ->is_rev based on input
+
+------------------------------------------------------------------------
+r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-24
+ * fixed a bug in printing the hits
+
+------------------------------------------------------------------------
+r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-23
+ * allow to skip reverse alignment
+ * increase opt->t to 37
+
+------------------------------------------------------------------------
+r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-22
+ * report if the hit is found in both directions
+
+------------------------------------------------------------------------
+r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-21
+ * dbwtsw: map to both forward and reverse BWT to reduce false alignment
+
+------------------------------------------------------------------------
+r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+save hits before cut_tail()
+
+------------------------------------------------------------------------
+r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.9-19
+ * use a global memory pool to reduce the CPU time spent on malloc/free().
+
+------------------------------------------------------------------------
+r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-18
+ * reduce unnecessary extension to the 5'-end
+ * allow to use different interval size for the 2 rounds
+ * change default parameters
+
+------------------------------------------------------------------------
+r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-17
+ * dbwtsw: fixed THE memory leak.
+
+------------------------------------------------------------------------
+r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+
+ * bwa-0.4.9-16
+ * fixed a memory leak
+ * a small memory leak still occurs to bwtsw2_core(). I will work on that later.
+ * changed the default parameters
+
+------------------------------------------------------------------------
+r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+
+ * bwa-0.4.9-15
+ * generate CIGAR right before output. This saves unnecessary computation.
+ * this version may be buggy as I have not tested it.
+
+------------------------------------------------------------------------
+r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.9-14
+
+ * deplete unique hits in dbwtsw and postprocess them with standard sw
+
+ * in principle, this stratgy should be faster and more accurate, but I
+ have not tested this point. I may switch back to the old method if
+ this does not work.
+
+ * the code looks quite nasty now. it needs clean up...
+
+
+------------------------------------------------------------------------
+r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+
+change a default parameter
+
+------------------------------------------------------------------------
+r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-13
+ * dbwtsw: switch between small and large Z-best
+
+------------------------------------------------------------------------
+r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-12
+ * changed the default -z to 100
+ * heuristically speed up alignments for polyA reads
+
+------------------------------------------------------------------------
+r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-11
+
+ * dbwtsw potentially generates less false alignments, although in
+ practice, the modification brings no improvement.
+
+
+------------------------------------------------------------------------
+r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-10
+ * disabled debugging code
+ * add "BAM_FMU" if both ends are unmapped
+
+------------------------------------------------------------------------
+r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+
+nothing, really
+
+------------------------------------------------------------------------
+r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-9
+ * bwtsw2: generate SAM output
+
+------------------------------------------------------------------------
+r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-8
+
+ * fixed a weird deadloop which only happens to icc -O3. Thanks John
+ Marshall for the fix.
+
+
+------------------------------------------------------------------------
+r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-7
+
+ * fixed a typo in bwtsw2 alignment. Now score from the standard SW
+ seems to agree with score from bwtsw2, except that in reporting
+ alignments, bwtsw2 may report non-optimal segments. This is expected,
+ though. I will improve in future.
+
+
+------------------------------------------------------------------------
+r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * aln_left_core() seems to work properly
+ * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!!
+
+------------------------------------------------------------------------
+r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/stdaln.c
+
+removed debugging code...
+
+------------------------------------------------------------------------
+r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/stdaln.c
+
+ * radical changes failed...
+ * fixed a bug
+
+------------------------------------------------------------------------
+r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/stdaln.c
+
+temporary changes. Will apply some radical changes to this file...
+
+------------------------------------------------------------------------
+r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/stdaln.c
+
+fixed a long-existing bug in Smith-Waterman alignment
+
+------------------------------------------------------------------------
+r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.9-6
+ * unexplained inconsistency still occurs, but the results largely look reasonable.
+
+------------------------------------------------------------------------
+r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/stdaln.c
+
+half DP
+
+------------------------------------------------------------------------
+r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+
+a bit code clean up
+
+------------------------------------------------------------------------
+r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-5
+ * fixed a bug in determining sub-optimal hits
+ * removed some debugging codes
+
+------------------------------------------------------------------------
+r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-4
+ * better cmd interface
+ * faster speed
+
+------------------------------------------------------------------------
+r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+bwtsw2 (dBWT-SW) is working apparently...
+
+
+------------------------------------------------------------------------
+r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.9-2
+ * bwtsw2: change cut_tail() such that it is faster but more likely to
+ miss true hits
+
+------------------------------------------------------------------------
+r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/bwt_lite.c
+ A /branches/prog/bwa/bwt_lite.h
+ A /branches/prog/bwa/bwtsw2.h
+ A /branches/prog/bwa/bwtsw2_aux.c
+ A /branches/prog/bwa/bwtsw2_core.c
+ A /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * bwa-0.4.9-1
+ * added back bwtsw2
+
+------------------------------------------------------------------------
+r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.9
+
+------------------------------------------------------------------------
+r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.8
+
+------------------------------------------------------------------------
+r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.7-2
+ * change the default of "aln -R" to 30
+
+------------------------------------------------------------------------
+r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.7-1
+ * search for suboptimal hits if the top hit is not so repetitive
+
+------------------------------------------------------------------------
+r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.7
+
+------------------------------------------------------------------------
+r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.6-9
+ * fixed compiling errors on some Linux machines
+
+------------------------------------------------------------------------
+r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.6-8
+ * avoid compilation error on some systems.
+
+------------------------------------------------------------------------
+r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.6-7
+ * fixed an integer overflow caused by previous modifications
+ * made insert size estimation more robust
+
+------------------------------------------------------------------------
+r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.6-5
+ * fixed a integer overflow problem which may cause seg fault in very rare cases
+ * made XN tags more accurate
+
+------------------------------------------------------------------------
+r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.6-4
+ * heuristic rules to detect suboptimal alignment
+ * stdsw: support double-strand and protein alignment
+
+------------------------------------------------------------------------
+r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.6-2
+ * improve the functionality of stdsw
+ * allow to add a threshold on SW alignment. Hope this does not incur new bugs...
+
+------------------------------------------------------------------------
+r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.6-1
+ * output SM and AM tag
+
+------------------------------------------------------------------------
+r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.6
+
+------------------------------------------------------------------------
+r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ A /branches/prog/bwa/solid2fastq.pl
+
+ * added notes to bwa
+ * added a script to convert SOLiD reads
+ * updated documentations
+
+------------------------------------------------------------------------
+r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/kstring.c
+ M /branches/prog/bwa/main.c
+
+fixed a bug in kstring
+
+------------------------------------------------------------------------
+r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtmisc.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.5-7
+ * fixed a bug in pac2cspac
+
+------------------------------------------------------------------------
+r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+
+disable debugging
+
+------------------------------------------------------------------------
+r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.5-6
+ * fixed problems with coordinates for color gapped alignment
+
+------------------------------------------------------------------------
+r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.5-5
+ * added support for gapped color alignment
+
+------------------------------------------------------------------------
+r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/cs2nt.c
+ M /branches/prog/bwa/main.c
+
+ * convert cs read to nt read (for ungapped alignment only)
+
+------------------------------------------------------------------------
+r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwase.c
+ A /branches/prog/bwa/cs2nt.c
+
+prepare to implement cs->nt conversion (have not yet...)
+
+------------------------------------------------------------------------
+r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/bwtmisc.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * bwa-0.4.5-3
+ * generate color index from nucleotide fasta reference
+
+------------------------------------------------------------------------
+r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.5-2
+ * improved mapping quality a bit if one end falls in a tandem repeat
+ but the mate is unique.
+
+------------------------------------------------------------------------
+r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.5-1
+ * make bwa work for SOLiD reads
+
+------------------------------------------------------------------------
+r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.5
+
+------------------------------------------------------------------------
+r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.4.4-6
+ * fixed a bug in SW alignment when no residue matches
+
+------------------------------------------------------------------------
+r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.4-5
+ * fixed that bounary bug
+
+------------------------------------------------------------------------
+r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwape.c
+
+just change some logging information
+
+------------------------------------------------------------------------
+r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update manual
+
+------------------------------------------------------------------------
+r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.4-4
+ * fixed a bug on boundary check in pair_sw
+
+------------------------------------------------------------------------
+r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.4-3
+ * allow to change mismatch penalty
+
+------------------------------------------------------------------------
+r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.4-2
+ * remove timer
+ * allow to change default gapo and gape penalty at the command line
+
+------------------------------------------------------------------------
+r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update benchmark
+
+------------------------------------------------------------------------
+r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/kvec.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.4-1
+ * automatically detect insert size
+ * use insert size in pairing. This may potentially improve accuracy (untested!)
+
+------------------------------------------------------------------------
+r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.4
+
+------------------------------------------------------------------------
+r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.3-5
+ * impose boundary check in refine_gapped
+
+------------------------------------------------------------------------
+r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.3-4
+ * change MD tag to match the latest SAM specification
+
+------------------------------------------------------------------------
+r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+
+update ChangeLog
+
+------------------------------------------------------------------------
+r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+change MD tag to meet the latest SAM specification
+
+------------------------------------------------------------------------
+r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.3-2
+ * fixed a bug on counting 'N'
+
+------------------------------------------------------------------------
+r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.3-1
+ * fixed potential boundary problems
+ * update benchmark result
+
+------------------------------------------------------------------------
+r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update some numbers
+
+------------------------------------------------------------------------
+r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update benchmark
+
+------------------------------------------------------------------------
+r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtindex.c
+
+a warning message for index
+
+------------------------------------------------------------------------
+r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/main.c
+
+forget to change release number
+
+------------------------------------------------------------------------
+r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+
+Release bwa-0.4.3
+
+------------------------------------------------------------------------
+r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+
+Release bwa-0.4.3
+
+------------------------------------------------------------------------
+r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-10
+ * update documentation
+ * fixed a bug on generating MD tags for SW alignment
+
+------------------------------------------------------------------------
+r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-9
+ * fixed a bug in samse -n...
+
+------------------------------------------------------------------------
+r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-8
+ * given -N, the previous version would stop if the top hit is a repeat. Now changed.
+
+------------------------------------------------------------------------
+r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-7
+ * use a bit-wise flag to replace some member variables in the option struct
+ * allow to switch off the iterative strategy
+
+------------------------------------------------------------------------
+r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-6
+ * allow to dump multiple hits from samse, in another format, though
+
+------------------------------------------------------------------------
+r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/kseq.h
+ A /branches/prog/bwa/kstring.c
+ A /branches/prog/bwa/kstring.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/simple_dp.c
+
+ * bwa-0.4.2-5
+ * update kseq.h to the latest version
+ * generate MD tag
+ * print mate coordinate if only one end is unmapped
+
+------------------------------------------------------------------------
+r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-4
+ * fixed a bug for SAM format
+
+------------------------------------------------------------------------
+r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-3
+ * change default fnr to 0.04
+ * print max_diff for valid fnr
+
+------------------------------------------------------------------------
+r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-2
+ * automatically choose max_diff
+
+------------------------------------------------------------------------
+r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.2-1
+ * take N as a mismatch
+
+------------------------------------------------------------------------
+r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.2
+
+------------------------------------------------------------------------
+r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.1
+
+------------------------------------------------------------------------
+r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.0-2
+ * make mate_sw fully working
+
+------------------------------------------------------------------------
+r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.4.0-1
+ * do SW alignment for unmapped mate. It is working.
+ * I still need to do some extra work for SW alignment, but it is too late
+ and I am getting tired... I will do tomorrow.
+
+------------------------------------------------------------------------
+r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.4.0
+
+------------------------------------------------------------------------
+r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-12
+ * better lock
+
+------------------------------------------------------------------------
+r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-11
+ * fixed a small memory leak in bwa_seq_close()
+ * fixed "uninitialized memory" from bwt_aln1_t
+ * multithreading for "aln" command
+
+------------------------------------------------------------------------
+r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ D /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwt_gen/bwt_gen.c
+ A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748)
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ D /branches/prog/bwa/pac2bwt.c
+
+ * bwa-0.3.0-10
+ * a little bit code clean up
+
+------------------------------------------------------------------------
+r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-9
+ * use 64-bit integer to speed up Occ calculate, although just a little bit
+
+------------------------------------------------------------------------
+r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-8
+ * a little bit code cleanup
+
+------------------------------------------------------------------------
+r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-7
+ * accelerate Occ calculation
+
+------------------------------------------------------------------------
+r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/pac2bwt.c
+
+ * bwa-0.3.0-6
+ * put occ table along with bwt to save another cache miss
+ * this version is already faster than the previous and I can still improve it...
+
+------------------------------------------------------------------------
+r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-5
+ * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be
+ increased to keep the same memory. As a result, the speed is a little
+ slower in fact.
+
+------------------------------------------------------------------------
+r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-4
+ * added back optimization codes (it is a pain...)
+
+------------------------------------------------------------------------
+r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-3
+ * faster bit operations
+
+------------------------------------------------------------------------
+r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-2
+ * removed optimization codes again...
+ * use a new method to count the bits
+
+------------------------------------------------------------------------
+r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.3.0-1
+ * added back the optimization codes
+ * added a new option to aln: max_entries, although this is disabled by default
+ * updated benchmark
+
+------------------------------------------------------------------------
+r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+add URL
+
+------------------------------------------------------------------------
+r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.3.0
+
+------------------------------------------------------------------------
+r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines
+Changed paths:
+ A /branches/prog/bwa/COPYING
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/utils.c
+ M /branches/prog/bwa/utils.h
+
+added licensing information
+
+------------------------------------------------------------------------
+r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-31
+ * better mapping quality
+ * update benchmark
+
+------------------------------------------------------------------------
+r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+
+update documentation
+
+------------------------------------------------------------------------
+r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update documentation
+
+------------------------------------------------------------------------
+r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-30
+ * reduce memory a little bit
+ * update documentation
+
+------------------------------------------------------------------------
+r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-29
+ * sampe: removed -O option; changed default -o to 100000
+ * sampe: fixed a bug in calculating paired mapping quality
+ * aln: added an option to search for suboptimal hits even if the best is a repeat.
+ This option will make sampe MUCH SLOWER.
+ * sampe: set isize as zero if mapped to two different chr
+ * update manual (unfinished)
+
+------------------------------------------------------------------------
+r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-28
+ * fixed a bug in calculating paired mapping quality
+
+------------------------------------------------------------------------
+r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675)
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-27
+ * accelerate sampe by storing visited large intervals
+
+------------------------------------------------------------------------
+r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-26
+ * remove the optimation codes
+
+------------------------------------------------------------------------
+r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-25
+ * accelerate OCC calculation by ~7%. However, it seems not worth doing
+ this by complicate the codes. I will change back later.
+
+------------------------------------------------------------------------
+r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-24
+ * change command "sai2sam_pe" to "sampe"
+ * print usage for sampe command
+ * in sampe: change default max_occ to 1000
+ * fixed a few compiling warnings in bntseq.c
+
+------------------------------------------------------------------------
+r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-22
+ * mating information can be printed to SAM
+
+------------------------------------------------------------------------
+r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-21
+ * implement pairing (still UNFINISHED)
+ * output all reads even if full of N
+
+------------------------------------------------------------------------
+r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/bwape.c
+ M /branches/prog/bwa/bwase.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * bwa-0.2.0-20
+ * remove "-t" from aln cmd
+ * code clean up: move some functions in bwt2fmv.c to other source files
+ * added sai2sam_pe cmd: *UNFINISHED*
+
+------------------------------------------------------------------------
+r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/bwase.c
+ A /branches/prog/bwa/bwaseqio.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/kseq.h
+ A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712)
+ A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537)
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-19
+ * considerable code cleanup; no actual changes
+
+------------------------------------------------------------------------
+r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-18
+ * generate SAM output
+
+------------------------------------------------------------------------
+r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * bwa-0.2.0-17
+ * remove bwtsw2 related codes
+ * separate searching for SA interval from generating alignments
+
+------------------------------------------------------------------------
+r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt2fmv.c
+ D /branches/prog/bwa/bwt_lite.c
+ D /branches/prog/bwa/bwt_lite.h
+ M /branches/prog/bwa/bwtgap.c
+ D /branches/prog/bwa/bwtsw2.h
+ D /branches/prog/bwa/bwtsw2_aux.c
+ D /branches/prog/bwa/bwtsw2_core.c
+ D /branches/prog/bwa/bwtsw2_main.c
+ D /branches/prog/bwa/khash.h
+ D /branches/prog/bwa/ksort.h
+ D /branches/prog/bwa/kvec.h
+ M /branches/prog/bwa/main.c
+
+ * added interface to "aln -t"
+ * remove bwtsw2 related codes
+
+------------------------------------------------------------------------
+r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-16
+ * allow to set max mismatches based on read length, but I do not know
+ whether this really works
+
+------------------------------------------------------------------------
+r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-15
+ * fixed a bug in sequence parser.
+
+------------------------------------------------------------------------
+r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/utils.c
+
+ * bwa-0.2.0-14
+ * fixed a bug caused by the change of the FASTA/Q parser
+
+------------------------------------------------------------------------
+r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ A /branches/prog/bwa/kseq.h
+ D /branches/prog/bwa/seq.c
+ D /branches/prog/bwa/seq.h
+ M /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/utils.c
+ M /branches/prog/bwa/utils.h
+
+replace seq.* with kseq.h
+
+------------------------------------------------------------------------
+r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-13
+ * make bwtsw2 output sub-optimal hits. not completed
+
+------------------------------------------------------------------------
+r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/kvec.h
+
+little...
+
+------------------------------------------------------------------------
+r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/khash.h
+
+improve interface of khash
+
+------------------------------------------------------------------------
+r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+improve minor things, which make bwtsw2 slower, but should miss less true hits
+
+------------------------------------------------------------------------
+r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * fixed a bug in calculating ->D
+ * enforce band-width checking
+
+------------------------------------------------------------------------
+r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+delete a line of code that is never visited
+
+------------------------------------------------------------------------
+r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+a bit code clean up
+
+------------------------------------------------------------------------
+r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-12
+ * max-depth can be set, although it does not help the speed at all
+
+------------------------------------------------------------------------
+r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+cut_tail after remove duplicate
+
+------------------------------------------------------------------------
+r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/khash.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-11
+ * improved cut_tail()
+
+------------------------------------------------------------------------
+r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-10
+ * fixed a bug in cut_tail()
+
+------------------------------------------------------------------------
+r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+a bit code clean up
+
+------------------------------------------------------------------------
+r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+improve speed (<1%)
+
+------------------------------------------------------------------------
+r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * fixed two potential bugs, although I have not seen their effects
+ * improve speed a bit (<2%)
+
+------------------------------------------------------------------------
+r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+
+nothing, really
+
+------------------------------------------------------------------------
+r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+disable X-drop, which has to be reimplemented in the current algorithm
+
+------------------------------------------------------------------------
+r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt_lite.c
+ M /branches/prog/bwa/bwt_lite.h
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+
+ * temporarily disable cut_tail()
+ * calculate SA in bwt_lite.c
+ * fixed a bug in reversing the sequence
+
+------------------------------------------------------------------------
+r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ A /branches/prog/bwa/ksort.h
+
+n-best method
+
+------------------------------------------------------------------------
+r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_core.c
+
+give correct result again
+
+------------------------------------------------------------------------
+r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+I think I know the reason. It needs more work...
+
+------------------------------------------------------------------------
+r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_core.c
+
+fixed another bug, but still have
+
+------------------------------------------------------------------------
+r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+fixed another bug
+
+------------------------------------------------------------------------
+r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/khash.h
+
+ * do not segfault, but the result is WRONG!
+ * prepare to remove bsw2_connectivity_check()
+
+------------------------------------------------------------------------
+r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/kvec.h
+
+more revisions
+
+------------------------------------------------------------------------
+r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+further simply codes with kvec.h
+
+------------------------------------------------------------------------
+r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+part of revisions... have not finished
+
+------------------------------------------------------------------------
+r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/khash.h
+ A /branches/prog/bwa/kvec.h
+
+prepare for abrupt change
+
+------------------------------------------------------------------------
+r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+fixed a bug; now "bwtsw2 -d" is useless
+
+------------------------------------------------------------------------
+r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+improve speed a little bit
+
+------------------------------------------------------------------------
+r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+remove debug codes
+
+------------------------------------------------------------------------
+r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * improve the speed a little bit (<5%)
+ * prepare to remove BSW_DEBUG
+
+------------------------------------------------------------------------
+r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-9
+ * support reverse strand
+ * fixed a bug that causes missing hits
+
+------------------------------------------------------------------------
+r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-8
+ * better progress report
+
+------------------------------------------------------------------------
+r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-7
+ * avoid some missing hits
+ * add maximum depth
+
+------------------------------------------------------------------------
+r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-6
+ * bwtsw2 works although on the forward strand only for now
+ * better progress information
+
+------------------------------------------------------------------------
+r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * implement memory pool
+ * avoid some rehashing
+
+------------------------------------------------------------------------
+r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_main.c
+
+ * fixed a memory leak
+ * prepare to implement mempool
+
+------------------------------------------------------------------------
+r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/khash.h
+
+ * add X-dropoff
+ * remove duplicated results
+ * switch to simple stack
+
+------------------------------------------------------------------------
+r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+
+ * check whether t-node has been visited
+ * prepare to remove two-level stack
+
+------------------------------------------------------------------------
+r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bwa/khash.h
+
+khash library
+
+------------------------------------------------------------------------
+r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+add inline
+
+------------------------------------------------------------------------
+r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+
+improve speed
+
+------------------------------------------------------------------------
+r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2_core.c
+
+Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just
+keep a copy in case I may need this in future.
+
+
+------------------------------------------------------------------------
+r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * no principal modification
+
+------------------------------------------------------------------------
+r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2_core.c
+
+ * fixed a bug which may cause duplicated matching
+ * accelerate the speed a bit, although using hash in avoiding duplications
+ slows the speed down in the end
+
+------------------------------------------------------------------------
+r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtsw2.h
+ M /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-5
+ * indel seems to work on toy example
+ * add band
+
+------------------------------------------------------------------------
+r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt_lite.c
+ M /branches/prog/bwa/bwt_lite.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtsw2.h
+ A /branches/prog/bwa/bwtsw2_aux.c
+ M /branches/prog/bwa/bwtsw2_core.c
+ M /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/is.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/simple_dp.c
+
+ * bwa-0.2.0-4
+ * updated bwtsw2, which seems to work properly on toy examples
+
+------------------------------------------------------------------------
+r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-3
+ * tune for longer gaps, but it does not really work with kilo-bp gaps...
+
+------------------------------------------------------------------------
+r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-2
+ * changed the way to extend long deletions. Now use max_del_occ.
+
+------------------------------------------------------------------------
+r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt_lite.c
+ M /branches/prog/bwa/bwt_lite.h
+
+updated from bwtsw2_lite
+
+------------------------------------------------------------------------
+r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.h
+ A /branches/prog/bwa/bwt_lite.c
+ A /branches/prog/bwa/bwt_lite.h
+ A /branches/prog/bwa/bwtsw2.h
+ A /branches/prog/bwa/bwtsw2_core.c
+ A /branches/prog/bwa/bwtsw2_main.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.2.0-1
+ * add bwt_lite: a light-weighted version of bwt (NOT TESTED!)
+ * add core codes for bwtsw2: NOT TESTED!!!
+
+------------------------------------------------------------------------
+r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.2.0
+
+------------------------------------------------------------------------
+r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.6-7
+ * change default seed length to 31
+ * add incomplete support to color sequences (not tested yet!)
+
+------------------------------------------------------------------------
+r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.6-6
+ * change default seed length to 33bp
+
+------------------------------------------------------------------------
+r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.6-5
+ * fixed a bug that may miss true alignments. this bugs exists in most
+ early versions.
+ * fixed a bug that yields wrong coordinates for reads mapped on the forward
+ strands with gaps.
+
+------------------------------------------------------------------------
+r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines
+Changed paths:
+ D /branches/prog/bwa/Makefile.div
+
+useless
+
+------------------------------------------------------------------------
+r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.6-4
+ * fixed one bug
+ * there is another one...
+
+------------------------------------------------------------------------
+r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.6-3
+ * almost there, but not quite right
+
+------------------------------------------------------------------------
+r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * improve the seeding method
+ * prepare to load two BWTs into memory. A BIG change!
+
+------------------------------------------------------------------------
+r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * added seeding
+ * unfinished yet
+
+------------------------------------------------------------------------
+r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.6
+
+------------------------------------------------------------------------
+r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/simple_dp.c
+
+sw: output alignment score
+
+------------------------------------------------------------------------
+r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ A /branches/prog/bwa/simple_dp.c
+ M /branches/prog/bwa/stdaln.c
+ M /branches/prog/bwa/stdaln.h
+
+ * bwa-0.1.5-3
+ * added a simple interface to SW/NW alignment
+ * stdaln-0.9.8 (see header for more details)
+
+------------------------------------------------------------------------
+r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ A /branches/prog/bwa/stdaln.c
+ A /branches/prog/bwa/stdaln.h
+
+ * bwa-0.1.5-2
+ * give accurate gap positions
+
+------------------------------------------------------------------------
+r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+
+unfinished, but I am tired...
+
+------------------------------------------------------------------------
+r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.5-1
+ * fixed a potential bug which may produce an alignment in N regions,
+ although extremely rare.
+
+------------------------------------------------------------------------
+r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.5
+
+------------------------------------------------------------------------
+r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update documentation
+
+------------------------------------------------------------------------
+r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ *
+
+------------------------------------------------------------------------
+r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.4-4
+ * add timer for debugging
+
+------------------------------------------------------------------------
+r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.4-3
+ * fixed a bug in the previous code
+ * this version gives identical result to bwa-0.1.4, just 10% faster
+
+------------------------------------------------------------------------
+r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.4-2
+ * further improve the speed
+ * The result is slightly different from bwa-0.1.4 now. I need to check...
+
+------------------------------------------------------------------------
+r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+
+comments only
+
+------------------------------------------------------------------------
+r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+further improve the speed in Occ functions
+
+------------------------------------------------------------------------
+r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.4
+
+------------------------------------------------------------------------
+r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+
+update documentation and ChangeLog
+
+------------------------------------------------------------------------
+r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.3-2
+ * fixed the bug in the last modification
+ * now the alignment should be more clearly defined
+
+------------------------------------------------------------------------
+r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.3-1
+ * this is a buggy verion!
+ * i will fix the bug tomorrow. It is late...
+
+------------------------------------------------------------------------
+r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.3
+
+------------------------------------------------------------------------
+r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.2-3
+ * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux
+ * aln: more command-line options
+
+------------------------------------------------------------------------
+r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.2-2
+ * further improve the speed
+ * this version gives exactly the same result as bwa-0.1.2
+
+------------------------------------------------------------------------
+r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.2-1
+ * speed up by about 5%
+
+------------------------------------------------------------------------
+r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.2
+
+------------------------------------------------------------------------
+r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ D /branches/prog/bwa/bwt1away.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ D /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-9
+ * some code cleanup
+ * remove 1away and top2
+
+------------------------------------------------------------------------
+r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/is.c
+
+Yuta Mori's implementation of IS algorithm.
+
+------------------------------------------------------------------------
+r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-8
+ * improve gapped alignment
+ * this version will miss more gapped alignments, but the speed is much faster
+ * prepare to remove top2 and 1away algorithms
+ * prepare to add SAIS algorithm for bwt construction
+
+------------------------------------------------------------------------
+r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-7
+ * change END_SKIP from 3 to 5, but still gaps may be wrongly added
+ * change default '-g' from 5 to 3
+
+------------------------------------------------------------------------
+r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-6
+ * fix a bug in nested stack
+
+------------------------------------------------------------------------
+r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ A /branches/prog/bwa/bwtgap.h
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-5
+ * replace heap with nested stacks
+ * there are still obvious bugs...
+
+------------------------------------------------------------------------
+r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * bwa-0.1.1-4
+ * add interface to affine gap alignment
+ * there are obvious bugs and I will fix them later
+
+------------------------------------------------------------------------
+r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-3
+ * affine gap seems to work, at least partially
+
+------------------------------------------------------------------------
+r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ A /branches/prog/bwa/bwtgap.c
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-2
+ * initial gapped alignment. not work at the moment
+
+------------------------------------------------------------------------
+r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.1-1
+ * ungap: remove a useless varible in top2_entry_t
+
+------------------------------------------------------------------------
+r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/ChangeLog
+ A /branches/prog/bwa/NEWS
+ M /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/main.c
+
+Release bwa-0.1.1
+
+------------------------------------------------------------------------
+r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwa.1
+
+update documentation
+
+------------------------------------------------------------------------
+r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines
+Changed paths:
+ A /branches/prog/bwa/ChangeLog
+ A /branches/prog/bwa/bwa.1
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-11
+ * improve approximating mapping qualities
+ * add documentation
+ * add ChangeLog
+
+------------------------------------------------------------------------
+r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-10
+ * output a random position for repetitive reads
+
+------------------------------------------------------------------------
+r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/pac2bwt.c
+
+ * bwa-0.1.0-9
+ * fix memory leaks
+ * fix a potential bug in coverting to the real coordinate
+
+------------------------------------------------------------------------
+r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile.div
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-8
+ * fix a bug about strand
+ * update Makefile.div
+ * change top2b as the default method
+
+------------------------------------------------------------------------
+r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt1away.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-7
+ * use bwt_2occ() and bwt_2occ4() in other functions
+
+------------------------------------------------------------------------
+r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-6
+ * fix a bug for missing hits
+
+------------------------------------------------------------------------
+r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-5
+ * accelerate comparisons in heap, a bit
+
+------------------------------------------------------------------------
+r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-4
+ * avoid marginal repeated calculation in occ
+
+------------------------------------------------------------------------
+r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-3
+ * fix a bug caused by previours change
+ * fix a bug in heap
+ * order the heap by more criteria
+
+------------------------------------------------------------------------
+r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+
+ * bwa-0.1.0-2
+ * also sort sa range in heapsort, in attempt to improve cache performance.
+ Unfortunately, it does not work well at all.
+
+------------------------------------------------------------------------
+r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/Makefile.div
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/main.c
+
+ * 0.1.0-1
+ * fix a bug in calculating the real coordinate
+
+------------------------------------------------------------------------
+r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+
+nothing, really
+
+------------------------------------------------------------------------
+r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/Makefile.div
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/pac2bwt.c
+
+use IS algorithm by default
+
+------------------------------------------------------------------------
+r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/is.c
+ M /branches/prog/bwa/pac2bwt.c
+
+ * a bit code clean up in is.c
+ * add IS algorithm for constructing BWT, albeit slower
+
+------------------------------------------------------------------------
+r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bwa/is.c
+
+IS linear-time algorithm for constructing SA/BWT
+
+------------------------------------------------------------------------
+r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bntseq.c
+ A /branches/prog/bwa/bwtindex.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * fix a bug in generating .pac
+ * index in one go
+
+------------------------------------------------------------------------
+r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwttop2.c
+
+real coordinates can be ouput
+
+------------------------------------------------------------------------
+r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwttop2.c
+
+add top2e which is similar to 1away
+
+------------------------------------------------------------------------
+r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwttop2.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * unified cmd-line interface for ungapped alignment
+ * add two alternatives to top2 algorithm
+
+------------------------------------------------------------------------
+r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+add cmd-line interface to alntop2
+
+------------------------------------------------------------------------
+r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt1away.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ A /branches/prog/bwa/bwttop2.c
+
+top2 algorithm seems to work. I need to change interface, though
+
+------------------------------------------------------------------------
+r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt1away.c
+
+change the variable in the structure
+
+------------------------------------------------------------------------
+r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt1away.c
+
+set a slightly better bound on the maximum allowed mismatches
+
+------------------------------------------------------------------------
+r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+
+ * output time statistics
+
+------------------------------------------------------------------------
+r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ A /branches/prog/bwa/bwt1away.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+
+ * presumably better way to make use of prefix. But for the moment I do
+ not know whether it is correct or not.
+ * a bit code clean up: separate alignment part
+
+------------------------------------------------------------------------
+r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt_gen/Makefile
+ M /branches/prog/bwa/bwt_gen/bwt_gen.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/pac2bwt.c
+
+ * a bit code clean up
+ * put bwt_gen in bwa
+
+------------------------------------------------------------------------
+r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtio.c
+
+ * improve cmd-line interface
+ * fix a bug in loading .sa
+ * change default sa interval to 32
+
+------------------------------------------------------------------------
+r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwtaln.c
+
+ * fix memory leak (I know that. Just a bit lazy)
+ * change to another method to do 1-away alignment
+
+------------------------------------------------------------------------
+r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+best unique match is partially finished
+
+------------------------------------------------------------------------
+r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+remove "ungapped" command and related codes
+
+------------------------------------------------------------------------
+r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+
+change variable name thick to width
+
+------------------------------------------------------------------------
+r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/pac2bwt.c
+
+revised algorithm for ungapped alignment. the old one can still be used.
+
+------------------------------------------------------------------------
+r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwt_gen/bwt_gen.c
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/pac2bwt.c
+
+ * make commands more independent, but ungapped does not work at the moment
+
+------------------------------------------------------------------------
+r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt_gen/bwt_gen.c
+
+little...
+
+------------------------------------------------------------------------
+r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt_gen/bwt_gen.c
+ M /branches/prog/bwa/bwt_gen/bwt_gen.h
+
+ * add CopyRight information from the original codes
+ * do not dump .fmv files
+
+------------------------------------------------------------------------
+r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines
+Changed paths:
+ A /branches/prog/bwa/bwt_gen
+ A /branches/prog/bwa/bwt_gen/Makefile
+ A /branches/prog/bwa/bwt_gen/QSufSort.c
+ A /branches/prog/bwa/bwt_gen/QSufSort.h
+ A /branches/prog/bwa/bwt_gen/bwt_gen.c
+ A /branches/prog/bwa/bwt_gen/bwt_gen.h
+
+codes from BWT-SW, for building BWT from packed file
+
+------------------------------------------------------------------------
+r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * change OCC_INTERVAL to 0x40, which makes bwa twice as fast.
+ * write Occ file as ".occ" as it is using a different interval from
+ .fmv, the BWT-SW correspondance of .occ
+
+------------------------------------------------------------------------
+r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+
+fix a bug
+
+------------------------------------------------------------------------
+r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+
+add heuristics to improve the speed, but I have not tested whether the
+results are correct or not.
+
+
+------------------------------------------------------------------------
+r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/bwtaln.c
+ M /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * make ungapped alignment basically works...
+ * but it is very slow in comparison to others...
+ * also I need to improve the interface...
+ * a lot of things to keep me busy today...
+
+------------------------------------------------------------------------
+r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwtaln.c
+
+ * remove recursion
+ * fixed a bug in bwt_occ()
+
+------------------------------------------------------------------------
+r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwtaln.c
+
+ * bwa now tells whether a sequenced can be mapped with maximum allowed
+ mismatches. ONLY ungapped.
+ * this is a recursive version. I will remove recursion later.
+
+
+------------------------------------------------------------------------
+r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ A /branches/prog/bwa/bwtaln.c
+ A /branches/prog/bwa/bwtaln.h
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/utils.c
+
+ * load .sa and .fmv files
+ * exact alignment now works
+
+------------------------------------------------------------------------
+r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/utils.c
+ M /branches/prog/bwa/utils.h
+
+add xassert and fix a bug
+
+------------------------------------------------------------------------
+r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwtio.c
+ A /branches/prog/bwa/utils.c
+ A /branches/prog/bwa/utils.h
+
+improve error message and error handling
+
+------------------------------------------------------------------------
+r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+ A /branches/prog/bwa/bwtio.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+
+ * move I/O codes to bwtio.c
+ * SA can be dumped and interestingly, it is identical to BWTSW
+ * now, .fmv is still different from BWTSW
+
+------------------------------------------------------------------------
+r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+
+generate/retrieve SA and Occ
+
+------------------------------------------------------------------------
+r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines
+Changed paths:
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/bwt.c
+ M /branches/prog/bwa/bwt.h
+ M /branches/prog/bwa/bwt2fmv.c
+
+ * retrieve occ value at any position
+ * move bwt_cal_occ() to bwt.c
+
+------------------------------------------------------------------------
+r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ A /branches/prog/bwa/bwt.c
+ A /branches/prog/bwa/bwt.h
+ A /branches/prog/bwa/bwt2fmv.c
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ M /branches/prog/bwa/pac2bwt.c
+
+ * add bwt2fmv. It works to some extend. However, I do not understand
+ the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv
+ could generate a file almost identical, but not exactly identical, to
+ the .fmv file from BWT-SW.
+
+
+------------------------------------------------------------------------
+r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines
+Changed paths:
+ M /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bntseq.c
+ M /branches/prog/bwa/bntseq.h
+ M /branches/prog/bwa/main.c
+ M /branches/prog/bwa/main.h
+ A /branches/prog/bwa/pac2bwt.c
+
+Burrows-Wheeler Transform now works. At least on one example, the
+current code generates the same BWT as BWT-SW. Kind of magical, I would
+say. :)
+
+
+------------------------------------------------------------------------
+r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines
+Changed paths:
+ A /branches/prog/bwa/Makefile
+ M /branches/prog/bwa/bntseq.c
+ A /branches/prog/bwa/main.c
+ A /branches/prog/bwa/main.h
+
+ * add Makefile and main.*
+ * improve interface to fa2bns, a bit
+
+------------------------------------------------------------------------
+r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines
+Changed paths:
+ A /branches/prog/bwa
+ A /branches/prog/bwa/bntseq.c
+ A /branches/prog/bwa/bntseq.h
+ A /branches/prog/bwa/seq.c
+ A /branches/prog/bwa/seq.h
+
+ * Burrow-Wheeler Alignment
+ * initial codes
+
+------------------------------------------------------------------------
diff --git a/ext/src/bwa/LICENSE.txt b/ext/src/bwa/LICENSE.txt
new file mode 100644
index 0000000..efaa7b9
--- /dev/null
+++ b/ext/src/bwa/LICENSE.txt
@@ -0,0 +1,168 @@
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction, and
+ distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by the
+ copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all other
+ entities that control, are controlled by, or are under common control with
+ that entity. For the purposes of this definition, "control" means (i) the
+ power, direct or indirect, to cause the direction or management of such
+ entity, whether by contract or otherwise, or (ii) ownership of fifty percent
+ (50%) or more of the outstanding shares, or (iii) beneficial ownership of
+ such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity exercising
+ permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation source, and
+ configuration files.
+
+ "Object" form shall mean any form resulting from mechanical transformation
+ or translation of a Source form, including but not limited to compiled
+ object code, generated documentation, and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or Object form,
+ made available under the License, as indicated by a copyright notice that is
+ included in or attached to the work (an example is provided in the Appendix
+ below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object form,
+ that is based on (or derived from) the Work and for which the editorial
+ revisions, annotations, elaborations, or other modifications represent, as a
+ whole, an original work of authorship. For the purposes of this License,
+ Derivative Works shall not include works that remain separable from, or
+ merely link (or bind by name) to the interfaces of, the Work and Derivative
+ Works thereof.
+
+ "Contribution" shall mean any work of authorship, including the original
+ version of the Work and any modifications or additions to that Work or
+ Derivative Works thereof, that is intentionally submitted to Licensor for
+ inclusion in the Work by the copyright owner or by an individual or Legal
+ Entity authorized to submit on behalf of the copyright owner. For the
+ purposes of this definition, "submitted" means any form of electronic,
+ verbal, or written communication sent to the Licensor or its
+ representatives, including but not limited to communication on electronic
+ mailing lists, source code control systems, and issue tracking systems that
+ are managed by, or on behalf of, the Licensor for the purpose of discussing
+ and improving the Work, but excluding communication that is conspicuously
+ marked or otherwise designated in writing by the copyright owner as "Not a
+ Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity on
+ behalf of whom a Contribution has been received by Licensor and subsequently
+ incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this
+ License, each Contributor hereby grants to You a perpetual, worldwide,
+ non-exclusive, no-charge, royalty-free, irrevocable copyright license to
+ reproduce, prepare Derivative Works of, publicly display, publicly perform,
+ sublicense, and distribute the Work and such Derivative Works in Source or
+ Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this
+ License, each Contributor hereby grants to You a perpetual, worldwide,
+ non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
+ this section) patent license to make, have made, use, offer to sell, sell,
+ import, and otherwise transfer the Work, where such license applies only to
+ those patent claims licensable by such Contributor that are necessarily
+ infringed by their Contribution(s) alone or by combination of their
+ Contribution(s) with the Work to which such Contribution(s) was submitted.
+ If You institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+ Contribution incorporated within the Work constitutes direct or contributory
+ patent infringement, then any patent licenses granted to You under this
+ License for that Work shall terminate as of the date such litigation is
+ filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or
+ Derivative Works thereof in any medium, with or without modifications, and
+ in Source or Object form, provided that You meet the following conditions:
+
+ (a) You must give any other recipients of the Work or Derivative Works a
+ copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices stating
+ that You changed the files; and
+
+ (c) You must retain,in the Source form of any Derivative Works that You
+ distribute, all copyright, patent, trademark, and attribution notices
+ from the Source form of the Work, excluding those notices that do not
+ pertain to any part of the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its distribution,
+ then any Derivative Works that You distribute must include a readable
+ copy of the attribution notices contained within such NOTICE file,
+ excluding those notices that do not pertain to any part of the
+ Derivative Works, in at least one of the following places: within a
+ NOTICE text file distributed as part of the Derivative Works; within the
+ Source form or documentation, if provided along with the Derivative
+ Works; or, within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents of the
+ NOTICE file are for informational purposes only and do not modify the
+ License. You may add Your own attribution notices within Derivative
+ Works that You distribute, alongside or as an addendum to the NOTICE
+ text from the Work, provided that such additional attribution notices
+ cannot be construed as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and may
+ provide additional or different license terms and conditions for use,
+ reproduction, or distribution of Your modifications, or for any such
+ Derivative Works as a whole, provided Your use, reproduction, and
+ distribution of the Work otherwise complies with the conditions stated in
+ this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any
+ Contribution intentionally submitted for inclusion in the Work by You to the
+ Licensor shall be under the terms and conditions of this License, without
+ any additional terms or conditions. Notwithstanding the above, nothing
+ herein shall supersede or modify the terms of any separate license agreement
+ you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names,
+ trademarks, service marks, or product names of the Licensor, except as
+ required for reasonable and customary use in describing the origin of the
+ Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
+ writing, Licensor provides the Work (and each Contributor provides its
+ Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied, including, without limitation, any
+ warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or
+ FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining
+ the appropriateness of using or redistributing the Work and assume any risks
+ associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in
+ tort (including negligence), contract, or otherwise, unless required by
+ applicable law (such as deliberate and grossly negligent acts) or agreed to
+ in writing, shall any Contributor be liable to You for damages, including
+ any direct, indirect, special, incidental, or consequential damages of any
+ character arising as a result of this License or out of the use or inability
+ to use the Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all other
+ commercial damages or losses), even if such Contributor has been advised of
+ the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or
+ Derivative Works thereof, You may choose to offer, and charge a fee for,
+ acceptance of support, warranty, indemnity, or other liability obligations
+ and/or rights consistent with this License. However, in accepting such
+ obligations, You may act only on Your own behalf and on Your sole
+ responsibility, not on behalf of any other Contributor, and only if You
+ agree to indemnify, defend, and hold each Contributor harmless for any
+ liability incurred by, or claims asserted against, such Contributor by
+ reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/ext/src/bwa/Makefile b/ext/src/bwa/Makefile
new file mode 100644
index 0000000..478aec8
--- /dev/null
+++ b/ext/src/bwa/Makefile
@@ -0,0 +1,88 @@
+CC= gcc
+#CC= clang --analyze
+CFLAGS= -g -Wall -Wno-unused-function -O2
+WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
+AR= ar
+DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
+## moved is.o bwtiindex.o rope.o to LOBJS
+LOBJS= utils.o kthread.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o \
+ bwamem_extra.o malloc_wrap.o is.o bwtindex.o rope.o rle.o
+AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+ bwape.o kopen.o pemerge.o maxk.o \
+ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
+ bwtsw2_chain.o fastmap.o bwtsw2_pair.o
+PROG= bwa
+INCLUDES=
+LIBS= -lm -lz -lpthread
+SUBDIRS= .
+
+ifeq ($(shell uname -s),Linux)
+ LIBS += -lrt
+endif
+
+.SUFFIXES:.c .o .cc
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+bwa:libbwa.a $(AOBJS) main.o
+ $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
+
+bwamem-lite:libbwa.a example.o
+ $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
+
+libbwa.a:$(LOBJS)
+ $(AR) -csru $@ $(LOBJS)
+
+clean:
+ rm -f gmon.out *.o a.out $(PROG) *~ *.a
+
+depend:
+ ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
+QSufSort.o: QSufSort.h
+bamlite.o: bamlite.h malloc_wrap.h
+bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
+bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h
+bwa.o: kseq.h
+bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
+bwamem.o: ksort.h utils.h kbtree.h
+bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
+bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
+bwamem_pair.o: utils.h ksw.h
+bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h
+bwape.o: ksw.h khash.h
+bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
+bwase.o: bwa.h ksw.h
+bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
+bwashm.o: bwa.h bntseq.h bwt.h
+bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
+bwt_lite.o: bwt_lite.h malloc_wrap.h
+bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h
+bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h
+bwtindex.o: bntseq.h bwt.h utils.h rle.h rope.h malloc_wrap.h
+bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h
+bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h
+bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h
+bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h
+bwtsw2_core.o: khash.h ksort.h
+bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h
+bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
+bwtsw2_pair.o: malloc_wrap.h ksw.h
+example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
+fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
+is.o: malloc_wrap.h
+kopen.o: malloc_wrap.h
+kstring.o: kstring.h malloc_wrap.h
+ksw.o: ksw.h malloc_wrap.h
+main.o: kstring.h malloc_wrap.h utils.h
+malloc_wrap.o: malloc_wrap.h
+maxk.o: bwa.h bntseq.h bwt.h bwamem.h kseq.h malloc_wrap.h
+pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
+rle.o: rle.h
+rope.o: rle.h rope.h
+utils.o: utils.h ksort.h malloc_wrap.h kseq.h
diff --git a/ext/src/bwa/NEWS.md b/ext/src/bwa/NEWS.md
new file mode 100644
index 0000000..4692889
--- /dev/null
+++ b/ext/src/bwa/NEWS.md
@@ -0,0 +1,1146 @@
+Release 0.7.12 (28 December 2014)
+---------------------------------
+
+This release fixed a bug in the pair-end mode when ALT contigs are present. It
+leads to undercalling in regions overlapping ALT contigs.
+
+(0.7.12: 28 December 2014, r1039)
+
+
+
+Release 0.7.11 (23 December, 2014)
+----------------------------------
+
+A major change to BWA-MEM is the support of mapping to ALT contigs in addition
+to the primary assembly. Part of the ALT mapping strategy is implemented in
+BWA-MEM and the rest in a postprocessing script for now. Due to the extra
+layer of complexity on generating the reference genome and on the two-step
+mapping, we start to provide a wrapper script and precompiled binaries since
+this release. The package may be more convenient to some specific use cases.
+For general uses, the single BWA binary still works like the old way.
+
+Another major addition to BWA-MEM is HLA typing, which made possible with the
+new ALT mapping strategy. Necessary data and programs are included in the
+binary release. The wrapper script also optionally performs HLA typing when HLA
+genes are included in the reference genome as additional ALT contigs.
+
+Other notable changes to BWA-MEM:
+
+ * Added option `-b` to `bwa index`. This option tunes the batch size used in
+ the construction of BWT. It is advised to use large `-b` for huge reference
+ sequences such as the BLAST *nt* database.
+
+ * Optimized for PacBio data. This includes a change to scoring based on a
+ study done by Aaron Quinlan and a heuristic speedup. Further speedup is
+ possible, but needs more careful investigation.
+
+ * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding
+ the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread`
+ is still available, but hidden on the command line. This may be removed in
+ future releases.
+
+ * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little
+ more sensitive on older bacterial data, but bwa-mem is as good on more
+ recent data and is times faster for mapping against mammalian genomes.
+
+ * Added LAST-like seeding. This improves the accuracy for longer reads.
+
+ * Added option `-H` to insert arbitrary header lines.
+
+ * Smarter option `-p`. Given an interleaved FASTQ stream, old bwa-mem identifies
+ the 2i-th and (2i+1)-th reads as a read pair. The new verion identifies
+ adjacent reads with the same read name as a read pair. It is possible to mix
+ single-end and paired-end reads in one FASTQ.
+
+ * Improved parallelization. Old bwa-mem waits for I/O. The new version puts
+ I/O on a separate thread. It performs mapping while reading FASTQ and
+ writing SAM. This saves significant wall-clock time when reading from
+ or writing to a slow Unix pipe.
+
+With the new release, the recommended way to map Illumina reads to GRCh38 is to
+use the bwakit binary package:
+
+ bwa.kit/run-gen-ref hs38DH
+ bwa.kit/bwa index hs38DH.fa
+ bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh
+
+Please check bwa.kit/README.md for details and command line options.
+
+(0.7.11: 23 December 2014, r1034)
+
+
+
+Release 0.7.10 (13 July, 2014)
+------------------------------
+
+Notable changes to BWA-MEM:
+
+ * Fixed a segmentation fault due to an alignment bridging the forward-reverse
+ boundary. This is a bug.
+
+ * Use the PacBio heuristic to map contigs to the reference genome. The old
+ heuristic evaluates the necessity of full extension for each chain. This may
+ not work in long low-complexity regions. The PacBio heuristic performs
+ SSE2-SW around each short seed. It works better. Note that the heuristic is
+ only applied to long query sequences. For Illumina reads, the output is
+ identical to the previous version.
+
+(0.7.10: 13 July 2014, r789)
+
+
+
+Release 0.7.9 (19 May, 2014)
+----------------------------
+
+This release brings several major changes to BWA-MEM. Notably, BWA-MEM now
+formally supports PacBio read-to-reference alignment and experimentally supports
+PacBio read-to-read alignment. BWA-MEM also runs faster at a minor cost of
+accuracy. The speedup is more significant when GRCh38 is in use. More
+specifically:
+
+ * Support PacBio subread-to-reference alignment. Although older BWA-MEM works
+ with PacBio data in principle, the resultant alignments are frequently
+ fragmented. In this release, we fine tuned existing methods and introduced
+ new heuristics to improve PacBio alignment. These changes are not used by
+ default. Users need to add option "-x pacbio" to enable the feature.
+
+ * Support PacBio subread-to-subread alignment (EXPERIMENTAL). This feature is
+ enabled with option "-x pbread". In this mode, the output only gives the
+ overlapping region between a pair of reads without detailed alignment.
+
+ * Output alternative hits in the XA tag if there are not so many of them. This
+ is a BWA-backtrack feature.
+
+ * Support mapping to ALT contigs in GRCh38 (EXPERIMENTAL). We provide a script
+ to postprocess hits in the XA tag to adjust the mapping quality and generate
+ new primary alignments to all overlapping ALT contigs. We would *NOT*
+ recommend this feature for production uses.
+
+ * Improved alignments to many short reference sequences. Older BWA-MEM may
+ generate an alignment bridging two or more adjacent reference sequences.
+ Such alignments are split at a later step as postprocessing. This approach
+ is complex and does not always work. This release forbids these alignments
+ from the very beginning. BWA-MEM should not produce an alignment bridging
+ two or more reference sequences any more.
+
+ * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum
+ rounds of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to
+ lower the mapping quality if a read contains seeds with excessive
+ occurrences. These changes make BWA-MEM faster at a minor cost of accuracy
+ in highly repetitive regions.
+
+ * Added an option "-Y" to use soft clipping for supplementary alignments.
+
+ * Bugfix: incomplete alignment extension in corner cases.
+
+ * Bugfix: integer overflow when aligning long query sequences.
+
+ * Bugfix: chain score is not computed correctly (almost no practical effect)
+
+ * General code cleanup
+
+ * Added FAQs to README
+
+Changes in BWA-backtrack:
+
+ * Bugfix: a segmentation fault when an alignment stands out of the end of the
+ last chromosome.
+
+(0.7.9: 19 May 2014, r783)
+
+
+
+Release 0.7.8 (31 March, 2014)
+------------------------------
+
+Changes in BWA-MEM:
+
+ * Bugfix: off-diagonal X-dropoff (option -d) not working as intended.
+ Short-read alignment is not affected.
+
+ * Bugfix: unnecessarily large bandwidth used during global alignment,
+ which reduces the mapping speed by -5% for short reads. Results are not
+ affected.
+
+ * Bugfix: when the matching score is not one, paired-end mapping quality is
+ inaccurate.
+
+ * When the matching score (option -A) is changed, scale all score-related
+ options accordingly unless overridden by users.
+
+ * Allow to specify different gap open (or extension) penalties for deletions
+ and insertions separately.
+
+ * Allow to specify the insert size distribution.
+
+ * Better and more detailed debugging information.
+
+With the default setting, 0.7.8 and 0.7.7 gave identical output on one million
+100bp read pairs.
+
+(0.7.8: 31 March 2014, r455)
+
+
+
+Release 0.7.7 (25 Feburary, 2014)
+---------------------------------
+
+This release fixes incorrect MD tags in the BWA-MEM output.
+
+A note about short-read mapping to GRCh38. The new human reference genome
+GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are
+hard masked as they cannot be localized. These highly repetitive arrays make
+BWA-MEM -50% slower. If you are concerned with the performance of BWA-MEM, you
+may consider to use option "-c2000 -m50". On simulated data, this setting helps
+the performance at a very minor cost on accuracy. I may consider to change the
+default in future releases.
+
+(0.7.7: 25 Feburary 2014, r441)
+
+
+
+Release 0.7.6 (31 Januaray, 2014)
+---------------------------------
+
+Changes in BWA-MEM:
+
+ * Changed the way mapping quality is estimated. The new method tends to give
+ the same alignment a higher mapping quality. On paired-end reads, the change
+ is minor as with pairing, the mapping quality is usually high. For short
+ single-end reads, the difference is considerable.
+
+ * Improved load balance when many threads are spawned. However, bwa-mem is
+ still not very thread efficient, probably due to the frequent heap memory
+ allocation. Further improvement is a little difficult and may affect the
+ code stability.
+
+ * Allow to use different clipping penalties for 5'- and 3'-ends. This helps
+ when we do not want to clip one end.
+
+ * Print the @PG line, including the command line options.
+
+ * Improved the band width estimate: a) fixed a bug causing the band
+ width extimated from extension not used in the final global alignment; b)
+ try doubled band width if the global alignment score is smaller.
+ Insufficient band width leads to wrong CIGAR and spurious mismatches/indels.
+
+ * Added a new option -D to fine tune a heuristic on dropping suboptimal hits.
+ Reducing -D increases accuracy but decreases the mapping speed. If unsure,
+ leave it to the default.
+
+ * Bugfix: for a repetitive single-end read, the reported hit is not randomly
+ distributed among equally best hits.
+
+ * Bugfix: missing paired-end hits due to unsorted list of SE hits.
+
+ * Bugfix: incorrect CIGAR caused by a defect in the global alignment.
+
+ * Bugfix: incorrect CIGAR caused by failed SW rescue.
+
+ * Bugfix: alignments largely mapped to the same position are regarded to be
+ distinct from each other, which leads to underestimated mapping quality.
+
+ * Added the MD tag.
+
+There are no changes to BWA-backtrack in this release. However, it has a few
+known issues yet to be fixed. If you prefer BWA-track, It is still advised to
+use bwa-0.6.x.
+
+While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now
+possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as
+BWA-MEM is usually better, I will not improve BWA-SW until I find applications
+where BWA-SW may excel.
+
+(0.7.6: 31 January 2014, r432)
+
+
+
+Release 0.7.5a (30 May, 2013)
+-----------------------------
+
+Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare
+cases.
+
+(0.7.5a: 30 May 2013, r405)
+
+
+
+Release 0.7.5 (29 May, 2013)
+----------------------------
+
+Changes in all components:
+
+ * Improved error checking on memory allocation and file I/O. Patches provided
+ by Rob Davies.
+
+ * Updated README.
+
+ * Bugfix: return code is zero upon errors.
+
+Changes in BWA-MEM:
+
+ * Changed the way a chimeric alignment is reported (conforming to the upcoming
+ SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired
+ or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100
+ bits. All the other hits part of the chimeric alignment will use hard
+ clipping and be marked with 0x800 if option "-M" is not in use, or marked
+ with 0x100 otherwise.
+
+ * Other hits part of a chimeric alignment are now reported in the SA tag,
+ conforming to the SAM spec v1.5.
+
+ * Better method for resolving an alignment bridging two or more short
+ reference sequences. The current strategy maps the query to the reference
+ sequence that covers the middle point of the alignment. For most
+ applications, this change has no effects.
+
+Changes in BWA-backtrack:
+
+ * Added a magic number to .sai files. This prevents samse/sampe from reading
+ corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai
+ generated by a different version of bwa.
+
+ * Bugfix: alignments in the XA:Z: tag were wrong.
+
+ * Keep track of #ins and #del during backtracking. This simplifies the code
+ and reduces errors in rare corner cases. I should have done this in the
+ early days of bwa.
+
+In addition, if you use BWA-MEM or the fastmap command of BWA, please cite:
+
+ - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
+ with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN].
+
+Thank you.
+
+(0.7.5: 29 May 2013, r404)
+
+
+
+Release 0.7.4 (23 April, 2013)
+------------------------------
+
+This is a bugfix release. Most of bugs are considered to be minor which only
+occur very rarely.
+
+ * Bugfix: wrong CIGAR when a query sequence bridges three or more target
+ sequences. This only happens when aligning reads to short assembly contigs.
+
+ * Bugfix: leading "D" operator in CIGAR.
+
+ * Extend more seeds for better alignment around tandem repeats. This is also
+ a cause of the leading "D" operator in CIGAR.
+
+ * Bugfix: SSE2-SSW may occasionally find incorrect query starting position
+ around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and
+ a wrong CIGAR in BWA.
+
+ * Bugfix: clipping penalty does not work as is intended when there is a gap
+ towards the end of a read.
+
+ * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin,
+ fread() is unable to read a data block longer than 2GB due to an integer
+ overflow bug in its implementation.
+
+Since version 0.7.4, BWA-MEM is considered to reach similar stability to
+BWA-backtrack for short-read mapping.
+
+(0.7.4: 23 April, r385)
+
+
+
+Release 0.7.3a (15 March, 2013)
+-------------------------------
+
+In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed
+in another corner case.
+
+(0.7.3a: 15 March 2013, r367)
+
+
+
+Release 0.7.3 (15 March, 2013)
+------------------------------
+
+Changes to BWA-MEM:
+
+ * Bugfix: pairing score is inaccurate when option -A does not take the default
+ value. This is a very minor issue even if it happens.
+
+ * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there
+ is a 1bp deletion and a 1bp insertion which are close to the end of the
+ reads, and there are no other substitutions or indels. BWA-MEM would not do
+ a gapped alignment due to the bug.
+
+ * New feature: output other non-overlapping alignments in the XP tag such that
+ we can see the entire picture of alignment from one SAM line. XP gives the
+ position, CIGAR, NM and mapQ of each aligned subsequence of the query.
+
+BWA-MEM has been used to align -300Gbp 100-700bp SE/PE reads. SNP/indel calling
+has also been evaluated on part of these data. BWA-MEM generally gives better
+pre-filtered SNP calls than BWA. No significant issues have been observed since
+0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release)
+are still possible. If you find potential issues, please send bug reports to
+<bio-bwa-help at lists.sourceforge.net> (free registration required).
+
+In addition, more detailed description of the BWA-MEM algorithm can be found at
+<https://github.com/lh3/mem-paper>.
+
+(0.7.3: 15 March 2013, r366)
+
+
+
+Release 0.7.2 (9 March, 2013)
+-----------------------------
+
+Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition,
+flagging 'properly paired' also gets improved a little.
+
+(0.7.2: 9 March 2013, r351)
+
+
+
+Release 0.7.1 (8 March, 2013)
+-----------------------------
+
+Changes to BWA-MEM:
+
+ * Bugfix: rare segmentation fault caused by a partial hit to the end of the
+ last sequence.
+
+ * Bugfix: occasional mis-pairing given an interleaved fastq.
+
+ * Bugfix: wrong mate information when the mate is unmapped. SAM generated by
+ BWA-MEM can now be validated with Picard.
+
+ * Improved the performance and accuracy for ultra-long query sequences.
+ Short-read alignment is not affected.
+
+Changes to other components:
+
+ * In BWA-backtrack and BWA-SW, replaced the code for global alignment,
+ Smith-Waterman and SW extension. The performance and accuracy of the two
+ algorithms stay the same.
+
+ * Added an experimental subcommand to merge overlapping paired ends. The
+ algorithm is very conservative: it may miss true overlaps but rarely makes
+ mistakes.
+
+An important note is that like BWA-SW, BWA-MEM may output multiple primary
+alignments for a read, which may cause problems to some tools. For aligning
+sequence reads, it is advised to use '-M' to flag extra hits as secondary. This
+option is not the default because multiple primary alignments are theoretically
+possible in sequence alignment.
+
+(0.7.1: 8 March 2013, r347)
+
+
+
+Beta Release 0.7.0 (28 Feburary, 2013)
+--------------------------------------
+
+This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
+sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
+algorithm and extends seeds with banded affine-gap-penalty dynamic programming
+(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
+longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
+and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
+and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
+to find hits within a certain edit distance, but BWA is not efficient for such
+task given longer reads anyway, and the edit-distance criterion is arguably
+not as important in long-read alignment.
+
+In addition to the algorithmic improvements, BWA-MEM also implements a few
+handy features in practical aspects:
+
+ 1. BWA-MEM automatically switches between local and glocal (global wrt reads;
+ local wrt reference) alignment. It reports the end-to-end glocal alignment
+ if the glocal alignment is not much worse than the optimal local alignment.
+ Glocal alignment reduces reference bias.
+
+ 2. BWA-MEM automatically infers pair orientation from a batch of single-end
+ alignments. It allows more than one orientations if there are sufficient
+ supporting reads. This feature has not been tested on reads from Illumina
+ jumping library yet. (EXPERIMENTAL)
+
+ 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
+ is possible to convert a name-sorted BAM to an interleaved fastq on the fly
+ and feed the data stream to BWA-MEM for mapping.
+
+ 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
+ helps to transfer individual read annotations to the output.
+
+ 5. BWA-MEM supports more advanced piping. Users can now run:
+ (bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
+ files without replying on bash features.
+
+ 6. BWA-MEM provides a few basic APIs for single-end mapping. The 'example.c'
+ program in the source code directory implements a full single-end mapper in
+ 50 lines of code.
+
+The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
+for production use yet. However, when the implementation becomes stable after a
+few release cycles, existing BWA users are recommended to migrate to BWA-MEM
+for 76bp or longer Illumina reads and long query sequences. The original BWA
+short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
+reads. Change of mappers will be necessary sooner or later.
+
+(0.7.0 beta: 28 Feburary 2013, r313)
+
+
+
+Release 0.6.2 (19 June, 2012)
+-----------------------------
+
+This is largely a bug-fix release. Notable changes in BWA-short and BWA-SW:
+
+ * Bugfix: BWA-SW may give bad alignments due to incorrect band width.
+
+ * Bugfix: A segmentation fault due to an out-of-boundary error. The fix is a
+ temporary solution. The real cause has not been identified.
+
+ * Attempt to read index from prefix.64.bwt, such that the 32-bit and 64-bit
+ index can coexist.
+
+ * Added options '-I' and '-S' to control BWA-SW pairing.
+
+(0.6.2: 19 June 2012, r126)
+
+
+
+Release 0.6.1 (28 November, 2011)
+---------------------------------
+
+Notable changes to BWA-short:
+
+ * Bugfix: duplicated alternative hits in the XA tag.
+
+ * Bugfix: when trimming enabled, bwa-aln trims 1bp less.
+
+ * Disabled the color-space alignment. 0.6.x is not working with SOLiD reads at
+ present.
+
+Notable changes to BWA-SW:
+
+ * Bugfix: segfault due to excessive ambiguous bases.
+
+ * Bugfix: incorrect mate position in the SE mode.
+
+ * Bugfix: rare segfault in the PE mode
+
+ * When macro _NO_SSE2 is in use, fall back to the standard Smith-Waterman
+ instead of SSE2-SW.
+
+ * Optionally mark split hits with lower alignment scores as secondary.
+
+Changes to fastmap:
+
+ * Bugfix: infinite loop caused by ambiguous bases.
+
+ * Optionally output the query sequence.
+
+(0.6.1: 28 November 2011, r104)
+
+
+
+Release 0.5.10 and 0.6.0 (12 November, 2011)
+--------------------------------------------
+
+The 0.6.0 release comes with two major changes. Firstly, the index data
+structure has been changed to support genomes longer than 4GB. The forward and
+reverse backward genome is now integrated in one index. This change speeds up
+BWA-short by about 20% and BWA-SW by 90% with the mapping acccuracy largely
+unchanged. A tradeoff is BWA requires more memory, but this is the price almost
+all mappers that index the genome have to pay.
+
+Secondly, BWA-SW in 0.6.0 now works with paired-end data. It is more accurate
+for highly unique reads and more robust to long indels and structural
+variations. However, BWA-short still has edges for reads with many suboptimal
+hits. It is yet to know which algorithm is the best for variant calling.
+
+0.5.10 is a bugfix release only and is likely to be the last release in the 0.5
+branch unless I find critical bugs in future.
+
+Other notable changes:
+
+ * Added the 'fastmap' command that finds super-maximal exact matches. It does
+ not give the final alignment, but runs much faster. It can be a building
+ block for other alignment algorithms. [0.6.0 only]
+
+ * Output the timing information before BWA exits. This also tells users that
+ the task has been finished instead of being killed or aborted. [0.6.0 only]
+
+ * Sped up multi-threading when using many (>20) CPU cores.
+
+ * Check I/O error.
+
+ * Increased the maximum barcode length to 63bp.
+
+ * Automatically choose the indexing algorithm.
+
+ * Bugfix: very rare segfault due to an uninitialized variable. The bug also
+ affects the placement of suboptimal alignments. The effect is very minor.
+
+This release involves quite a lot of tricky changes. Although it has been
+tested on a few data sets, subtle bugs may be still hidden. It is *NOT*
+recommended to use this release in a production pipeline. In future, however,
+BWA-SW may be better when reads continue to go longer. I would encourage users
+to try the 0.6 release. I would also like to hear the users' experience. Thank
+you.
+
+(0.6.0: 12 November 2011, r85)
+
+
+
+Beta Release 0.5.9 (24 January, 2011)
+-------------------------------------
+
+Notable changes:
+
+ * Feature: barcode support via the '-B' option.
+
+ * Feature: Illumina 1.3+ read format support via the '-I' option.
+
+ * Bugfix: RG tags are not attached to unmapped reads.
+
+ * Bugfix: very rare bwasw mismappings
+
+ * Recommend options for PacBio reads in bwasw help message.
+
+
+Also, since January 13, the BWA master repository has been moved to github:
+
+ https://github.com/lh3/bwa
+
+The revision number has been reset. All recent changes will be first
+committed to this repository.
+
+(0.5.9: 24 January 2011, r16)
+
+
+
+Beta Release Candidate 0.5.9rc1 (10 December, 2010)
+---------------------------------------------------
+
+Notable changes in bwasw:
+
+ * Output unmapped reads.
+
+ * For a repetitive read, choose a random hit instead of a fixed
+ one. This is not well tested.
+
+Notable changes in bwa-short:
+
+ * Fixed a bug in the SW scoring system, which may lead to unexpected
+ gaps towards the end of a read.
+
+ * Fixed a bug which invalidates the randomness of repetitive reads.
+
+ * Fixed a rare memory leak.
+
+ * Allowed to specify the read group at the command line.
+
+ * Take name-grouped BAM files as input.
+
+Changes to this release are usually safe in that they do not interfere
+with the key functionality. However, the release has only been tested on
+small samples instead of on large-scale real data. If anything weird
+happens, please report the bugs to the bio-bwa-help mailing list.
+
+(0.5.9rc1: 10 December 2010, r1561)
+
+
+
+Beta Release 0.5.8 (8 June, 2010)
+---------------------------------
+
+Notable changes in bwasw:
+
+ * Fixed an issue of missing alignments. This should happen rarely and
+ only when the contig/read alignment is multi-part. Very rarely, bwasw
+ may still miss a segment in a multi-part alignment. This is difficult
+ to fix, although possible.
+
+Notable changes in bwa-short:
+
+ * Discard the SW alignment when the best single-end alignment is much
+ better. Such a SW alignment may caused by structural variations and
+ forcing it to be aligned leads to false alignment. This fix has not
+ been tested thoroughly. It would be great to receive more users
+ feedbacks on this issue.
+
+ * Fixed a typo/bug in sampe which leads to unnecessarily large memory
+ usage in some cases.
+
+ * Further reduced the chance of reporting 'weird pairing'.
+
+(0.5.8: 8 June 2010, r1442)
+
+
+
+Beta Release 0.5.7 (1 March, 2010)
+----------------------------------
+
+This release only has an effect on paired-end data with fat insert-size
+distribution. Users are still recommended to update as the new release
+improves the robustness to poor data.
+
+ * The fix for 'weird pairing' was not working in version 0.5.6, pointed
+ out by Carol Scott. It should work now.
+
+ * Optionally output to a normal file rather than to stdout (by Tim
+ Fennel).
+
+(0.5.7: 1 March 2010, r1310)
+
+
+
+Beta Release 0.5.6 (10 Feburary, 2010)
+--------------------------------------
+
+Notable changes in bwa-short:
+
+ * Report multiple hits in the SAM format at a new tag XA encoded as:
+ (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has
+ 4 or fewer hits, they will all be reported; if a read in a anomalous
+ pair has 11 or fewer hits, all of them will be reported.
+
+ * Perform Smith-Waterman alignment also for anomalous read pairs when
+ both ends have quality higher than 17. This reduces false positives
+ for some SV discovery algorithms.
+
+ * Do not report "weird pairing" when the insert size distribution is
+ too fat or has a mean close to zero.
+
+ * If a read is bridging two adjacent chromsomes, flag it as unmapped.
+
+ * Fixed a small but long existing memory leak in paired-end mapping.
+
+ * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly
+ parsed by solid2fastq.pl; b) truncated quality string is resolved; c)
+ SOLiD read mapped to the reverse strand is complemented.
+
+ * Bwa now calculates skewness and kurtosis of the insert size
+ distribution.
+
+ * Deploy a Bayesian method to estimate the maximum distance for a read
+ pair considered to be paired properly. The method is proposed by
+ Gerton Lunter, but bwa only implements a simplified version.
+
+ * Export more functions for Java bindings, by Matt Hanna (See:
+ http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings)
+
+ * Abstract bwa CIGAR for further extension, by Rodrigo Goya.
+
+(0.5.6: 10 Feburary 2010, r1303)
+
+
+
+Beta Release 0.5.5 (10 November, 2009)
+--------------------------------------
+
+This is a bug fix release:
+
+ * Fixed a serious bug/typo in aln which does not occur given short
+ reads, but will lead to segfault for >500bp reads. Of course, the aln
+ command is not recommended for reads longer than 200bp, but this is a
+ bug anyway.
+
+ * Fixed a minor bug/typo which leads to incorrect single-end mapping
+ quality when one end is moved to meet the mate-pair requirement.
+
+ * Fixed a bug in samse for mapping in the color space. This bug is
+ caused by quality filtration added since 0.5.1.
+
+(0.5.5: 10 November 2009, r1273)
+
+
+
+Beta Release 0.5.4 (9 October, 2009)
+------------------------------------
+
+Since this version, the default seed length used in the "aln" command is
+changed to 32.
+
+Notable changes in bwa-short:
+
+ * Added a new tag "XC:i" which gives the length of clipped reads.
+
+ * In sampe, skip alignments in case of a bug in the Smith-Waterman
+ alignment module.
+
+ * In sampe, fixed a bug in pairing when the read sequence is identical
+ to its reverse complement.
+
+ * In sampe, optionally preload the entire FM-index into memory to
+ reduce disk operations.
+
+Notable changes in dBWT-SW/BWA-SW:
+
+ * Changed name dBWT-SW to BWA-SW.
+
+ * Optionally use "hard clipping" in the SAM output.
+
+(0.5.4: 9 October 2009, r1245)
+
+
+
+Beta Release 0.5.3 (15 September, 2009)
+---------------------------------------
+
+Fixed a critical bug in bwa-short: reads mapped to the reverse strand
+are not complemented.
+
+(0.5.3: 15 September 2009, r1225)
+
+
+
+Beta Release 0.5.2 (13 September, 2009)
+---------------------------------------
+
+Notable changes in bwa-short:
+
+ * Optionally trim reads before alignment. See the manual page on 'aln
+ -q' for detailed description.
+
+ * Fixed a bug in calculating the NM tag for a gapped alignment.
+
+ * Fixed a bug given a mixture of reads with some longer than the seed
+ length and some shorter.
+
+ * Print SAM header.
+
+Notable changes in dBWT-SW:
+
+ * Changed the default value of -T to 30. As a result, the accuracy is a
+ little higher for short reads at the cost of speed.
+
+(0.5.2: 13 September 2009, r1223)
+
+
+
+Beta Release 0.5.1 (2 September, 2009)
+--------------------------------------
+
+Notable changes in the short read alignment component:
+
+ * Fixed a bug in samse: do not write mate coordinates.
+
+Notable changes in dBWT-SW:
+
+ * Randomly choose one alignment if the read is a repetitive.
+
+ * Fixed a flaw when a read is mapped across two adjacent reference
+ sequences. However, wrong alignment reports may still occur rarely in
+ this case.
+
+ * Changed the default band width to 50. The speed is slower due to this
+ change.
+
+ * Improved the mapping quality a little given long query sequences.
+
+(0.5.1: 2 September 2009, r1209)
+
+
+
+Beta Release 0.5.0 (20 August, 2009)
+------------------------------------
+
+This release implements a novel algorithm, dBWT-SW, specifically
+designed for long reads. It is 10-50 times faster than SSAHA2, depending
+on the characteristics of the input data, and achieves comparable
+alignment accuracy while allowing chimera detection. In comparison to
+BLAT, dBWT-SW is several times faster and much more accurate especially
+when the error rate is high. Please read the manual page for more
+information.
+
+The dBWT-SW algorithm is kind of developed for future sequencing
+technologies which produce much longer reads with a little higher error
+rate. It is still at its early development stage. Some features are
+missing and it may be buggy although I have evaluated on several
+simulated and real data sets. But following the "release early"
+paradigm, I would like the users to try it first.
+
+Other notable changes in BWA are:
+
+ * Fixed a rare bug in the Smith-Waterman alignment module.
+
+ * Fixed a rare bug about the wrong alignment coordinate when a read is
+ poorly aligned.
+
+ * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in
+ a pair are unmapped.
+
+(0.5.0: 20 August 2009, r1200)
+
+
+
+Beta Release 0.4.9 (19 May, 2009)
+---------------------------------
+
+Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has
+not in fact. Now I have fixed the bug. Sorry for this and thank Quan
+Long for pointing out the bug (again).
+
+(0.4.9: 19 May 2009, r1075)
+
+
+
+Beta Release 0.4.8 (18 May, 2009)
+---------------------------------
+
+One change to "aln -R". Now by default, if there are no more than '-R'
+equally best hits, bwa will search for suboptimal hits. This change
+affects the ability in finding SNPs in segmental duplications.
+
+I have not tested this option thoroughly, but this simple change is less
+likely to cause new bugs. Hope I am right.
+
+(0.4.8: 18 May 2009, r1073)
+
+
+
+Beta Release 0.4.7 (12 May, 2009)
+---------------------------------
+
+Notable changes:
+
+ * Output SM (single-end mapping quality) and AM (smaller mapping
+ quality among the two ends) tag from sam output.
+
+ * Improved the functionality of stdsw.
+
+ * Made the XN tag more accurate.
+
+ * Fixed a very rare segfault caused by integer overflow.
+
+ * Improve the insert size estimation.
+
+ * Fixed compiling errors for some Linux systems.
+
+(0.4.7: 12 May 2009, r1066)
+
+
+
+Beta Release 0.4.6 (9 March, 2009)
+----------------------------------
+
+This release improves the SOLiD support. First, a script for converting
+SOLiD raw data is provided. This script is adapted from solid2fastq.pl
+in the MAQ package. Second, a nucleotide reference file can be directly
+used with 'bwa index'. Third, SOLiD paired-end support is
+completed. Fourth, color-space reads will be converted to nucleotides
+when SAM output is generated. Color errors are corrected in this
+process. Please note that like MAQ, BWA cannot make use of the primer
+base and the first color.
+
+In addition, the calculation of mapping quality is also improved a
+little bit, although end-users may barely observe the difference.
+
+(0.4.6: 9 March 2009, r915)
+
+
+
+Beta Release 0.4.5 (18 Feburary, 2009)
+--------------------------------------
+
+Not much happened, but I think it would be good to let the users use the
+latest version.
+
+Notable changes (Thank Bob Handsaker for catching the two bugs):
+
+ * Improved bounary check. Previous version may still give incorrect
+ alignment coordinates in rare cases.
+
+ * Fixed a bug in SW alignment when no residue matches. This only
+ affects the 'sampe' command.
+
+ * Robustly estimate insert size without setting the maximum on the
+ command line. Since this release 'sampe -a' only has an effect if
+ there are not enough good pairs to infer the insert size
+ distribution.
+
+ * Reduced false PE alignments a little bit by using the inferred insert
+ size distribution. This fix may be more important for long insert
+ size libraries.
+
+(0.4.5: 18 Feburary 2009, r829)
+
+
+
+Beta Release 0.4.4 (15 Feburary, 2009)
+--------------------------------------
+
+This is mainly a bug fix release. Notable changes are:
+
+ * Imposed boundary check for extracting subsequence from the
+ genome. Previously this causes memory problem in rare cases.
+
+ * Fixed a bug in failing to find whether an alignment overlapping with
+ N on the genome.
+
+ * Changed MD tag to meet the latest SAM specification.
+
+(0.4.4: 15 Feburary 2009, r815)
+
+
+
+Beta Release 0.4.3 (22 January, 2009)
+------------------------------------
+
+Notable changes:
+
+ * Treat an ambiguous base N as a mismatch. Previous versions will not
+ map reads containing any N.
+
+ * Automatically choose the maximum allowed number of differences. This
+ is important when reads of different lengths are mixed together.
+
+ * Print mate coordinate if only one end is unmapped.
+
+ * Generate MD tag. This tag encodes the mismatching positions and the
+ reference bases at these positions. Deletions from the reference will
+ also be printed.
+
+ * Optionally dump multiple hits from samse, in another concise format
+ rather than SAM.
+
+ * Optionally disable iterative search. This is VERY SLOOOOW, though.
+
+ * Fixed a bug in generate SAM.
+
+(0.4.3: 22 January 2009, r787)
+
+
+
+Beta Release 0.4.2 (9 January, 2009)
+------------------------------------
+
+Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if
+there are no comment texts in the FASTA header. This is a critical
+bug. Nothing else was changed.
+
+(0.4.2: 9 January 2009, r769)
+
+
+
+Beta Release 0.4.1 (7 January, 2009)
+------------------------------------
+
+I am sorry for the quick updates these days. I like to set a milestone
+for BWA and this release seems to be. For paired end reads, BWA also
+does Smith-Waterman alignment for an unmapped read whose mate can be
+mapped confidently. With this strategy BWA achieves similar accuracy to
+maq. Benchmark is also updated accordingly.
+
+(0.4.1: 7 January 2009, r760)
+
+
+
+Beta Release 0.4.0 (6 January, 2009)
+------------------------------------
+
+In comparison to the release two days ago, this release is mainly tuned
+for performance with some tricks I learnt from Bowtie. However, as the
+indexing format has also been changed, I have to increase the version
+number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with
+'bwa index'.
+
+ * Improved the speed by about 20%.
+
+ * Added multi-threading to 'bwa aln'.
+
+(0.4.0: 6 January 2009, r756)
+
+
+
+Beta Release 0.3.0 (4 January, 2009)
+------------------------------------
+
+ * Added paired-end support by separating SA calculation and alignment
+ output.
+
+ * Added SAM output.
+
+ * Added evaluation to the documentation.
+
+(0.3.0: 4 January 2009, r741)
+
+
+
+Beta Release 0.2.0 (15 Augusst, 2008)
+-------------------------------------
+
+ * Take the subsequence at the 5'-end as seed. Seeding strategy greatly
+ improves the speed for long reads, at the cost of missing a few true
+ hits that contain many differences in the seed. Seeding also increase
+ the memory by 800MB.
+
+ * Fixed a bug which may miss some gapped alignments. Fixing the bug
+ also slows the speed a little.
+
+(0.2.0: 15 August 2008, r428)
+
+
+
+Beta Release 0.1.6 (08 Augusst, 2008)
+-------------------------------------
+
+ * Give accurate CIGAR string.
+
+ * Add a simple interface to SW/NW alignment
+
+(0.1.6: 08 August 2008, r414)
+
+
+
+Beta Release 0.1.5 (27 July, 2008)
+----------------------------------
+
+ * Improve the speed. This version is expected to give the same results.
+
+(0.1.5: 27 July 2008, r400)
+
+
+
+Beta Release 0.1.4 (22 July, 2008)
+----------------------------------
+
+ * Fixed a bug which may cause missing gapped alignments.
+
+ * More clearly define what alignments can be found by BWA (See
+ manual). Now BWA runs a little slower because it will visit more
+ potential gapped alignments.
+
+ * A bit code clean up.
+
+(0.1.4: 22 July 2008, r387)
+
+
+
+Beta Release 0.1.3 (21 July, 2008)
+----------------------------------
+
+Improve the speed with some tricks on retrieving occurences. The results
+should be exactly the same as that of 0.1.2.
+
+(0.1.3: 21 July 2008, r382)
+
+
+
+Beta Release 0.1.2 (17 July, 2008)
+----------------------------------
+
+Support gapped alignment. Codes for ungapped alignment has been removed.
+
+(0.1.2: 17 July 2008, r371)
+
+
+
+Beta Release 0.1.1 (03 June, 2008)
+-----------------------------------
+
+This is the first release of BWA, Burrows-Wheeler Alignment tool. Please
+read man page for more information about this software.
+
+(0.1.1: 03 June 2008, r349)
diff --git a/ext/src/bwa/README-alt.md b/ext/src/bwa/README-alt.md
new file mode 100644
index 0000000..058ab7a
--- /dev/null
+++ b/ext/src/bwa/README-alt.md
@@ -0,0 +1,178 @@
+## For the Impatient
+
+```sh
+# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
+ | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh # skip "|sh" to show command lines
+```
+
+This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
+genotypes on each gene and `out.hla.all` for other possible HLA genotypes.
+Please check out [bwa/bwakit/README.md][kithelp] for details.
+
+## Background
+
+GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
+(chromosome known but location unknown), unplaced contigs (chromosome unknown)
+and ALT contigs (long clustered variations). The combination of the first three
+components is called the *primary assembly*. It is recommended to use the
+complete primary assembly for all analyses. Using ALT contigs in read mapping is
+tricky.
+
+GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
+assembly. However, sequences that are highly diverged from the primary assembly
+only contribute a few million bp. Most subsequences of ALT contigs are nearly
+identical to the primary assembly. If we align sequence reads to GRCh38+ALT
+blindly, we will get many additional reads with zero mapping quality and miss
+variants on them. It is crucial to make mappers aware of ALTs.
+
+BWA-MEM is ALT-aware. It essentially computes mapping quality across the
+non-redundant content of the primary assembly plus the ALT contigs and is free
+of the problem above.
+
+## Methods
+
+### Sequence alignment
+
+As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
+postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
+contigs are present. The following picture shows an example about how BWA-MEM
+infers mapping quality and reports alignment after step 2:
+
+![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png)
+
+#### Step 1: BWA-MEM mapping
+
+At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
+the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
+depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
+alignments and assigns mapQ following these two rules:
+
+1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
+ an ALT hit is computed across all hits.
+
+2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
+ alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
+ primary and ALT hits be supplementary (SAM flag 0x800).
+
+In theory, non-ALT alignments from step 1 should be identical to alignments
+against the reference genome with ALT contigs. In practice, the two types of
+alignments may differ in rare cases due to seeding heuristics. When an ALT hit
+is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
+non-ALT hits.
+
+If we don't care about ALT hits, we may skip postprocessing (step 2).
+Nonetheless, postprocessing is recommended as it improves mapQ and gives more
+information about ALT hits.
+
+#### Step 2: Postprocessing
+
+Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
+potential hits reported in the XA tag, lifts ALT hits to the chromosomal
+positions using the ALT-to-ref alignment, groups them based on overlaps between
+their lifted positions, and then re-estimates mapQ across the best scoring hit
+in each group. Being aware of the ALT-to-ref alignment, this script can greatly
+improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
+writes each hit overlapping the reported hit into a separate SAM line. This
+enables variant calling on each ALT contig independent of others.
+
+### On the completeness of GRCh38+ALT
+
+While GRCh38 is much more complete than GRCh37, it is still missing some true
+human sequences. To make sure every piece of sequence in the reference assembly
+is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
+to have enough support from multiple sources before considering to add it to the
+reference assembly. This careful and sophisticated procedure has left out some
+sequences, one of which is [this example][novel], a 10kb contig assembled from
+CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
+[BLAST][blast] to see where it maps.
+
+For a more complete reference genome, we compiled a new set of decoy sequences
+from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
+The sequences are included in `hs38DH-extra.fa` from the [BWA binary
+package][res].
+
+In addition to decoy, we also put multiple alleles of HLA genes in
+`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
+version 3.18.0 and are used to collect reads sequenced from these genes.
+
+### HLA typing
+
+HLA genes are known to be associated with many autoimmune diseases, infectious
+diseases and drug responses. They are among the most important genes but are
+rarely studied by WGS projects due to the high sequence divergence between
+HLA genes and the reference genome in these regions.
+
+By including the HLA gene regions in the reference assembly as ALT contigs, we
+are able to effectively identify reads coming from these genes. We also provide
+a pipeline, which is included in the [BWA binary package][res], to type the
+several classic HLA genes. The pipeline is conceptually simple. It de novo
+assembles sequence reads mapped to each gene, aligns exon sequences of each
+allele to the assembled contigs and then finds the pairs of alleles that best
+explain the contigs. In practice, however, the completeness of IMGT/HLA and
+copy-number changes related to these genes are not so straightforward to
+resolve. HLA typing may not always be successful. Users may also consider to use
+other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
+(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
+most of them are distributed under restrictive licenses.
+
+## Preliminary Evaluation
+
+To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
+unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
+(hs38DH), and called small variants from the alignment. CHM1 is haploid.
+Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
+true positive (TP) heterozygous calls from NA12878 are approximately equal
+to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
+should yield higher TP and lower FP. The following table shows the numbers for
+these assemblies:
+
+|Assembly|hs37 |hs38 |hs38DH|CHM1_1.1| huref|
+|:------:|------:|------:|------:|------:|------:|
+|FP | 255706| 168068| 142516|307172 | 575634|
+|TP |2142260|2163113|2150844|2167235|2137053|
+
+With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
+FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
+from hs38 only and found the majority of them are associated with excessive read
+depth, clustered variants or weak alignment. We believe most hs38-only calls are
+problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
+with nearly identical library construction, the difference is ~140k, an order
+of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
+decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
+ALT contigs and HLA typing at little cost.
+
+## Problems and Future Development
+
+There are some uncertainties about ALT mappings - we are not sure whether they
+help biological discovery and don't know the best way to analyze them. Without
+clear demand from downstream analyses, it is very difficult to design the
+optimal mapping strategy. The current BWA-MEM method is just a start. If it
+turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
+for performance; if not, we may make changes. It is also possible that we might
+make breakthrough on the representation of multiple genomes, in which case, we
+can even get rid of ALT contigs for good.
+
+
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
+[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
+[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
+[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
+[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
+[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
+[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
+[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
+[hlalink]: http://www.hladiseaseassociations.com
+[hlatools]: https://www.biostars.org/p/93245/
+[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
+[hla3]: http://www.biomedcentral.com/1471-2164/15/325
+[hla4]: http://genomemedicine.com/content/4/12/95
+[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/ext/src/bwa/README.md b/ext/src/bwa/README.md
new file mode 100644
index 0000000..e6f677d
--- /dev/null
+++ b/ext/src/bwa/README.md
@@ -0,0 +1,174 @@
+[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
+[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
+##Getting started
+
+ git clone https://github.com/lh3/bwa.git
+ cd bwa; make
+ ./bwa index ref.fa
+ ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz
+ ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz
+
+##Introduction
+
+BWA is a software package for mapping DNA sequences against a large reference
+genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the
+support of long reads and chimeric alignment, but BWA-MEM, which is the latest,
+is generally recommended as it is faster and more accurate. BWA-MEM also has
+better performance than BWA-backtrack for 70-100bp Illumina reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for the
+reference genome (the **index** command). Alignment algorithms are invoked with
+different sub-commands: **aln/samse/sampe** for BWA-backtrack,
+**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm.
+
+##Availability
+
+BWA is released under [Apache 2.0][1]. The latest source code is [freely
+available at github][2]. Released packages can [be downloaded][3] at
+SourceForge. After you acquire the source code, simply use `make` to compile
+and copy the single executable `bwa` to the destination you want. The only
+dependency required to build BWA is [zlib][14].
+
+Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17].
+In addition to BWA, this self-consistent package also comes with bwa-associated
+and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
+adapter triming, duplicate marking, HLA typing and associated data files.
+
+##Seeking helps
+
+The detailed usage is described in the man page available together with the
+source code. You can use `man ./bwa.1` to view the man page in a terminal. The
+[HTML version][4] of the man page can be found at the [BWA website][5]. If you
+have questions about BWA, you may [sign up the mailing list][6] and then send
+the questions to [bio-bwa-help at sourceforge.net][7]. You may also ask questions
+in forums such as [BioStar][8] and [SEQanswers][9].
+
+##Citing BWA
+
+* Li H. and Durbin R. (2009) Fast and accurate short read alignment with
+ Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID:
+ [19451168][10]]. (if you use the BWA-backtrack algorithm)
+
+* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
+ Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID:
+ [20080505][11]]. (if you use the BWA-SW algorithm)
+
+* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs
+ with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM
+ algorithm or the **fastmap** command, or want to cite the whole BWA package)
+
+Please note that the last reference is a preprint hosted at [arXiv.org][13]. I
+do not have plan to submit it to a peer-reviewed journal in the near future.
+
+##Frequently asked questions (FAQs)
+
+1. [What types of data does BWA work with?](#type)
+2. [Why does a read appear multiple times in the output SAM?](#multihit)
+3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
+4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
+5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
+6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
+7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
+
+####<a name="type"></a>1. What types of data does BWA work with?
+
+BWA works with a variety types of DNA sequence data, though the optimal
+algorithm and setting may vary. The following list gives the recommended
+settings:
+
+* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
+ contigs up to a few megabases mapped to a closely related reference genome:
+
+ bwa mem ref.fa reads.fq > aln.sam
+
+* Illumina single-end reads shorter than ~70bp:
+
+ bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
+
+* Illumina/454/IonTorrent paired-end reads longer than ~70bp:
+
+ bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
+
+* Illumina paired-end reads shorter than ~70bp:
+
+ bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
+ bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
+
+* PacBio subreads or Oxford Nanopore reads to a reference genome:
+
+ bwa mem -x pacbio ref.fa reads.fq > aln.sam
+ bwa mem -x ont2d ref.fa reads.fq > aln.sam
+
+BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
+error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
+errors given longer query sequences as the chance of missing all seeds is small.
+As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
+reads with a sequencing error rate over 20%.
+
+####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
+
+BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene
+fusion or a long deletion, a read bridging the break point may have two hits,
+occupying two lines in the SAM output. With the default setting of BWA-MEM, one
+and only one line is primary and is soft clipped; other lines are tagged with
+0x800 SAM flag (supplementary alignment) and are hard clipped.
+
+####<a name="4gb"></a>3. Does BWA work on reference sequences longer than 4GB in total?
+
+Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over
+4GB. However, individual chromosome should not be longer than 2GB.
+
+####<a name="pe0"></a>4. Why can one read in a pair has high mapping quality but the other has zero?
+
+This is correct. Mapping quality is assigned for individual read, not for a read
+pair. It is possible that one read can be mapped unambiguously, but its mate
+falls in a tandem repeat and thus its accurate position cannot be determined.
+
+####<a name="endref"></a>5. How can a BWA-backtrack alignment stands out of the end of a chromosome?
+
+Internally BWA concatenates all reference sequences into one long sequence. A
+read may be mapped to the junction of two adjacent reference sequences. In this
+case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
+position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
+as well. BWA-MEM does not have this problem.
+
+####<a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
+
+Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
+BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
+see [README-alt.md][18] for details. Briefly, it is recommended to use
+[bwakit][17], the binary release of BWA, for generating the reference genome
+and for mapping.
+
+####<a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
+
+If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
+without post-processing. The alignments produced this way are very close to
+alignments against GRCh38 without ALT contigs. Nonetheless, applying
+post-processing helps to reduce false mappings caused by reads from the
+diverged part of ALT contigs and also enables HLA typing. It is recommended to
+run the post-processing script.
+
+
+
+[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License
+[2]: https://github.com/lh3/bwa
+[3]: http://sourceforge.net/projects/bio-bwa/files/
+[4]: http://bio-bwa.sourceforge.net/bwa.shtml
+[5]: http://bio-bwa.sourceforge.net/
+[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help
+[7]: mailto:bio-bwa-help at sourceforge.net
+[8]: http://biostars.org
+[9]: http://seqanswers.com/
+[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168
+[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505
+[12]: http://arxiv.org/abs/1303.3997
+[13]: http://arxiv.org/
+[14]: http://zlib.net/
+[15]: https://github.com/lh3/bwa/tree/mem
+[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/
+[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/
+[18]: https://github.com/lh3/bwa/blob/master/README-alt.md
diff --git a/ext/src/bwa/bamlite.c b/ext/src/bwa/bamlite.c
new file mode 100644
index 0000000..3704beb
--- /dev/null
+++ b/ext/src/bwa/bamlite.c
@@ -0,0 +1,210 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include "bamlite.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+/*********************
+ * from bam_endian.c *
+ *********************/
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+ return x;
+}
+
+/**************
+ * from bam.c *
+ **************/
+
+int bam_is_be;
+
+bam_header_t *bam_header_init()
+{
+ bam_is_be = bam_is_big_endian();
+ return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+ int32_t i;
+ if (header == 0) return;
+ if (header->target_name) {
+ for (i = 0; i < header->n_targets; ++i)
+ if (header->target_name[i]) free(header->target_name[i]);
+ if (header->target_len) free(header->target_len);
+ free(header->target_name);
+ }
+ if (header->text) free(header->text);
+ free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+ bam_header_t *header;
+ char buf[4];
+ int magic_len;
+ int32_t i = 1, name_len;
+ // read "BAM1"
+ magic_len = bam_read(fp, buf, 4);
+ if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
+ fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
+ return NULL;
+ }
+ header = bam_header_init();
+ // read plain text and the number of reference sequences
+ if (bam_read(fp, &header->l_text, 4) != 4) goto fail;
+ if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+ header->text = (char*)calloc(header->l_text + 1, 1);
+ if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail;
+ if (bam_read(fp, &header->n_targets, 4) != 4) goto fail;
+ if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+ // read reference sequence names and lengths
+ header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+ header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+ for (i = 0; i != header->n_targets; ++i) {
+ if (bam_read(fp, &name_len, 4) != 4) goto fail;
+ if (bam_is_be) bam_swap_endian_4p(&name_len);
+ header->target_name[i] = (char*)calloc(name_len, 1);
+ if (bam_read(fp, header->target_name[i], name_len) != name_len) {
+ goto fail;
+ }
+ if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail;
+ if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+ }
+ return header;
+ fail:
+ bam_header_destroy(header);
+ return NULL;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+ uint8_t *s;
+ uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+ s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+ for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+ while (s < data + data_len) {
+ uint8_t type;
+ s += 2; // skip key
+ type = toupper(*s); ++s; // skip type
+ if (type == 'C' || type == 'A') ++s;
+ else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+ else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+ else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
+ else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+ }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+ bam1_core_t *c = &b->core;
+ int32_t block_len, ret, i;
+ uint32_t x[8];
+
+ if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+ if (ret == 0) return -1; // normal end-of-file
+ else return -2; // truncated
+ }
+ if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3;
+ if (bam_is_be) {
+ bam_swap_endian_4p(&block_len);
+ for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+ }
+ c->tid = x[0]; c->pos = x[1];
+ c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+ c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+ c->l_qseq = x[4];
+ c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+ b->data_len = block_len - sizeof(bam1_core_t);
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+ b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+ if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+ return 4 + block_len;
+}
+
+
+#ifdef USE_VERBOSE_ZLIB_WRAPPERS
+// Versions of gzopen, gzread and gzclose that print up error messages
+
+gzFile bamlite_gzopen(const char *fn, const char *mode) {
+ gzFile fp;
+ if (strcmp(fn, "-") == 0) {
+ fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
+ if (!fp) {
+ fprintf(stderr, "Couldn't open %s : %s",
+ (strstr(mode, "r"))? "stdin" : "stdout",
+ strerror(errno));
+ }
+ return fp;
+ }
+ if ((fp = gzopen(fn, mode)) == 0) {
+ fprintf(stderr, "Couldn't open %s : %s\n", fn,
+ errno ? strerror(errno) : "Out of memory");
+ }
+ return fp;
+}
+
+int bamlite_gzread(gzFile file, void *ptr, unsigned int len) {
+ int ret = gzread(file, ptr, len);
+
+ if (ret < 0) {
+ int errnum = 0;
+ const char *msg = gzerror(file, &errnum);
+ fprintf(stderr, "gzread error: %s\n",
+ Z_ERRNO == errnum ? strerror(errno) : msg);
+ }
+ return ret;
+}
+
+int bamlite_gzclose(gzFile file) {
+ int ret = gzclose(file);
+ if (Z_OK != ret) {
+ fprintf(stderr, "gzclose error: %s\n",
+ Z_ERRNO == ret ? strerror(errno) : zError(ret));
+ }
+
+ return ret;
+}
+#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
diff --git a/ext/src/bwa/bamlite.h b/ext/src/bwa/bamlite.h
new file mode 100644
index 0000000..efab7ac
--- /dev/null
+++ b/ext/src/bwa/bamlite.h
@@ -0,0 +1,114 @@
+#ifndef BAMLITE_H_
+#define BAMLITE_H_
+
+#include <stdint.h>
+#include <zlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define USE_VERBOSE_ZLIB_WRAPPERS
+
+typedef gzFile bamFile;
+#ifdef USE_VERBOSE_ZLIB_WRAPPERS
+/* These print error messages on failure */
+# define bam_open(fn, mode) bamlite_gzopen(fn, mode)
+# define bam_dopen(fd, mode) gzdopen(fd, mode)
+# define bam_close(fp) bamlite_gzclose(fp)
+# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size)
+#else
+# define bam_open(fn, mode) gzopen(fn, mode)
+# define bam_dopen(fd, mode) gzdopen(fd, mode)
+# define bam_close(fp) gzclose(fp)
+# define bam_read(fp, buf, size) gzread(fp, buf, size)
+#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
+
+typedef struct {
+ int32_t n_targets;
+ char **target_name;
+ uint32_t *target_len;
+ size_t l_text, n_text;
+ char *text;
+} bam_header_t;
+
+#define BAM_FPAIRED 1
+#define BAM_FPROPER_PAIR 2
+#define BAM_FUNMAP 4
+#define BAM_FMUNMAP 8
+#define BAM_FREVERSE 16
+#define BAM_FMREVERSE 32
+#define BAM_FREAD1 64
+#define BAM_FREAD2 128
+#define BAM_FSECONDARY 256
+#define BAM_FQCFAIL 512
+#define BAM_FDUP 1024
+
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1)
+
+#define BAM_CMATCH 0
+#define BAM_CINS 1
+#define BAM_CDEL 2
+#define BAM_CREF_SKIP 3
+#define BAM_CSOFT_CLIP 4
+#define BAM_CHARD_CLIP 5
+#define BAM_CPAD 6
+
+typedef struct {
+ int32_t tid;
+ int32_t pos;
+ uint32_t bin:16, qual:8, l_qname:8;
+ uint32_t flag:16, n_cigar:16;
+ int32_t l_qseq;
+ int32_t mtid;
+ int32_t mpos;
+ int32_t isize;
+} bam1_core_t;
+
+typedef struct {
+ bam1_core_t core;
+ int l_aux, data_len, m_data;
+ uint8_t *data;
+} bam1_t;
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+#define bam1_qname(b) ((char*)((b)->data))
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
+#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+#define bam_destroy1(b) do { \
+ if (b) { free((b)->data); free(b); } \
+ } while (0)
+
+extern int bam_is_be;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bam_header_t *bam_header_init(void);
+ void bam_header_destroy(bam_header_t *header);
+ bam_header_t *bam_header_read(bamFile fp);
+ int bam_read1(bamFile fp, bam1_t *b);
+
+#ifdef USE_VERBOSE_ZLIB_WRAPPERS
+ gzFile bamlite_gzopen(const char *fn, const char *mode);
+ int bamlite_gzread(gzFile file, void *ptr, unsigned int len);
+ int bamlite_gzclose(gzFile file);
+#endif /* USE_VERBOSE_ZLIB_WRAPPERS */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/bntseq.c b/ext/src/bwa/bntseq.c
new file mode 100644
index 0000000..8d43083
--- /dev/null
+++ b/ext/src/bwa/bntseq.c
@@ -0,0 +1,446 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include "bwa/bntseq.h"
+#include "bwa/utils.h"
+
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+unsigned char nst_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+void bns_dump(const bntseq_t *bns, const char *prefix)
+{
+ char str[1024];
+ FILE *fp;
+ int i;
+ { // dump .ann
+ strcpy(str, prefix); strcat(str, ".ann");
+ fp = xopen(str, "w");
+ err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed);
+ for (i = 0; i != bns->n_seqs; ++i) {
+ bntann1_t *p = bns->anns + i;
+ err_fprintf(fp, "%d %s", p->gi, p->name);
+ if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno);
+ else err_fprintf(fp, "\n");
+ err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs);
+ }
+ err_fflush(fp);
+ err_fclose(fp);
+ }
+ { // dump .amb
+ strcpy(str, prefix); strcat(str, ".amb");
+ fp = xopen(str, "w");
+ err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes);
+ for (i = 0; i != bns->n_holes; ++i) {
+ bntamb1_t *p = bns->ambs + i;
+ err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb);
+ }
+ err_fflush(fp);
+ err_fclose(fp);
+ }
+}
+
+bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
+{
+ char str[8192];
+ FILE *fp;
+ const char *fname;
+ bntseq_t *bns;
+ long long xx;
+ int i;
+ int scanres;
+ bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
+ { // read .ann
+ fp = xopen(fname = ann_filename, "r");
+ scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed);
+ if (scanres != 3) goto badread;
+ bns->l_pac = xx;
+ bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t));
+ for (i = 0; i < bns->n_seqs; ++i) {
+ bntann1_t *p = bns->anns + i;
+ char *q = str;
+ int c;
+ // read gi and sequence name
+ scanres = fscanf(fp, "%u%s", &p->gi, str);
+ if (scanres != 2) goto badread;
+ p->name = strdup(str);
+ // read fasta comments
+ while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+ while (c != '\n' && c != EOF) c = fgetc(fp);
+ if (c == EOF) {
+ scanres = EOF;
+ goto badread;
+ }
+ *q = 0;
+ if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
+ else p->anno = strdup("");
+ // read the rest
+ scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
+ if (scanres != 3) goto badread;
+ p->offset = xx;
+ }
+ err_fclose(fp);
+ }
+ { // read .amb
+ int64_t l_pac;
+ int32_t n_seqs;
+ fp = xopen(fname = amb_filename, "r");
+ scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes);
+ if (scanres != 3) goto badread;
+ l_pac = xx;
+ xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files.");
+ bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0;
+ for (i = 0; i < bns->n_holes; ++i) {
+ bntamb1_t *p = bns->ambs + i;
+ scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str);
+ if (scanres != 3) goto badread;
+ p->offset = xx;
+ p->amb = str[0];
+ }
+ err_fclose(fp);
+ }
+ { // open .pac
+ bns->fp_pac = xopen(pac_filename, "rb");
+ }
+ return bns;
+
+ badread:
+ if (EOF == scanres) {
+ err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file");
+ }
+ err_fatal(__func__, "Parse error reading %s\n", fname);
+}
+
+bntseq_t *bns_restore(const char *prefix)
+{
+ char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
+ FILE *fp;
+ bntseq_t *bns;
+ strcat(strcpy(ann_filename, prefix), ".ann");
+ strcat(strcpy(amb_filename, prefix), ".amb");
+ strcat(strcpy(pac_filename, prefix), ".pac");
+ bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
+ if (bns == 0) return 0;
+ if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
+ char str[1024];
+ khash_t(str) *h;
+ int c, i, absent;
+ khint_t k;
+ h = kh_init(str);
+ for (i = 0; i < bns->n_seqs; ++i) {
+ k = kh_put(str, h, bns->anns[i].name, &absent);
+ kh_val(h, k) = i;
+ }
+ i = 0;
+ while ((c = fgetc(fp)) != EOF) {
+ if (c == '\t' || c == '\n' || c == '\r') {
+ str[i] = 0;
+ if (str[0] != '@') {
+ k = kh_get(str, h, str);
+ if (k != kh_end(h))
+ bns->anns[kh_val(h, k)].is_alt = 1;
+ }
+ while (c != '\n' && c != EOF) c = fgetc(fp);
+ i = 0;
+ } else str[i++] = c; // FIXME: potential segfault here
+ }
+ kh_destroy(str, h);
+ fclose(fp);
+ }
+ return bns;
+}
+
+void bns_destroy(bntseq_t *bns)
+{
+ if (bns == 0) return;
+ else {
+ int i;
+ if (bns->fp_pac) err_fclose(bns->fp_pac);
+ free(bns->ambs);
+ for (i = 0; i < bns->n_seqs; ++i) {
+ free(bns->anns[i].name);
+ free(bns->anns[i].anno);
+ }
+ free(bns->anns);
+ free(bns);
+ }
+}
+
+#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
+#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
+
+static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q)
+{
+ bntann1_t *p;
+ int i, lasts;
+ if (bns->n_seqs == *m_seqs) {
+ *m_seqs <<= 1;
+ bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
+ }
+ p = bns->anns + bns->n_seqs;
+ p->name = strdup((char*)seq->name.s);
+ p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
+ p->gi = 0; p->len = seq->seq.l;
+ p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
+ p->n_ambs = 0;
+ for (i = lasts = 0; i < seq->seq.l; ++i) {
+ int c = nst_nt4_table[(int)seq->seq.s[i]];
+ if (c >= 4) { // N
+ if (lasts == seq->seq.s[i]) { // contiguous N
+ ++(*q)->len;
+ } else {
+ if (bns->n_holes == *m_holes) {
+ (*m_holes) <<= 1;
+ bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
+ }
+ *q = bns->ambs + bns->n_holes;
+ (*q)->len = 1;
+ (*q)->offset = p->offset + i;
+ (*q)->amb = seq->seq.s[i];
+ ++p->n_ambs;
+ ++bns->n_holes;
+ }
+ }
+ lasts = seq->seq.s[i];
+ { // fill buffer
+ if (c >= 4) c = lrand48()&3;
+ if (bns->l_pac == *m_pac) { // double the pac size
+ *m_pac <<= 1;
+ pac = realloc(pac, *m_pac/4);
+ memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
+ }
+ _set_pac(pac, bns->l_pac, c);
+ ++bns->l_pac;
+ }
+ }
+ ++bns->n_seqs;
+ return pac;
+}
+
+int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only)
+{
+ extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c
+ kseq_t *seq;
+ char name[1024];
+ bntseq_t *bns;
+ uint8_t *pac = 0;
+ int32_t m_seqs, m_holes;
+ int64_t ret = -1, m_pac, l;
+ bntamb1_t *q;
+ FILE *fp;
+
+ // initialization
+ seq = kseq_init(fp_fa);
+ bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
+ bns->seed = 11; // fixed seed for random generator
+ srand48(bns->seed);
+ m_seqs = m_holes = 8; m_pac = 0x10000;
+ bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
+ bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
+ pac = calloc(m_pac/4, 1);
+ q = bns->ambs;
+ strcpy(name, prefix); strcat(name, ".pac");
+ fp = xopen(name, "wb");
+ // read sequences
+ while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
+ if (!for_only) { // add the reverse complemented sequence
+ m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
+ pac = realloc(pac, m_pac/4);
+ memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
+ for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
+ _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
+ }
+ ret = bns->l_pac;
+ { // finalize .pac file
+ ubyte_t ct;
+ err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp);
+ // the following codes make the pac file size always (l_pac/4+1+1)
+ if (bns->l_pac % 4 == 0) {
+ ct = 0;
+ err_fwrite(&ct, 1, 1, fp);
+ }
+ ct = bns->l_pac % 4;
+ err_fwrite(&ct, 1, 1, fp);
+ // close .pac file
+ err_fflush(fp);
+ err_fclose(fp);
+ }
+ bns_dump(bns, prefix);
+ bns_destroy(bns);
+ kseq_destroy(seq);
+ free(pac);
+ return ret;
+}
+
+int bwa_fa2pac(int argc, char *argv[])
+{
+ int c, for_only = 0;
+ gzFile fp;
+ while ((c = getopt(argc, argv, "f")) >= 0) {
+ switch (c) {
+ case 'f': for_only = 1; break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: bwa fa2pac [-f] <in.fasta> [<out.prefix>]\n");
+ return 1;
+ }
+ fp = xzopen(argv[optind], "r");
+ bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only);
+ err_gzclose(fp);
+ return 0;
+}
+
+int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
+{
+ int left, mid, right;
+ if (pos_f >= bns->l_pac) return -1;
+ left = 0; mid = 0; right = bns->n_seqs;
+ while (left < right) { // binary search
+ mid = (left + right) >> 1;
+ if (pos_f >= bns->anns[mid].offset) {
+ if (mid == bns->n_seqs - 1) break;
+ if (pos_f < bns->anns[mid+1].offset) break; // bracketed
+ left = mid + 1;
+ } else right = mid;
+ }
+ return mid;
+}
+
+int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
+{
+ int is_rev, rid_b, rid_e;
+ if (rb < bns->l_pac && re > bns->l_pac) return -2;
+ assert(rb <= re);
+ rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
+ rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
+ return rid_b == rid_e? rid_b : -1;
+}
+
+int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
+{
+ int left, mid, right, nn;
+ if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
+ left = 0; right = bns->n_holes; nn = 0;
+ while (left < right) {
+ mid = (left + right) >> 1;
+ if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1;
+ else if (pos_f + len <= bns->ambs[mid].offset) right = mid;
+ else { // overlap
+ if (pos_f >= bns->ambs[mid].offset) {
+ nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
+ bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len;
+ } else {
+ nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len?
+ bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f);
+ }
+ break;
+ }
+ }
+ return nn;
+}
+
+uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
+{
+ uint8_t *seq = 0;
+ if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
+ if (end > l_pac<<1) end = l_pac<<1;
+ if (beg < 0) beg = 0;
+ if (beg >= l_pac || end <= l_pac) {
+ int64_t k, l = 0;
+ *len = end - beg;
+ seq = malloc(end - beg);
+ if (beg >= l_pac) { // reverse strand
+ int64_t beg_f = (l_pac<<1) - 1 - end;
+ int64_t end_f = (l_pac<<1) - 1 - beg;
+ for (k = end_f; k > beg_f; --k)
+ seq[l++] = 3 - _get_pac(pac, k);
+ } else { // forward strand
+ for (k = beg; k < end; ++k)
+ seq[l++] = _get_pac(pac, k);
+ }
+ } else *len = 0; // if bridging the forward-reverse boundary, return nothing
+ return seq;
+}
+
+uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
+{
+ int64_t far_beg, far_end, len;
+ int is_rev;
+ uint8_t *seq;
+
+ if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
+ assert(*beg <= mid && mid < *end);
+ *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
+ far_beg = bns->anns[*rid].offset;
+ far_end = far_beg + bns->anns[*rid].len;
+ if (is_rev) { // flip to the reverse strand
+ int64_t tmp = far_beg;
+ far_beg = (bns->l_pac<<1) - far_end;
+ far_end = (bns->l_pac<<1) - tmp;
+ }
+ *beg = *beg > far_beg? *beg : far_beg;
+ *end = *end < far_end? *end : far_end;
+ seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
+ if (seq == 0 || *end - *beg != len) {
+ fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n",
+ __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end);
+ }
+ assert(seq && *end - *beg == len); // assertion failure should never happen
+ return seq;
+}
diff --git a/ext/src/bwa/bwa.1 b/ext/src/bwa/bwa.1
new file mode 100644
index 0000000..994f96a
--- /dev/null
+++ b/ext/src/bwa/bwa.1
@@ -0,0 +1,825 @@
+.TH bwa 1 "23 December 2014" "bwa-0.7.12-r1034" "Bioinformatics tools"
+.SH NAME
+.PP
+bwa - Burrows-Wheeler Alignment Tool
+.SH SYNOPSIS
+.PP
+bwa index ref.fa
+.PP
+bwa mem ref.fa reads.fq > aln-se.sam
+.PP
+bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
+.PP
+bwa aln ref.fa short_read.fq > aln_sa.sai
+.PP
+bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+.PP
+bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+.PP
+bwa bwasw ref.fa long_read.fq > aln.sam
+
+.SH DESCRIPTION
+.PP
+BWA is a software package for mapping low-divergent sequences against a large
+reference genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
+support and split alignment, but BWA-MEM, which is the latest, is generally
+recommended for high-quality queries as it is faster and more accurate.
+BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
+reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for
+the reference genome (the
+.B index
+command). Alignment algorithms are invoked with different sub-commands:
+.BR aln / samse / sampe
+for BWA-backtrack,
+.B bwasw
+for BWA-SW and
+.B mem
+for the BWA-MEM algorithm.
+
+.SH COMMANDS AND OPTIONS
+.TP
+.B index
+.B bwa index
+.RB [ -p
+.IR prefix ]
+.RB [ -a
+.IR algoType ]
+.I db.fa
+
+Index database sequences in the FASTA format.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -p \ STR
+Prefix of the output database [same as db filename]
+.TP
+.BI -a \ STR
+Algorithm for constructing BWT index. BWA implements three algorithms for BWT
+construction:
+.BR is ,
+.B bwtsw
+and
+.BR rb2 .
+The first algorithm is a little faster for small database but requires large
+RAM and does not work for databases with total length longer than 2GB. The
+second algorithm is adapted from the BWT-SW source code. It in theory works
+with database with trillions of bases. When this option is not specified, the
+appropriate algorithm will be chosen automatically.
+.RE
+
+.TP
+.B mem
+.B bwa mem
+.RB [ -aCHjMpP ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -k
+.IR minSeedLen ]
+.RB [ -w
+.IR bandWidth ]
+.RB [ -d
+.IR zDropoff ]
+.RB [ -r
+.IR seedSplitRatio ]
+.RB [ -c
+.IR maxOcc ]
+.RB [ -D
+.IR chainShadow ]
+.RB [ -m
+.IR maxMateSW ]
+.RB [ -W
+.IR minSeedMatch ]
+.RB [ -A
+.IR matchScore ]
+.RB [ -B
+.IR mmPenalty ]
+.RB [ -O
+.IR gapOpenPen ]
+.RB [ -E
+.IR gapExtPen ]
+.RB [ -L
+.IR clipPen ]
+.RB [ -U
+.IR unpairPen ]
+.RB [ -R
+.IR RGline ]
+.RB [ -H
+.IR HDlines ]
+.RB [ -v
+.IR verboseLevel ]
+.I db.prefix
+.I reads.fq
+.RI [ mates.fq ]
+
+Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
+algorithm works by seeding alignments with maximal exact matches (MEMs) and
+then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
+
+If
+.I mates.fq
+file is absent and option
+.B -p
+is not set, this command regards input reads are single-end. If
+.I mates.fq
+is present, this command assumes the
+.IR i -th
+read in
+.I reads.fq
+and the
+.IR i -th
+read in
+.I mates.fq
+constitute a read pair. If
+.B -p
+is used, the command assumes the
+.RI 2 i -th
+and the
+.RI (2 i +1)-th
+read in
+.I reads.fq
+constitute a read pair (such input file is said to be interleaved). In this case,
+.I mates.fq
+is ignored. In the paired-end mode, the
+.B mem
+command will infer the read orientation and the insert size distribution from a
+batch of reads.
+
+The BWA-MEM algorithm performs local alignment. It may produce multiple primary
+alignments for different part of a query sequence. This is a crucial feature
+for long sequences. However, some tools such as Picard's markDuplicates does
+not work with split alignments. One may consider to use option
+.B -M
+to flag shorter split hits as secondary.
+
+.RS
+.TP 10
+.B ALGORITHM OPTIONS:
+.TP
+.BI -t \ INT
+Number of threads [1]
+.TP
+.BI -k \ INT
+Minimum seed length. Matches shorter than
+.I INT
+will be missed. The alignment speed is usually insensitive to this value unless
+it significantly deviates from 20. [19]
+.TP
+.BI -w \ INT
+Band width. Essentially, gaps longer than
+.I INT
+will not be found. Note that the maximum gap length is also affected by the
+scoring matrix and the hit length, not solely determined by this option. [100]
+.TP
+.BI -d \ INT
+Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between
+the best and the current extension score is above
+.RI | i - j |* A + INT ,
+where
+.I i
+and
+.I j
+are the current positions of the query and reference, respectively, and
+.I A
+is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it
+doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not
+only avoids unnecessary extension, but also reduces poor alignments inside a
+long good alignment. [100]
+.TP
+.BI -r \ FLOAT
+Trigger re-seeding for a MEM longer than
+.IR minSeedLen * FLOAT .
+This is a key heuristic parameter for tuning the performance. Larger value
+yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
+.TP
+.BI -c \ INT
+Discard a MEM if it has more than
+.I INT
+occurence in the genome. This is an insensitive parameter. [500]
+.TP
+.BI -D \ INT
+Drop chains shorter than
+.I FLOAT
+fraction of the longest overlapping chain [0.5]
+.TP
+.BI -m \ INT
+Perform at most
+.I INT
+rounds of mate-SW [50]
+.TP
+.BI -W \ INT
+Drop a chain if the number of bases in seeds is smaller than
+.IR INT .
+This option is primarily used for longer contigs/reads. When positive, it also
+affects seed filtering. [0]
+.TP
+.B -P
+In the paired-end mode, perform SW to rescue missing hits only but do not try to find
+hits that fit a proper pair.
+
+.TP
+.B SCORING OPTIONS:
+.TP
+.BI -A \ INT
+Matching score. [1]
+.TP
+.BI -B \ INT
+Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
+.TP
+.BI -O \ INT[,INT]
+Gap open penalty. If two numbers are specified, the first is the penalty of
+openning a deletion and the second for openning an insertion. [6]
+.TP
+.BI -E \ INT[,INT]
+Gap extension penalty. If two numbers are specified, the first is the penalty
+of extending a deletion and second for extending an insertion. A gap of length
+k costs O + k*E (i.e.
+.B -O
+is for opening a zero-length gap). [1]
+.TP
+.BI -L \ INT[,INT]
+Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
+score reaching the end of query. If this score is larger than the best SW score
+minus the clipping penalty, clipping will not be applied. Note that in this
+case, the SAM AS tag reports the best SW score; clipping penalty is not
+deduced. If two numbers are provided, the first is for 5'-end clipping and
+second for 3'-end clipping. [5]
+.TP
+.BI -U \ INT
+Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
+.RI scoreRead1+scoreRead2- INT
+and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
+two scores to determine whether we should force pairing. A larger value leads to
+more aggressive read pair. [17]
+
+.TP
+.B INPUT/OUTPUT OPTIONS:
+.TP
+.B -p
+Smart pairing. If two adjacent reads have the same name, they are considered
+to form a read pair. This way, paired-end and single-end reads can be mixed
+in a single FASTA/Q stream.
+.TP
+.BI -R \ STR
+Complete read group header line. '\\t' can be used in
+.I STR
+and will be converted to a TAB in the output SAM. The read group ID will be
+attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
+[null]
+.TP
+.BI -H \ ARG
+If ARG starts with @, it is interpreted as a string and gets inserted into the
+output SAM header; otherwise, ARG is interpreted as a file with all lines
+starting with @ in the file inserted into the SAM header. [null]
+.TP
+.BI -T \ INT
+Don't output alignment with score lower than
+.IR INT .
+This option affects output and occasionally SAM flag 2. [30]
+.TP
+.BI -j
+Treat ALT contigs as part of the primary assembly (i.e. ignore the
+.I db.prefix.alt
+file).
+.TP
+.BI -h \ INT[,INT2]
+If a query has not more than
+.I INT
+hits with score higher than 80% of the best hit, output them all in the XA tag.
+If
+.I INT2
+is specified, BWA-MEM outputs up to
+.I INT2
+hits if the list contains a hit to an ALT contig. [5,200]
+.TP
+.B -a
+Output all found alignments for single-end or unpaired paired-end reads. These
+alignments will be flagged as secondary alignments.
+.TP
+.B -C
+Append append FASTA/Q comment to SAM output. This option can be used to
+transfer read meta information (e.g. barcode) to the SAM output. Note that the
+FASTA/Q comment (the string after a space in the header line) must conform the SAM
+spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
+.TP
+.B -Y
+Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM
+uses soft clipping for the primary alignment and hard clipping for
+supplementary alignments.
+.TP
+.B -M
+Mark shorter split hits as secondary (for Picard compatibility).
+.TP
+.BI -v \ INT
+Control the verbose level of the output. This option has not been fully
+supported throughout BWA. Ideally, a value 0 for disabling all the output to
+stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
+all normal messages; 4 or higher for debugging. When this option takes value
+4, the output is not SAM. [3]
+.TP
+.BI -I \ FLOAT[,FLOAT[,INT[,INT]]]
+Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma
+from the mean if absent) and min (4 sigma if absent) of the insert size
+distribution. Only applicable to the FR orientation. By default, BWA-MEM infers
+these numbers and the pair orientations given enough reads. [inferred]
+
+.RE
+
+.TP
+.B aln
+bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
+nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
+[-O gapOsc] [-E gapEsc] [-q trimQual] <in.db.fasta> <in.query.fq> >
+<out.sai>
+
+Find the SA coordinates of the input reads. Maximum
+.I maxSeedDiff
+differences are allowed in the first
+.I seedLen
+subsequence and maximum
+.I maxDiff
+differences are allowed in the whole sequence.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -n \ NUM
+Maximum edit distance if the value is INT, or the fraction of missing
+alignments given 2% uniform base error rate if FLOAT. In the latter
+case, the maximum edit distance is automatically chosen for different
+read lengths. [0.04]
+.TP
+.BI -o \ INT
+Maximum number of gap opens [1]
+.TP
+.BI -e \ INT
+Maximum number of gap extensions, -1 for k-difference mode (disallowing
+long gaps) [-1]
+.TP
+.BI -d \ INT
+Disallow a long deletion within INT bp towards the 3'-end [16]
+.TP
+.BI -i \ INT
+Disallow an indel within INT bp towards the ends [5]
+.TP
+.BI -l \ INT
+Take the first INT subsequence as seed. If INT is larger than the query
+sequence, seeding will be disabled. For long reads, this option is
+typically ranged from 25 to 35 for `-k 2'. [inf]
+.TP
+.BI -k \ INT
+Maximum edit distance in the seed [2]
+.TP
+.BI -t \ INT
+Number of threads (multi-threading mode) [1]
+.TP
+.BI -M \ INT
+Mismatch penalty. BWA will not search for suboptimal hits with a score
+lower than (bestScore-misMsc). [3]
+.TP
+.BI -O \ INT
+Gap open penalty [11]
+.TP
+.BI -E \ INT
+Gap extension penalty [4]
+.TP
+.BI -R \ INT
+Proceed with suboptimal alignments if there are no more than INT equally
+best hits. This option only affects paired-end mapping. Increasing this
+threshold helps to improve the pairing accuracy at the cost of speed,
+especially for short reads (~32bp).
+.TP
+.B -c
+Reverse query but not complement it, which is required for alignment in
+the color space. (Disabled since 0.6.x)
+.TP
+.B -N
+Disable iterative search. All hits with no more than
+.I maxDiff
+differences will be found. This mode is much slower than the default.
+.TP
+.BI -q \ INT
+Parameter for read trimming. BWA trims a read down to
+argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT where l is the original
+read length. [0]
+.TP
+.B -I
+The input is in the Illumina 1.3+ read format (quality equals ASCII-64).
+.TP
+.BI -B \ INT
+Length of barcode starting from the 5'-end. When
+.I INT
+is positive, the barcode of each read will be trimmed before mapping and will
+be written at the
+.B BC
+SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]
+.TP
+.B -b
+Specify the input read sequence file is the BAM format. For paired-end
+data, two ends in a pair must be grouped together and options
+.B -1
+or
+.B -2
+are usually applied to specify which end should be mapped. Typical
+command lines for mapping pair-end data in the BAM format are:
+
+ bwa aln ref.fa -b1 reads.bam > 1.sai
+ bwa aln ref.fa -b2 reads.bam > 2.sai
+ bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam
+.TP
+.B -0
+When
+.B -b
+is specified, only use single-end reads in mapping.
+.TP
+.B -1
+When
+.B -b
+is specified, only use the first read in a read pair in mapping (skip
+single-end reads and the second reads).
+.TP
+.B -2
+When
+.B -b
+is specified, only use the second read in a read pair in mapping.
+.B
+.RE
+
+.TP
+.B samse
+bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
+
+Generate alignments in the SAM format given single-end reads. Repetitive
+hits will be randomly chosen.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -n \ INT
+Maximum number of alignments to output in the XA tag for reads paired
+properly. If a read has more than INT hits, the XA tag will not be
+written. [3]
+.TP
+.BI -r \ STR
+Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
+.RE
+
+.TP
+.B sampe
+bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis]
+[-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam>
+
+Generate alignments in the SAM format given paired-end reads. Repetitive
+read pairs will be placed randomly.
+
+.B OPTIONS:
+.RS
+.TP 8
+.BI -a \ INT
+Maximum insert size for a read pair to be considered being mapped
+properly. Since 0.4.5, this option is only used when there are not
+enough good alignment to infer the distribution of insert sizes. [500]
+.TP
+.BI -o \ INT
+Maximum occurrences of a read for pairing. A read with more occurrneces
+will be treated as a single-end read. Reducing this parameter helps
+faster pairing. [100000]
+.TP
+.B -P
+Load the entire FM-index into memory to reduce disk operations
+(base-space reads only). With this option, at least 1.25N bytes of
+memory are required, where N is the length of the genome.
+.TP
+.BI -n \ INT
+Maximum number of alignments to output in the XA tag for reads paired
+properly. If a read has more than INT hits, the XA tag will not be
+written. [3]
+.TP
+.BI -N \ INT
+Maximum number of alignments to output in the XA tag for disconcordant
+read pairs (excluding singletons). If a read has more than INT hits, the
+XA tag will not be written. [10]
+.TP
+.BI -r \ STR
+Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null]
+.RE
+
+.TP
+.B bwasw
+bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t
+nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N
+nHspRev] [-c thresCoef] <in.db.fasta> <in.fq> [mate.fq]
+
+Align query sequences in the
+.I in.fq
+file. When
+.I mate.fq
+is present, perform paired-end alignment. The paired-end mode only works
+for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW
+may still output split alignments but they are all marked as not properly
+paired; the mate positions will not be written if the mate has multiple
+local hits.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -a \ INT
+Score of a match [1]
+.TP
+.BI -b \ INT
+Mismatch penalty [3]
+.TP
+.BI -q \ INT
+Gap open penalty [5]
+.TP
+.BI -r \ INT
+Gap extension penalty. The penalty for a contiguous gap of size k is
+q+k*r. [2]
+.TP
+.BI -t \ INT
+Number of threads in the multi-threading mode [1]
+.TP
+.BI -w \ INT
+Band width in the banded alignment [33]
+.TP
+.BI -T \ INT
+Minimum score threshold divided by a [37]
+.TP
+.BI -c \ FLOAT
+Coefficient for threshold adjustment according to query length. Given an
+l-long query, the threshold for a hit to be retained is
+a*max{T,c*log(l)}. [5.5]
+.TP
+.BI -z \ INT
+Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]
+.TP
+.BI -s \ INT
+Maximum SA interval size for initiating a seed. Higher -s increases
+accuracy at the cost of speed. [3]
+.TP
+.BI -N \ INT
+Minimum number of seeds supporting the resultant alignment to skip
+reverse alignment. [5]
+.RE
+
+.SH SAM ALIGNMENT FORMAT
+.PP
+The output of the
+.B `aln'
+command is binary and designed for BWA use only. BWA outputs the final
+alignment in the SAM (Sequence Alignment/Map) format. Each line consists
+of:
+
+.TS
+center box;
+cb | cb | cb
+n | l | l .
+Col Field Description
+_
+1 QNAME Query (pair) NAME
+2 FLAG bitwise FLAG
+3 RNAME Reference sequence NAME
+4 POS 1-based leftmost POSition/coordinate of clipped sequence
+5 MAPQ MAPping Quality (Phred-scaled)
+6 CIAGR extended CIGAR string
+7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME)
+8 MPOS 1-based Mate POSistion
+9 ISIZE Inferred insert SIZE
+10 SEQ query SEQuence on the same strand as the reference
+11 QUAL query QUALity (ASCII-33 gives the Phred base quality)
+12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE
+.TE
+
+.PP
+Each bit in the FLAG field is defined as:
+
+.TS
+center box;
+cb | cb | cb
+c | l | l .
+Chr Flag Description
+_
+p 0x0001 the read is paired in sequencing
+P 0x0002 the read is mapped in a proper pair
+u 0x0004 the query sequence itself is unmapped
+U 0x0008 the mate is unmapped
+r 0x0010 strand of the query (1 for reverse)
+R 0x0020 strand of the mate
+1 0x0040 the read is the first read in a pair
+2 0x0080 the read is the second read in a pair
+s 0x0100 the alignment is not primary
+f 0x0200 QC failure
+d 0x0400 optical or PCR duplicate
+S 0x0800 supplementary alignment
+.TE
+
+.PP
+The Please check <http://samtools.sourceforge.net> for the format
+specification and the tools for post-processing the alignment.
+
+BWA generates the following optional fields. Tags starting with `X' are
+specific to BWA.
+
+.TS
+center box;
+cb | cb
+cB | l .
+Tag Meaning
+_
+NM Edit distance
+MD Mismatching positions/bases
+AS Alignment score
+BC Barcode sequence
+SA Supplementary alignments
+_
+X0 Number of best hits
+X1 Number of suboptimal hits found by BWA
+XN Number of ambiguous bases in the referenece
+XM Number of mismatches in the alignment
+XO Number of gap opens
+XG Number of gap extentions
+XT Type: Unique/Repeat/N/Mate-sw
+XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/
+_
+XS Suboptimal alignment score
+XF Support from forward/reverse alignment
+XE Number of supporting seeds
+.TE
+
+.PP
+Note that XO and XG are generated by BWT search while the CIGAR string
+by Smith-Waterman alignment. These two tags may be inconsistent with the
+CIGAR string. This is not a bug.
+
+.SH NOTES ON SHORT-READ ALIGNMENT
+.SS Alignment Accuracy
+.PP
+When seeding is disabled, BWA guarantees to find an alignment
+containing maximum
+.I maxDiff
+differences including
+.I maxGapO
+gap opens which do not occur within
+.I nIndelEnd
+bp towards either end of the query. Longer gaps may be found if
+.I maxGapE
+is positive, but it is not guaranteed to find all hits. When seeding is
+enabled, BWA further requires that the first
+.I seedLen
+subsequence contains no more than
+.I maxSeedDiff
+differences.
+.PP
+When gapped alignment is disabled, BWA is expected to generate the same
+alignment as Eland version 1, the Illumina alignment program. However, as BWA
+change `N' in the database sequence to random nucleotides, hits to these
+random sequences will also be counted. As a consequence, BWA may mark a
+unique hit as a repeat, if the random sequences happen to be identical
+to the sequences which should be unqiue in the database.
+.PP
+By default, if the best hit is not highly repetitive (controlled by -R), BWA
+also finds all hits contains one more mismatch; otherwise, BWA finds all
+equally best hits only. Base quality is NOT considered in evaluating
+hits. In the paired-end mode, BWA pairs all hits it found. It further
+performs Smith-Waterman alignment for unmapped reads to rescue reads with a
+high erro rate, and for high-quality anomalous pairs to fix potential alignment
+errors.
+
+.SS Estimating Insert Size Distribution
+.PP
+BWA estimates the insert size distribution per 256*1024 read pairs. It
+first collects pairs of reads with both ends mapped with a single-end
+quality 20 or higher and then calculates median (Q2), lower and higher
+quartile (Q1 and Q3). It estimates the mean and the variance of the
+insert size distribution from pairs whose insert sizes are within
+interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
+considered to be properly paired (SAM flag 0x2) is calculated by solving
+equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the
+standard error of the insert size distribution, L is the length of the
+genome, p0 is prior of anomalous pair and Phi() is the standard
+cumulative distribution function. For mapping Illumina short-insert
+reads to the human genome, x is about 6-7 sigma away from the
+mean. Quartiles, mean, variance and x will be printed to the standard
+error output.
+
+.SS Memory Requirement
+.PP
+With bwtsw algorithm, 5GB memory is required for indexing the complete
+human genome sequences. For short reads, the
+.B aln
+command uses ~3.2GB memory and the
+.B sampe
+command uses ~5.4GB.
+
+.SS Speed
+.PP
+Indexing the human genome sequences takes 3 hours with bwtsw
+algorithm. Indexing smaller genomes with IS algorithms is
+faster, but requires more memory.
+.PP
+The speed of alignment is largely determined by the error rate of the query
+sequences (r). Firstly, BWA runs much faster for near perfect hits than
+for hits with many differences, and it stops searching for a hit with
+l+2 differences if a l-difference hit is found. This means BWA will be
+very slow if r is high because in this case BWA has to visit hits with
+many differences and looking for these hits is expensive. Secondly, the
+alignment algorithm behind makes the speed sensitive to [k log(N)/m],
+where k is the maximum allowed differences, N the size of database and m
+the length of a query. In practice, we choose k w.r.t. r and therefore r
+is the leading factor. I would not recommend to use BWA on data with
+r>0.02.
+.PP
+Pairing is slower for shorter reads. This is mainly because shorter
+reads have more spurious hits and converting SA coordinates to
+chromosomal coordinates are very costly.
+
+.SH CHANGES IN BWA-0.6
+.PP
+Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
+This feature makes it possible to integrate the forward and reverse complemented
+genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff,
+BWA uses more memory because it has to keep all positions and ranks in 64-bit
+integers, twice larger than 32-bit integers used in the previous versions.
+
+The latest BWA-SW also works for paired-end reads longer than 100bp. In
+comparison to BWA-short, BWA-SW tends to be more accurate for highly unique
+reads and more robust to relative long INDELs and structural variants.
+Nonetheless, BWA-short usually has higher power to distinguish the optimal hit
+from many suboptimal hits. The choice of the mapping algorithm may depend on
+the application.
+
+.SH SEE ALSO
+BWA website <http://bio-bwa.sourceforge.net>, Samtools website
+<http://samtools.sourceforge.net>
+
+.SH AUTHOR
+Heng Li at the Sanger Institute wrote the key source codes and
+integrated the following codes for BWT construction: bwtsw
+<http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong at
+the University of Hong Kong and IS
+<http://yuta.256.googlepages.com/sais> originally proposed by Nong Ge
+<http://www.cs.sysu.edu.cn/nong/> at the Sun Yat-Sen University and
+implemented by Yuta Mori.
+
+.SH LICENSE AND CITATION
+.PP
+The full BWA package is distributed under GPLv3 as it uses source codes
+from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
+libraries are distributed under the MIT license.
+.PP
+If you use the BWA-backtrack algorithm, please cite the following
+paper:
+.PP
+Li H. and Durbin R. (2009) Fast and accurate short read alignment with
+Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
+.PP
+If you use the BWA-SW algorithm, please cite:
+.PP
+Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
+Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
+.PP
+If you use BWA-MEM or the fastmap component of BWA, please cite:
+.PP
+Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with
+BWA-MEM. arXiv:1303.3997v1 [q-bio.GN].
+.PP
+It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed
+journal.
+
+.SH HISTORY
+BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
+and mimics its binary file formats; BWA-SW resembles BWT-SW in several
+ways. The initial idea about BWT-based alignment also came from the
+group who developed BWT-SW. At the same time, BWA is different enough
+from BWT-SW. The short-read alignment algorithm bears no similarity to
+Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it
+introduces heuristics that can hardly be applied to the original
+algorithm. In all, BWA does not guarantee to find all local hits as what
+BWT-SW is designed to do, but it is much faster than BWT-SW on both
+short and long query sequences.
+
+I started to write the first piece of codes on 24 May 2008 and got the
+initial stable version on 02 June 2008. During this period, I was
+acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper,
+was collaborating with Beijing Genomics Institute on SOAP2, the successor
+to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in
+November 2008. According to the SourceForge download page, the third
+BWT-based short read aligner, bowtie, was first released in August
+2008. At the time of writing this manual, at least three more BWT-based
+short-read aligners are being implemented.
+
+The BWA-SW algorithm is a new component of BWA. It was conceived in
+November 2008 and implemented ten months later.
+
+The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
+matches (SMEMs), which was first published with the fermi assembler paper
+in 2012. I first implemented the basic SMEM algorithm in the
+.B fastmap
+command for an experiment and then extended the basic algorithm and added the
+extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
+
diff --git a/ext/src/bwa/bwa.c b/ext/src/bwa/bwa.c
new file mode 100644
index 0000000..1b543b2
--- /dev/null
+++ b/ext/src/bwa/bwa.c
@@ -0,0 +1,447 @@
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+#include <assert.h>
+#include "bwa/bntseq.h"
+#include "bwa/bwa.h"
+#include "ksw.h"
+#include "bwa/utils.h"
+#include "kstring.h"
+#include "kvec.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+int bwa_verbose = 3;
+char bwa_rg_id[256];
+char *bwa_pg;
+
+/************************
+ * Batch FASTA/Q reader *
+ ************************/
+
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+static inline void trim_readno(kstring_t *s)
+{
+ if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
+ s->l -= 2, s->s[s->l] = 0;
+}
+
+static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
+{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
+ s->name = strdup(ks->name.s);
+ s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
+ s->seq = strdup(ks->seq.s);
+ s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
+ s->l_seq = strlen(s->seq);
+}
+
+bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
+{
+ kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
+ int size = 0, m, n;
+ bseq1_t *seqs;
+ m = n = 0; seqs = 0;
+ while (kseq_read(ks) >= 0) {
+ if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
+ fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
+ break;
+ }
+ if (n >= m) {
+ m = m? m<<1 : 256;
+ seqs = realloc(seqs, m * sizeof(bseq1_t));
+ }
+ trim_readno(&ks->name);
+ kseq2bseq1(ks, &seqs[n]);
+ seqs[n].id = n;
+ size += seqs[n++].l_seq;
+ if (ks2) {
+ trim_readno(&ks2->name);
+ kseq2bseq1(ks2, &seqs[n]);
+ seqs[n].id = n;
+ size += seqs[n++].l_seq;
+ }
+ if (size >= chunk_size && (n&1) == 0) break;
+ }
+ if (size == 0) { // test if the 2nd file is finished
+ if (ks2 && kseq_read(ks2) >= 0)
+ fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
+ }
+ *n_ = n;
+ return seqs;
+}
+
+void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
+{
+ int i, has_last;
+ kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
+ for (i = 1, has_last = 1; i < n; ++i) {
+ if (has_last) {
+ if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
+ kv_push(bseq1_t, a[1], seqs[i-1]);
+ kv_push(bseq1_t, a[1], seqs[i]);
+ has_last = 0;
+ } else kv_push(bseq1_t, a[0], seqs[i-1]);
+ } else has_last = 1;
+ }
+ if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
+ sep[0] = a[0].a, m[0] = a[0].n;
+ sep[1] = a[1].a, m[1] = a[1].n;
+}
+
+/*****************
+ * CIGAR related *
+ *****************/
+
+void bwa_fill_scmat(int a, int b, int8_t mat[25])
+{
+ int i, j, k;
+ for (i = k = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ mat[k++] = i == j? a : -b;
+ mat[k++] = -1; // ambiguous base
+ }
+ for (j = 0; j < 5; ++j) mat[k++] = -1;
+}
+
+// Generate CIGAR when the alignment end points are known
+uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+{
+ uint32_t *cigar = 0;
+ uint8_t tmp, *rseq;
+ int i;
+ int64_t rlen;
+ kstring_t str;
+ const char *int2base;
+
+ if (n_cigar) *n_cigar = 0;
+ if (NM) *NM = -1;
+ if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
+ rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
+ if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
+ if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
+ for (i = 0; i < l_query>>1; ++i)
+ tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+ for (i = 0; i < rlen>>1; ++i)
+ tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
+ }
+ if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+ // UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance.
+ if (n_cigar) {
+ cigar = malloc(4);
+ cigar[0] = l_query<<4 | 0;
+ *n_cigar = 1;
+ }
+ for (i = 0, *score = 0; i < l_query; ++i)
+ *score += mat[rseq[i]*5 + query[i]];
+ } else {
+ int w, max_gap, max_ins, max_del, min_w;
+ // set the band-width
+ max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.);
+ max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.);
+ max_gap = max_ins > max_del? max_ins : max_del;
+ max_gap = max_gap > 1? max_gap : 1;
+ w = (max_gap + abs(rlen - l_query) + 1) >> 1;
+ w = w < w_? w : w_;
+ min_w = abs(rlen - l_query) + 3;
+ w = w > min_w? w : min_w;
+ // NW alignment
+ if (bwa_verbose >= 4) {
+ printf("* Global bandwidth: %d\n", w);
+ printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+ printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+ }
+ *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar);
+ }
+ if (NM && n_cigar) {// compute NM and MD
+ int k, x, y, u, n_mm = 0, n_gap = 0;
+ str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR
+ int2base = rb < l_pac? "ACGTN" : "TGCAN";
+ for (k = 0, x = y = u = 0; k < *n_cigar; ++k) {
+ int op, len;
+ cigar = (uint32_t*)str.s;
+ op = cigar[k]&0xf, len = cigar[k]>>4;
+ if (op == 0) { // match
+ for (i = 0; i < len; ++i) {
+ if (query[x + i] != rseq[y + i]) {
+ kputw(u, &str);
+ kputc(int2base[rseq[y+i]], &str);
+ ++n_mm; u = 0;
+ } else ++u;
+ }
+ x += len; y += len;
+ } else if (op == 2) { // deletion
+ if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR
+ kputw(u, &str); kputc('^', &str);
+ for (i = 0; i < len; ++i)
+ kputc(int2base[rseq[y+i]], &str);
+ u = 0; n_gap += len;
+ }
+ y += len;
+ } else if (op == 1) x += len, n_gap += len; // insertion
+ }
+ kputw(u, &str); kputc(0, &str);
+ *NM = n_mm + n_gap;
+ cigar = (uint32_t*)str.s;
+ }
+ if (rb >= l_pac) // reverse back query
+ for (i = 0; i < l_query>>1; ++i)
+ tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+
+ret_gen_cigar:
+ free(rseq);
+ return cigar;
+}
+
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
+{
+ return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM);
+}
+
+/*********************
+ * Full index reader *
+ *********************/
+
+char *bwa_idx_infer_prefix(const char *hint)
+{
+ char *prefix;
+ int l_hint;
+ FILE *fp;
+ l_hint = strlen(hint);
+ prefix = malloc(l_hint + 3 + 4 + 1);
+ strcpy(prefix, hint);
+ strcpy(prefix + l_hint, ".64.bwt");
+ if ((fp = fopen(prefix, "rb")) != 0) {
+ fclose(fp);
+ prefix[l_hint + 3] = 0;
+ return prefix;
+ } else {
+ strcpy(prefix + l_hint, ".bwt");
+ if ((fp = fopen(prefix, "rb")) == 0) {
+ free(prefix);
+ return 0;
+ } else {
+ fclose(fp);
+ prefix[l_hint] = 0;
+ return prefix;
+ }
+ }
+}
+
+bwt_t *bwa_idx_load_bwt(const char *hint)
+{
+ char *tmp, *prefix;
+ bwt_t *bwt;
+ prefix = bwa_idx_infer_prefix(hint);
+ if (prefix == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+ return 0;
+ }
+ tmp = calloc(strlen(prefix) + 5, 1);
+ strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
+ bwt = bwt_restore_bwt(tmp);
+ strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
+ bwt_restore_sa(tmp, bwt);
+ free(tmp); free(prefix);
+ return bwt;
+}
+
+bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
+{
+ bwaidx_t *idx;
+ char *prefix;
+ prefix = bwa_idx_infer_prefix(hint);
+ if (prefix == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+ return 0;
+ }
+ idx = calloc(1, sizeof(bwaidx_t));
+ if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
+ if (which & BWA_IDX_BNS) {
+ int i, c;
+ idx->bns = bns_restore(prefix);
+ for (i = c = 0; i < idx->bns->n_seqs; ++i)
+ if (idx->bns->anns[i].is_alt) ++c;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
+ if (which & BWA_IDX_PAC) {
+ idx->pac = calloc(idx->bns->l_pac/4+1, 1);
+ err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
+ err_fclose(idx->bns->fp_pac);
+ idx->bns->fp_pac = 0;
+ }
+ }
+ free(prefix);
+ return idx;
+}
+
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+ return bwa_idx_load_from_disk(hint, which);
+}
+
+void bwa_idx_destroy(bwaidx_t *idx)
+{
+ if (idx == 0) return;
+ if (idx->mem == 0) {
+ if (idx->bwt) bwt_destroy(idx->bwt);
+ if (idx->bns) bns_destroy(idx->bns);
+ if (idx->pac) free(idx->pac);
+ } else {
+ free(idx->bwt); free(idx->bns->anns); free(idx->bns);
+ if (!idx->is_shm) free(idx->mem);
+ }
+ free(idx);
+}
+
+int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
+{
+ int64_t k = 0, x;
+ int i;
+
+ // generate idx->bwt
+ x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
+ x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
+
+ // generate idx->bns and idx->pac
+ x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
+ x = idx->bns->n_seqs * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
+ idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
+ }
+ idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
+ assert(k == l_mem);
+
+ idx->l_mem = k; idx->mem = mem;
+ return 0;
+}
+
+int bwa_idx2mem(bwaidx_t *idx)
+{
+ int i;
+ int64_t k, x, tmp;
+ uint8_t *mem;
+
+ // copy idx->bwt
+ x = idx->bwt->bwt_size * 4;
+ mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
+ memmove(mem + sizeof(bwt_t), mem, x);
+ memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
+ x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
+ free(idx->bwt->sa);
+ free(idx->bwt); idx->bwt = 0;
+
+ // copy idx->bns
+ tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
+ for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
+ tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
+ mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
+ x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
+ x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
+ free(idx->bns->ambs);
+ x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
+ for (i = 0; i < idx->bns->n_seqs; ++i) {
+ x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
+ x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
+ free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
+ }
+ free(idx->bns->anns);
+
+ // copy idx->pac
+ x = idx->bns->l_pac/4+1;
+ mem = realloc(mem, k + x);
+ memcpy(mem + k, idx->pac, x); k += x;
+ free(idx->bns); idx->bns = 0;
+ free(idx->pac); idx->pac = 0;
+
+ return bwa_mem2idx(k, mem, idx);
+}
+
+/***********************
+ * SAM header routines *
+ ***********************/
+
+void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
+{
+ int i, n_SQ = 0;
+ extern char *bwa_pg;
+ if (hdr_line) {
+ const char *p = hdr_line;
+ while ((p = strstr(p, "@SQ\t")) != 0) {
+ if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
+ p += 4;
+ }
+ }
+ if (n_SQ == 0) {
+ for (i = 0; i < bns->n_seqs; ++i)
+ err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
+ } else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
+ fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
+ if (hdr_line) err_printf("%s\n", hdr_line);
+ if (bwa_pg) err_printf("%s\n", bwa_pg);
+}
+
+static char *bwa_escape(char *s)
+{
+ char *p, *q;
+ for (p = q = s; *p; ++p) {
+ if (*p == '\\') {
+ ++p;
+ if (*p == 't') *q++ = '\t';
+ else if (*p == 'n') *q++ = '\n';
+ else if (*p == 'r') *q++ = '\r';
+ else if (*p == '\\') *q++ = '\\';
+ } else *q++ = *p;
+ }
+ *q = '\0';
+ return s;
+}
+
+char *bwa_set_rg(const char *s)
+{
+ char *p, *q, *r, *rg_line = 0;
+ memset(bwa_rg_id, 0, 256);
+ if (strstr(s, "@RG") != s) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
+ goto err_set_rg;
+ }
+ rg_line = strdup(s);
+ bwa_escape(rg_line);
+ if ((p = strstr(rg_line, "\tID:")) == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
+ goto err_set_rg;
+ }
+ p += 4;
+ for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
+ if (q - p + 1 > 256) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
+ goto err_set_rg;
+ }
+ for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
+ *r++ = *q;
+ return rg_line;
+
+err_set_rg:
+ free(rg_line);
+ return 0;
+}
+
+char *bwa_insert_header(const char *s, char *hdr)
+{
+ int len = 0;
+ if (s == 0 || s[0] != '@') return hdr;
+ if (hdr) {
+ len = strlen(hdr);
+ hdr = realloc(hdr, len + strlen(s) + 2);
+ hdr[len++] = '\n';
+ strcpy(hdr + len, s);
+ } else hdr = strdup(s);
+ bwa_escape(hdr + len);
+ return hdr;
+}
diff --git a/ext/src/bwa/bwakit/README.md b/ext/src/bwa/bwakit/README.md
new file mode 100644
index 0000000..b7a67ea
--- /dev/null
+++ b/ext/src/bwa/bwakit/README.md
@@ -0,0 +1,115 @@
+## Introduction
+
+Bwakit is a self-consistent installation-free package of scripts and precompiled
+binaries, providing an end-to-end solution to read mapping. In addition to the
+basic mapping functionality implemented in bwa, bwakit is able to generate
+proper human reference genome and to take advantage of ALT contigs, if present,
+to improve read mapping and to perform HLA typing for high-coverage human data.
+It can remap name- or coordinate-sorted BAM with read group and barcode
+information retained. Bwakit also *optionally* trims adapters (via
+[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final
+alignment (via [samtools][smtl]).
+
+Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human
+reference genomes, and `run-bwamem` which prints mapping command lines on the
+standard output that can be piped to `sh` to execute. The two scripts will call
+other programs or use data in `bwa.kit`. The following shows an example about
+how to use bwakit:
+
+```sh
+# Download the bwa-0.7.11 binary package (download link may change)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.12_x64-linux.tar.bz2/download \
+ | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
+```
+
+The last mapping command line will generate the following files:
+
+* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
+ file, one read may be placed on multiple overlapping ALT contigs at the same
+ time even if the read is mapped better to some contigs than others. This makes
+ it possible to analyze each contig independent of others.
+
+* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
+
+* `out.hla.all`: other possible genotypes on the six HLA genes.
+
+* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
+
+Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
+scripts in the package are available in the [bwa/bwakit][kit] directory.
+Packaging is done manually for now.
+
+## Limitations
+
+* HLA typing only works for high-coverage human data. The typing accuracy can
+ still be improved. We encourage researchers to develop better HLA typing tools
+ based on the intermediate output of bwakit (for each HLA gene included in the
+ index, bwakit writes all reads matching it in a separate file).
+
+* Duplicate marking only works when all reads from a single paired-end library
+ are provided as the input. This limitation is the necessary tradeoff of fast
+ MarkDuplicate provided by samblaster.
+
+* The adapter trimmer is chosen as it is fast, pipe friendly and does not
+ discard reads. However, it is conservative and suboptimal. If this is a
+ concern, it is recommended to preprocess input reads with a more sophisticated
+ adapter trimmer. We also hope existing trimmers can be modified to operate on
+ an interleaved FASTQ stream. We will replace trimadap once a better trimmer
+ meets our needs.
+
+* Bwakit can be memory demanding depends on the functionality invoked. For 30X
+ human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
+ close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
+ In the current setting, sorting uses about 10GB.
+
+
+## Package Contents
+```
+bwa.kit
+|-- README.md This README file.
+|-- run-bwamem *Entry script* for the entire mapping pipeline.
+|-- bwa *BWA binary*
+|-- k8 Interpretor for *.js scripts.
+|-- bwa-postalt.js Post-process alignments to ALT contigs/decoys/HLA genes.
+|-- htsbox Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
+|-- samblaster MarkDuplicates for reads from the same library. v0.1.20
+|-- samtools SAMtools for sorting and SAM=>BAM conversion. v1.1
+|-- seqtk For FASTQ manipulation.
+|-- trimadap Trim Illumina PE sequencing adapters.
+|
+|-- run-gen-ref *Entry script* for generating human reference genomes.
+|-- resource-GRCh38 Resources for generating GRCh38
+| |-- hs38DH-extra.fa Decoy and HLA gene sequences. Used by run-gen-ref.
+| `-- hs38DH.fa.alt ALT-to-GRCh38 alignment. Used by run-gen-ref.
+|
+|-- run-HLA HLA typing for sequences extracted by bwa-postalt.js.
+|-- typeHLA.sh Type one HLA-gene. Called by run-HLA.
+|-- typeHLA.js HLA typing from exon-to-contig alignment. Used by typeHLA.sh.
+|-- typeHLA-selctg.js Select contigs overlapping HLA exons. Used by typeHLA.sh.
+|-- fermi2.pl Fermi2 wrapper. Used by typeHLA.sh for de novo assembly.
+|-- fermi2 Fermi2 binary. Used by fermi2.pl.
+|-- ropebwt2 RopeBWT2 binary. Used by fermi2.pl.
+|-- resource-human-HLA Resources for HLA typing
+| |-- HLA-ALT-exons.bed Exonic regions of HLA ALT contigs. Used by typeHLA.sh.
+| |-- HLA-CDS.fa CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0.
+| |-- HLA-ALT-type.txt HLA types for each HLA ALT contig. Not used.
+| `-- HLA-ALT-idx BWA indices of each HLA ALT contig. Used by typeHLA.sh
+| `-- (...)
+|
+`-- doc BWA documentations
+ |-- bwa.1 Manpage
+ |-- NEWS.md Release Notes
+ |-- README.md GitHub README page
+ `-- README-alt.md Documentation for ALT mapping
+```
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c
+[smtl]: http://www.htslib.org
+[kit]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/ext/src/bwa/bwakit/bwa-postalt.js b/ext/src/bwa/bwakit/bwa-postalt.js
new file mode 100644
index 0000000..bfc4190
--- /dev/null
+++ b/ext/src/bwa/bwakit/bwa-postalt.js
@@ -0,0 +1,524 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ * *
+ * Source code: https://github.com/attractivechaos/k8 *
+ * Binary: http://sourceforge.net/projects/lh3/files/k8/ *
+ * *
+ * Data file used for generating GRCh38 ALT alignments: *
+ * *
+ * http://sourceforge.net/projects/bio-bwa/files/ *
+ *****************************************************************/
+
+/******************
+ *** From k8.js ***
+ ******************/
+
+// Parse command-line options. A BSD getopt() clone in javascript.
+var getopt = function(args, ostr) {
+ var oli; // option letter list index
+ if (typeof(getopt.place) == 'undefined')
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+ if (getopt.place == -1) { // update scanning pointer
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+ getopt.place = -1;
+ return null;
+ }
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+ ++getopt.ind;
+ getopt.place = -1;
+ return null;
+ }
+ }
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
+ if (getopt.place < 0) ++getopt.ind;
+ return '?';
+ }
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+ getopt.arg = null;
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+ } else { // need an argument
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+ getopt.arg = args[getopt.ind].substr(getopt.place);
+ else if (args.length <= ++getopt.ind) { // no arg
+ getopt.place = -1;
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+ return '?';
+ } else getopt.arg = args[getopt.ind]; // white space
+ getopt.place = -1;
+ ++getopt.ind;
+ }
+ return optopt;
+}
+
+// reverse a string
+Bytes.prototype.reverse = function()
+{
+ for (var i = 0; i < this.length>>1; ++i) {
+ var tmp = this[i];
+ this[i] = this[this.length - i - 1];
+ this[this.length - i - 1] = tmp;
+ }
+}
+
+// reverse complement a DNA string
+Bytes.prototype.revcomp = function()
+{
+ if (Bytes.rctab == null) {
+ var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
+ var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
+ Bytes.rctab = [];
+ for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
+ for (var i = 0; i < s1.length; ++i)
+ Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
+ }
+ for (var i = 0; i < this.length>>1; ++i) {
+ var tmp = this[this.length - i - 1];
+ this[this.length - i - 1] = Bytes.rctab[this[i]];
+ this[i] = Bytes.rctab[tmp];
+ }
+ if (this.length&1)
+ this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
+}
+
+// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools
+function intv_ovlp(intv, bits)
+{
+ if (typeof bits == "undefined") bits = 13;
+ intv.sort(function(a,b) {return a[0]-b[0];});
+ // create the index
+ var idx = [], max = 0;
+ for (var i = 0; i < intv.length; ++i) {
+ var b = intv[i][0]>>bits;
+ var e = (intv[i][1]-1)>>bits;
+ if (b != e) {
+ for (var j = b; j <= e; ++j)
+ if (idx[j] == null) idx[j] = i;
+ } else if (idx[b] == null) idx[b] = i;
+ max = max > e? max : e;
+ }
+ // closure
+ return function(_b, _e) {
+ var x = _b >> bits;
+ if (x > max) return [];
+ var off = idx[x];
+ if (off == null) {
+ var i;
+ for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
+ if (idx[i] != null) break;
+ off = i < 0? 0 : idx[i];
+ }
+ var ovlp = [];
+ for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
+ if (intv[i][1] > _b) ovlp.push(intv[i]);
+ return ovlp;
+ }
+}
+
+var re_cigar = /(\d+)([MIDSHN])/g;
+
+/******************************
+ *** Generate ALT alignment ***
+ ******************************/
+
+// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
+function cigar2pos(cigar, pos)
+{
+ var x = 0, y = 0;
+ for (var i = 0; i < cigar.length; ++i) {
+ var op = cigar[i][0], len = cigar[i][1];
+ if (op == 'M') {
+ if (y <= pos && pos < y + len)
+ return x + (pos - y);
+ x += len, y += len;
+ } else if (op == 'D') {
+ x += len;
+ } else if (op == 'I') {
+ if (y <= pos && pos < y + len)
+ return x;
+ y += len;
+ } else if (op == 'S' || op == 'H') {
+ if (y <= pos && pos < y + len)
+ return -1;
+ y += len;
+ }
+ }
+ return -1;
+}
+
+// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5]
+// Return an object keeping various information about the alignment.
+function parse_hit(s, opt)
+{
+ var h = {};
+ h.ctg = s[0];
+ h.start = parseInt(s[1].substr(1)) - 1;
+ h.rev = (s[1].charAt(0) == '-');
+ h.cigar = s[2];
+ h.NM = parseInt(s[3]);
+ h.hard = false;
+ var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
+ l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
+ while ((m = re_cigar.exec(h.cigar)) != null) {
+ var l = parseInt(m[1]);
+ if (m[2] == 'M') l_match += l;
+ else if (m[2] == 'D') ++n_del, l_del += l;
+ else if (m[2] == 'I') ++n_ins, l_ins += l;
+ else if (m[2] == 'N') l_skip += l;
+ else if (m[2] == 'H' || m[2] == 'S') {
+ l_clip += l;
+ if (m[2] == 'H') h.hard = true;
+ }
+ }
+ h.end = h.start + l_match + l_del + l_skip;
+ h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
+ h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
+ h.l_query = l_match + l_ins + l_clip;
+ return h;
+}
+
+function print_buffer(buf2, fp_hla, hla) // output alignments
+{
+ if (buf2.length == 0) return;
+ for (var i = 0; i < buf2.length; ++i)
+ print(buf2[i].join("\t"));
+ if (fp_hla != null) {
+ var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
+ for (var x in hla) {
+ if (fp_hla[x] != null);
+ fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
+ }
+ }
+}
+
+function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
+{
+ var m, ofunc = idx[ctg];
+ if (ofunc == null) return;
+ var ovlp_alt = ofunc(start, end);
+ for (var i = 0; i < ovlp_alt.length; ++i)
+ if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
+ hla[m[1]] = true;
+}
+
+function bwa_postalt(args)
+{
+ var version = "r985";
+ var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
+
+ while ((c = getopt(args, 'vp:r:')) != null) {
+ if (c == 'p') opt.pre = getopt.arg;
+ else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
+ else if (c == 'v') { print(version); exit(0); }
+ }
+ if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
+
+ if (args.length == getopt.ind) {
+ print("");
+ print("Usage: k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
+ print("Options: -p STR prefix of output files containting sequences matching HLA genes [null]");
+ print(" -r FLOAT reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
+ print(" -v show version number");
+ print("");
+ print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
+ print(" the primary assembly, groups them and then estimates mapQ across groups. If");
+ print(" a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
+ print(" smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
+ print(" multiple ALT hits are lifted to the same position, they will yield new SAM");
+ print(" lines with the same mapQ.");
+ print("");
+ exit(1);
+ }
+
+ var aux = new Bytes(); // used for reverse and reverse complement
+ var buf = new Bytes(); // line reading buffer
+
+ // read ALT-to-REF alignment
+ var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
+ var file = new File(args[getopt.ind]);
+ while (file.readline(buf) >= 0) {
+ var line = buf.toString();
+ if (line.charAt(0) == '@') continue;
+ var t = line.split("\t");
+ if (t.length < 11) continue; // incomplete lines
+ is_alt[t[0]] = true;
+ var pos = parseInt(t[3]) - 1;
+ var flag = parseInt(t[1]);
+ if ((flag&4) || t[2] == '*') continue;
+ var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
+ if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
+ if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
+ ++hla_ctg[m[1]];
+ hla_chr = t[2];
+ }
+ while ((m = re_cigar.exec(t[5])) != null) {
+ var l = parseInt(m[1]);
+ cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
+ if (m[2] == 'M') l_qaln += l, l_tlen += l;
+ else if (m[2] == 'I') l_qaln += l;
+ else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
+ else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
+ }
+ var j = flag&16? cigar.length-1 : 0;
+ var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
+ if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
+ intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
+ if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
+ intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
+ }
+ file.close();
+ var idx_alt = {}, idx_pri = {};
+ for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
+ for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
+
+ // initialize the list of HLA contigs
+ var fp_hla = null;
+ if (opt.pre) {
+ fp_hla = {};
+ for (var h in hla_ctg)
+ fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
+ }
+
+ // process SAM
+ var buf2 = [], hla = {};
+ file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
+ while (file.readline(buf) >= 0) {
+ var m, line = buf.toString();
+
+ if (line.charAt(0) == '@') { // print and then skip the header line
+ print(line);
+ continue;
+ }
+
+ var t = line.split("\t");
+ t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]);
+
+ // print bufferred reads
+ if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
+ print_buffer(buf2, fp_hla, hla);
+ buf2 = [], hla = {};
+ }
+
+ // skip unmapped lines
+ if (t[1]&4) {
+ buf2.push(t);
+ continue;
+ }
+
+ // parse the reported hit
+ var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
+ var flag = t[1];
+ var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
+ if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
+
+ if (h.hard) { // the following does not work with hard clipped alignments
+ buf2.push(t);
+ continue;
+ }
+ var hits = [h];
+
+ // parse hits in the XA tag
+ if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
+ var XA_strs = m[1].split(";");
+ for (var i = 0; i < XA_strs.length; ++i)
+ if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
+ hits.push(parse_hit(XA_strs[i].split(","), opt));
+ }
+
+ // check if there are ALT hits
+ var has_alt = false;
+ for (var i = 0; i < hits.length; ++i)
+ if (is_alt[hits[i].ctg] != null) {
+ has_alt = true;
+ break;
+ }
+ if (!has_alt) {
+ buf2.push(t);
+ continue;
+ }
+
+ // lift mapping positions to the primary assembly
+ var n_rpt_lifted = 0, rpt_lifted = null;
+ for (var i = 0; i < hits.length; ++i) {
+ var a, h = hits[i];
+
+ if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
+ continue;
+
+ // find the approximate position on the primary assembly
+ var lifted = [];
+ for (var j = 0; j < a.length; ++j) {
+ var s, e;
+ if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
+ s = cigar2pos(a[j][6], h.start);
+ e = cigar2pos(a[j][6], h.end - 1) + 1;
+ } else {
+ s = cigar2pos(a[j][6], a[j][2] - h.end);
+ e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
+ }
+ if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
+ s += a[j][5]; e += a[j][5];
+ lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
+ if (i == 0) ++n_rpt_lifted;
+ }
+ if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
+ if (lifted.length) hits[i].lifted = lifted;
+ }
+
+ // prepare for hits grouping
+ for (var i = 0; i < hits.length; ++i) { // set keys for sorting
+ if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
+ hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
+ else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
+ hits[i].i = i; // keep the original index
+ }
+
+ // group hits based on the lifted positions on non-ALT sequences
+ if (hits.length > 1) {
+ hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
+ var last_chr = null, end = 0, g = -1;
+ for (var i = 0; i < hits.length; ++i) {
+ if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
+ else if (hits[i].pstart >= end) ++g;
+ hits[i].g = g;
+ end = end > hits[i].pend? end : hits[i].pend;
+ }
+ } else hits[0].g = 0;
+
+ // find the index and group id of the reported hit; find the size of the reported group
+ var reported_g = null, reported_i = null, n_group0 = 0;
+ if (hits.length > 1) {
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].i == 0)
+ reported_g = hits[i].g, reported_i = i;
+ for (var i = 0; i < hits.length; ++i)
+ if (hits[i].g == reported_g)
+ ++n_group0;
+ } else {
+ if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
+ buf2.push(t);
+ continue;
+ }
+ reported_g = reported_i = 0, n_group0 = 1;
+ }
+
+ // re-estimate mapping quality if necessary
+ var mapQ, ori_mapQ = t[4];
+ if (n_group0 > 1) {
+ var group_max = [];
+ for (var i = 0; i < hits.length; ++i) {
+ var g = hits[i].g;
+ if (group_max[g] == null || group_max[g][0] < hits[i].score)
+ group_max[g] = [hits[i].score, g];
+ }
+ if (group_max.length > 1)
+ group_max.sort(function(x,y) {return y[0]-x[0]});
+ if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
+ mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
+ } else mapQ = 0;
+ mapQ = mapQ < 60? mapQ : 60;
+ if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
+ else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
+ } else mapQ = t[4];
+
+ // find out whether the read is overlapping HLA genes
+ if (hits[reported_i].pctg == hla_chr) {
+ var rpt_start = 1<<30, rpt_end = 0;
+ for (var i = 0; i < hits.length; ++i) {
+ var h = hits[i];
+ if (h.g == reported_g) {
+ rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
+ rpt_end = rpt_end > h.pend ? rpt_end : h.pend;
+ }
+ }
+ collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
+ }
+
+ // adjust the mapQ of the primary hits
+ if (n_rpt_lifted <= 1) {
+ var l = n_rpt_lifted == 1? rpt_lifted : null;
+ for (var i = 0; i < buf2.length; ++i) {
+ var s = buf2[i], is_ovlp = true;
+ if (l != null) {
+ if (l[0] != s[2]) is_ovlp = false; // different chr
+ else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
+ else {
+ var start = s[3] - 1, end = start;
+ while ((m = re_cigar.exec(t[5])) != null)
+ if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
+ end += parseInt(m[1]);
+ if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
+ }
+ } else is_ovlp = false;
+ // get the "pa" tag if present
+ var om = -1, pa = 10.;
+ for (var j = 11; j < s.length; ++j)
+ if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
+ om = parseInt(m[1]);
+ else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
+ pa = parseFloat(m[1]);
+ if (is_ovlp) { // overlapping the lifted hit
+ if (om > 0) s[4] = om;
+ s[4] = s[4] < mapQ? s[4] : mapQ;
+ } else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
+ if (om < 0) s.push("om:i:" + s[4]);
+ s[4] = 0;
+ }
+ }
+ }
+
+ // generate lifted_str
+ for (var i = 0; i < hits.length; ++i) {
+ if (hits[i].lifted && hits[i].lifted.length) {
+ var u = '', lifted = hits[i].lifted;
+ for (var j = 0; j < lifted.length; ++j)
+ u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
+ hits[i].lifted_str = u;
+ }
+ }
+
+ // stage the reported hit
+ t[4] = mapQ;
+ if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
+ if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
+ buf2.push(t);
+
+ // stage the hits generated from the XA tag
+ var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
+ var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
+ for (var i = 0; i < hits.length; ++i) {
+ if (hits[i].g != reported_g || i == reported_i) continue;
+ if (idx_alt[hits[i].ctg] == null) continue;
+ var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
+ if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
+ // print sequence/quality and set the rev flag
+ if (hits[i].rev == hits[reported_i].rev) {
+ s.push(t[9], t[10]);
+ s[1] = flag | 0x800;
+ } else { // we need to write the reverse sequence
+ if (rs == null || rq == null) {
+ aux.length = 0;
+ aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
+ aux.set(t[10],0); aux.reverse(); rq = aux.toString();
+ }
+ s.push(rs, rq);
+ s[1] = (flag ^ 0x10) | 0x800;
+ }
+ s.push("NM:i:" + hits[i].NM);
+ if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
+ if (rg != null) s.push(rg);
+ buf2.push(s);
+ }
+ }
+ print_buffer(buf2, fp_hla, hla);
+ file.close();
+ if (fp_hla != null)
+ for (var h in fp_hla)
+ fp_hla[h].close();
+
+ buf.destroy();
+ aux.destroy();
+}
+
+bwa_postalt(arguments);
diff --git a/ext/src/bwa/bwakit/run-HLA b/ext/src/bwa/bwakit/run-HLA
new file mode 100755
index 0000000..4fee16f
--- /dev/null
+++ b/ext/src/bwa/bwakit/run-HLA
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+ctg_opt=""
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+ ctg_opt="-A"
+ shift
+fi
+
+if [ $# -eq 0 ]; then
+ echo "Usage: $0 <prefix>"
+ exit 1
+fi
+
+for f in $1.HLA-*.fq; do
+ gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
+ echo -e "\n*** Processing gene $gene...\n" >&2
+ `dirname $0`/typeHLA.sh $ctg_opt $1 $gene
+done
+
+ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"
diff --git a/ext/src/bwa/bwakit/run-bwamem b/ext/src/bwa/bwakit/run-bwamem
new file mode 100755
index 0000000..165f93e
--- /dev/null
+++ b/ext/src/bwa/bwakit/run-bwamem
@@ -0,0 +1,186 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts = (t=>1);
+getopts("SadskHo:R:x:t:", \%opts);
+
+die('
+Usage: run-bwamem [options] <idxbase> <file1> [file2]
+
+Options: -o STR prefix for output files [inferred from input]
+ -R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null]
+ -x STR read type: pacbio, ont2d or intractg [default]
+ intractg: intra-species contig (kb query, highly similar)
+ pacbio: pacbio subreads (~10kb query, high error rate)
+ ont2d: Oxford Nanopore reads (~10kb query, higher error rate)
+ -t INT number of threads [1]
+
+ -H apply HLA typing
+ -a trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
+ -d mark duplicate (via samblaster)
+ -S for BAM input, don\'t shuffle
+ -s sort the output alignment (via samtools; requring more RAM)
+ -k keep temporary files generated by typeHLA
+
+Examples:
+
+ * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
+
+ run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
+
+ Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
+ with the quality of input. It is only intended for research purpose, not for diagnostic.
+
+ * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
+ sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
+ the two types. Specifying -R stops read group transfer.
+
+ run-bwamem -sao prefix hs38DH.fa old-srt.bam
+
+ Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
+ mapping pipeline better. It is conservative and suboptimal. A more sophisticated
+ trimmer is recommended if this becomes a concern.
+
+ * Remap name-grouped BAM and mark duplicates:
+
+ run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
+
+ Note: streamed duplicate marking requires all reads from a single paired-end library
+ to be aligned at the same time.
+
+Output files:
+
+ {-o}.aln.bam - final alignment
+ {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
+ {-o}.hla.all - additional HLA genotypes consistent with data
+ {-o}.log.* - log files
+
+') if @ARGV < 2;
+
+my $idx = $ARGV[0];
+
+my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
+my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
+$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
+die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
+
+die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
+ unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
+
+if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
+ warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
+ @ARGV = 2;
+}
+
+if (defined($opts{p}) && @ARGV >= 3) {
+ warn("WARNING: option -P is ignored as there are two input sequence files.\n");
+ delete $opts{p};
+}
+
+my $prefix;
+if (defined $opts{o}) {
+ $prefix = $opts{o};
+} elsif (@ARGV >= 3) {
+ my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
+ my $i;
+ for ($i = 0; $i < $len; ++$i) {
+ last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
+ }
+ $prefix = substr($ARGV[1], 0, $i) if $i > 0;
+} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
+ $prefix = $1;
+}
+die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
+
+my $size = 0;
+my $comp_ratio = 3.;
+for my $f (@ARGV[1..$#ARGV]) {
+ my @a = stat($f);
+ my $s = $a[7];
+ die("ERROR: failed to read file $f\n") if !defined($s);
+ $s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
+ $size += int($s) + 1;
+}
+
+my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
+my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
+
+if (defined($opts{x})) {
+ delete($opts{d}); delete($opts{a}); delete $opts{p};
+}
+
+# for BAM input, find @RG header lines
+my @RG_lines = ();
+if ($is_bam && !defined($opts{R})) {
+ my $fh;
+ open($fh, "$root/samtools view -H $ARGV[1] |") || die;
+ while (<$fh>) {
+ chomp;
+ if (/^\@RG\t/) {
+ s/\t/\\t/g;
+ push(@RG_lines, "-H'$_'");
+ }
+ }
+ close($fh);
+}
+
+warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
+
+my $cmd = '';
+if ($is_bam) {
+ my $cmd_sam2bam = "cat $ARGV[1] \\\n";
+ my $ntmps = int($size / 4e9) + 1;
+ my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
+ my $bam2fq_opt = @RG_lines > 0? " -t" : "";
+ my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
+ $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
+} elsif (@ARGV >= 3) {
+ $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
+} else {
+ $cmd = "cat $ARGV[1] \\\n";
+}
+
+my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
+$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
+
+$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
+$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
+$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
+
+my $has_hla = 0;
+if (-f "$ARGV[0].alt") {
+ my $fh;
+ open($fh, "$ARGV[0].alt") || die;
+ while (<$fh>) {
+ $has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
+ }
+ close($fh);
+ my $hla_pre = $has_hla? "-p $prefix.hla " : "";
+ $cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
+}
+
+my $t_sort = $opts{t} < 4? $opts{t} : 4;
+$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n";
+
+if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
+ $cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
+ $cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
+ $cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
+}
+
+print $cmd;
+
+sub which
+{
+ my $file = shift;
+ my $path = (@_)? shift : $ENV{PATH};
+ return if (!defined($path));
+ foreach my $x (split(":", $path)) {
+ $x =~ s/\/$//;
+ return "$x/$file" if (-x "$x/$file");
+ }
+ return;
+}
diff --git a/ext/src/bwa/bwakit/run-gen-ref b/ext/src/bwa/bwakit/run-gen-ref
new file mode 100755
index 0000000..3ed63b2
--- /dev/null
+++ b/ext/src/bwa/bwakit/run-gen-ref
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+root=`dirname $0`
+
+url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
+url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
+
+if [ $# -eq 0 ]; then
+ echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
+ echo "Analysis sets:"
+ echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
+ echo " hs38a hs38 plus ALT contigs"
+ echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
+ echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
+ echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)"
+ echo ""
+ echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
+ echo " sequences and ALT-to-REF mapping included in the bwa.kit package."
+ exit 1;
+fi
+
+if [ $1 == "hs38DH" ]; then
+ (wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
+ [ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
+elif [ $1 == "hs38a" ]; then
+ wget -O- $url38 | gzip -dc > $1.fa
+ [ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
+elif [ $1 == "hs38" ]; then
+ wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
+elif [ $1 == "hs37d5" ]; then
+ wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
+elif [ $1 == "hs37" ]; then
+ wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
+else
+ echo "ERROR: unknown genome build"
+fi
+
+[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"
+
diff --git a/ext/src/bwa/bwakit/typeHLA-selctg.js b/ext/src/bwa/bwakit/typeHLA-selctg.js
new file mode 100644
index 0000000..0e02a65
--- /dev/null
+++ b/ext/src/bwa/bwakit/typeHLA-selctg.js
@@ -0,0 +1,62 @@
+var min_ovlp = 30;
+
+if (arguments.length < 3) {
+ print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
+ exit(1);
+}
+
+if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
+var gene = arguments[0];
+
+var buf = new Bytes();
+
+var h = {};
+var file = new File(arguments[1]);
+while (file.readline(buf) >= 0) {
+ var t = buf.toString().split("\t");
+ if (t[3] != gene) continue;
+ if (h[t[0]] == null) h[t[0]] = [];
+ h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
+}
+file.close();
+
+var s = {}, re = /(\d+)([MIDSHN])/g;
+file = new File(arguments[2]);
+while (file.readline(buf) >= 0) {
+ var line = buf.toString();
+ var m, t = line.split("\t");
+ var x = h[t[2]];
+ if (x == null) continue;
+
+ var start = parseInt(t[3]) - 1, end = start;
+ while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
+ if (m[2] == 'M' || m[2] == 'D')
+ end += parseInt(m[1]);
+
+ var max_ovlp = 0;
+ for (var i = 0; i < x.length; ++i) {
+ var max_left = x[i][0] > start? x[i][0] : start;
+ var min_rght = x[i][1] < end ? x[i][1] : end;
+ max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
+ }
+
+ var AS = null, XS = null;
+ if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
+ if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
+
+ if (s[t[0]] == null) s[t[0]] = [];
+ s[t[0]].push([AS, XS, max_ovlp]);
+}
+file.close();
+
+buf.destroy();
+
+for (var x in s) {
+ var is_rejected = false, y = s[x];
+ y.sort(function(a,b) {return b[0]-a[0]});
+ for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
+ if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
+ is_rejected = true;
+ if (is_rejected) continue;
+ print(x);
+}
diff --git a/ext/src/bwa/bwakit/typeHLA.js b/ext/src/bwa/bwakit/typeHLA.js
new file mode 100644
index 0000000..b265d07
--- /dev/null
+++ b/ext/src/bwa/bwakit/typeHLA.js
@@ -0,0 +1,496 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ * *
+ * Source code: https://github.com/attractivechaos/k8 *
+ * Binary: http://sourceforge.net/projects/lh3/files/k8/ *
+ *****************************************************************/
+
+var getopt = function(args, ostr) {
+ var oli; // option letter list index
+ if (typeof(getopt.place) == 'undefined')
+ getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+ if (getopt.place == -1) { // update scanning pointer
+ if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+ getopt.place = -1;
+ return null;
+ }
+ if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+ ++getopt.ind;
+ getopt.place = -1;
+ return null;
+ }
+ }
+ var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+ if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+ if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null.
+ if (getopt.place < 0) ++getopt.ind;
+ return '?';
+ }
+ if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+ getopt.arg = null;
+ if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+ } else { // need an argument
+ if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+ getopt.arg = args[getopt.ind].substr(getopt.place);
+ else if (args.length <= ++getopt.ind) { // no arg
+ getopt.place = -1;
+ if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+ return '?';
+ } else getopt.arg = args[getopt.ind]; // white space
+ getopt.place = -1;
+ ++getopt.ind;
+ }
+ return optopt;
+}
+
+/************************
+ * Command line parsing *
+ ************************/
+
+var ver = "r19";
+var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
+
+// parse command line options
+while ((c = getopt(arguments, "vdl:n:f:")) != null) {
+ if (c == 'l') thres_len = parseInt(getopt.arg);
+ else if (c == 'n') thres_nm = parseInt(getopt.arg);
+ else if (c == 'd') dbg = true;
+ else if (c == 'f') thres_frac = parseFloat(getopt.arg);
+ else if (c == 'v') { print(ver); exit(0); }
+}
+if (arguments.length == getopt.ind) {
+ print("");
+ print("Usage: k8 typeHLA.js [options] <exon-to-contig.sam>\n");
+ print("Options: -n INT drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
+ print(" -l INT drop a contig if its match too short ["+thres_len+"]");
+ print(" -f FLOAT drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
+ print(" -d output extra info for debugging");
+ print(" -v show version number");
+ print("");
+ print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
+ print(" #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
+ print(" used in typing. If unusure, use the first GT line as the final genotype.\n");
+ exit(1);
+}
+
+/*********************************
+ * Read gene-to-contig alignment *
+ *********************************/
+
+var file = new File(arguments[getopt.ind]);
+var buf = new Bytes();
+var re_cigar = /(\d+)([MIDSH])/g;
+
+var len = {}, list = [], gcnt = [];
+while (file.readline(buf) >= 0) {
+ var m, mm, line = buf.toString();
+ var t = line.split("\t");
+ var flag = parseInt(t[1]);
+ // SAM header
+ if (t[0].charAt(0) == '@') {
+ if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
+ len[mm[1]] = parseInt(m[1]);
+ continue;
+ }
+ // parse gene name and exon number
+ var gene = null, exon = null;
+ if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
+ gene = m[1], exon = parseInt(m[2]) - 1;
+ if (gcnt[exon] == null) gcnt[exon] = {};
+ gcnt[exon][gene] = true;
+ }
+ if (gene == null || exon == null || t[2] == '*') continue;
+ // parse clipping and aligned length
+ var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
+ while ((m = re_cigar.exec(t[5])) != null) {
+ var l = parseInt(m[1]);
+ if (m[2] == 'M') x += l, te += l;
+ else if (m[2] == 'I') x += l;
+ else if (m[2] == 'D') te += l;
+ else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
+ }
+ var tl = len[t[2]];
+ var left = ts < clip[0]? ts : clip[0];
+ var right = tl - te < clip[1]? tl - te : clip[1];
+ var qs, qe, ql = clip[0] + x + clip[1];
+ if (flag & 16) qs = clip[1], qe = ql - clip[0];
+ else qs = clip[0], qe = ql - clip[1];
+ var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
+ list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
+}
+
+buf.destroy();
+file.close();
+
+/**************************************
+ * Prepare data structures for typing *
+ **************************************/
+
+// identify the primary exons, the exons associated with most genes
+var pri_exon = [], n_pri_exons;
+{
+ var cnt = [], max = 0;
+ // count the number of genes per exon and track the max
+ for (var e = 0; e < gcnt.length; ++e) {
+ if (gcnt[e] != null) {
+ var c = 0, h = gcnt[e];
+ for (var x in h) ++c;
+ cnt[e] = c;
+ max = max > c? max : c;
+ } else cnt[e] = 0;
+ }
+ warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
+ // find primary exons
+ var pri_list = [];
+ for (var e = 0; e < cnt.length; ++e) {
+ if (cnt[e] == max) pri_list.push(e + 1);
+ pri_exon[e] = cnt[e] == max? 1 : 0;
+ }
+ warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
+ n_pri_exons = pri_list.length;
+}
+
+// convert strings to integers (for performance)
+var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
+for (var i = 0; i < list.length; ++i) {
+ if (ghash[list[i][1]] == null) {
+ ghash[list[i][1]] = glist.length;
+ glist.push(list[i][1]);
+ }
+ if (chash[list[i][0]] == null) {
+ chash[list[i][0]] = clist.length;
+ clist.push(list[i][0]);
+ }
+ var g = ghash[list[i][1]];
+ if (elist[g] == null) elist[g] = {};
+ elist[g][list[i][2]] = true;
+}
+
+// extract the 3rd and 4th digits
+var gsub = [], gsuf = [];
+for (var i = 0; i < glist.length; ++i) {
+ var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
+ gsub[i] = parseInt(m[1]);
+ gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
+}
+
+/*************************************************
+ * Collect genes with perfect matches on primary *
+ *************************************************/
+
+// collect exons with fully covered by perfect match(es)
+var perf_exons = [];
+
+function push_perf_exons(matches, last)
+{
+ matches.sort(function(a, b) { return a[0]-b[0]; });
+ var cov = 0, start = 0, end = 0;
+ for (var i = 0; i < matches.length; ++i) {
+ if (matches[i][3] > 0) continue;
+ if (matches[i][0] <= end)
+ end = end > matches[i][1]? end : matches[i][1];
+ else cov += end - start, start = matches[i][0], end = matches[i][1];
+ }
+ cov += end - start;
+ if (matches[0][2] == cov) {
+ if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
+ //print(last[0], last[1], ghash[last[0]]);
+ perf_exons[last[1]].push(ghash[last[0]]);
+ }
+}
+
+var last = [null, -1], matches = [];
+for (var i = 0; i < list.length; ++i) {
+ var li = list[i];
+ if (last[0] != li[1] || last[1] != li[2]) {
+ if (matches.length) push_perf_exons(matches, last);
+ matches = [];
+ last = [li[1], li[2]];
+ }
+ matches.push([li[7], li[8], li[9], li[5]+li[6]]);
+}
+if (matches.length) push_perf_exons(matches, last);
+
+// for each gene, count how many primary exons are perfect
+var pg_aux_cnt = {};
+for (var e = 0; e < perf_exons.length; ++e) {
+ if (!pri_exon[e]) continue;
+ var pe = perf_exons[e];
+ var n = pe? pe.length : 0;
+ for (var i = 0; i < n; ++i) {
+ var g = pe[i];
+ if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
+ else ++pg_aux_cnt[g];
+ }
+}
+
+// find genes with perfect matches on the primary exons
+var perf_genes = [];
+for (var g in pg_aux_cnt)
+ if (pg_aux_cnt[g] == n_pri_exons)
+ perf_genes.push(parseInt(g));
+warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
+
+var h_perf_genes = {};
+for (var i = 0; i < perf_genes.length; ++i) {
+ if (dbg) print("PG", glist[perf_genes[i]]);
+ h_perf_genes[perf_genes[i]] = true;
+}
+
+/*******************
+ * Filter hit list *
+ *******************/
+
+// reorganize hits to exons
+function list2exons(list, flt_flag, perf_hash)
+{
+ var exons = [];
+ for (var i = 0; i < list.length; ++i) {
+ var li = list[i], c = chash[li[0]], g = ghash[li[1]];
+ if (flt_flag != null && flt_flag[c] == 1) continue;
+ if (perf_hash != null && !perf_hash[g]) continue;
+ if (exons[li[2]] == null) exons[li[2]] = [];
+ exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
+ }
+ return exons;
+}
+
+var exons = list2exons(list), flt_flag = [], ovlp_len = [];
+for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
+for (var e = 0; e < exons.length; ++e) {
+ if (!pri_exon[e]) continue;
+ var ee = exons[e];
+ var max_len = [];
+ for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
+ for (var i = 0; i < ee.length; ++i) {
+ var l = ee[i][3] - ee[i][2];
+ if (l < 1) l = 1;
+ if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
+ }
+ for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
+ for (var i = 0; i < ee.length; ++i)
+ flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;
+}
+
+var l_cons = 0, l_incons = 0;
+for (var c = 0; c < clist.length; ++c)
+ if (flt_flag[c]&2) l_cons += ovlp_len[c];
+ else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
+
+warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
+var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
+
+/********************************
+ * Core function for genotyping *
+ ********************************/
+
+function type_gene(perf_mode)
+{
+ if (perf_mode) {
+ var flt_list = [];
+ for (var c = 0; c < clist.length; ++c)
+ if (flt_flag[c] == 1) flt_list.push(clist[c]);
+ warn(" - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
+ exons = list2exons(list, flt_flag, h_perf_genes);
+ } else exons = list2exons(list);
+
+ /***********************
+ * Score each genotype *
+ ***********************/
+
+ // initialize genotype scores
+ var pair = [];
+ for (var i = 0; i < glist.length; ++i) {
+ pair[i] = [];
+ for (var j = 0; j <= i; ++j)
+ pair[i][j] = 0;
+ }
+
+ // these two arrays are used to output debugging information
+ var score = [], ctg = [];
+
+ function type_exon(e, gt_list)
+ {
+ function update_pair(x, m, is_pri)
+ {
+ var y, z;
+ y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
+ if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
+ else z = x>>22;
+ return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
+ }
+
+ score[e] = []; ctg[e] = [];
+ if (exons[e] == null) return;
+ var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
+ // find contigs and genes associated with the current exon
+ var ch = {}, gh = {};
+ for (var i = 0; i < ee.length; ++i)
+ if (elist[ee[i][1]][e] != null)
+ ch[ee[i][0]] = true, gh[ee[i][1]] = true;
+ var ga = [], ca = ctg[e];
+ for (var c in ch) ca.push(parseInt(c));
+ for (var g in gh) ga.push(parseInt(g));
+ var named_ca = [];
+ for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
+ warn(" - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
+ // set unmapped entries to high mismatch
+ var sc = score[e];
+ for (var k = 0; k < ga.length; ++k) {
+ var g = ga[k];
+ if (sc[g] == null) sc[g] = [];
+ for (var i = 0; i < ca.length; ++i)
+ sc[g][ca[i]] = 0xff;
+ }
+ // convert representation again and compute max_len[]
+ var max_len = [];
+ for (var i = 0; i < ee.length; ++i) {
+ var c = ee[i][0], g = ee[i][1];
+ if (gh[g] == null || ch[c] == null) continue;
+ sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
+ if (max_len[c] == null) max_len[c] = 0;
+ max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
+ }
+ // drop mismapped contigs
+ var max_max_len = 0;
+ for (var k = 0; k < ca.length; ++k)
+ max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
+ var dropped = [];
+ for (var k = 0; k < ca.length; ++k) {
+ var min = 0x7fffffff, c = ca[k];
+ for (var i = 0; i < ga.length; ++i) {
+ var g = ga[i];
+ min = min < sc[g][c]? min : sc[g][c];
+ }
+ dropped[c] = min > thres_nm? true : false;
+ if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
+ if (dropped[c]) warn(" . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
+ }
+ // fill the pair array
+ if (gt_list == null) {
+ for (var i = 0; i < ga.length; ++i) {
+ var m = 0, gi = ga[i], g1 = sc[gi];
+ // homozygous
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) m += g1[c];
+ }
+ pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
+ // heterozygous
+ for (var j = i + 1; j < ga.length; ++j) {
+ var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) {
+ m += g1[c] < g2[c]? g1[c] : g2[c];
+ ++a[g1[c]<g2[c]? 0:1];
+ }
+ }
+ if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
+ if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
+ else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
+ }
+ }
+ } else {
+ var tmp_pairs = [], min = 0xff;
+ for (var i = 0; i < gt_list.length; ++i) {
+ var gt = gt_list[i], m = 0;
+ var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
+ if (g1 == null || g2 == null) continue;
+ if (gt[0] == gt[1]) {
+ for (var k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) m += g1[c];
+ }
+ } else {
+ var a = [0, 0];
+ for (k = 0; k < ca.length; ++k) {
+ var c = ca[k];
+ if (!dropped[c]) {
+ m += g1[c] < g2[c]? g1[c] : g2[c];
+ ++a[g1[c]<g2[c]? 0:1];
+ }
+ }
+ if (a[0] == 0 || a[1] == 0) m = 0xff;
+ }
+ tmp_pairs.push([gt[0], gt[1], m]);
+ min = min < m? min : m;
+ }
+ if (min < 0xff) {
+ for (var i = 0; i < tmp_pairs.length; ++i) {
+ var t = tmp_pairs[i];
+ pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
+ }
+ } else warn(" . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
+ }
+ }
+
+ // type primary exons
+ warn(" - Processing primary exon(s)...");
+ for (var e = 0; e < exons.length; ++e)
+ if (pri_exon[e]) type_exon(e);
+
+ // generate the list of best genotypes on primary exons
+ var min_nm_pri = 0x7fffffff;
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons)
+ min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
+
+ var gt_list = [];
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
+ gt_list.push([i, j]);
+
+ warn(" - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
+
+ // type other exons
+ warn(" - Processing other exon(s)...");
+ for (var e = 0; e < exons.length; ++e)
+ if (!pri_exon[e]) type_exon(e, gt_list);
+
+ /*****************************
+ * Choose the best genotypes *
+ *****************************/
+
+ // genotyping
+ var min_nm = 0x7fffffff;
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons)
+ min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
+
+ var out = [];
+ for (var i = 0; i < glist.length; ++i)
+ for (var j = 0; j <= i; ++j)
+ if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
+ out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
+
+ out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
+
+ return out;
+}
+
+/**********************
+ * Perform genotyping *
+ **********************/
+
+warn("- Typing in the imperfect mode...");
+var rst = type_gene(false);
+if (attempt_perf) {
+ warn("- Typing in the perfect mode...");
+ var rst_perf = type_gene(true);
+ warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
+ if (rst_perf[0][0] < rst[0][0]) {
+ warn("- Chose the result from the perfect mode");
+ rst = rst_perf;
+ } else warn("- Chose the result from the imperfect mode");
+} else warn("- Perfect mode is not attempted");
+
+/**********
+ * Output *
+ **********/
+
+for (var i = 0; i < rst.length; ++i)
+ print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);
diff --git a/ext/src/bwa/bwakit/typeHLA.sh b/ext/src/bwa/bwakit/typeHLA.sh
new file mode 100755
index 0000000..b73100d
--- /dev/null
+++ b/ext/src/bwa/bwakit/typeHLA.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+is_ctg=0
+
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+ is_ctg=1
+ shift
+fi
+
+if [ $# -lt 2 ]; then
+ echo "Usage: $0 [-A] <prefix> <gene>"
+ exit 1
+fi
+
+preres="resource-human-HLA"
+root=`dirname $0`
+pre=$1.$2
+touch $pre.gt
+
+if [ ! -s $pre.fq ]; then
+ echo '** Empty input file. Abort!' >&2
+ exit 0
+fi
+
+if [ $is_ctg -eq 0 ]; then
+ echo "** De novo assembling..." >&2
+ len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
+ $root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
+ make -f $pre.tmp.mak >&2
+ cp $pre.tmp.mag.gz $pre.mag.gz
+else
+ rm -f $pre.tmp.mag.gz
+ ln -s $pre.fq $pre.tmp.mag.gz
+fi
+
+echo "** Selecting contigs overlapping target exons..." >&2
+(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
+$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
+
+echo "** Mapping exons to de novo contigs..." >&2
+$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
+$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
+
+echo "** Typing..." >&2
+$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
+
+# delete temporary files
+rm -f $pre.tmp.*
+[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz
diff --git a/ext/src/bwa/bwamem.c b/ext/src/bwa/bwamem.c
new file mode 100644
index 0000000..91d551a
--- /dev/null
+++ b/ext/src/bwa/bwamem.c
@@ -0,0 +1,1201 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+#include "kstring.h"
+#include "bwa/bwamem.h"
+#include "bwa/bntseq.h"
+#include "ksw.h"
+#include "kvec.h"
+#include "ksort.h"
+#include "bwa/utils.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+/* Theory on probability and scoring *ungapped* alignment
+ *
+ * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
+ * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
+ *
+ * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
+ *
+ * If the matching score is x and mismatch penalty is -y, we can compute error rate e:
+ * e = .75 * exp[-log(4) * y/x]
+ *
+ * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
+ * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
+ *
+ * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
+ * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
+ *
+ *
+ * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
+ * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
+ *
+ * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
+ */
+
+static const bntseq_t *global_bns = 0; // for debugging only
+
+mem_opt_t *mem_opt_init()
+{
+ mem_opt_t *o;
+ o = calloc(1, sizeof(mem_opt_t));
+ o->flag = 0;
+ o->a = 1; o->b = 4;
+ o->o_del = o->o_ins = 6;
+ o->e_del = o->e_ins = 1;
+ o->w = 100;
+ o->T = 30;
+ o->zdrop = 100;
+ o->pen_unpaired = 17;
+ o->pen_clip5 = o->pen_clip3 = 5;
+
+ o->max_mem_intv = 20;
+
+ o->min_seed_len = 19;
+ o->split_width = 10;
+ o->max_occ = 500;
+ o->max_chain_gap = 10000;
+ o->max_ins = 10000;
+ o->mask_level = 0.50;
+ o->drop_ratio = 0.50;
+ o->XA_drop_ratio = 0.80;
+ o->split_factor = 1.5;
+ o->chunk_size = 10000000;
+ o->n_threads = 1;
+ o->max_XA_hits = 5;
+ o->max_XA_hits_alt = 200;
+ o->max_matesw = 50;
+ o->mask_level_redun = 0.95;
+ o->min_chain_weight = 0;
+ o->max_chain_extend = 1<<30;
+ o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len);
+ bwa_fill_scmat(o->a, o->b, o->mat);
+ return o;
+}
+
+/***************************
+ * Collection SA invervals *
+ ***************************/
+
+#define intv_lt(a, b) ((a).info < (b).info)
+KSORT_INIT(mem_intv, bwtintv_t, intv_lt)
+
+typedef struct {
+ bwtintv_v mem, mem1, *tmpv[2];
+} smem_aux_t;
+
+static smem_aux_t *smem_aux_init()
+{
+ smem_aux_t *a;
+ a = calloc(1, sizeof(smem_aux_t));
+ a->tmpv[0] = calloc(1, sizeof(bwtintv_v));
+ a->tmpv[1] = calloc(1, sizeof(bwtintv_v));
+ return a;
+}
+
+static void smem_aux_destroy(smem_aux_t *a)
+{
+ free(a->tmpv[0]->a); free(a->tmpv[0]);
+ free(a->tmpv[1]->a); free(a->tmpv[1]);
+ free(a->mem.a); free(a->mem1.a);
+ free(a);
+}
+
+static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a)
+{
+ int i, k, x = 0, old_n;
+ int start_width = 1;
+ int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+ a->mem.n = 0;
+ // first pass: find all SMEMs
+ while (x < len) {
+ if (seq[x] < 4) {
+ x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv);
+ for (i = 0; i < a->mem1.n; ++i) {
+ bwtintv_t *p = &a->mem1.a[i];
+ int slen = (uint32_t)p->info - (p->info>>32); // seed length
+ if (slen >= opt->min_seed_len)
+ kv_push(bwtintv_t, a->mem, *p);
+ }
+ } else ++x;
+ }
+ // second pass: find MEMs inside a long SMEM
+ old_n = a->mem.n;
+ for (k = 0; k < old_n; ++k) {
+ bwtintv_t *p = &a->mem.a[k];
+ int start = p->info>>32, end = (int32_t)p->info;
+ if (end - start < split_len || p->x[2] > opt->split_width) continue;
+ bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
+ for (i = 0; i < a->mem1.n; ++i)
+ if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
+ kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+ }
+ // third pass: LAST-like
+ if (opt->max_mem_intv > 0) {
+ x = 0;
+ while (x < len) {
+ if (seq[x] < 4) {
+ if (1) {
+ bwtintv_t m;
+ x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
+ if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
+ } else { // for now, we never come to this block which is slower
+ x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
+ for (i = 0; i < a->mem1.n; ++i)
+ kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+ }
+ } else ++x;
+ }
+ }
+ // sort
+ ks_introsort(mem_intv, a->mem.n, a->mem.a);
+}
+
+/************
+ * Chaining *
+ ************/
+
+typedef struct {
+ int64_t rbeg;
+ int32_t qbeg, len;
+ int score;
+} mem_seed_t; // unaligned memory
+
+typedef struct {
+ int n, m, first, rid;
+ uint32_t w:29, kept:2, is_alt:1;
+ float frac_rep;
+ int64_t pos;
+ mem_seed_t *seeds;
+} mem_chain_t;
+
+typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v;
+
+#include "kbtree.h"
+
+#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
+KBTREE_INIT(chn, mem_chain_t, chain_cmp)
+
+// return 1 if the seed is merged into the chain
+static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid)
+{
+ int64_t qend, rend, x, y;
+ const mem_seed_t *last = &c->seeds[c->n-1];
+ qend = last->qbeg + last->len;
+ rend = last->rbeg + last->len;
+ if (seed_rid != c->rid) return 0; // different chr; request a new chain
+ if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
+ return 1; // contained seed; do nothing
+ if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand
+ x = p->qbeg - last->qbeg; // always non-negtive
+ y = p->rbeg - last->rbeg;
+ if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
+ if (c->n == c->m) {
+ c->m <<= 1;
+ c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
+ }
+ c->seeds[c->n++] = *p;
+ return 1;
+ }
+ return 0; // request to add a new chain
+}
+
+int mem_chain_weight(const mem_chain_t *c)
+{
+ int64_t end;
+ int j, w = 0, tmp;
+ for (j = 0, end = 0; j < c->n; ++j) {
+ const mem_seed_t *s = &c->seeds[j];
+ if (s->qbeg >= end) w += s->len;
+ else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
+ end = end > s->qbeg + s->len? end : s->qbeg + s->len;
+ }
+ tmp = w; w = 0;
+ for (j = 0, end = 0; j < c->n; ++j) {
+ const mem_seed_t *s = &c->seeds[j];
+ if (s->rbeg >= end) w += s->len;
+ else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
+ end = end > s->rbeg + s->len? end : s->rbeg + s->len;
+ }
+ w = w < tmp? w : tmp;
+ return w < 1<<30? w : (1<<30)-1;
+}
+
+void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
+{
+ int i, j;
+ for (i = 0; i < chn->n; ++i) {
+ mem_chain_t *p = &chn->a[i];
+ err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p));
+ for (j = 0; j < p->n; ++j) {
+ bwtint_t pos;
+ int is_rev;
+ pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
+ if (is_rev) pos -= p->seeds[j].len - 1;
+ err_printf("\t%d;%d;%d,%ld(%s:%c%ld)", p->seeds[j].score, p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[p->rid].name, "+-"[is_rev], (long)(pos - bns->anns[p->rid].offset) + 1);
+ }
+ err_putchar('\n');
+ }
+}
+
+mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq, void *buf)
+{
+ int i, b, e, l_rep;
+ int64_t l_pac = bns->l_pac;
+ mem_chain_v chain;
+ kbtree_t(chn) *tree;
+ smem_aux_t *aux;
+
+ kv_init(chain);
+ if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
+ tree = kb_init(chn, KB_DEFAULT_SIZE);
+
+ aux = buf? (smem_aux_t*)buf : smem_aux_init();
+ mem_collect_intv(opt, bwt, len, seq, aux);
+ for (i = 0, b = e = l_rep = 0; i < aux->mem.n; ++i) { // compute frac_rep
+ bwtintv_t *p = &aux->mem.a[i];
+ int sb = (p->info>>32), se = (uint32_t)p->info;
+ if (p->x[2] <= opt->max_occ) continue;
+ if (sb > e) l_rep += e - b, b = sb, e = se;
+ else e = e > se? e : se;
+ }
+ l_rep += e - b;
+ for (i = 0; i < aux->mem.n; ++i) {
+ bwtintv_t *p = &aux->mem.a[i];
+ int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
+ int64_t k;
+ // if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
+ step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
+ for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
+ mem_chain_t tmp, *lower, *upper;
+ mem_seed_t s;
+ int rid, to_add = 0;
+ s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
+ s.qbeg = p->info>>32;
+ s.score= s.len = slen;
+ rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len);
+ if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary; TODO: split the seed; don't discard it!!!
+ if (kb_size(tree)) {
+ kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
+ if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1;
+ } else to_add = 1;
+ if (to_add) { // add the seed as a new chain
+ tmp.n = 1; tmp.m = 4;
+ tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
+ tmp.seeds[0] = s;
+ tmp.rid = rid;
+ tmp.is_alt = !!bns->anns[rid].is_alt;
+ kb_putp(chn, tree, &tmp);
+ }
+ }
+ }
+ if (buf == 0) smem_aux_destroy(aux);
+
+ kv_resize(mem_chain_t, chain, kb_size(tree));
+
+ #define traverse_func(p_) (chain.a[chain.n++] = *(p_))
+ __kb_traverse(mem_chain_t, tree, traverse_func);
+ #undef traverse_func
+
+ for (i = 0; i < chain.n; ++i) chain.a[i].frac_rep = (float)l_rep / len;
+ if (bwa_verbose >= 4) printf("* fraction of repetitive seeds: %.3f\n", (float)l_rep / len);
+
+ kb_destroy(chn, tree);
+ return chain;
+}
+
+/********************
+ * Filtering chains *
+ ********************/
+
+#define chn_beg(ch) ((ch).seeds->qbeg)
+#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len)
+
+#define flt_lt(a, b) ((a).w > (b).w)
+KSORT_INIT(mem_flt, mem_chain_t, flt_lt)
+
+int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
+{
+ int i, k;
+ kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains
+ if (n_chn == 0) return 0; // no need to filter
+ // compute the weight of each chain and drop chains with small weight
+ for (i = k = 0; i < n_chn; ++i) {
+ mem_chain_t *c = &a[i];
+ c->first = -1; c->kept = 0;
+ c->w = mem_chain_weight(c);
+ if (c->w < opt->min_chain_weight) free(c->seeds);
+ else a[k++] = *c;
+ }
+ n_chn = k;
+ ks_introsort(mem_flt, n_chn, a);
+ // pairwise chain comparisons
+ a[0].kept = 3;
+ kv_push(int, chains, 0);
+ for (i = 1; i < n_chn; ++i) {
+ int large_ovlp = 0;
+ for (k = 0; k < chains.n; ++k) {
+ int j = chains.a[k];
+ int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
+ int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
+ if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
+ int li = chn_end(a[i]) - chn_beg(a[i]);
+ int lj = chn_end(a[j]) - chn_beg(a[j]);
+ int min_l = li < lj? li : lj;
+ if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap
+ large_ovlp = 1;
+ if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate
+ if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
+ break;
+ }
+ }
+ }
+ if (k == chains.n) {
+ kv_push(int, chains, i);
+ a[i].kept = large_ovlp? 2 : 3;
+ }
+ }
+ for (i = 0; i < chains.n; ++i) {
+ mem_chain_t *c = &a[chains.a[i]];
+ if (c->first >= 0) a[c->first].kept = 1;
+ }
+ free(chains.a);
+ for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains
+ if (a[i].kept == 0 || a[i].kept == 3) continue;
+ if (++k >= opt->max_chain_extend) break;
+ }
+ for (; i < n_chn; ++i)
+ if (a[i].kept < 3) a[i].kept = 0;
+ for (i = k = 0; i < n_chn; ++i) { // free discarded chains
+ mem_chain_t *c = &a[i];
+ if (c->kept == 0) free(c->seeds);
+ else a[k++] = a[i];
+ }
+ return k;
+}
+
+/******************************
+ * De-overlap single-end hits *
+ ******************************/
+
+#define alnreg_slt2(a, b) ((a).re < (b).re)
+KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
+
+#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
+KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
+
+#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
+KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
+
+#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
+KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
+
+#define PATCH_MAX_R_BW 0.05f
+#define PATCH_MIN_SC_RATIO 0.90f
+
+int mem_patch_reg(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *a, const mem_alnreg_t *b, int *_w)
+{
+ int w, score, q_s, r_s;
+ double r;
+ if (bns == 0 || pac == 0 || query == 0) return 0;
+ assert(a->rid == b->rid && a->rb <= b->rb);
+ if (a->rb < bns->l_pac && b->rb >= bns->l_pac) return 0; // on different strands
+ if (a->qb >= b->qb || a->qe >= b->qe || a->re >= b->re) return 0; // not colinear
+ w = (a->re - b->rb) - (a->qe - b->qb); // required bandwidth
+ w = w > 0? w : -w; // l = abs(l)
+ r = (double)(a->re - b->rb) / (b->re - a->rb) - (double)(a->qe - b->qb) / (b->qe - a->qb); // relative bandwidth
+ r = r > 0.? r : -r; // r = fabs(r)
+ if (bwa_verbose >= 4)
+ printf("* potential hit merge between [%d,%d)<=>[%ld,%ld) and [%d,%d)<=>[%ld,%ld), @ %s; w=%d, r=%.4g\n",
+ a->qb, a->qe, (long)a->rb, (long)a->re, b->qb, b->qe, (long)b->rb, (long)b->re, bns->anns[a->rid].name, w, r);
+ if (a->re < b->rb || a->qe < b->qb) { // no overlap on query or on ref
+ if (w > opt->w<<1 || r >= PATCH_MAX_R_BW) return 0; // the bandwidth or the relative bandwidth is too large
+ } else if (w > opt->w<<2 || r >= PATCH_MAX_R_BW*2) return 0; // more permissive if overlapping on both ref and query
+ // global alignment
+ w += a->w + b->w;
+ w = w < opt->w<<2? w : opt->w<<2;
+ if (bwa_verbose >= 4) printf("* test potential hit merge with global alignment; w=%d\n", w);
+ bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w, bns->l_pac, pac, b->qe - a->qb, query + a->qb, a->rb, b->re, &score, 0, 0);
+ q_s = (int)((double)(b->qe - a->qb) / ((b->qe - b->qb) + (a->qe - a->qb)) * (b->score + a->score) + .499); // predicted score from query
+ r_s = (int)((double)(b->re - a->rb) / ((b->re - b->rb) + (a->re - a->rb)) * (b->score + a->score) + .499); // predicted score from ref
+ if (bwa_verbose >= 4) printf("* score=%d;(%d,%d)\n", score, q_s, r_s);
+ if ((double)score / (q_s > r_s? q_s : r_s) < PATCH_MIN_SC_RATIO) return 0;
+ *_w = w;
+ return score;
+}
+
+int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a)
+{
+ int m, i, j;
+ if (n <= 1) return n;
+ ks_introsort(mem_ars2, n, a); // sort by the END position, not START!
+ for (i = 0; i < n; ++i) a[i].n_comp = 1;
+ for (i = 1; i < n; ++i) {
+ mem_alnreg_t *p = &a[i];
+ if (p->rid != a[i-1].rid || p->rb >= a[i-1].re + opt->max_chain_gap) continue; // then no need to go into the loop below
+ for (j = i - 1; j >= 0 && p->rid == a[j].rid && p->rb < a[j].re + opt->max_chain_gap; --j) {
+ mem_alnreg_t *q = &a[j];
+ int64_t or, oq, mr, mq;
+ int score, w;
+ if (q->qe == q->qb) continue; // a[j] has been excluded
+ or = q->re - p->rb; // overlap length on the reference
+ oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query
+ mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment
+ mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment
+ if (or > opt->mask_level_redun * mr && oq > opt->mask_level_redun * mq) { // one of the hits is redundant
+ if (p->score < q->score) {
+ p->qe = p->qb;
+ break;
+ } else q->qe = q->qb;
+ } else if (q->rb < p->rb && (score = mem_patch_reg(opt, bns, pac, query, q, p, &w)) > 0) { // then merge q into p
+ p->n_comp += q->n_comp + 1;
+ p->seedcov = p->seedcov > q->seedcov? p->seedcov : q->seedcov;
+ p->sub = p->sub > q->sub? p->sub : q->sub;
+ p->csub = p->csub > q->csub? p->csub : q->csub;
+ p->qb = q->qb, p->rb = q->rb;
+ p->truesc = p->score = score;
+ p->w = w;
+ q->qb = q->qe;
+ }
+ }
+ }
+ for (i = 0, m = 0; i < n; ++i) // exclude identical hits
+ if (a[i].qe > a[i].qb) {
+ if (m != i) a[m++] = a[i];
+ else ++m;
+ }
+ n = m;
+ ks_introsort(mem_ars, n, a);
+ for (i = 1; i < n; ++i) { // mark identical hits
+ if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
+ a[i].qe = a[i].qb;
+ }
+ for (i = 1, m = 1; i < n; ++i) // exclude identical hits
+ if (a[i].qe > a[i].qb) {
+ if (m != i) a[m++] = a[i];
+ else ++m;
+ }
+ return m;
+}
+
+typedef kvec_t(int) int_v;
+
+static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
+{ // similar to the loop in mem_chain_flt()
+ int i, k, tmp;
+ tmp = opt->a + opt->b;
+ tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
+ tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
+ z->n = 0;
+ kv_push(int, *z, 0);
+ for (i = 1; i < n; ++i) {
+ for (k = 0; k < z->n; ++k) {
+ int j = z->a[k];
+ int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
+ int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
+ if (e_min > b_max) { // have overlap
+ int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
+ if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
+ if (a[j].sub == 0) a[j].sub = a[i].score;
+ if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
+ ++a[j].sub_n;
+ break;
+ }
+ }
+ }
+ if (k == z->n) kv_push(int, *z, i);
+ else a[i].secondary = z->a[k];
+ }
+}
+
+int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+{
+ int i, n_pri;
+ int_v z = {0,0,0};
+ if (n == 0) return 0;
+ for (i = n_pri = 0; i < n; ++i) {
+ a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
+ if (!a[i].is_alt) ++n_pri;
+ }
+ ks_introsort(mem_ars_hash, n, a);
+ mem_mark_primary_se_core(opt, n, a, &z);
+ for (i = 0; i < n; ++i) {
+ mem_alnreg_t *p = &a[i];
+ p->secondary_all = i; // keep the rank in the first round
+ if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
+ p->alt_sc = a[p->secondary].score;
+ }
+ if (n_pri >= 0 && n_pri < n) {
+ kv_resize(int, z, n);
+ if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
+ for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
+ for (i = 0; i < n; ++i) {
+ if (a[i].secondary >= 0) {
+ a[i].secondary_all = z.a[a[i].secondary];
+ if (a[i].is_alt) a[i].secondary = INT_MAX;
+ } else a[i].secondary_all = -1;
+ }
+ if (n_pri > 0) { // mark primary for hits to the primary assembly only
+ for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
+ mem_mark_primary_se_core(opt, n_pri, a, &z);
+ }
+ } else {
+ for (i = 0; i < n; ++i)
+ a[i].secondary_all = a[i].secondary;
+ }
+ free(z.a);
+ return n_pri;
+}
+
+/*********************************
+ * Test if a seed is good enough *
+ *********************************/
+
+#define MEM_SHORT_EXT 50
+#define MEM_SHORT_LEN 200
+
+#define MEM_HSP_COEF 1.1f
+#define MEM_MINSC_COEF 5.5f
+#define MEM_SEEDSW_COEF 0.05f
+
+int mem_seed_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_seed_t *s)
+{
+ int qb, qe, rid;
+ int64_t rb, re, mid, l_pac = bns->l_pac;
+ uint8_t *rseq = 0;
+ kswr_t x;
+
+ if (s->len >= MEM_SHORT_LEN) return -1; // the seed is longer than the max-extend; no need to do SW
+ qb = s->qbeg, qe = s->qbeg + s->len;
+ rb = s->rbeg, re = s->rbeg + s->len;
+ mid = (rb + re) >> 1;
+ qb -= MEM_SHORT_EXT; qb = qb > 0? qb : 0;
+ qe += MEM_SHORT_EXT; qe = qe < l_query? qe : l_query;
+ rb -= MEM_SHORT_EXT; rb = rb > 0? rb : 0;
+ re += MEM_SHORT_EXT; re = re < l_pac<<1? re : l_pac<<1;
+ if (rb < l_pac && l_pac < re) {
+ if (mid < l_pac) re = l_pac;
+ else rb = l_pac;
+ }
+ if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return -1; // the seed seems good enough; no need to do SW
+
+ rseq = bns_fetch_seq(bns, pac, &rb, mid, &re, &rid);
+ x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0);
+ free(rseq);
+ return x.score;
+}
+
+void mem_flt_chained_seeds(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, int n_chn, mem_chain_t *a)
+{
+ double min_l = opt->min_chain_weight? MEM_HSP_COEF * opt->min_chain_weight : MEM_MINSC_COEF * log(l_query);
+ int i, j, k, min_HSP_score = (int)(opt->a * min_l + .499);
+ if (min_l > MEM_SEEDSW_COEF * l_query) return; // don't run the following for short reads
+ for (i = 0; i < n_chn; ++i) {
+ mem_chain_t *c = &a[i];
+ for (j = k = 0; j < c->n; ++j) {
+ mem_seed_t *s = &c->seeds[j];
+ s->score = mem_seed_sw(opt, bns, pac, l_query, query, s);
+ if (s->score < 0 || s->score >= min_HSP_score) {
+ s->score = s->score < 0? s->len * opt->a : s->score;
+ c->seeds[k++] = *s;
+ }
+ }
+ c->n = k;
+ }
+}
+
+/****************************************
+ * Construct the alignment from a chain *
+ ****************************************/
+
+static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
+{
+ int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.);
+ int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.);
+ int l = l_del > l_ins? l_del : l_ins;
+ l = l > 1? l : 1;
+ return l < opt->w<<1? l : opt->w<<1;
+}
+
+#define MAX_BAND_TRY 2
+
+void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
+{
+ int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension
+ int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0;
+ const mem_seed_t *s;
+ uint8_t *rseq = 0;
+ uint64_t *srt;
+
+ if (c->n == 0) return;
+ // get the max possible span
+ rmax[0] = l_pac<<1; rmax[1] = 0;
+ for (i = 0; i < c->n; ++i) {
+ int64_t b, e;
+ const mem_seed_t *t = &c->seeds[i];
+ b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
+ e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
+ rmax[0] = rmax[0] < b? rmax[0] : b;
+ rmax[1] = rmax[1] > e? rmax[1] : e;
+ if (t->len > max) max = t->len;
+ }
+ rmax[0] = rmax[0] > 0? rmax[0] : 0;
+ rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
+ if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
+ if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand
+ else rmax[0] = l_pac;
+ }
+ // retrieve the reference sequence
+ rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid);
+ assert(c->rid == rid);
+
+ srt = malloc(c->n * 8);
+ for (i = 0; i < c->n; ++i)
+ srt[i] = (uint64_t)c->seeds[i].score<<32 | i;
+ ks_introsort_64(c->n, srt);
+
+ for (k = c->n - 1; k >= 0; --k) {
+ mem_alnreg_t *a;
+ s = &c->seeds[(uint32_t)srt[k]];
+
+ for (i = 0; i < av->n; ++i) { // test whether extension has been made before
+ mem_alnreg_t *p = &av->a[i];
+ int64_t rd;
+ int qd, w, max_gap;
+ if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
+ if (s->len - p->seedlen0 > .1 * l_query) continue; // this seed may give a better alignment
+ // qd: distance ahead of the seed on query; rd: on reference
+ qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
+ max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
+ w = max_gap < p->w? max_gap : p->w; // bounded by the band width
+ if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
+ // similar to the previous four lines, but this time we look at the region behind
+ qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
+ max_gap = cal_max_gap(opt, qd < rd? qd : rd);
+ w = max_gap < p->w? max_gap : p->w;
+ if (qd - rd < w && rd - qd < w) break;
+ }
+ if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
+ if (bwa_verbose >= 4)
+ printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment [%d,%d) <=> [%ld,%ld)\n",
+ k, (long)s->len, (long)s->qbeg, (long)s->rbeg, av->a[i].qb, av->a[i].qe, (long)av->a[i].rb, (long)av->a[i].re);
+ for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain
+ const mem_seed_t *t;
+ if (srt[i] == 0) continue;
+ t = &c->seeds[(uint32_t)srt[i]];
+ if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping
+ if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break;
+ if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break;
+ }
+ if (i == c->n) { // no overlapping seeds; then skip extension
+ srt[k] = 0; // mark that seed extension has not been performed
+ continue;
+ }
+ if (bwa_verbose >= 4)
+ printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k);
+ }
+
+ a = kv_pushp(mem_alnreg_t, *av);
+ memset(a, 0, sizeof(mem_alnreg_t));
+ a->w = aw[0] = aw[1] = opt->w;
+ a->score = a->truesc = -1;
+ a->rid = c->rid;
+
+ if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] @ %s <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg, bns->anns[c->rid].name);
+ if (s->qbeg) { // left extension
+ uint8_t *rs, *qs;
+ int qle, tle, gtle, gscore;
+ qs = malloc(s->qbeg);
+ for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
+ tmp = s->rbeg - rmax[0];
+ rs = malloc(tmp);
+ for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
+ for (i = 0; i < MAX_BAND_TRY; ++i) {
+ int prev = a->score;
+ aw[0] = opt->w << i;
+ if (bwa_verbose >= 4) {
+ int j;
+ printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n');
+ printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n');
+ }
+ a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]);
+ if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
+ if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
+ }
+ // check whether we prefer to reach the end of the query
+ if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension
+ a->qb = s->qbeg - qle, a->rb = s->rbeg - tle;
+ a->truesc = a->score;
+ } else { // to-end extension
+ a->qb = 0, a->rb = s->rbeg - gtle;
+ a->truesc = gscore;
+ }
+ free(qs); free(rs);
+ } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
+
+ if (s->qbeg + s->len != l_query) { // right extension
+ int qle, tle, qe, re, gtle, gscore, sc0 = a->score;
+ qe = s->qbeg + s->len;
+ re = s->rbeg + s->len - rmax[0];
+ assert(re >= 0);
+ for (i = 0; i < MAX_BAND_TRY; ++i) {
+ int prev = a->score;
+ aw[1] = opt->w << i;
+ if (bwa_verbose >= 4) {
+ int j;
+ printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n');
+ printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n');
+ }
+ a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]);
+ if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
+ if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
+ }
+ // similar to the above
+ if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension
+ a->qe = qe + qle, a->re = rmax[0] + re + tle;
+ a->truesc += a->score - sc0;
+ } else { // to-end extension
+ a->qe = l_query, a->re = rmax[0] + re + gtle;
+ a->truesc += gscore - sc0;
+ }
+ } else a->qe = l_query, a->re = s->rbeg + s->len;
+ if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);
+
+ // compute seedcov
+ for (i = 0, a->seedcov = 0; i < c->n; ++i) {
+ const mem_seed_t *t = &c->seeds[i];
+ if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
+ a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
+ }
+ a->w = aw[0] > aw[1]? aw[0] : aw[1];
+ a->seedlen0 = s->len;
+
+ a->frac_rep = c->frac_rep;
+ }
+ free(srt); free(rseq);
+}
+
+/*****************************
+ * Basic hit->SAM conversion *
+ *****************************/
+
+static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
+{
+ int w;
+ if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps
+ w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.);
+ if (w < abs(l1 - l2)) w = abs(l1 - l2);
+ return w;
+}
+
+static inline int get_rlen(int n_cigar, const uint32_t *cigar)
+{
+ int k, l;
+ for (k = l = 0; k < n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ if (op == 0 || op == 2)
+ l += cigar[k]>>4;
+ }
+ return l;
+}
+
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
+{
+ int i, l_name;
+ mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
+
+ if (m_) mtmp = *m_, m = &mtmp;
+ // set flag
+ p->flag |= m? 0x1 : 0; // is paired in sequencing
+ p->flag |= p->rid < 0? 0x4 : 0; // is mapped
+ p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped
+ if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment
+ p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0;
+ if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate
+ m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0;
+ p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand
+ p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand
+
+ // print up to CIGAR
+ l_name = strlen(s->name);
+ ks_resize(str, str->l + s->l_seq + l_name + (s->qual? s->l_seq : 0) + 20);
+ kputsn(s->name, l_name, str); kputc('\t', str); // QNAME
+ kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG
+ if (p->rid >= 0) { // with coordinate
+ kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME
+ kputl(p->pos + 1, str); kputc('\t', str); // POS
+ kputw(p->mapq, str); kputc('\t', str); // MAPQ
+ if (p->n_cigar) { // aligned
+ for (i = 0; i < p->n_cigar; ++i) {
+ int c = p->cigar[i]&0xf;
+ if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
+ c = which? 4 : 3; // use hard clipping for supplementary alignments
+ kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
+ }
+ } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true)
+ } else kputsn("*\t0\t0\t*", 7, str); // without coordinte
+ kputc('\t', str);
+
+ // print the mate position if applicable
+ if (m && m->rid >= 0) {
+ if (p->rid == m->rid) kputc('=', str);
+ else kputs(bns->anns[m->rid].name, str);
+ kputc('\t', str);
+ kputl(m->pos + 1, str); kputc('\t', str);
+ if (p->rid == m->rid) {
+ int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0);
+ int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0);
+ if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str);
+ else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str);
+ } else kputc('0', str);
+ } else kputsn("*\t0\t0", 5, str);
+ kputc('\t', str);
+
+ // print SEQ and QUAL
+ if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL
+ kputsn("*\t*", 3, str);
+ } else if (!p->is_rev) { // the forward strand
+ int i, qb = 0, qe = s->l_seq;
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all
+ if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
+ if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
+ }
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ } else { // the reverse strand
+ int i, qb = 0, qe = s->l_seq;
+ if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) {
+ if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
+ if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
+ }
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
+ kputc('\t', str);
+ if (s->qual) { // printf qual
+ ks_resize(str, str->l + (qe - qb) + 1);
+ for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
+ str->s[str->l] = 0;
+ } else kputc('*', str);
+ }
+
+ // print optional tags
+ if (p->n_cigar) {
+ kputsn("\tNM:i:", 6, str); kputw(p->NM, str);
+ kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str);
+ }
+ if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
+ if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
+ if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
+ if (!(p->flag & 0x100)) { // not multi-hit
+ for (i = 0; i < n; ++i)
+ if (i != which && !(list[i].flag&0x100)) break;
+ if (i < n) { // there are other primary hits; output them
+ kputsn("\tSA:Z:", 6, str);
+ for (i = 0; i < n; ++i) {
+ const mem_aln_t *r = &list[i];
+ int k;
+ if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
+ kputs(bns->anns[r->rid].name, str); kputc(',', str);
+ kputl(r->pos+1, str); kputc(',', str);
+ kputc("+-"[r->is_rev], str); kputc(',', str);
+ for (k = 0; k < r->n_cigar; ++k) {
+ kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str);
+ }
+ kputc(',', str); kputw(r->mapq, str);
+ kputc(',', str); kputw(r->NM, str);
+ kputc(';', str);
+ }
+ }
+ if (p->alt_sc > 0)
+ ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
+ }
+ if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
+ if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+ if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
+ int tmp;
+ kputsn("\tXR:Z:", 6, str);
+ tmp = str->l;
+ kputs(bns->anns[p->rid].anno, str);
+ for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
+ if (str->s[i] == '\t') str->s[i] = ' ';
+ }
+ kputc('\n', str);
+}
+
+/************************
+ * Integrated interface *
+ ************************/
+
+int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
+{
+ int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
+ double identity;
+ sub = a->csub > sub? a->csub : sub;
+ if (sub >= a->score) return 0;
+ l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
+ identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
+ if (a->score == 0) {
+ mapq = 0;
+ } else if (opt->mapQ_coef_len > 0) {
+ double tmp;
+ tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l);
+ tmp *= identity * identity;
+ mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499);
+ } else {
+ mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499);
+ mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
+ }
+ if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
+ if (mapq > 60) mapq = 60;
+ if (mapq < 0) mapq = 0;
+ mapq = (int)(mapq * (1. - a->frac_rep) + .499);
+ return mapq;
+}
+
+// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
+void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
+{
+ extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
+ kstring_t str;
+ kvec_t(mem_aln_t) aa;
+ int k, l;
+ char **XA = 0;
+
+ if (!(opt->flag & MEM_F_ALL))
+ XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
+ kv_init(aa);
+ str.l = str.m = 0; str.s = 0;
+ for (k = l = 0; k < a->n; ++k) {
+ mem_alnreg_t *p = &a->a[k];
+ mem_aln_t *q;
+ if (p->score < opt->T) continue;
+ if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
+ if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
+ q = kv_pushp(mem_aln_t, aa);
+ *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
+ assert(q->rid >= 0); // this should not happen with the new code
+ q->XA = XA? XA[k] : 0;
+ q->flag |= extra_flag; // flag secondary
+ if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
+ if (l && p->secondary < 0) // if supplementary
+ q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
+ if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+ ++l;
+ }
+ if (aa.n == 0) { // no alignments good enough; then write an unaligned record
+ mem_aln_t t;
+ t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
+ t.flag |= extra_flag;
+ mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m);
+ } else {
+ for (k = 0; k < aa.n; ++k)
+ mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m);
+ for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
+ free(aa.a);
+ }
+ s->sam = str.s;
+ if (XA) {
+ for (k = 0; k < a->n; ++k) free(XA[k]);
+ free(XA);
+ }
+}
+
+mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf)
+{
+ int i;
+ mem_chain_v chn;
+ mem_alnreg_v regs;
+
+ for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
+ seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
+
+ chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq, buf);
+ chn.n = mem_chain_flt(opt, chn.n, chn.a);
+ mem_flt_chained_seeds(opt, bns, pac, l_seq, (uint8_t*)seq, chn.n, chn.a);
+ if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
+
+ kv_init(regs);
+ for (i = 0; i < chn.n; ++i) {
+ mem_chain_t *p = &chn.a[i];
+ if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i);
+ mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s);
+ free(chn.a[i].seeds);
+ }
+ free(chn.a);
+ regs.n = mem_sort_dedup_patch(opt, bns, pac, (uint8_t*)seq, regs.n, regs.a);
+ if (bwa_verbose >= 4) {
+ err_printf("* %ld chains remain after removing duplicated chains\n", regs.n);
+ for (i = 0; i < regs.n; ++i) {
+ mem_alnreg_t *p = ®s.a[i];
+ printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
+ }
+ }
+ for (i = 0; i < regs.n; ++i) {
+ mem_alnreg_t *p = ®s.a[i];
+ if (p->rid >= 0 && bns->anns[p->rid].is_alt)
+ p->is_alt = 1;
+ }
+ return regs;
+}
+
+mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar)
+{
+ mem_aln_t a;
+ int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD;
+ int64_t pos, rb, re;
+ uint8_t *query;
+
+ memset(&a, 0, sizeof(mem_aln_t));
+ if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record
+ a.rid = -1; a.pos = -1; a.flag |= 0x4;
+ return a;
+ }
+ qb = ar->qb, qe = ar->qe;
+ rb = ar->rb, re = ar->re;
+ query = malloc(l_query);
+ for (i = 0; i < l_query; ++i) // convert to the nt4 encoding
+ query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]];
+ a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0;
+ if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment
+ tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del);
+ w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins);
+ w2 = w2 > tmp? w2 : tmp;
+ if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w);
+ if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w;
+ i = 0; a.cigar = 0;
+ do {
+ free(a.cigar);
+ w2 = w2 < opt->w<<2? w2 : opt->w<<2;
+ a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+ if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc);
+ if (score == last_sc || w2 == opt->w<<2) break; // it is possible that global alignment and local alignment give different scores
+ last_sc = score;
+ w2 <<= 1;
+ } while (++i < 3 && score < ar->truesc - opt->a);
+ l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1;
+ a.NM = NM;
+ pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
+ a.is_rev = is_rev;
+ if (a.n_cigar > 0) { // squeeze out leading or trailing deletions
+ if ((a.cigar[0]&0xf) == 2) {
+ pos += a.cigar[0]>>4;
+ --a.n_cigar;
+ memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD);
+ } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) {
+ --a.n_cigar;
+ memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly
+ }
+ }
+ if (qb != 0 || qe != l_query) { // add clipping to CIGAR
+ int clip5, clip3;
+ clip5 = is_rev? l_query - qe : qb;
+ clip3 = is_rev? qb : l_query - qe;
+ a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD);
+ if (clip5) {
+ memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping
+ a.cigar[0] = clip5<<4 | 3;
+ ++a.n_cigar;
+ }
+ if (clip3) {
+ memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping
+ a.cigar[a.n_cigar++] = clip3<<4 | 3;
+ }
+ }
+ a.rid = bns_pos2rid(bns, pos);
+ assert(a.rid == ar->rid);
+ a.pos = pos - bns->anns[a.rid].offset;
+ a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
+ a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc;
+ free(query);
+ return a;
+}
+
+typedef struct {
+ const mem_opt_t *opt;
+ const bwt_t *bwt;
+ const bntseq_t *bns;
+ const uint8_t *pac;
+ const mem_pestat_t *pes;
+ smem_aux_t **aux;
+ bseq1_t *seqs;
+ mem_alnreg_v *regs;
+ int64_t n_processed;
+} worker_t;
+
+static void worker1(void *data, int i, int tid)
+{
+ worker_t *w = (worker_t*)data;
+ if (!(w->opt->flag&MEM_F_PE)) {
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name);
+ w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq, w->aux[tid]);
+ } else {
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name);
+ w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq, w->aux[tid]);
+ if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name);
+ w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq, w->aux[tid]);
+ }
+}
+
+static void worker2(void *data, int i, int tid)
+{
+ extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
+ extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a);
+ worker_t *w = (worker_t*)data;
+ if (!(w->opt->flag&MEM_F_PE)) {
+ if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name);
+ mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
+ mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+ free(w->regs[i].a);
+ } else {
+ if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name);
+ mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]);
+ free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
+ }
+}
+
+void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0)
+{
+ extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
+ worker_t w;
+ mem_pestat_t pes[4];
+ double ctime, rtime;
+ int i;
+
+ ctime = cputime(); rtime = realtime();
+ global_bns = bns;
+ w.regs = malloc(n * sizeof(mem_alnreg_v));
+ w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
+ w.seqs = seqs; w.n_processed = n_processed;
+ w.pes = &pes[0];
+ w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
+ for (i = 0; i < opt->n_threads; ++i)
+ w.aux[i] = smem_aux_init();
+ kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions
+ for (i = 0; i < opt->n_threads; ++i)
+ smem_aux_destroy(w.aux[i]);
+ free(w.aux);
+ if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
+ if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
+ else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
+ }
+ kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
+ free(w.regs);
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
+}
diff --git a/ext/src/bwa/bwamem_extra.c b/ext/src/bwa/bwamem_extra.c
new file mode 100644
index 0000000..bb520fd
--- /dev/null
+++ b/ext/src/bwa/bwamem_extra.c
@@ -0,0 +1,140 @@
+#include <limits.h>
+#include "bwa/bwa.h"
+#include "bwa/bwamem.h"
+#include "bwa/bntseq.h"
+#include "kstring.h"
+
+/***************************
+ * SMEM iterator interface *
+ ***************************/
+
+struct __smem_i {
+ const bwt_t *bwt;
+ const uint8_t *query;
+ int start, len;
+ int min_intv, max_len;
+ uint64_t max_intv;
+ bwtintv_v *matches; // matches; to be returned by smem_next()
+ bwtintv_v *sub; // sub-matches inside the longest match; temporary
+ bwtintv_v *tmpvec[2]; // temporary arrays
+};
+
+smem_i *smem_itr_init(const bwt_t *bwt)
+{
+ smem_i *itr;
+ itr = calloc(1, sizeof(smem_i));
+ itr->bwt = bwt;
+ itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
+ itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
+ itr->matches = calloc(1, sizeof(bwtintv_v));
+ itr->sub = calloc(1, sizeof(bwtintv_v));
+ itr->min_intv = 1;
+ itr->max_len = INT_MAX;
+ itr->max_intv = 0;
+ return itr;
+}
+
+void smem_itr_destroy(smem_i *itr)
+{
+ free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
+ free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
+ free(itr->matches->a); free(itr->matches);
+ free(itr->sub->a); free(itr->sub);
+ free(itr);
+}
+
+void smem_set_query(smem_i *itr, int len, const uint8_t *query)
+{
+ itr->query = query;
+ itr->start = 0;
+ itr->len = len;
+}
+
+void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
+{
+ itr->min_intv = min_intv;
+ itr->max_len = max_len;
+ itr->max_intv = max_intv;
+}
+
+const bwtintv_v *smem_next(smem_i *itr)
+{
+ int ori_start;
+ itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
+ if (itr->start >= itr->len || itr->start < 0) return 0;
+ while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
+ if (itr->start == itr->len) return 0;
+ ori_start = itr->start;
+ itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
+ return itr->matches;
+}
+
+/***********************
+ *** Extra functions ***
+ ***********************/
+
+mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_)
+{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence
+ extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq, void *buf);
+ extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
+ mem_alnreg_v ar;
+ char *seq;
+ seq = malloc(l_seq);
+ memcpy(seq, seq_, l_seq); // makes a copy of seq_
+ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq, 0);
+ mem_mark_primary_se(opt, ar.n, ar.a, lrand48());
+ free(seq);
+ return ar;
+}
+
+static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
+{
+ int k = a[i].secondary_all;
+ if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
+ return -1;
+}
+
+// Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
+char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
+{
+ int i, k, r, *cnt, tot;
+ kstring_t *aln = 0, str = {0,0,0};
+ char **XA = 0, *has_alt;
+
+ cnt = calloc(a->n, sizeof(int));
+ has_alt = calloc(a->n, 1);
+ for (i = 0, tot = 0; i < a->n; ++i) {
+ r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
+ if (r >= 0) {
+ ++cnt[r], ++tot;
+ if (a->a[i].is_alt) has_alt[r] = 1;
+ }
+ }
+ if (tot == 0) goto end_gen_alt;
+ aln = calloc(a->n, sizeof(kstring_t));
+ for (i = 0; i < a->n; ++i) {
+ mem_aln_t t;
+ if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
+ if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
+ t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
+ str.l = 0;
+ kputs(bns->anns[t.rid].name, &str);
+ kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
+ kputc(',', &str);
+ for (k = 0; k < t.n_cigar; ++k) {
+ kputw(t.cigar[k]>>4, &str);
+ kputc("MIDSHN"[t.cigar[k]&0xf], &str);
+ }
+ kputc(',', &str); kputw(t.NM, &str);
+ kputc(';', &str);
+ free(t.cigar);
+ kputsn(str.s, str.l, &aln[r]);
+ }
+ XA = calloc(a->n, sizeof(char*));
+ for (k = 0; k < a->n; ++k)
+ XA[k] = aln[k].s;
+
+end_gen_alt:
+ free(has_alt); free(cnt); free(aln); free(str.s);
+ return XA;
+}
diff --git a/ext/src/bwa/bwamem_pair.c b/ext/src/bwa/bwamem_pair.c
new file mode 100644
index 0000000..395f73c
--- /dev/null
+++ b/ext/src/bwa/bwamem_pair.c
@@ -0,0 +1,388 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "kstring.h"
+#include "bwa/bwamem.h"
+#include "kvec.h"
+#include "bwa/utils.h"
+#include "ksw.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+
+#define MIN_RATIO 0.8
+#define MIN_DIR_CNT 10
+#define MIN_DIR_RATIO 0.05
+#define OUTLIER_BOUND 2.0
+#define MAPPING_BOUND 3.0
+#define MAX_STDDEV 4.0
+
+static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
+{
+ int64_t p2;
+ int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
+ p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
+ *dist = p2 > b1? p2 - b1 : b1 - p2;
+ return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
+}
+
+static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
+{
+ int j;
+ for (j = 1; j < r->n; ++j) { // choose unique alignment
+ int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
+ int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
+ if (e_min > b_max) { // have overlap
+ int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
+ if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
+ }
+ }
+ return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
+}
+
+void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
+{
+ int i, d, max;
+ uint64_v isize[4];
+ memset(pes, 0, 4 * sizeof(mem_pestat_t));
+ memset(isize, 0, sizeof(kvec_t(int)) * 4);
+ for (i = 0; i < n>>1; ++i) {
+ int dir;
+ int64_t is;
+ mem_alnreg_v *r[2];
+ r[0] = (mem_alnreg_v*)®s[i<<1|0];
+ r[1] = (mem_alnreg_v*)®s[i<<1|1];
+ if (r[0]->n == 0 || r[1]->n == 0) continue;
+ if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
+ if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
+ if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr
+ dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
+ if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
+ }
+ if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
+ for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
+ mem_pestat_t *r = &pes[d];
+ uint64_v *q = &isize[d];
+ int p25, p50, p75, x;
+ if (q->n < MIN_DIR_CNT) {
+ fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ r->failed = 1;
+ free(q->a);
+ continue;
+ } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ ks_introsort_64(q->n, q->a);
+ p25 = q->a[(int)(.25 * q->n + .499)];
+ p50 = q->a[(int)(.50 * q->n + .499)];
+ p75 = q->a[(int)(.75 * q->n + .499)];
+ r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+ if (r->low < 1) r->low = 1;
+ r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+ fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
+ fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
+ for (i = x = 0, r->avg = 0; i < q->n; ++i)
+ if (q->a[i] >= r->low && q->a[i] <= r->high)
+ r->avg += q->a[i], ++x;
+ r->avg /= x;
+ for (i = 0, r->std = 0; i < q->n; ++i)
+ if (q->a[i] >= r->low && q->a[i] <= r->high)
+ r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
+ r->std = sqrt(r->std / x);
+ fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
+ r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
+ r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
+ if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499);
+ if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
+ if (r->low < 1) r->low = 1;
+ fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
+ free(q->a);
+ }
+ for (d = 0, max = 0; d < 4; ++d)
+ max = max > isize[d].n? max : isize[d].n;
+ for (d = 0; d < 4; ++d)
+ if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
+ pes[d].failed = 1;
+ fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+ }
+}
+
+int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
+{
+ extern int mem_sort_dedup_patch(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int n, mem_alnreg_t *a);
+ int64_t l_pac = bns->l_pac;
+ int i, r, skip[4], n = 0, rid;
+ for (r = 0; r < 4; ++r)
+ skip[r] = pes[r].failed? 1 : 0;
+ for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
+ int64_t dist;
+ r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
+ if (dist >= pes[r].low && dist <= pes[r].high)
+ skip[r] = 1;
+ }
+ if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
+ for (r = 0; r < 4; ++r) {
+ int is_rev, is_larger;
+ uint8_t *seq, *rev = 0, *ref = 0;
+ int64_t rb, re;
+ if (skip[r]) continue;
+ is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
+ is_larger = !(r>>1); // whether the mate has larger coordinate
+ if (is_rev) {
+ rev = malloc(l_ms); // this is the reverse complement of $ms
+ for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
+ seq = rev;
+ } else seq = (uint8_t*)ms;
+ if (!is_rev) {
+ rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
+ re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
+ } else {
+ rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
+ re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
+ }
+ if (rb < 0) rb = 0;
+ if (re > l_pac<<1) re = l_pac<<1;
+ if (rb < re) ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid);
+ if (a->rid == rid && re - rb >= opt->min_seed_len) { // no funny things happening
+ kswr_t aln;
+ mem_alnreg_t b;
+ int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a);
+ aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0);
+ memset(&b, 0, sizeof(mem_alnreg_t));
+ if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
+ b.rid = a->rid;
+ b.is_alt = a->is_alt;
+ b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;
+ b.qe = is_rev? l_ms - aln.qb : aln.qe + 1;
+ b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
+ b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
+ b.score = aln.score;
+ b.csub = aln.score2;
+ b.secondary = -1;
+ b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
+// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
+ kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+ // move b s.t. ma is sorted
+ for (i = 0; i < ma->n - 1; ++i) // find the insertion point
+ if (ma->a[i].score < b.score) break;
+ tmp = i;
+ for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+ ma->a[i] = b;
+ }
+ ++n;
+ }
+ if (n) ma->n = mem_sort_dedup_patch(opt, 0, 0, 0, ma->n, ma->a);
+ if (rev) free(rev);
+ free(ref);
+ }
+ return n;
+}
+
+int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
+{
+ pair64_v v, u;
+ int r, i, k, y[4], ret; // y[] keeps the last hit
+ int64_t l_pac = bns->l_pac;
+ kv_init(v); kv_init(u);
+ for (r = 0; r < 2; ++r) { // loop through read number
+ for (i = 0; i < n_pri[r]; ++i) {
+ pair64_t key;
+ mem_alnreg_t *e = &a[r].a[i];
+ key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
+ key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset);
+ key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
+ kv_push(pair64_t, v, key);
+ }
+ }
+ ks_introsort_128(v.n, v.a);
+ y[0] = y[1] = y[2] = y[3] = -1;
+ //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
+ for (i = 0; i < v.n; ++i) {
+ for (r = 0; r < 2; ++r) { // loop through direction
+ int dir = r<<1 | (v.a[i].y>>1&1), which;
+ if (pes[dir].failed) continue; // invalid orientation
+ which = r<<1 | ((v.a[i].y&1)^1);
+ if (y[which] < 0) continue; // no previous hits
+ for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
+ int64_t dist;
+ int q;
+ double ns;
+ pair64_t *p;
+ if ((v.a[k].y&3) != which) continue;
+ dist = (int64_t)v.a[i].x - v.a[k].x;
+ //printf("%d: %lld\n", k, dist);
+ if (dist > pes[dir].high) break;
+ if (dist < pes[dir].low) continue;
+ ns = (dist - pes[dir].avg) / pes[dir].std;
+ q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4)
+ if (q < 0) q = 0;
+ p = kv_pushp(pair64_t, u);
+ p->y = (uint64_t)k<<32 | i;
+ p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
+ //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
+ }
+ }
+ y[v.a[i].y&3] = i;
+ }
+ if (u.n) { // found at least one proper pair
+ int tmp = opt->a + opt->b;
+ tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del;
+ tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins;
+ ks_introsort_128(u.n, u.a);
+ i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
+ z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
+ z[v.a[k].y&1] = v.a[k].y<<32>>34;
+ ret = u.a[u.n-1].x >> 32;
+ *sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
+ for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
+ if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
+ } else ret = 0, *sub = 0, *n_sub = 0;
+ free(u.a); free(v.a);
+ return ret;
+}
+
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
+
+#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
+
+int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
+{
+ extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
+ extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
+ extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
+ extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
+
+ int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
+ kstring_t str;
+ mem_aln_t h[2], g[2], aa[2][2];
+
+ str.l = str.m = 0; str.s = 0;
+ memset(h, 0, sizeof(mem_aln_t) * 2);
+ memset(g, 0, sizeof(mem_aln_t) * 2);
+ n_aa[0] = n_aa[1] = 0;
+ if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
+ mem_alnreg_v b[2];
+ kv_init(b[0]); kv_init(b[1]);
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < a[i].n; ++j)
+ if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired)
+ kv_push(mem_alnreg_t, b[i], a[i].a[j]);
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
+ n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
+ free(b[0].a); free(b[1].a);
+ }
+ n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
+ n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
+ if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
+ // pairing single-end hits
+ if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
+ int is_multi[2], q_pe, score_un, q_se[2];
+ char **XA[2];
+ // check if an end has multiple hits even after mate-SW
+ for (i = 0; i < 2; ++i) {
+ for (j = 1; j < n_pri[i]; ++j)
+ if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
+ is_multi[i] = j < n_pri[i]? 1 : 0;
+ }
+ if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
+ // compute mapQ for the best SE hit
+ score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
+ //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
+ subo = subo > score_un? subo : score_un;
+ q_pe = raw_mapq(o - subo, opt->a);
+ if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
+ if (q_pe < 0) q_pe = 0;
+ if (q_pe > 60) q_pe = 60;
+ q_pe = (int)(q_pe * (1. - .5 * (a[0].a[0].frac_rep + a[1].a[0].frac_rep)) + .499);
+ // the following assumes no split hits
+ if (o > score_un) { // paired alignment is preferred
+ mem_alnreg_t *c[2];
+ c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
+ for (i = 0; i < 2; ++i) {
+ if (c[i]->secondary >= 0)
+ c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
+ q_se[i] = mem_approx_mapq_se(opt, c[i]);
+ }
+ q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
+ q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
+ extra_flag |= 2;
+ // cap at the tandem repeat score
+ q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a);
+ q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a);
+ } else { // the unpaired alignment is preferred
+ z[0] = z[1] = 0;
+ q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
+ q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
+ }
+ for (i = 0; i < 2; ++i) {
+ int k = a[i].a[z[i]].secondary_all;
+ if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
+ assert(a[i].a[k].secondary_all < 0);
+ for (j = 0; j < a[i].n; ++j)
+ if (a[i].a[j].secondary_all == k || j == k)
+ a[i].a[j].secondary_all = z[i];
+ a[i].a[z[i]].secondary_all = -1;
+ }
+ }
+ if (!(opt->flag & MEM_F_ALL)) {
+ for (i = 0; i < 2; ++i)
+ XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
+ } else XA[0] = XA[1] = 0;
+ // write SAM
+ for (i = 0; i < 2; ++i) {
+ h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
+ h[i].mapq = q_se[i];
+ h[i].flag |= 0x40<<i | extra_flag;
+ h[i].XA = XA[i]? XA[i][z[i]] : 0;
+ aa[i][n_aa[i]++] = h[i];
+ if (n_pri[i] < a[i].n) { // the read has ALT hits
+ mem_alnreg_t *p = &a[i].a[n_pri[i]];
+ if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
+ g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
+ g[i].flag |= 0x800 | 0x40<<i | extra_flag;
+ g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
+ aa[i][n_aa[i]++] = g[i];
+ }
+ }
+ for (i = 0; i < n_aa[0]; ++i)
+ mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
+ s[0].sam = strdup(str.s); str.l = 0;
+ for (i = 0; i < n_aa[1]; ++i)
+ mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
+ s[1].sam = str.s;
+ if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
+ // free
+ for (i = 0; i < 2; ++i) {
+ free(h[i].cigar); free(g[i].cigar);
+ if (XA[i] == 0) continue;
+ for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
+ free(XA[i]);
+ }
+ } else goto no_pairing;
+ return n;
+
+no_pairing:
+ for (i = 0; i < 2; ++i) {
+ int which = -1;
+ if (a[i].n) {
+ if (a[i].a[0].score >= opt->T) which = 0;
+ else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
+ which = n_pri[i];
+ }
+ if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
+ else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
+ }
+ if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
+ int64_t dist;
+ int d;
+ d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
+ if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
+ }
+ mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
+ mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
+ if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
+ free(h[0].cigar); free(h[1].cigar);
+ return n;
+}
diff --git a/ext/src/bwa/bwape.c b/ext/src/bwa/bwape.c
new file mode 100644
index 0000000..a5dc3ad
--- /dev/null
+++ b/ext/src/bwa/bwape.c
@@ -0,0 +1,783 @@
+#include <unistd.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h>
+#include <stdio.h>
+#include <string.h>
+#include "bwtaln.h"
+#include "kvec.h"
+#include "bntseq.h"
+#include "utils.h"
+#include "bwase.h"
+#include "bwa.h"
+#include "ksw.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef struct {
+ int n;
+ bwtint_t *a;
+} poslist_t;
+
+typedef struct {
+ double avg, std, ap_prior;
+ bwtint_t low, high, high_bayesian;
+} isize_info_t;
+
+#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
+#define b128_hash(a) ((uint32_t)(a).x)
+
+#include "khash.h"
+KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
+
+typedef struct {
+ pair64_v arr;
+ pair64_v pos[2];
+ kvec_t(bwt_aln1_t) aln[2];
+} pe_data_t;
+
+#define MIN_HASH_WIDTH 1000
+
+extern int g_log_n[256]; // in bwase.c
+static kh_b128_t *g_hash;
+
+void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi);
+void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
+int bwa_approx_mapQ(const bwa_seq_t *p, int mm);
+void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2);
+bntseq_t *bwa_open_nt(const char *prefix);
+void bwa_print_sam_SQ(const bntseq_t *bns);
+
+pe_opt_t *bwa_init_pe_opt()
+{
+ pe_opt_t *po;
+ po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t));
+ po->max_isize = 500;
+ po->force_isize = 0;
+ po->max_occ = 100000;
+ po->n_multi = 3;
+ po->N_multi = 10;
+ po->type = BWA_PET_STD;
+ po->is_sw = 1;
+ po->ap_prior = 1e-5;
+ return po;
+}
+/*
+static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
+{
+ const double a = 0.140012;
+ double b, c;
+ b = log(x * (2 - x));
+ c = 2./M_PI/a + b / 2.;
+ return sqrt(sqrt(c * c - b / a) - c);
+}
+*/
+
+// for normal distribution, this is about 3std
+#define OUTLIER_BOUND 2.0
+
+static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L)
+{
+ uint64_t x, *isizes, n_ap = 0;
+ int n, i, tot, p25, p75, p50, max_len = 1, tmp;
+ double skewness = 0.0, kurtosis = 0.0, y;
+
+ ii->avg = ii->std = -1.0;
+ ii->low = ii->high = ii->high_bayesian = 0;
+ isizes = (uint64_t*)calloc(n_seqs, 8);
+ for (i = 0, tot = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p[2];
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
+ if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) {
+ x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos;
+ if (x < 100000) isizes[tot++] = x;
+ }
+ if (p[0]->len > max_len) max_len = p[0]->len;
+ if (p[1]->len > max_len) max_len = p[1]->len;
+ }
+ if (tot < 20) {
+ fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n");
+ free(isizes);
+ return -1;
+ }
+ ks_introsort_64(tot, isizes);
+ p25 = isizes[(int)(tot*0.25 + 0.5)];
+ p50 = isizes[(int)(tot*0.50 + 0.5)];
+ p75 = isizes[(int)(tot*0.75 + 0.5)];
+ tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+ ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned
+ ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+ if (ii->low > ii->high) {
+ fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n");
+ free(isizes);
+ return -1;
+ }
+ for (i = 0, x = n = 0; i < tot; ++i)
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high)
+ ++n, x += isizes[i];
+ ii->avg = (double)x / n;
+ for (i = 0; i < tot; ++i) {
+ if (isizes[i] >= ii->low && isizes[i] <= ii->high) {
+ double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg);
+ ii->std += tmp;
+ skewness += tmp * (isizes[i] - ii->avg);
+ kurtosis += tmp * tmp;
+ }
+ }
+ kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3;
+ ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large
+ skewness = skewness / n / (ii->std * ii->std * ii->std);
+ for (y = 1.0; y < 10.0; y += 0.01)
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
+ for (i = 0; i < tot; ++i)
+ if (isizes[i] > ii->high_bayesian) ++n_ap;
+ ii->ap_prior = .01 * (n_ap + .01) / tot;
+ if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior;
+ free(isizes);
+ fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75);
+ if (isnan(ii->std) || p75 > 100000) {
+ ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0;
+ fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n");
+ return -1;
+ }
+ for (y = 1.0; y < 10.0; y += 0.01)
+ if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break;
+ ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499);
+ fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high);
+ fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std);
+ fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior);
+ fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y);
+ return 0;
+}
+
+static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii)
+{
+ int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
+ uint64_t o_score, subo_score;
+ pair64_t last_pos[2][2], o_pos[2];
+ max_len = p[0]->full_len;
+ if (max_len < p[1]->full_len) max_len = p[1]->full_len;
+ if (low_bound < max_len) low_bound = max_len;
+
+ // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize
+#define __pairing_aux(u,v) do { \
+ bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \
+ if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \
+ && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \
+ { \
+ uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \
+ s *= 10; \
+ if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \
+ s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \
+ if (s>>32 == o_score>>32) ++o_n; \
+ else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \
+ else ++subo_n; \
+ if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \
+ else if (s < subo_score) subo_score = s; \
+ } \
+ } while (0)
+
+#define __pairing_aux2(q, w) do { \
+ const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \
+ (q)->extra_flag |= SAM_FPP; \
+ if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \
+ (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \
+ (q)->score = r->score; \
+ (q)->pos = (w).x; \
+ if ((q)->mapQ > 0) ++cnt_chg; \
+ } \
+ } while (0)
+
+ o_score = subo_score = (uint64_t)-1;
+ o_n = subo_n = 0;
+ ks_introsort_128(d->arr.n, d->arr.a);
+ for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
+ if (opt->type == BWA_PET_STD) {
+ for (i = 0; i < d->arr.n; ++i) {
+ pair64_t x = d->arr.a[i];
+ int strand = x.y>>1&1;
+ if (strand == 1) { // reverse strand, then check
+ int y = 1 - (x.y&1);
+ __pairing_aux(last_pos[y][1], x);
+ __pairing_aux(last_pos[y][0], x);
+ } else { // forward strand, then push
+ last_pos[x.y&1][0] = last_pos[x.y&1][1];
+ last_pos[x.y&1][1] = x;
+ }
+ }
+ } else {
+ fprintf(stderr, "[paring] not implemented yet!\n");
+ exit(1);
+ }
+ // set pairing
+ //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n);
+ if (o_score != (uint64_t)-1) {
+ int mapQ_p = 0; // this is the maximum mapping quality when one end is moved
+ //fprintf(stderr, "%d, %d\n", o_n, subo_n);
+ if (o_n == 1) {
+ if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair
+ else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair
+ else {
+ int n = subo_n > 255? 255 : subo_n;
+ mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n];
+ if (mapQ_p < 0) mapQ_p = 0;
+ }
+ }
+ if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved
+ if (p[0]->mapQ > 0 && p[1]->mapQ > 0) {
+ int mapQ = p[0]->mapQ + p[1]->mapQ;
+ if (mapQ > 60) mapQ = 60;
+ p[0]->mapQ = p[1]->mapQ = mapQ;
+ } else {
+ if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ;
+ if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ;
+ }
+ } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved
+ p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ;
+ if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p;
+ } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved
+ p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ;
+ if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p;
+ } else { // both ends moved
+ p[0]->seQ = p[1]->seQ = 0;
+ mapQ_p -= 20;
+ if (mapQ_p < 0) mapQ_p = 0;
+ p[0]->mapQ = p[1]->mapQ = mapQ_p;
+ }
+ __pairing_aux2(p[0], o_pos[0]);
+ __pairing_aux2(p[1], o_pos[1]);
+ }
+ return cnt_chg;
+}
+
+typedef struct {
+ kvec_t(bwt_aln1_t) aln;
+} aln_buf_t;
+
+int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii,
+ const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii)
+{
+ int i, j, cnt_chg = 0;
+ char str[1024];
+ bwt_t *bwt;
+ pe_data_t *d;
+ aln_buf_t *buf[2];
+
+ d = (pe_data_t*)calloc(1, sizeof(pe_data_t));
+ buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
+ buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));
+
+ if (_bwt == 0) { // load forward SA
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
+ } else bwt = _bwt;
+
+ // SE
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p[2];
+ for (j = 0; j < 2; ++j) {
+ int n_aln;
+ p[j] = seqs[j] + i;
+ p[j]->n_multi = 0;
+ p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2);
+ err_fread_noeof(&n_aln, 4, 1, fp_sa[j]);
+ if (n_aln > kv_max(d->aln[j]))
+ kv_resize(bwt_aln1_t, d->aln[j], n_aln);
+ d->aln[j].n = n_aln;
+ err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]);
+ kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j]
+ // generate SE alignment and mapping quality
+ bwa_aln2seq(n_aln, d->aln[j].a, p[j]);
+ if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) {
+ int strand;
+ int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff;
+ p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff);
+ p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand);
+ p[j]->strand = strand;
+ if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH;
+ }
+ }
+ }
+
+ // infer isize
+ infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2);
+ if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii;
+ if (opt->force_isize) {
+ fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__);
+ ii->low = ii->high = 0; ii->avg = ii->std = -1.0;
+ }
+
+ // PE
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p[2];
+ for (j = 0; j < 2; ++j) {
+ p[j] = seqs[j] + i;
+ kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln);
+ }
+ if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
+ && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
+ { // only when both ends mapped
+ pair64_t x;
+ int j, k;
+ long long n_occ[2];
+ for (j = 0; j < 2; ++j) {
+ n_occ[j] = 0;
+ for (k = 0; k < d->aln[j].n; ++k)
+ n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1;
+ }
+ if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue;
+ d->arr.n = 0;
+ for (j = 0; j < 2; ++j) {
+ for (k = 0; k < d->aln[j].n; ++k) {
+ bwt_aln1_t *r = d->aln[j].a + k;
+ bwtint_t l;
+ if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
+ pair64_t key;
+ int ret;
+ key.x = r->k; key.y = r->l;
+ khint_t iter = kh_put(b128, g_hash, key, &ret);
+ if (ret) { // not in the hash table; ret must equal 1 as we never remove elements
+ poslist_t *z = &kh_val(g_hash, iter);
+ z->n = r->l - r->k + 1;
+ z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n);
+ for (l = r->k; l <= r->l; ++l) {
+ int strand;
+ z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1;
+ z->a[l - r->k] |= strand;
+ }
+ }
+ for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
+ x.x = kh_val(g_hash, iter).a[l]>>1;
+ x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
+ kv_push(pair64_t, d->arr, x);
+ }
+ } else { // then calculate on the fly
+ for (l = r->k; l <= r->l; ++l) {
+ int strand;
+ x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand);
+ x.y = k<<2 | strand<<1 | j;
+ kv_push(pair64_t, d->arr, x);
+ }
+ }
+ }
+ }
+ cnt_chg += pairing(p, d, opt, gopt->s_mm, ii);
+ }
+
+ if (opt->N_multi || opt->n_multi) {
+ for (j = 0; j < 2; ++j) {
+ if (p[j]->type != BWA_TYPE_NO_MATCH) {
+ int k, n_multi;
+ if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) {
+ bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi);
+ } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi);
+ for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) {
+ int strand;
+ bwt_multi1_t *q = p[j]->multi + k;
+ q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand);
+ q->strand = strand;
+ if (q->pos != p[j]->pos)
+ p[j]->multi[n_multi++] = *q;
+ }
+ p[j]->n_multi = n_multi;
+ }
+ }
+ }
+ }
+
+ // free
+ for (i = 0; i < n_seqs; ++i) {
+ kv_destroy(buf[0][i].aln);
+ kv_destroy(buf[1][i].aln);
+ }
+ free(buf[0]); free(buf[1]);
+ if (_bwt == 0) bwt_destroy(bwt);
+ kv_destroy(d->arr);
+ kv_destroy(d->pos[0]); kv_destroy(d->pos[1]);
+ kv_destroy(d->aln[0]); kv_destroy(d->aln[1]);
+ free(d);
+ return cnt_chg;
+}
+
+#define SW_MIN_MATCH_LEN 20
+#define SW_MIN_MAPQ 17
+
+// cnt = n_mm<<16 | n_gapo<<8 | n_gape
+bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt)
+{
+ kswr_t r;
+ uint32_t *cigar32 = 0;
+ bwa_cigar_t *cigar = 0;
+ ubyte_t *ref_seq;
+ bwtint_t k, x, y, l;
+ int xtra, gscore;
+ int8_t mat[25];
+
+ bwa_fill_scmat(1, 3, mat);
+ // check whether there are too many N's
+ if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0;
+ for (k = 0, x = 0; k < len; ++k)
+ if (seq[k] >= 4) ++x;
+ if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0;
+
+ // get reference subsequence
+ ref_seq = (ubyte_t*)calloc(reglen, 1);
+ for (k = *beg, l = 0; l < reglen && k < l_pac; ++k)
+ ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3;
+
+ // do alignment
+ xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0);
+ r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0);
+ gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32);
+ cigar = (bwa_cigar_t*)cigar32;
+ for (k = 0; k < *n_cigar; ++k)
+ cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
+
+ if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment
+ free(cigar); free(ref_seq); *n_cigar = 0;
+ return 0;
+ }
+
+ // check whether the alignment is good enough
+ for (k = 0, x = y = 0; k < *n_cigar; ++k) {
+ bwa_cigar_t c = cigar[k];
+ if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c);
+ else if (__cigar_op(c) == FROM_D) x += __cigar_len(c);
+ else y += __cigar_len(c);
+ }
+ if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough
+ free(cigar); free(ref_seq);
+ *n_cigar = 0;
+ return 0;
+ }
+
+ { // update cigar and coordinate;
+ int start = r.qb, end = r.qe + 1;
+ *beg += r.tb;
+ cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2));
+ if (start) {
+ memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar));
+ cigar[0] = __cigar_create(3, start);
+ ++(*n_cigar);
+ }
+ if (end < len) {
+ /*cigar[*n_cigar] = 3<<14 | (len - end);*/
+ cigar[*n_cigar] = __cigar_create(3, (len - end));
+ ++(*n_cigar);
+ }
+ }
+
+ { // set *cnt
+ int n_mm, n_gapo, n_gape;
+ n_mm = n_gapo = n_gape = 0;
+ x = r.tb; y = r.qb;
+ for (k = 0; k < *n_cigar; ++k) {
+ bwa_cigar_t c = cigar[k];
+ if (__cigar_op(c) == FROM_M) {
+ for (l = 0; l < (__cigar_len(c)); ++l)
+ if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm;
+ x += __cigar_len(c), y += __cigar_len(c);
+ } else if (__cigar_op(c) == FROM_D) {
+ x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
+ } else if (__cigar_op(c) == FROM_I) {
+ y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1;
+ }
+ }
+ *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape;
+ }
+
+ free(ref_seq);
+ return cigar;
+}
+
+ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii)
+{
+ ubyte_t *pacseq;
+ int i;
+ uint64_t n_tot[2], n_mapped[2];
+
+ // load reference sequence
+ if (_pacseq == 0) {
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
+ err_rewind(bns->fp_pac);
+ err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
+ } else pacseq = (ubyte_t*)_pacseq;
+ if (!popt->is_sw || ii->avg < 0.0) return pacseq;
+
+ // perform mate alignment
+ n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0;
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p[2];
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
+ if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ
+ int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2];
+ int64_t beg[2], end[2];
+ bwa_cigar_t *cigar[2];
+ uint32_t cnt[2];
+
+ /* In the following, _pref points to the reference read
+ * which must be aligned; _pmate points to its mate which is
+ * considered to be modified. */
+
+#define __set_rght_coor(_a, _b, _pref, _pmate) do { \
+ (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
+ if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \
+ if ((_b) > bns->l_pac) (_b) = bns->l_pac; \
+ } while (0)
+
+#define __set_left_coor(_a, _b, _pref, _pmate) do { \
+ (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \
+ (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \
+ if ((_a) < 0) (_a) = 0; \
+ if ((_b) > _pref->pos) (_b) = _pref->pos; \
+ } while (0)
+
+#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \
+ _pmate->type = BWA_TYPE_MATESW; \
+ _pmate->pos = _beg; \
+ _pmate->seQ = _pref->seQ; \
+ _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \
+ _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \
+ _pmate->extra_flag |= SAM_FPP; \
+ _pref->extra_flag |= SAM_FPP; \
+ } while (0)
+
+ mq_adjust[0] = mq_adjust[1] = 255; // not effective
+ is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0;
+
+ ++n_tot[is_singleton];
+ cigar[0] = cigar[1] = 0;
+ n_cigar[0] = n_cigar[1] = 0;
+ if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
+ for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
+ ubyte_t *seq;
+ if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
+ { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
+ if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
+ __set_rght_coor(beg[k], end[k], p[1-k], p[k]);
+ seq = p[k]->rseq;
+ } else { // then the mate is on forward stand and has smaller coordinate
+ __set_left_coor(beg[k], end[k], p[1-k], p[k]);
+ seq = p[k]->seq;
+ seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
+ }
+ }
+ // perform SW alignment
+ cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
+ if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k]
+ int s_old, clip = 0, s_new;
+ if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]);
+ if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]);
+ s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499);
+ s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499);
+ s_old += -4.343 * log(ii->ap_prior / bns->l_pac);
+ s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma
+ if (s_old < s_new) { // reject SW alignment
+ mq_adjust[k] = s_new - s_old;
+ free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0;
+ } else mq_adjust[k] = s_old - s_new;
+ }
+ // now revserse sequence back such that p[*]->seq looks untouched
+ if (popt->type == BWA_PET_STD) {
+ if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0);
+ } else {
+ if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0);
+ }
+ }
+ k = -1; // no read to be changed
+ if (cigar[0] && cigar[1]) {
+ k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed
+ mapQ = abs(p[1]->mapQ - p[0]->mapQ);
+ } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ;
+ else if (cigar[1]) k = 1, mapQ = p[0]->mapQ;
+ if (k >= 0 && p[k]->pos != beg[k]) {
+ ++n_mapped[is_singleton];
+ { // recalculate mapping quality
+ int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8;
+ if (tmp <= 0) tmp = 1;
+ if (mapQ > tmp) mapQ = tmp;
+ p[k]->mapQ = p[1-k]->mapQ = mapQ;
+ p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ;
+ if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k];
+ if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k];
+ }
+ // update CIGAR
+ free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0;
+ p[k]->n_cigar = n_cigar[k];
+ // update the rest of information
+ __set_fixed(p[1-k], p[k], beg[k], cnt[k]);
+ }
+ free(cigar[0]); free(cigar[1]);
+ }
+ }
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n",
+ (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ);
+ fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n",
+ (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ);
+ return pacseq;
+}
+
+void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
+{
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
+ int i, j, n_seqs, tot_seqs = 0;
+ bwa_seq_t *seqs[2];
+ bwa_seqio_t *ks[2];
+ clock_t t;
+ bntseq_t *bns;
+ FILE *fp_sa[2];
+ gap_opt_t opt, opt0;
+ khint_t iter;
+ isize_info_t last_ii; // this is for the last batch of reads
+ char str[1024], magic[2][4];
+ bwt_t *bwt;
+ uint8_t *pac;
+
+ // initialization
+ bwase_initialize(); // initialize g_log_n[] in bwase.c
+ pac = 0; bwt = 0;
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
+ bns = bns_restore(prefix);
+ srand48(bns->seed);
+ fp_sa[0] = xopen(fn_sa[0], "r");
+ fp_sa[1] = xopen(fn_sa[1], "r");
+ g_hash = kh_init(b128);
+ last_ii.avg = -1.0;
+
+ err_fread_noeof(magic[0], 1, 4, fp_sa[0]);
+ err_fread_noeof(magic[1], 1, 4, fp_sa[1]);
+ if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) {
+ fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
+ exit(1);
+ }
+ err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]);
+ ks[0] = bwa_open_reads(opt.mode, fn_fa[0]);
+ opt0 = opt;
+ err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
+ ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
+ { // for Illumina alignment only
+ if (popt->is_preload) {
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
+ pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
+ err_rewind(bns->fp_pac);
+ err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
+ }
+ }
+
+ // core loop
+ bwa_print_sam_hdr(bns, rg_line);
+ while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
+ int cnt_chg;
+ isize_info_t ii;
+ ubyte_t *pacseq;
+
+ seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual);
+ tot_seqs += n_seqs;
+ t = clock();
+
+ fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n");
+ cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii);
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+ fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg);
+
+ fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n");
+ pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii);
+ fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+ fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
+ for (j = 0; j < 2; ++j)
+ bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+ if (pac == 0) free(pacseq);
+
+ fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... ");
+ for (i = 0; i < n_seqs; ++i) {
+ bwa_seq_t *p[2];
+ p[0] = seqs[0] + i; p[1] = seqs[1] + i;
+ if (p[0]->bc[0] || p[1]->bc[0]) {
+ strcat(p[0]->bc, p[1]->bc);
+ strcpy(p[1]->bc, p[0]->bc);
+ }
+ bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2);
+ bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2);
+ if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name);
+ }
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+ for (j = 0; j < 2; ++j)
+ bwa_free_read_seq(n_seqs, seqs[j]);
+ fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs);
+ last_ii = ii;
+ }
+
+ // destroy
+ bns_destroy(bns);
+ for (i = 0; i < 2; ++i) {
+ bwa_seq_close(ks[i]);
+ err_fclose(fp_sa[i]);
+ }
+ for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter)
+ if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a);
+ kh_destroy(b128, g_hash);
+ if (pac) {
+ free(pac); bwt_destroy(bwt);
+ }
+}
+
+int bwa_sai2sam_pe(int argc, char *argv[])
+{
+ int c;
+ pe_opt_t *popt;
+ char *prefix, *rg_line = 0;
+
+ popt = bwa_init_pe_opt();
+ while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
+ switch (c) {
+ case 'r':
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
+ break;
+ case 'a': popt->max_isize = atoi(optarg); break;
+ case 'o': popt->max_occ = atoi(optarg); break;
+ case 's': popt->is_sw = 0; break;
+ case 'P': popt->is_preload = 1; break;
+ case 'n': popt->n_multi = atoi(optarg); break;
+ case 'N': popt->N_multi = atoi(optarg); break;
+ case 'c': popt->ap_prior = atof(optarg); break;
+ case 'f': xreopen(optarg, "w", stdout); break;
+ case 'A': popt->force_isize = 1; break;
+ default: return 1;
+ }
+ }
+
+ if (optind + 5 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa sampe [options] <prefix> <in1.sai> <in2.sai> <in1.fq> <in2.fq>\n\n");
+ fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize);
+ fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ);
+ fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi);
+ fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi);
+ fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior);
+ fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
+ fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n");
+ fprintf(stderr, " -P preload index into memory (for base-space reads only)\n");
+ fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n");
+ fprintf(stderr, " -A disable insert size estimate (force -s)\n\n");
+ fprintf(stderr, "Notes: 1. For SOLiD reads, <in1.fq> corresponds R3 reads and <in2.fq> to F3.\n");
+ fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n");
+ fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
+ fprintf(stderr, "[%s] fail to locate the index\n", __func__);
+ return 1;
+ }
+ bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
+ free(prefix); free(popt);
+ return 0;
+}
diff --git a/ext/src/bwa/bwase.c b/ext/src/bwa/bwase.c
new file mode 100644
index 0000000..cb912ec
--- /dev/null
+++ b/ext/src/bwa/bwase.c
@@ -0,0 +1,602 @@
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <assert.h>
+#include "bwase.h"
+#include "bwtaln.h"
+#include "bntseq.h"
+#include "utils.h"
+#include "kstring.h"
+#include "bwa.h"
+#include "ksw.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+int g_log_n[256];
+
+void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
+{
+ int i, cnt, best;
+ if (n_aln == 0) {
+ s->type = BWA_TYPE_NO_MATCH;
+ s->c1 = s->c2 = 0;
+ return;
+ }
+
+ if (set_main) {
+ best = aln[0].score;
+ for (i = cnt = 0; i < n_aln; ++i) {
+ const bwt_aln1_t *p = aln + i;
+ if (p->score > best) break;
+ if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
+ s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape;
+ s->ref_shift = (int)p->n_del - (int)p->n_ins;
+ s->score = p->score;
+ s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
+ }
+ cnt += p->l - p->k + 1;
+ }
+ s->c1 = cnt;
+ for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1;
+ s->c2 = cnt - s->c1;
+ s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
+ }
+
+ if (n_multi) {
+ int k, rest, n_occ, z = 0;
+ for (k = n_occ = 0; k < n_aln; ++k) {
+ const bwt_aln1_t *q = aln + k;
+ n_occ += q->l - q->k + 1;
+ }
+ if (s->multi) free(s->multi);
+ if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
+ s->multi = 0; s->n_multi = 0;
+ return;
+ }
+ /* The following code is more flexible than what is required
+ * here. In principle, due to the requirement above, we can
+ * simply output all hits, but the following samples "rest"
+ * number of random hits. */
+ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
+ s->multi = calloc(rest, sizeof(bwt_multi1_t));
+ for (k = 0; k < n_aln; ++k) {
+ const bwt_aln1_t *q = aln + k;
+ if (q->l - q->k + 1 <= rest) {
+ bwtint_t l;
+ for (l = q->k; l <= q->l; ++l) {
+ s->multi[z].pos = l;
+ s->multi[z].gap = q->n_gapo + q->n_gape;
+ s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
+ s->multi[z++].mm = q->n_mm;
+ }
+ rest -= q->l - q->k + 1;
+ } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here.
+ int j, i;
+ for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
+ double p = 1.0, x = drand48();
+ while (x < p) p -= p * j / (i--);
+ s->multi[z].pos = q->l - i;
+ s->multi[z].gap = q->n_gapo + q->n_gape;
+ s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins;
+ s->multi[z++].mm = q->n_mm;
+ }
+ rest = 0;
+ break;
+ }
+ }
+ s->n_multi = z;
+ }
+}
+
+void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s)
+{
+ bwa_aln2seq_core(n_aln, aln, s, 1, 0);
+}
+
+int bwa_approx_mapQ(const bwa_seq_t *p, int mm)
+{
+ int n;
+ if (p->c1 == 0) return 23;
+ if (p->c1 > 1) return 0;
+ if (p->n_mm == mm) return 25;
+ if (p->c2 == 0) return 37;
+ n = (p->c2 >= 255)? 255 : p->c2;
+ return (23 < g_log_n[n])? 0 : 23 - g_log_n[n];
+}
+
+bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand)
+{
+ bwtint_t pos_f;
+ int is_rev;
+ pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate
+ if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1;
+ pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base
+ *strand = !is_rev;
+ if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base
+ return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset
+}
+
+/**
+ * Derive the actual position in the read from the given suffix array
+ * coordinates. Note that the position will be approximate based on
+ * whether indels appear in the read and whether calculations are
+ * performed from the start or end of the read.
+ */
+void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr)
+{
+ int max_diff, strand;
+ if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return;
+ max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm;
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
+ //fprintf(stderr, "%d\n", seq->ref_shift);
+ seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand);
+ seq->strand = strand;
+ seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff);
+ if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH;
+}
+
+void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr)
+{
+ int i, j, strand, n_multi;
+ char str[1024];
+ bwt_t *bwt;
+ // load forward SA
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
+ strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p = &seqs[i];
+ bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
+ for (j = n_multi = 0; j < p->n_multi; ++j) {
+ bwt_multi1_t *q = p->multi + j;
+ q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand);
+ q->strand = strand;
+ if (q->pos != p->pos && q->pos != (bwtint_t)-1)
+ p->multi[n_multi++] = *q;
+ }
+ p->n_multi = n_multi;
+ }
+ bwt_destroy(bwt);
+}
+
+#define SW_BW 50
+
+bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar)
+{
+ bwa_cigar_t *cigar = 0;
+ uint32_t *cigar32 = 0;
+ ubyte_t *rseq;
+ int64_t k, rb, re, rlen;
+ int8_t mat[25];
+
+ bwa_fill_scmat(1, 3, mat);
+ rb = *_rb; re = rb + len + ref_shift;
+ assert(re <= l_pac);
+ rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen);
+ assert(re - rb == rlen);
+ ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32);
+ assert(*n_cigar > 0);
+ if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping
+ if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping
+ if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del
+ if ((cigar32[0]&0xf) == 2) { // delete beginning del
+ *_rb += cigar32[0]>>4;
+ --*n_cigar;
+ memmove(cigar32, cigar32+1, (*n_cigar) * 4);
+ }
+ cigar = (bwa_cigar_t*)cigar32;
+ for (k = 0; k < *n_cigar; ++k)
+ cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4));
+ free(rseq);
+ return cigar;
+}
+
+char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq,
+ bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm)
+{
+ bwtint_t x, y;
+ int z, u, c, nm = 0;
+ str->l = 0; // reset
+ x = pos; y = 0;
+ if (cigar) {
+ int k, l;
+ for (k = u = 0; k < n_cigar; ++k) {
+ l = __cigar_len(cigar[k]);
+ if (__cigar_op(cigar[k]) == FROM_M) {
+ for (z = 0; z < l && x+z < l_pac; ++z) {
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
+ ksprintf(str, "%d", u);
+ kputc("ACGTN"[c], str);
+ ++nm;
+ u = 0;
+ } else ++u;
+ }
+ x += l; y += l;
+ } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) {
+ y += l;
+ if (__cigar_op(cigar[k]) == FROM_I) nm += l;
+ } else if (__cigar_op(cigar[k]) == FROM_D) {
+ ksprintf(str, "%d", u);
+ kputc('^', str);
+ for (z = 0; z < l && x+z < l_pac; ++z)
+ kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str);
+ u = 0;
+ x += l; nm += l;
+ }
+ }
+ } else { // no gaps
+ for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) {
+ c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
+ if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) {
+ ksprintf(str, "%d", u);
+ kputc("ACGTN"[c], str);
+ ++nm;
+ u = 0;
+ } else ++u;
+ }
+ }
+ ksprintf(str, "%d", u);
+ *_nm = nm;
+ return strdup(str->s);
+}
+
+void bwa_correct_trimmed(bwa_seq_t *s)
+{
+ if (s->len == s->full_len) return;
+ if (s->strand == 0) { // forward
+ if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S
+ s->cigar[s->n_cigar-1] += s->full_len - s->len;
+ } else {
+ if (s->cigar == 0) {
+ s->n_cigar = 2;
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
+ s->cigar[0] = __cigar_create(0, s->len);
+ } else {
+ ++s->n_cigar;
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
+ }
+ s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len));
+ }
+ } else { // reverse
+ if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S
+ s->cigar[0] += s->full_len - s->len;
+ } else {
+ if (s->cigar == 0) {
+ s->n_cigar = 2;
+ s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t));
+ s->cigar[1] = __cigar_create(0, s->len);
+ } else {
+ ++s->n_cigar;
+ s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t));
+ memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t));
+ }
+ s->cigar[0] = __cigar_create(3, (s->full_len - s->len));
+ }
+ }
+ s->len = s->full_len;
+}
+
+void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
+{
+ ubyte_t *pacseq;
+ int i, j, k;
+ kstring_t *str;
+
+ if (!_pacseq) {
+ pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
+ err_rewind(bns->fp_pac);
+ err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac);
+ } else pacseq = _pacseq;
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *s = seqs + i;
+ seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!!
+ for (j = k = 0; j < s->n_multi; ++j) {
+ bwt_multi1_t *q = s->multi + j;
+ int n_cigar;
+ if (q->gap) { // gapped alignment
+ q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar);
+ q->n_cigar = n_cigar;
+ if (q->cigar) s->multi[k++] = *q;
+ } else s->multi[k++] = *q;
+ }
+ s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation
+ if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue;
+ s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar);
+ if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH;
+ }
+ // generate MD tag
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *s = seqs + i;
+ if (s->type != BWA_TYPE_NO_MATCH) {
+ int nm;
+ s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm);
+ s->nm = nm;
+ }
+ }
+ free(str->s); free(str);
+
+ // correct for trimmed reads
+ for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
+
+ if (!_pacseq) free(pacseq);
+}
+
+int64_t pos_end(const bwa_seq_t *p)
+{
+ if (p->cigar) {
+ int j;
+ int64_t x = p->pos;
+ for (j = 0; j != p->n_cigar; ++j) {
+ int op = __cigar_op(p->cigar[j]);
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
+ }
+ return x;
+ } else return p->pos + p->len;
+}
+
+int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end()
+{
+ if (p->cigar) {
+ int j;
+ int64_t x = p->pos;
+ for (j = 0; j != p->n_cigar; ++j) {
+ int op = __cigar_op(p->cigar[j]);
+ if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]);
+ }
+ return x;
+ } else return p->pos + len;
+}
+
+static int64_t pos_5(const bwa_seq_t *p)
+{
+ if (p->type != BWA_TYPE_NO_MATCH)
+ return p->strand? pos_end(p) : p->pos;
+ return -1;
+}
+
+void bwa_print_seq(FILE *stream, bwa_seq_t *seq) {
+ char buffer[4096];
+ const int bsz = sizeof(buffer);
+ int i, j, l;
+
+ if (seq->strand == 0) {
+ for (i = 0; i < seq->full_len; i += bsz) {
+ l = seq->full_len - i > bsz ? bsz : seq->full_len - i;
+ for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]];
+ err_fwrite(buffer, 1, l, stream);
+ }
+ } else {
+ for (i = seq->full_len - 1; i >= 0; i -= bsz) {
+ l = i + 1 > bsz ? bsz : i + 1;
+ for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]];
+ err_fwrite(buffer, 1, l, stream);
+ }
+ }
+}
+
+void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2)
+{
+ int j;
+ if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) {
+ int seqid, nn, am = 0, flag = p->extra_flag;
+ char XT;
+
+ if (p->type == BWA_TYPE_NO_MATCH) {
+ p->pos = mate->pos;
+ p->strand = mate->strand;
+ flag |= SAM_FSU;
+ j = 1;
+ } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment
+
+ // get seqid
+ nn = bns_cnt_ambi(bns, p->pos, j, &seqid);
+ if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len)
+ flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences
+
+ // update flag and print it
+ if (p->strand) flag |= SAM_FSR;
+ if (mate) {
+ if (mate->type != BWA_TYPE_NO_MATCH) {
+ if (mate->strand) flag |= SAM_FMR;
+ } else flag |= SAM_FMU;
+ }
+ err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name);
+ err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ);
+
+ // print CIGAR
+ if (p->cigar) {
+ for (j = 0; j != p->n_cigar; ++j)
+ err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]);
+ } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*");
+ else err_printf("%dM", p->len);
+
+ // print mate coordinate
+ if (mate && mate->type != BWA_TYPE_NO_MATCH) {
+ int m_seqid;
+ long long isize;
+ am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
+ // redundant calculation here, but should not matter too much
+ bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
+ err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
+ isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
+ if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
+ err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize);
+ } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1));
+ else err_printf("\t*\t0\t0\t");
+
+ // print sequence and quality
+ bwa_print_seq(stdout, p);
+ err_putchar('\t');
+ if (p->qual) {
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
+ err_printf("%s", p->qual);
+ } else err_printf("*");
+
+ if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
+ if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
+ if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
+ if (p->type != BWA_TYPE_NO_MATCH) {
+ int i;
+ // calculate XT tag
+ XT = "NURM"[p->type];
+ if (nn > 10) XT = 'N';
+ // print tags
+ err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm);
+ if (nn) err_printf("\tXN:i:%d", nn);
+ if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am);
+ if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment
+ err_printf("\tX0:i:%d", p->c1);
+ if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2);
+ }
+ err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape);
+ if (p->md) err_printf("\tMD:Z:%s", p->md);
+ // print multiple hits
+ if (p->n_multi) {
+ err_printf("\tXA:Z:");
+ for (i = 0; i < p->n_multi; ++i) {
+ bwt_multi1_t *q = p->multi + i;
+ int k;
+ j = pos_end_multi(q, p->len) - q->pos;
+ nn = bns_cnt_ambi(bns, q->pos, j, &seqid);
+ err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+',
+ (int)(q->pos - bns->anns[seqid].offset + 1));
+ if (q->cigar) {
+ for (k = 0; k < q->n_cigar; ++k)
+ err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]);
+ } else err_printf("%dM", p->len);
+ err_printf(",%d;", q->gap + q->mm);
+ }
+ }
+ }
+ err_putchar('\n');
+ } else { // this read has no match
+ //ubyte_t *s = p->strand? p->rseq : p->seq;
+ int flag = p->extra_flag | SAM_FSU;
+ if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU;
+ err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag);
+ //Why did this work differently to the version above??
+ //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]);
+ bwa_print_seq(stdout, p);
+ err_putchar('\t');
+ if (p->qual) {
+ if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
+ err_printf("%s", p->qual);
+ } else err_printf("*");
+ if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
+ if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
+ if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
+ err_putchar('\n');
+ }
+}
+
+void bwase_initialize()
+{
+ int i;
+ for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
+}
+
+void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
+{
+ extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
+ int i, n_seqs, tot_seqs = 0, m_aln;
+ bwt_aln1_t *aln = 0;
+ bwa_seq_t *seqs;
+ bwa_seqio_t *ks;
+ clock_t t;
+ bntseq_t *bns;
+ FILE *fp_sa;
+ gap_opt_t opt;
+ char magic[4];
+
+ // initialization
+ bwase_initialize();
+ bns = bns_restore(prefix);
+ srand48(bns->seed);
+ fp_sa = xopen(fn_sa, "r");
+
+ m_aln = 0;
+ err_fread_noeof(magic, 1, 4, fp_sa);
+ if (strncmp(magic, SAI_MAGIC, 4) != 0) {
+ fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__);
+ exit(1);
+ }
+ err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa);
+ bwa_print_sam_hdr(bns, rg_line);
+ // set ks
+ ks = bwa_open_reads(opt.mode, fn_fa);
+ // core loop
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) {
+ tot_seqs += n_seqs;
+ t = clock();
+
+ // read alignment
+ for (i = 0; i < n_seqs; ++i) {
+ bwa_seq_t *p = seqs + i;
+ int n_aln;
+ err_fread_noeof(&n_aln, 4, 1, fp_sa);
+ if (n_aln > m_aln) {
+ m_aln = n_aln;
+ aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln);
+ }
+ err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa);
+ bwa_aln2seq_core(n_aln, aln, p, 1, n_occ);
+ }
+
+ fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... ");
+ bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+ fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
+ bwa_refine_gapped(bns, n_seqs, seqs, 0);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
+
+ fprintf(stderr, "[bwa_aln_core] print alignments... ");
+ for (i = 0; i < n_seqs; ++i)
+ bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+
+ bwa_free_read_seq(n_seqs, seqs);
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
+ }
+
+ // destroy
+ bwa_seq_close(ks);
+ bns_destroy(bns);
+ err_fclose(fp_sa);
+ free(aln);
+}
+
+int bwa_sai2sam_se(int argc, char *argv[])
+{
+ int c, n_occ = 3;
+ char *prefix, *rg_line = 0;
+ while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
+ switch (c) {
+ case 'h': break;
+ case 'r':
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
+ break;
+ case 'n': n_occ = atoi(optarg); break;
+ case 'f': xreopen(optarg, "w", stdout); break;
+ default: return 1;
+ }
+ }
+
+ if (optind + 3 > argc) {
+ fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
+ return 1;
+ }
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
+ fprintf(stderr, "[%s] fail to locate the index\n", __func__);
+ return 1;
+ }
+ bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
+ free(prefix);
+ return 0;
+}
diff --git a/ext/src/bwa/bwase.h b/ext/src/bwa/bwase.h
new file mode 100644
index 0000000..26a9f68
--- /dev/null
+++ b/ext/src/bwa/bwase.h
@@ -0,0 +1,29 @@
+#ifndef BWASE_H
+#define BWASE_H
+
+#include "bntseq.h"
+#include "bwt.h"
+#include "bwtaln.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ // Initialize mapping tables in the bwa single-end mapper.
+ void bwase_initialize();
+ // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
+ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
+ // Refine the approximate position of the sequence to an actual placement for the sequence.
+ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
+ // Backfill certain alignment properties mainly centering around number of matches.
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
+ // Calculate the end position of a read given a certain sequence.
+ int64_t pos_end(const bwa_seq_t *p);
+ //
+ bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BWASE_H
diff --git a/ext/src/bwa/bwaseqio.c b/ext/src/bwa/bwaseqio.c
new file mode 100644
index 0000000..d850307
--- /dev/null
+++ b/ext/src/bwa/bwaseqio.c
@@ -0,0 +1,235 @@
+#include <zlib.h>
+#include <ctype.h>
+#include "bwtaln.h"
+#include "utils.h"
+#include "bamlite.h"
+
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+extern unsigned char nst_nt4_table[256];
+static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+struct __bwa_seqio_t {
+ // for BAM input
+ int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE
+ bamFile fp;
+ // for fastq input
+ kseq_t *ks;
+};
+
+bwa_seqio_t *bwa_bam_open(const char *fn, int which)
+{
+ bwa_seqio_t *bs;
+ bam_header_t *h;
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
+ bs->is_bam = 1;
+ bs->which = which;
+ bs->fp = bam_open(fn, "r");
+ if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
+ h = bam_header_read(bs->fp);
+ bam_header_destroy(h);
+ return bs;
+}
+
+bwa_seqio_t *bwa_seq_open(const char *fn)
+{
+ gzFile fp;
+ bwa_seqio_t *bs;
+ bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
+ fp = xzopen(fn, "r");
+ bs->ks = kseq_init(fp);
+ return bs;
+}
+
+void bwa_seq_close(bwa_seqio_t *bs)
+{
+ if (bs == 0) return;
+ if (bs->is_bam) {
+ if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file");
+ } else {
+ err_gzclose(bs->ks->f->f);
+ kseq_destroy(bs->ks);
+ }
+ free(bs);
+}
+
+void seq_reverse(int len, ubyte_t *seq, int is_comp)
+{
+ int i;
+ if (is_comp) {
+ for (i = 0; i < len>>1; ++i) {
+ char tmp = seq[len-1-i];
+ if (tmp < 4) tmp = 3 - tmp;
+ seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
+ seq[i] = tmp;
+ }
+ if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i];
+ } else {
+ for (i = 0; i < len>>1; ++i) {
+ char tmp = seq[len-1-i];
+ seq[len-1-i] = seq[i]; seq[i] = tmp;
+ }
+ }
+}
+
+int bwa_trim_read(int trim_qual, bwa_seq_t *p)
+{
+ int s = 0, l, max = 0, max_l = p->len;
+ if (trim_qual < 1 || p->qual == 0) return 0;
+ for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) {
+ s += trim_qual - (p->qual[l] - 33);
+ if (s < 0) break;
+ if (s > max) max = s, max_l = l;
+ }
+ p->clip_len = p->len = max_l;
+ return p->full_len - p->len;
+}
+
+static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual)
+{
+ bwa_seq_t *seqs, *p;
+ int n_seqs, l, i;
+ long n_trimmed = 0, n_tot = 0;
+ bam1_t *b;
+ int res;
+
+ b = bam_init1();
+ n_seqs = 0;
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
+ while ((res = bam_read1(bs->fp, b)) >= 0) {
+ uint8_t *s, *q;
+ int go = 0;
+ if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1;
+ if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1;
+ if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1;
+ if (go == 0) continue;
+ l = b->core.l_qseq;
+ p = &seqs[n_seqs++];
+ p->tid = -1; // no assigned to a thread
+ p->qual = 0;
+ p->full_len = p->clip_len = p->len = l;
+ n_tot += p->full_len;
+ s = bam1_seq(b); q = bam1_qual(b);
+ p->seq = (ubyte_t*)calloc(p->len + 1, 1);
+ p->qual = (ubyte_t*)calloc(p->len + 1, 1);
+ for (i = 0; i != p->full_len; ++i) {
+ p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)];
+ p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126;
+ }
+ if (bam1_strand(b)) { // then reverse
+ seq_reverse(p->len, p->seq, 1);
+ seq_reverse(p->len, p->qual, 0);
+ }
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
+ memcpy(p->rseq, p->seq, p->len);
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
+ seq_reverse(p->len, p->rseq, is_comp);
+ p->name = strdup((const char*)bam1_qname(b));
+ if (n_seqs == n_needed) break;
+ }
+ if (res < 0 && res != -1) err_fatal_simple("Error reading bam file");
+ *n = n_seqs;
+ if (n_seqs && trim_qual >= 1)
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
+ if (n_seqs == 0) {
+ free(seqs);
+ bam_destroy1(b);
+ return 0;
+ }
+ bam_destroy1(b);
+ return seqs;
+}
+
+#define BARCODE_LOW_QUAL 13
+
+bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual)
+{
+ bwa_seq_t *seqs, *p;
+ kseq_t *seq = bs->ks;
+ int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24;
+ long n_trimmed = 0, n_tot = 0;
+
+ if (l_bc > BWA_MAX_BCLEN) {
+ fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN);
+ return 0;
+ }
+ if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input
+ n_seqs = 0;
+ seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t));
+ while ((l = kseq_read(seq)) >= 0) {
+ if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) {
+ // skip reads that are marked to be filtered by Casava
+ char *s = index(seq->comment.s, ':');
+ if (s && *(++s) == 'Y') {
+ continue;
+ }
+ }
+ if (is_64 && seq->qual.l)
+ for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31;
+ if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length
+ p = &seqs[n_seqs++];
+ if (l_bc) { // then trim barcode
+ for (i = 0; i < l_bc; ++i)
+ p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]);
+ p->bc[i] = 0;
+ for (; i < seq->seq.l; ++i)
+ seq->seq.s[i - l_bc] = seq->seq.s[i];
+ seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0;
+ if (seq->qual.l) {
+ for (i = l_bc; i < seq->qual.l; ++i)
+ seq->qual.s[i - l_bc] = seq->qual.s[i];
+ seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0;
+ }
+ l = seq->seq.l;
+ } else p->bc[0] = 0;
+ p->tid = -1; // no assigned to a thread
+ p->qual = 0;
+ p->full_len = p->clip_len = p->len = l;
+ n_tot += p->full_len;
+ p->seq = (ubyte_t*)calloc(p->full_len, 1);
+ for (i = 0; i != p->full_len; ++i)
+ p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]];
+ if (seq->qual.l) { // copy quality
+ p->qual = (ubyte_t*)strdup((char*)seq->qual.s);
+ if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p);
+ }
+ p->rseq = (ubyte_t*)calloc(p->full_len, 1);
+ memcpy(p->rseq, p->seq, p->len);
+ seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped()
+ seq_reverse(p->len, p->rseq, is_comp);
+ p->name = strdup((const char*)seq->name.s);
+ { // trim /[12]$
+ int t = strlen(p->name);
+ if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0';
+ }
+ if (n_seqs == n_needed) break;
+ }
+ *n = n_seqs;
+ if (n_seqs && trim_qual >= 1)
+ fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot);
+ if (n_seqs == 0) {
+ free(seqs);
+ return 0;
+ }
+ return seqs;
+}
+
+void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs)
+{
+ int i, j;
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p = seqs + i;
+ for (j = 0; j < p->n_multi; ++j)
+ if (p->multi[j].cigar) free(p->multi[j].cigar);
+ free(p->name);
+ free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi);
+ free(p->cigar);
+ }
+ free(seqs);
+}
diff --git a/ext/src/bwa/bwashm.c b/ext/src/bwa/bwashm.c
new file mode 100644
index 0000000..163f764
--- /dev/null
+++ b/ext/src/bwa/bwashm.c
@@ -0,0 +1,213 @@
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include "bwa.h"
+
+int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt;
+ int shmid, to_init = 0, l;
+ char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
+
+ if (hint == 0 || hint[0] == 0) return -1;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+
+ if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
+ shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
+ to_init = 1;
+ }
+ if (shmid < 0) return -1;
+ ftruncate(shmid, BWA_CTL_SIZE);
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (to_init) {
+ memset(shm, 0, BWA_CTL_SIZE);
+ cnt[1] = 4;
+ }
+
+ if (idx->mem == 0) bwa_idx2mem(idx);
+
+ if (tmpfn) {
+ FILE *fp;
+ if ((fp = fopen(tmpfn, "wb")) != 0) {
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ free(idx->mem); idx->mem = 0;
+ } else {
+ fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
+ tmpfn = 0;
+ }
+ }
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
+ shm_unlink(path);
+ perror("shm_open()");
+ return -1;
+ }
+ l = 8 + strlen(name) + 1;
+ if (cnt[1] + l > BWA_CTL_SIZE) return -1;
+ memcpy(shm + cnt[1], &idx->l_mem, 8);
+ memcpy(shm + cnt[1] + 8, name, l - 8);
+ cnt[1] += l; ++cnt[0];
+ ftruncate(shmid, idx->l_mem);
+ shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+ if (tmpfn) {
+ FILE *fp;
+ fp = fopen(tmpfn, "rb");
+ int64_t rest = idx->l_mem;
+ while (rest > 0) {
+ int64_t l = rest < 0x1000000? rest : 0x1000000;
+ rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
+ }
+ fclose(fp);
+ unlink(tmpfn);
+ } else {
+ memcpy(shm_idx, idx->mem, idx->l_mem);
+ free(idx->mem);
+ }
+ bwa_mem2idx(idx->l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return 0;
+}
+
+bwaidx_t *bwa_idx_load_from_shm(const char *hint)
+{
+ const char *name;
+ uint8_t *shm, *shm_idx;
+ uint16_t *cnt, i;
+ char *p, path[PATH_MAX + 1];
+ int shmid;
+ int64_t l_mem;
+ bwaidx_t *idx;
+
+ if (hint == 0 || hint[0] == 0) return 0;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ if (cnt[0] == 0) return 0;
+ for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
+ memcpy(&l_mem, p, 8); p += 8;
+ if (strcmp(p, name) == 0) break;
+ p += strlen(p) + 1;
+ }
+ if (i == cnt[0]) return 0;
+
+ strcat(strcpy(path, "/bwaidx-"), name);
+ if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0;
+ shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
+ idx = calloc(1, sizeof(bwaidx_t));
+ bwa_mem2idx(l_mem, shm_idx, idx);
+ idx->is_shm = 1;
+ return idx;
+}
+
+int bwa_shm_test(const char *hint)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ const char *name;
+
+ if (hint == 0 || hint[0] == 0) return 0;
+ for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+ ++name;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ if (strcmp(p + 8, name) == 0) return 1;
+ p += strlen(p) + 9;
+ }
+ return 0;
+}
+
+int bwa_shm_list(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ printf("%s\t%ld\n", p, (long)l_mem);
+ p += strlen(p) + 1;
+ }
+ return 0;
+}
+
+int bwa_shm_destroy(void)
+{
+ int shmid;
+ uint16_t *cnt, i;
+ char *p, *shm;
+ char path[PATH_MAX + 1];
+
+ if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+ shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+ cnt = (uint16_t*)shm;
+ for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+ int64_t l_mem;
+ memcpy(&l_mem, p, 8); p += 8;
+ strcat(strcpy(path, "/bwaidx-"), p);
+ shm_unlink(path);
+ p += strlen(p) + 1;
+ }
+ munmap(shm, BWA_CTL_SIZE);
+ shm_unlink("/bwactl");
+ return 0;
+}
+
+int main_shm(int argc, char *argv[])
+{
+ int c, to_list = 0, to_drop = 0, ret = 0;
+ char *tmpfn = 0;
+ while ((c = getopt(argc, argv, "ldf:")) >= 0) {
+ if (c == 'l') to_list = 1;
+ else if (c == 'd') to_drop = 1;
+ else if (c == 'f') tmpfn = optarg;
+ }
+ if (optind == argc && !to_list && !to_drop) {
+ fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
+ fprintf(stderr, "Options: -d destroy all indices in shared memory\n");
+ fprintf(stderr, " -l list names of indices in shared memory\n");
+ fprintf(stderr, " -f FILE temporary file to reduce peak memory\n\n");
+ return 1;
+ }
+ if (optind < argc && (to_list || to_drop)) {
+ fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
+ return 1;
+ }
+ if (optind < argc) {
+ if (bwa_shm_test(argv[optind]) == 0) {
+ bwaidx_t *idx;
+ idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
+ if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
+ fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
+ ret = 1;
+ }
+ bwa_idx_destroy(idx);
+ } else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
+ }
+ if (to_list) bwa_shm_list();
+ if (to_drop) bwa_shm_destroy();
+ return ret;
+}
diff --git a/ext/src/bwa/bwt.c b/ext/src/bwa/bwt.c
new file mode 100644
index 0000000..859b16a
--- /dev/null
+++ b/ext/src/bwa/bwt.c
@@ -0,0 +1,469 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <limits.h>
+#include "bwa/utils.h"
+#include "bwa/bwt.h"
+#include "kvec.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+void bwt_gen_cnt_table(bwt_t *bwt)
+{
+ int i, j;
+ for (i = 0; i != 256; ++i) {
+ uint32_t x = 0;
+ for (j = 0; j != 4; ++j)
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
+ bwt->cnt_table[i] = x;
+ }
+}
+
+static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
+{
+ bwtint_t x = k - (k > bwt->primary);
+ x = bwt_B0(bwt, x);
+ x = bwt->L2[x] + bwt_occ(bwt, k, x);
+ return k == bwt->primary? 0 : x;
+}
+
+// bwt->bwt and bwt->occ must be precalculated
+void bwt_cal_sa(bwt_t *bwt, int intv)
+{
+ bwtint_t isa, sa, i; // S(isa) = sa
+ int intv_round = intv;
+
+ kv_roundup32(intv_round);
+ xassert(intv_round == intv, "SA sample interval is not a power of 2.");
+ xassert(bwt->bwt, "bwt_t::bwt is not initialized.");
+
+ if (bwt->sa) free(bwt->sa);
+ bwt->sa_intv = intv;
+ bwt->n_sa = (bwt->seq_len + intv) / intv;
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
+ // calculate SA value
+ isa = 0; sa = bwt->seq_len;
+ for (i = 0; i < bwt->seq_len; ++i) {
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
+ --sa;
+ isa = bwt_invPsi(bwt, isa);
+ }
+ if (isa % intv == 0) bwt->sa[isa/intv] = sa;
+ bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len
+}
+
+bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k)
+{
+ bwtint_t sa = 0, mask = bwt->sa_intv - 1;
+ while (k & mask) {
+ ++sa;
+ k = bwt_invPsi(bwt, k);
+ }
+ /* without setting bwt->sa[0] = -1, the following line should be
+ changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */
+ return sa + bwt->sa[k/bwt->sa_intv];
+}
+
+static inline int __occ_aux(uint64_t y, int c)
+{
+ // reduce nucleotide counting to bits counting
+ y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
+ // count the number of 1s in y
+ y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
+ return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
+}
+
+bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
+{
+ bwtint_t n;
+ uint32_t *p, *end;
+
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
+ if (k == (bwtint_t)(-1)) return 0;
+ k -= (k >= bwt->primary); // because $ is not in bwt
+
+ // retrieve Occ at k/OCC_INTERVAL
+ n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
+ p += sizeof(bwtint_t); // jump to the start of the first BWT cell
+
+ // calculate Occ up to the last k/32
+ end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
+ for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
+
+ // calculate Occ
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
+
+ return n;
+}
+
+// an analogy to bwt_occ() but more efficient, requiring k <= l
+void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
+{
+ bwtint_t _k, _l;
+ _k = (k >= bwt->primary)? k-1 : k;
+ _l = (l >= bwt->primary)? l-1 : l;
+ if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
+ *ok = bwt_occ(bwt, k, c);
+ *ol = bwt_occ(bwt, l, c);
+ } else {
+ bwtint_t m, n, i, j;
+ uint32_t *p;
+ if (k >= bwt->primary) --k;
+ if (l >= bwt->primary) --l;
+ n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
+ p += sizeof(bwtint_t);
+ // calculate *ok
+ j = k >> 5 << 5;
+ for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2)
+ n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
+ m = n;
+ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
+ if (c == 0) n -= ~k&31; // corrected for the masked bits
+ *ok = n;
+ // calculate *ol
+ j = l >> 5 << 5;
+ for (; i < j; i += 32, p += 2)
+ m += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
+ m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c);
+ if (c == 0) m -= ~l&31; // corrected for the masked bits
+ *ol = m;
+ }
+}
+
+#define __occ_aux4(bwt, b) \
+ ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \
+ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
+
+void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
+{
+ bwtint_t x;
+ uint32_t *p, tmp, *end;
+ if (k == (bwtint_t)(-1)) {
+ memset(cnt, 0, 4 * sizeof(bwtint_t));
+ return;
+ }
+ k -= (k >= bwt->primary); // because $ is not in bwt
+ p = bwt_occ_intv(bwt, k);
+ memcpy(cnt, p, 4 * sizeof(bwtint_t));
+ p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
+ end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
+ for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
+ tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+ x += __occ_aux4(bwt, tmp) - (~k&15);
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
+}
+
+// an analogy to bwt_occ4() but more efficient, requiring k <= l
+void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
+{
+ bwtint_t _k, _l;
+ _k = k - (k >= bwt->primary);
+ _l = l - (l >= bwt->primary);
+ if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
+ bwt_occ4(bwt, k, cntk);
+ bwt_occ4(bwt, l, cntl);
+ } else {
+ bwtint_t x, y;
+ uint32_t *p, tmp, *endk, *endl;
+ k -= (k >= bwt->primary); // because $ is not in bwt
+ l -= (l >= bwt->primary);
+ p = bwt_occ_intv(bwt, k);
+ memcpy(cntk, p, 4 * sizeof(bwtint_t));
+ p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
+ // prepare cntk[]
+ endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
+ endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
+ for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
+ y = x;
+ tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+ x += __occ_aux4(bwt, tmp) - (~k&15);
+ // calculate cntl[] and finalize cntk[]
+ for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
+ tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
+ y += __occ_aux4(bwt, tmp) - (~l&15);
+ memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
+ cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
+ cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
+ }
+}
+
+int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end)
+{
+ bwtint_t k, l, ok, ol;
+ int i;
+ k = 0; l = bwt->seq_len;
+ for (i = len - 1; i >= 0; --i) {
+ ubyte_t c = str[i];
+ if (c > 3) return 0; // no match
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
+ k = bwt->L2[c] + ok + 1;
+ l = bwt->L2[c] + ol;
+ if (k > l) break; // no match
+ }
+ if (k > l) return 0; // no match
+ if (sa_begin) *sa_begin = k;
+ if (sa_end) *sa_end = l;
+ return l - k + 1;
+}
+
+int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0)
+{
+ int i;
+ bwtint_t k, l, ok, ol;
+ k = *k0; l = *l0;
+ for (i = len - 1; i >= 0; --i) {
+ ubyte_t c = str[i];
+ if (c > 3) return 0; // there is an N here. no match
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
+ k = bwt->L2[c] + ok + 1;
+ l = bwt->L2[c] + ol;
+ if (k > l) return 0; // no match
+ }
+ *k0 = k; *l0 = l;
+ return l - k + 1;
+}
+
+/*********************
+ * Bidirectional BWT *
+ *********************/
+
+void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back)
+{
+ bwtint_t tk[4], tl[4];
+ int i;
+ bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl);
+ for (i = 0; i != 4; ++i) {
+ ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i];
+ ok[i].x[2] = tl[i] - tk[i];
+ }
+ ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary);
+ ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2];
+ ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2];
+ ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2];
+}
+
+static void bwt_reverse_intvs(bwtintv_v *p)
+{
+ if (p->n > 1) {
+ int j;
+ for (j = 0; j < p->n>>1; ++j) {
+ bwtintv_t tmp = p->a[p->n - 1 - j];
+ p->a[p->n - 1 - j] = p->a[j];
+ p->a[j] = tmp;
+ }
+ }
+}
+// NOTE: $max_intv is not currently used in BWA-MEM
+int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+{
+ int i, j, c, ret;
+ bwtintv_t ik, ok[4];
+ bwtintv_v a[2], *prev, *curr, *swap;
+
+ mem->n = 0;
+ if (q[x] > 3) return x + 1;
+ if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
+ kv_init(a[0]); kv_init(a[1]);
+ prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
+ curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
+ bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
+ ik.info = x + 1;
+
+ for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
+ if (ik.x[2] < max_intv) { // an interval small enough
+ kv_push(bwtintv_t, *curr, ik);
+ break;
+ } else if (q[i] < 4) { // an A/C/G/T base
+ c = 3 - q[i]; // complement of q[i]
+ bwt_extend(bwt, &ik, ok, 0);
+ if (ok[c].x[2] != ik.x[2]) { // change of the interval size
+ kv_push(bwtintv_t, *curr, ik);
+ if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
+ }
+ ik = ok[c]; ik.info = i + 1;
+ } else { // an ambiguous base
+ kv_push(bwtintv_t, *curr, ik);
+ break; // always terminate extension at an ambiguous base; in this case, i<len always stands
+ }
+ }
+ if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
+ bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
+ ret = curr->a[0].info; // this will be the returned value
+ swap = curr; curr = prev; prev = swap;
+
+ for (i = x - 1; i >= -1; --i) { // backward search for MEMs
+ c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
+ for (j = 0, curr->n = 0; j < prev->n; ++j) {
+ bwtintv_t *p = &prev->a[j];
+ if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
+ if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+ if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
+ if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
+ ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
+ kv_push(bwtintv_t, *mem, ik);
+ }
+ } // otherwise the match is contained in another longer match
+ } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
+ ok[c].info = p->info;
+ kv_push(bwtintv_t, *curr, ok[c]);
+ }
+ }
+ if (curr->n == 0) break;
+ swap = curr; curr = prev; prev = swap;
+ }
+ bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
+
+ if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
+ if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
+ return ret;
+}
+
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+{
+ return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
+}
+
+int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
+{
+ int i, c;
+ bwtintv_t ik, ok[4];
+
+ memset(mem, 0, sizeof(bwtintv_t));
+ if (q[x] > 3) return x + 1;
+ bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
+ for (i = x + 1; i < len; ++i) { // forward search
+ if (q[i] < 4) { // an A/C/G/T base
+ c = 3 - q[i]; // complement of q[i]
+ bwt_extend(bwt, &ik, ok, 0);
+ if (ok[c].x[2] < max_intv && i - x >= min_len) {
+ *mem = ok[c];
+ mem->info = (uint64_t)x<<32 | (i + 1);
+ return i + 1;
+ }
+ ik = ok[c];
+ } else return i + 1;
+ }
+ return len;
+}
+
+/*************************
+ * Read/write BWT and SA *
+ *************************/
+
+void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
+{
+ FILE *fp;
+ fp = xopen(fn, "wb");
+ err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
+ err_fflush(fp);
+ err_fclose(fp);
+}
+
+void bwt_dump_sa(const char *fn, const bwt_t *bwt)
+{
+ FILE *fp;
+ fp = xopen(fn, "wb");
+ err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+ err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
+ err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
+ err_fflush(fp);
+ err_fclose(fp);
+}
+
+static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
+{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks
+ const int bufsize = 0x1000000; // 16M block
+ bwtint_t offset = 0;
+ while (size) {
+ int x = bufsize < size? bufsize : size;
+ if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break;
+ size -= x; offset += x;
+ }
+ return offset;
+}
+
+void bwt_restore_sa(const char *fn, bwt_t *bwt)
+{
+ char skipped[256];
+ FILE *fp;
+ bwtint_t primary;
+
+ fp = xopen(fn, "rb");
+ err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
+ xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
+ err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip
+ err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+ err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp);
+ xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
+
+ bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
+ bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
+ bwt->sa[0] = -1;
+
+ fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1);
+ err_fclose(fp);
+}
+
+bwt_t *bwt_restore_bwt(const char *fn)
+{
+ bwt_t *bwt;
+ FILE *fp;
+
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+ fp = xopen(fn, "rb");
+ err_fseek(fp, 0, SEEK_END);
+ bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
+ bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
+ err_fseek(fp, 0, SEEK_SET);
+ err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp);
+ err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+ fread_fix(fp, bwt->bwt_size<<2, bwt->bwt);
+ bwt->seq_len = bwt->L2[4];
+ err_fclose(fp);
+ bwt_gen_cnt_table(bwt);
+
+ return bwt;
+}
+
+void bwt_destroy(bwt_t *bwt)
+{
+ if (bwt == 0) return;
+ free(bwt->sa); free(bwt->bwt);
+ free(bwt);
+}
diff --git a/ext/src/bwa/bwt_lite.c b/ext/src/bwa/bwt_lite.c
new file mode 100644
index 0000000..f7946f5
--- /dev/null
+++ b/ext/src/bwa/bwt_lite.c
@@ -0,0 +1,98 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "bwt_lite.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+int is_sa(const uint8_t *T, int *SA, int n);
+int is_bwt(uint8_t *T, int n);
+
+bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq)
+{
+ bwtl_t *b;
+ int i;
+ b = (bwtl_t*)calloc(1, sizeof(bwtl_t));
+ b->seq_len = len;
+
+ { // calculate b->bwt
+ uint8_t *s;
+ b->sa = (uint32_t*)calloc(len + 1, 4);
+ is_sa(seq, (int*)b->sa, len);
+ s = (uint8_t*)calloc(len + 1, 1);
+ for (i = 0; i <= len; ++i) {
+ if (b->sa[i] == 0) b->primary = i;
+ else s[i] = seq[b->sa[i] - 1];
+ }
+ for (i = b->primary; i < len; ++i) s[i] = s[i + 1];
+ b->bwt_size = (len + 15) / 16;
+ b->bwt = (uint32_t*)calloc(b->bwt_size, 4);
+ for (i = 0; i < len; ++i)
+ b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1);
+ free(s);
+ }
+ { // calculate b->occ
+ uint32_t c[4];
+ b->n_occ = (len + 15) / 16 * 4;
+ b->occ = (uint32_t*)calloc(b->n_occ, 4);
+ memset(c, 0, 16);
+ for (i = 0; i < len; ++i) {
+ if (i % 16 == 0)
+ memcpy(b->occ + (i/16) * 4, c, 16);
+ ++c[bwtl_B0(b, i)];
+ }
+ memcpy(b->L2+1, c, 16);
+ for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1];
+ }
+ { // generate cnt_table
+ for (i = 0; i != 256; ++i) {
+ u_int32_t j, x = 0;
+ for (j = 0; j != 4; ++j)
+ x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3);
+ b->cnt_table[i] = x;
+ }
+ }
+ return b;
+}
+uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
+{
+ uint32_t n, b;
+ if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
+ if (k == (uint32_t)(-1)) return 0;
+ if (k >= bwt->primary) --k; // because $ is not in bwt
+ n = bwt->occ[k/16<<2|c];
+ b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1);
+ n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff;
+ if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
+ return n;
+}
+void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
+{
+ uint32_t x, b;
+ if (k == (uint32_t)(-1)) {
+ memset(cnt, 0, 16);
+ return;
+ }
+ if (k >= bwt->primary) --k; // because $ is not in bwt
+ memcpy(cnt, bwt->occ + (k>>4<<2), 16);
+ b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1);
+ x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff]
+ + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24];
+ x -= 15 - (k&15);
+ cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
+}
+void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
+{
+ bwtl_occ4(bwt, k, cntk);
+ bwtl_occ4(bwt, l, cntl);
+}
+void bwtl_destroy(bwtl_t *bwt)
+{
+ if (bwt) {
+ free(bwt->occ); free(bwt->bwt); free(bwt->sa);
+ free(bwt);
+ }
+}
diff --git a/ext/src/bwa/bwt_lite.h b/ext/src/bwa/bwt_lite.h
new file mode 100644
index 0000000..4fadcce
--- /dev/null
+++ b/ext/src/bwa/bwt_lite.h
@@ -0,0 +1,29 @@
+#ifndef BWT_LITE_H_
+#define BWT_LITE_H_
+
+#include <stdint.h>
+
+typedef struct {
+ uint32_t seq_len, bwt_size, n_occ;
+ uint32_t primary;
+ uint32_t *bwt, *occ, *sa, L2[5];
+ uint32_t cnt_table[256];
+} bwtl_t;
+
+#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq);
+ uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c);
+ void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]);
+ void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]);
+ void bwtl_destroy(bwtl_t *bwt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/bwtaln.c b/ext/src/bwa/bwtaln.c
new file mode 100644
index 0000000..20b01cd
--- /dev/null
+++ b/ext/src/bwa/bwtaln.c
@@ -0,0 +1,320 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "bwtaln.h"
+#include "bwtgap.h"
+#include "utils.h"
+#include "bwa.h"
+
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+gap_opt_t *gap_init_opt()
+{
+ gap_opt_t *o;
+ o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t));
+ /* IMPORTANT: s_mm*10 should be about the average base error
+ rate. Voilating this requirement will break pairing! */
+ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4;
+ o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6;
+ o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000;
+ o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD;
+ o->seed_len = 32; o->max_seed_diff = 2;
+ o->fnr = 0.04;
+ o->n_threads = 1;
+ o->max_top2 = 30;
+ o->trim_qual = 0;
+ return o;
+}
+
+int bwa_cal_maxdiff(int l, double err, double thres)
+{
+ double elambda = exp(-l * err);
+ double sum, y = 1.0;
+ int k, x = 1;
+ for (k = 1, sum = elambda; k < 1000; ++k) {
+ y *= l * err;
+ x *= k;
+ sum += elambda * y / x;
+ if (1.0 - sum < thres) return k;
+ }
+ return 2;
+}
+
+// width must be filled as zero
+int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width)
+{
+ bwtint_t k, l, ok, ol;
+ int i, bid;
+ bid = 0;
+ k = 0; l = bwt->seq_len;
+ for (i = 0; i < len; ++i) {
+ ubyte_t c = str[i];
+ if (c < 4) {
+ bwt_2occ(bwt, k - 1, l, c, &ok, &ol);
+ k = bwt->L2[c] + ok + 1;
+ l = bwt->L2[c] + ol;
+ }
+ if (k > l || c > 3) { // then restart
+ k = 0;
+ l = bwt->seq_len;
+ ++bid;
+ }
+ width[i].w = l - k + 1;
+ width[i].bid = bid;
+ }
+ width[len].w = 0;
+ width[len].bid = ++bid;
+ return bid;
+}
+
+void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt)
+{
+ int i, j, max_l = 0, max_len;
+ gap_stack_t *stack;
+ bwt_width_t *w, *seed_w;
+ gap_opt_t local_opt = *opt;
+
+ // initiate priority stack
+ for (i = max_len = 0; i != n_seqs; ++i)
+ if (seqs[i].len > max_len) max_len = seqs[i].len;
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr);
+ if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff;
+ stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt);
+
+ seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t));
+ w = 0;
+ for (i = 0; i != n_seqs; ++i) {
+ bwa_seq_t *p = seqs + i;
+#ifdef HAVE_PTHREAD
+ if (i % opt->n_threads != tid) continue;
+#endif
+ p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0;
+ if (max_l < p->len) {
+ max_l = p->len;
+ w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t));
+ memset(w, 0, (max_l + 1) * sizeof(bwt_width_t));
+ }
+ bwt_cal_width(bwt, p->len, p->seq, w);
+ if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr);
+ local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff;
+ if (p->len > opt->seed_len)
+ bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w);
+ // core function
+ for (j = 0; j < p->len; ++j) // we need to complement
+ p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j];
+ p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack);
+ //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo);
+ // clean up the unused data in the record
+ free(p->name); free(p->seq); free(p->rseq); free(p->qual);
+ p->name = 0; p->seq = p->rseq = p->qual = 0;
+ }
+ free(seed_w); free(w);
+ gap_destroy_stack(stack);
+}
+
+#ifdef HAVE_PTHREAD
+typedef struct {
+ int tid;
+ bwt_t *bwt;
+ int n_seqs;
+ bwa_seq_t *seqs;
+ const gap_opt_t *opt;
+} thread_aux_t;
+
+static void *worker(void *data)
+{
+ thread_aux_t *d = (thread_aux_t*)data;
+ bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt);
+ return 0;
+}
+#endif
+
+bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa)
+{
+ bwa_seqio_t *ks;
+ if (mode & BWA_MODE_BAM) { // open BAM
+ int which = 0;
+ if (mode & BWA_MODE_BAM_SE) which |= 4;
+ if (mode & BWA_MODE_BAM_READ1) which |= 1;
+ if (mode & BWA_MODE_BAM_READ2) which |= 2;
+ if (which == 0) which = 7; // then read all reads
+ ks = bwa_bam_open(fn_fa, which);
+ } else ks = bwa_seq_open(fn_fa);
+ return ks;
+}
+
+void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
+{
+ int i, n_seqs, tot_seqs = 0;
+ bwa_seq_t *seqs;
+ bwa_seqio_t *ks;
+ clock_t t;
+ bwt_t *bwt;
+
+ // initialization
+ ks = bwa_open_reads(opt->mode, fn_fa);
+
+ { // load BWT
+ char *str = (char*)calloc(strlen(prefix) + 10, 1);
+ strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
+ free(str);
+ }
+
+ // core loop
+ err_fwrite(SAI_MAGIC, 1, 4, stdout);
+ err_fwrite(opt, sizeof(gap_opt_t), 1, stdout);
+ while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) {
+ tot_seqs += n_seqs;
+ t = clock();
+
+ fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... ");
+
+#ifdef HAVE_PTHREAD
+ if (opt->n_threads <= 1) { // no multi-threading at all
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
+ } else {
+ pthread_t *tid;
+ pthread_attr_t attr;
+ thread_aux_t *data;
+ int j;
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+ for (j = 0; j < opt->n_threads; ++j) {
+ data[j].tid = j; data[j].bwt = bwt;
+ data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt;
+ pthread_create(&tid[j], &attr, worker, data + j);
+ }
+ for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
+ free(data); free(tid);
+ }
+#else
+ bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt);
+#endif
+
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+
+ t = clock();
+ fprintf(stderr, "[bwa_aln_core] write to the disk... ");
+ for (i = 0; i < n_seqs; ++i) {
+ bwa_seq_t *p = seqs + i;
+ err_fwrite(&p->n_aln, 4, 1, stdout);
+ if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout);
+ }
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+
+ bwa_free_read_seq(n_seqs, seqs);
+ fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs);
+ }
+
+ // destroy
+ bwt_destroy(bwt);
+ bwa_seq_close(ks);
+}
+
+int bwa_aln(int argc, char *argv[])
+{
+ int c, opte = -1;
+ gap_opt_t *opt;
+ char *prefix;
+
+ opt = gap_init_opt();
+ while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
+ switch (c) {
+ case 'n':
+ if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
+ else opt->max_diff = atoi(optarg), opt->fnr = -1.0;
+ break;
+ case 'o': opt->max_gapo = atoi(optarg); break;
+ case 'e': opte = atoi(optarg); break;
+ case 'M': opt->s_mm = atoi(optarg); break;
+ case 'O': opt->s_gapo = atoi(optarg); break;
+ case 'E': opt->s_gape = atoi(optarg); break;
+ case 'd': opt->max_del_occ = atoi(optarg); break;
+ case 'i': opt->indel_end_skip = atoi(optarg); break;
+ case 'l': opt->seed_len = atoi(optarg); break;
+ case 'k': opt->max_seed_diff = atoi(optarg); break;
+ case 'm': opt->max_entries = atoi(optarg); break;
+ case 't': opt->n_threads = atoi(optarg); break;
+ case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
+ case 'R': opt->max_top2 = atoi(optarg); break;
+ case 'q': opt->trim_qual = atoi(optarg); break;
+ case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
+ case 'f': xreopen(optarg, "wb", stdout); break;
+ case 'b': opt->mode |= BWA_MODE_BAM; break;
+ case '0': opt->mode |= BWA_MODE_BAM_SE; break;
+ case '1': opt->mode |= BWA_MODE_BAM_READ1; break;
+ case '2': opt->mode |= BWA_MODE_BAM_READ2; break;
+ case 'I': opt->mode |= BWA_MODE_IL13; break;
+ case 'Y': opt->mode |= BWA_MODE_CFY; break;
+ case 'B': opt->mode |= atoi(optarg) << 24; break;
+ default: return 1;
+ }
+ }
+ if (opte > 0) {
+ opt->max_gape = opte;
+ opt->mode &= ~BWA_MODE_GAPE;
+ }
+
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa aln [options] <prefix> <in.fq>\n\n");
+ fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n",
+ BWA_AVG_ERR, opt->fnr);
+ fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo);
+ fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n");
+ fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip);
+ fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ);
+ fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len);
+ fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff);
+ fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries);
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm);
+ fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo);
+ fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape);
+ fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2);
+ fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
+ fprintf(stderr, " -f FILE file to write output to instead of stdout\n");
+ fprintf(stderr, " -B INT length of barcode\n");
+ fprintf(stderr, " -L log-scaled gap penalty for long deletions\n");
+ fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n");
+ fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n");
+ fprintf(stderr, " -b the input read file is in the BAM format\n");
+ fprintf(stderr, " -0 use single-end reads only (effective with -b)\n");
+ fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n");
+ fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n");
+ fprintf(stderr, " -Y filter Casava-filtered sequences\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (opt->fnr > 0.0) {
+ int i, k;
+ for (i = 17, k = 0; i <= 250; ++i) {
+ int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
+ if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l);
+ k = l;
+ }
+ }
+ if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
+ fprintf(stderr, "[%s] fail to locate the index\n", __func__);
+ free(opt);
+ return 1;
+ }
+ bwa_aln_core(prefix, argv[optind+1], opt);
+ free(opt); free(prefix);
+ return 0;
+}
diff --git a/ext/src/bwa/bwtaln.h b/ext/src/bwa/bwtaln.h
new file mode 100644
index 0000000..4616ff5
--- /dev/null
+++ b/ext/src/bwa/bwtaln.h
@@ -0,0 +1,153 @@
+#ifndef BWTALN_H
+#define BWTALN_H
+
+#include <stdint.h>
+#include "bwt.h"
+
+#define BWA_TYPE_NO_MATCH 0
+#define BWA_TYPE_UNIQUE 1
+#define BWA_TYPE_REPEAT 2
+#define BWA_TYPE_MATESW 3
+
+#define SAM_FPD 1 // paired
+#define SAM_FPP 2 // properly paired
+#define SAM_FSU 4 // self-unmapped
+#define SAM_FMU 8 // mate-unmapped
+#define SAM_FSR 16 // self on the reverse strand
+#define SAM_FMR 32 // mate on the reverse strand
+#define SAM_FR1 64 // this is read one
+#define SAM_FR2 128 // this is read two
+#define SAM_FSC 256 // secondary alignment
+
+#define BWA_AVG_ERR 0.02
+#define BWA_MIN_RDLEN 35 // for read trimming
+
+#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum
+
+#ifndef bns_pac
+#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3)
+#endif
+
+#define FROM_M 0
+#define FROM_I 1
+#define FROM_D 2
+#define FROM_S 3
+
+#define SAI_MAGIC "SAI\1"
+
+typedef struct {
+ bwtint_t w;
+ int bid;
+} bwt_width_t;
+
+typedef struct {
+ uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10;
+ bwtint_t k, l;
+} bwt_aln1_t;
+
+typedef uint16_t bwa_cigar_t;
+/* rgoya: If changing order of bytes, beware of operations like:
+ * s->cigar[0] += s->full_len - s->len;
+ */
+#define CIGAR_OP_SHIFT 14
+#define CIGAR_LN_MASK 0x3fff
+
+#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT)
+#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK)
+#define __cigar_create(__op, __len) ((__op)<<CIGAR_OP_SHIFT | (__len))
+
+typedef struct {
+ uint32_t n_cigar:15, gap:8, mm:8, strand:1;
+ int ref_shift;
+ bwtint_t pos;
+ bwa_cigar_t *cigar;
+} bwt_multi1_t;
+
+typedef struct {
+ char *name;
+ ubyte_t *seq, *rseq, *qual;
+ uint32_t len:20, strand:1, type:2, dummy:1, extra_flag:8;
+ uint32_t n_mm:8, n_gapo:8, n_gape:8, mapQ:8;
+ int score;
+ int clip_len;
+ // alignments in SA coordinates
+ int n_aln;
+ bwt_aln1_t *aln;
+ // multiple hits
+ int n_multi;
+ bwt_multi1_t *multi;
+ // alignment information
+ bwtint_t sa, pos;
+ uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
+ int ref_shift;
+ int n_cigar;
+ bwa_cigar_t *cigar;
+ // for multi-threading only
+ int tid;
+ // barcode
+ char bc[BWA_MAX_BCLEN+1]; // null terminated; up to BWA_MAX_BCLEN bases
+ // NM and MD tags
+ uint32_t full_len:20, nm:12;
+ char *md;
+} bwa_seq_t;
+
+#define BWA_MODE_GAPE 0x01
+#define BWA_MODE_COMPREAD 0x02
+#define BWA_MODE_LOGGAP 0x04
+#define BWA_MODE_CFY 0x08
+#define BWA_MODE_NONSTOP 0x10
+#define BWA_MODE_BAM 0x20
+#define BWA_MODE_BAM_SE 0x40
+#define BWA_MODE_BAM_READ1 0x80
+#define BWA_MODE_BAM_READ2 0x100
+#define BWA_MODE_IL13 0x200
+
+typedef struct {
+ int s_mm, s_gapo, s_gape;
+ int mode; // bit 24-31 are the barcode length
+ int indel_end_skip, max_del_occ, max_entries;
+ float fnr;
+ int max_diff, max_gapo, max_gape;
+ int max_seed_diff, seed_len;
+ int n_threads;
+ int max_top2;
+ int trim_qual;
+} gap_opt_t;
+
+#define BWA_PET_STD 1
+
+typedef struct {
+ int max_isize, force_isize;
+ int max_occ;
+ int n_multi, N_multi;
+ int type, is_sw, is_preload;
+ double ap_prior;
+} pe_opt_t;
+
+struct __bwa_seqio_t;
+typedef struct __bwa_seqio_t bwa_seqio_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ gap_opt_t *gap_init_opt();
+ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt);
+
+ bwa_seqio_t *bwa_seq_open(const char *fn);
+ bwa_seqio_t *bwa_bam_open(const char *fn, int which);
+ void bwa_seq_close(bwa_seqio_t *bs);
+ void seq_reverse(int len, ubyte_t *seq, int is_comp);
+ bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual);
+ void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs);
+
+ int bwa_cal_maxdiff(int l, double err, double thres);
+ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt);
+
+ void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/bwtgap.c b/ext/src/bwa/bwtgap.c
new file mode 100644
index 0000000..08bc1f4
--- /dev/null
+++ b/ext/src/bwa/bwtgap.c
@@ -0,0 +1,264 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bwtgap.h"
+#include "bwtaln.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define STATE_M 0
+#define STATE_I 1
+#define STATE_D 2
+
+#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape)
+
+gap_stack_t *gap_init_stack2(int max_score)
+{
+ gap_stack_t *stack;
+ stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t));
+ stack->n_stacks = max_score;
+ stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t));
+ return stack;
+}
+
+gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt)
+{
+ return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt));
+}
+
+void gap_destroy_stack(gap_stack_t *stack)
+{
+ int i;
+ for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack);
+ free(stack->stacks);
+ free(stack);
+}
+
+static void gap_reset_stack(gap_stack_t *stack)
+{
+ int i;
+ for (i = 0; i != stack->n_stacks; ++i)
+ stack->stacks[i].n_entries = 0;
+ stack->best = stack->n_stacks;
+ stack->n_entries = 0;
+}
+
+static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del,
+ int state, int is_diff, const gap_opt_t *opt)
+{
+ int score;
+ gap_entry_t *p;
+ gap_stack1_t *q;
+ score = aln_score(n_mm, n_gapo, n_gape, opt);
+ q = stack->stacks + score;
+ if (q->n_entries == q->m_entries) {
+ q->m_entries = q->m_entries? q->m_entries<<1 : 4;
+ q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries);
+ }
+ p = q->stack + q->n_entries;
+ p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l;
+ p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape;
+ p->n_ins = n_ins; p->n_del = n_del;
+ p->state = state;
+ p->last_diff_pos = is_diff? i : 0;
+ ++(q->n_entries);
+ ++(stack->n_entries);
+ if (stack->best > score) stack->best = score;
+}
+
+static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e)
+{
+ gap_stack1_t *q;
+ q = stack->stacks + stack->best;
+ *e = q->stack[q->n_entries - 1];
+ --(q->n_entries);
+ --(stack->n_entries);
+ if (q->n_entries == 0 && stack->n_entries) { // reset best
+ int i;
+ for (i = stack->best + 1; i < stack->n_stacks; ++i)
+ if (stack->stacks[i].n_entries != 0) break;
+ stack->best = i;
+ } else if (stack->n_entries == 0) stack->best = stack->n_stacks;
+}
+
+static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w)
+{
+ int i, j;
+ for (i = j = 0; i < last_diff_pos; ++i) {
+ if (w[i].w > x) w[i].w -= x;
+ else if (w[i].w == x) {
+ w[i].bid = 1;
+ w[i].w = max - (++j);
+ } // else should not happen
+ }
+}
+
+static inline int int_log2(uint32_t v)
+{
+ int c = 0;
+ if (v & 0xffff0000u) { v >>= 16; c |= 16; }
+ if (v & 0xff00) { v >>= 8; c |= 8; }
+ if (v & 0xf0) { v >>= 4; c |= 4; }
+ if (v & 0xc) { v >>= 2; c |= 2; }
+ if (v & 0x2) c |= 1;
+ return c;
+}
+
+bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width,
+ bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack)
+{ // $seq is the reverse complement of the input read
+ int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt);
+ int best_diff = opt->max_diff + 1, max_diff = opt->max_diff;
+ int best_cnt = 0;
+ int max_entries = 0, j, _j, n_aln, m_aln;
+ bwt_aln1_t *aln;
+
+ m_aln = 4; n_aln = 0;
+ aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t));
+
+ // check whether there are too many N
+ for (j = _j = 0; j < len; ++j)
+ if (seq[j] > 3) ++_j;
+ if (_j > max_diff) {
+ *_n_aln = n_aln;
+ return aln;
+ }
+
+ //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w);
+ gap_reset_stack(stack); // reset stack
+ gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt);
+
+ while (stack->n_entries) {
+ gap_entry_t e;
+ int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp;
+ bwtint_t k, l, cnt_k[4], cnt_l[4], occ;
+
+ if (max_entries < stack->n_entries) max_entries = stack->n_entries;
+ if (stack->n_entries > opt->max_entries) break;
+ gap_pop(stack, &e); // get the best entry
+ k = e.k; l = e.l; // SA interval
+ i = e.info&0xffff; // length
+ if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed
+
+ m = max_diff - (e.n_mm + e.n_gapo);
+ if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape;
+ if (m < 0) continue;
+ if (seed_width) { // apply seeding
+ m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo);
+ if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape;
+ }
+ //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos);
+ if (i > 0 && m < width[i-1].bid) continue;
+
+ // check whether a hit is found
+ hit_found = 0;
+ if (i == 0) hit_found = 1;
+ else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed
+ if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1;
+ else continue; // no hit, skip
+ }
+
+ if (hit_found) { // action for found hits
+ int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt);
+ int do_add = 1;
+ //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l);
+ if (n_aln == 0) {
+ best_score = score;
+ best_diff = e.n_mm + e.n_gapo;
+ if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape;
+ if (!(opt->mode & BWA_MODE_NONSTOP))
+ max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
+ }
+ if (score == best_score) best_cnt += l - k + 1;
+ else if (best_cnt > opt->max_top2) break; // top2b behaviour
+ if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
+ for (j = 0; j != n_aln; ++j)
+ if (aln[j].k == k && aln[j].l == l) break;
+ if (j < n_aln) do_add = 0;
+ }
+ if (do_add) { // append
+ bwt_aln1_t *p;
+ gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width);
+ if (n_aln == m_aln) {
+ m_aln <<= 1;
+ aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t));
+ memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t));
+ }
+ p = aln + n_aln;
+ p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape;
+ p->n_ins = e.n_ins; p->n_del = e.n_del;
+ p->k = k; p->l = l;
+ p->score = score;
+ //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del);
+ ++n_aln;
+ }
+ continue;
+ }
+
+ --i;
+ bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values
+ occ = l - k + 1;
+ // test whether diff is allowed
+ allow_diff = allow_M = 1;
+ if (i > 0) {
+ int ii = i - (len - opt->seed_len);
+ if (width[i-1].bid > m-1) allow_diff = 0;
+ else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0;
+ if (seed_width && ii > 0) {
+ if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0;
+ else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1
+ && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0;
+ }
+ }
+ // indels
+ tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape;
+ if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) {
+ if (e.state == STATE_M) { // gap open
+ if (e.n_gapo < opt->max_gapo) { // gap open is allowed
+ // insertion
+ gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
+ // deletion
+ for (j = 0; j != 4; ++j) {
+ k = bwt->L2[j] + cnt_k[j] + 1;
+ l = bwt->L2[j] + cnt_l[j];
+ if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
+ }
+ }
+ } else if (e.state == STATE_I) { // extention of an insertion
+ if (e.n_gape < opt->max_gape) // gap extention is allowed
+ gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt);
+ } else if (e.state == STATE_D) { // extention of a deletion
+ if (e.n_gape < opt->max_gape) { // gap extention is allowed
+ if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) {
+ for (j = 0; j != 4; ++j) {
+ k = bwt->L2[j] + cnt_k[j] + 1;
+ l = bwt->L2[j] + cnt_l[j];
+ if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt);
+ }
+ }
+ }
+ }
+ }
+ // mismatches
+ if (allow_diff && allow_M) { // mismatch is allowed
+ for (j = 1; j <= 4; ++j) {
+ int c = (seq[i] + j) & 3;
+ int is_mm = (j != 4 || seq[i] > 3);
+ k = bwt->L2[c] + cnt_k[c] + 1;
+ l = bwt->L2[c] + cnt_l[c];
+ if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt);
+ }
+ } else if (seq[i] < 4) { // try exact match only
+ int c = seq[i] & 3;
+ k = bwt->L2[c] + cnt_k[c] + 1;
+ l = bwt->L2[c] + cnt_l[c];
+ if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt);
+ }
+ }
+
+ *_n_aln = n_aln;
+ //fprintf(stderr, "max_entries = %d\n", max_entries);
+ return aln;
+}
diff --git a/ext/src/bwa/bwtgap.h b/ext/src/bwa/bwtgap.h
new file mode 100644
index 0000000..7dd6165
--- /dev/null
+++ b/ext/src/bwa/bwtgap.h
@@ -0,0 +1,40 @@
+#ifndef BWTGAP_H_
+#define BWTGAP_H_
+
+#include "bwt.h"
+#include "bwtaln.h"
+
+typedef struct { // recursion stack
+ u_int32_t info; // score<<21 | i
+ u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6;
+ u_int32_t n_ins:16, n_del:16;
+ int last_diff_pos;
+ bwtint_t k, l; // (k,l) is the SA region of [i,n-1]
+} gap_entry_t;
+
+typedef struct {
+ int n_entries, m_entries;
+ gap_entry_t *stack;
+} gap_stack1_t;
+
+typedef struct {
+ int n_stacks, best, n_entries;
+ gap_stack1_t *stacks;
+} gap_stack_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ gap_stack_t *gap_init_stack2(int max_score);
+ gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt);
+ void gap_destroy_stack(gap_stack_t *stack);
+ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w,
+ bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack);
+ void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/bwtindex.c b/ext/src/bwa/bwtindex.c
new file mode 100644
index 0000000..23a3085
--- /dev/null
+++ b/ext/src/bwa/bwtindex.c
@@ -0,0 +1,304 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <zlib.h>
+#include "bwa/bntseq.h"
+#include "bwa/bwt.h"
+#include "bwa/utils.h"
+#include "rle.h"
+#include "rope.h"
+
+#ifdef _DIVBWT
+#include "divsufsort.h"
+#endif
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+
+int is_bwt(ubyte_t *T, int n);
+
+int64_t bwa_seq_len(const char *fn_pac)
+{
+ FILE *fp;
+ int64_t pac_len;
+ ubyte_t c;
+ fp = xopen(fn_pac, "rb");
+ err_fseek(fp, -1, SEEK_END);
+ pac_len = err_ftell(fp);
+ err_fread_noeof(&c, 1, 1, fp);
+ err_fclose(fp);
+ return (pac_len - 1) * 4 + (int)c;
+}
+
+bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
+{
+ bwt_t *bwt;
+ ubyte_t *buf, *buf2;
+ int64_t i, pac_size;
+ FILE *fp;
+
+ // initialization
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+ bwt->seq_len = bwa_seq_len(fn_pac);
+ bwt->bwt_size = (bwt->seq_len + 15) >> 4;
+ fp = xopen(fn_pac, "rb");
+
+ // prepare sequence
+ pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
+ buf2 = (ubyte_t*)calloc(pac_size, 1);
+ err_fread_noeof(buf2, 1, pac_size, fp);
+ err_fclose(fp);
+ memset(bwt->L2, 0, 5 * 4);
+ buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
+ for (i = 0; i < bwt->seq_len; ++i) {
+ buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
+ ++bwt->L2[1+buf[i]];
+ }
+ for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
+ free(buf2);
+
+ // Burrows-Wheeler Transform
+ if (use_is) {
+ bwt->primary = is_bwt(buf, bwt->seq_len);
+ } else {
+ rope_t *r;
+ int64_t x;
+ rpitr_t itr;
+ const uint8_t *blk;
+
+ r = rope_init(ROPE_DEF_MAX_NODES, ROPE_DEF_BLOCK_LEN);
+ for (i = bwt->seq_len - 1, x = 0; i >= 0; --i) {
+ int c = buf[i] + 1;
+ x = rope_insert_run(r, x, c, 1, 0) + 1;
+ while (--c >= 0) x += r->c[c];
+ }
+ bwt->primary = x;
+ rope_itr_first(r, &itr);
+ x = 0;
+ while ((blk = rope_itr_next_block(&itr)) != 0) {
+ const uint8_t *q = blk + 2, *end = blk + 2 + *rle_nptr(blk);
+ while (q < end) {
+ int c = 0;
+ int64_t l;
+ rle_dec1(q, c, l);
+ for (i = 0; i < l; ++i)
+ buf[x++] = c - 1;
+ }
+ }
+ rope_destroy(r);
+ }
+ bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
+ for (i = 0; i < bwt->seq_len; ++i)
+ bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
+ free(buf);
+ return bwt;
+}
+
+int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
+{
+ bwt_t *bwt;
+ int c, use_is = 1;
+ while ((c = getopt(argc, argv, "d")) >= 0) {
+ switch (c) {
+ case 'd': use_is = 0; break;
+ default: return 1;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
+ return 1;
+ }
+ bwt = bwt_pac2bwt(argv[optind], use_is);
+ bwt_dump_bwt(argv[optind+1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
+
+void bwt_bwtupdate_core(bwt_t *bwt)
+{
+ bwtint_t i, k, c[4], n_occ;
+ uint32_t *buf;
+
+ n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
+ bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
+ buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
+ c[0] = c[1] = c[2] = c[3] = 0;
+ for (i = k = 0; i < bwt->seq_len; ++i) {
+ if (i % OCC_INTERVAL == 0) {
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+ k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
+ }
+ if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
+ ++c[bwt_B00(bwt, i)];
+ }
+ // the last element
+ memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+ xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
+ // update bwt
+ free(bwt->bwt); bwt->bwt = buf;
+}
+
+int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
+{
+ bwt_t *bwt;
+ if (argc < 2) {
+ fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
+ return 1;
+ }
+ bwt = bwt_restore_bwt(argv[1]);
+ bwt_bwtupdate_core(bwt);
+ bwt_dump_bwt(argv[1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
+{
+ bwt_t *bwt;
+ int c, sa_intv = 32;
+ while ((c = getopt(argc, argv, "i:")) >= 0) {
+ switch (c) {
+ case 'i': sa_intv = atoi(optarg); break;
+ default: return 1;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
+ return 1;
+ }
+ bwt = bwt_restore_bwt(argv[optind]);
+ bwt_cal_sa(bwt, sa_intv);
+ bwt_dump_sa(argv[optind+1], bwt);
+ bwt_destroy(bwt);
+ return 0;
+}
+
+int bwa_index(int argc, char *argv[]) // the "index" command
+{
+ extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+
+ char *prefix = 0, *str, *str2, *str3;
+ int c, algo_type = 0, is_64 = 0;
+ clock_t t;
+ int64_t l_pac;
+
+ while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
+ switch (c) {
+ case 'a': // if -a is not set, algo_type will be determined later
+ if (strcmp(optarg, "rb2") == 0) algo_type = 1;
+ else if (strcmp(optarg, "is") == 0) algo_type = 3;
+ else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
+ break;
+ case 'p': prefix = strdup(optarg); break;
+ case '6': is_64 = 1; break;
+ default: return 1;
+ }
+ }
+
+ if (optind + 1 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa index [options] <in.fasta>\n\n");
+ fprintf(stderr, "Options: -a STR BWT construction algorithm: is or rb2 [auto]\n");
+ fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n");
+ fprintf(stderr, " -6 index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
+ fprintf(stderr, " `-a div' do not work not for long genomes.\n\n");
+ return 1;
+ }
+ if (prefix == 0) {
+ prefix = malloc(strlen(argv[optind]) + 4);
+ strcpy(prefix, argv[optind]);
+ if (is_64) strcat(prefix, ".64");
+ }
+ str = (char*)calloc(strlen(prefix) + 10, 1);
+ str2 = (char*)calloc(strlen(prefix) + 10, 1);
+ str3 = (char*)calloc(strlen(prefix) + 10, 1);
+
+ { // nucleotide indexing
+ gzFile fp = xzopen(argv[optind], "r");
+ t = clock();
+ fprintf(stderr, "[bwa_index] Pack FASTA... ");
+ l_pac = bns_fasta2bntseq(fp, prefix, 0);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+ err_gzclose(fp);
+ }
+ if (algo_type == 0) algo_type = l_pac > 50000000? 1 : 3; // set the algorithm for generating BWT
+ {
+ bwt_t *bwt;
+ strcpy(str, prefix); strcat(str, ".pac");
+ strcpy(str2, prefix); strcat(str2, ".bwt");
+ t = clock();
+ fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
+ bwt = bwt_pac2bwt(str, algo_type == 3);
+ bwt_dump_bwt(str2, bwt);
+ bwt_destroy(bwt);
+ fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+ }
+ {
+ bwt_t *bwt;
+ strcpy(str, prefix); strcat(str, ".bwt");
+ t = clock();
+ fprintf(stderr, "[bwa_index] Update BWT... ");
+ bwt = bwt_restore_bwt(str);
+ bwt_bwtupdate_core(bwt);
+ bwt_dump_bwt(str, bwt);
+ bwt_destroy(bwt);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+ }
+ {
+ gzFile fp = xzopen(argv[optind], "r");
+ t = clock();
+ fprintf(stderr, "[bwa_index] Pack forward-only FASTA... ");
+ l_pac = bns_fasta2bntseq(fp, prefix, 1);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+ err_gzclose(fp);
+ }
+ {
+ bwt_t *bwt;
+ strcpy(str, prefix); strcat(str, ".bwt");
+ strcpy(str3, prefix); strcat(str3, ".sa");
+ t = clock();
+ fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... ");
+ bwt = bwt_restore_bwt(str);
+ bwt_cal_sa(bwt, 32);
+ bwt_dump_sa(str3, bwt);
+ bwt_destroy(bwt);
+ fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+ }
+ free(str3); free(str2); free(str); free(prefix);
+ return 0;
+}
diff --git a/ext/src/bwa/bwtsw2.h b/ext/src/bwa/bwtsw2.h
new file mode 100644
index 0000000..0ec9676
--- /dev/null
+++ b/ext/src/bwa/bwtsw2.h
@@ -0,0 +1,69 @@
+#ifndef LH3_BWTSW2_H
+#define LH3_BWTSW2_H
+
+#include <stdint.h>
+#include "bntseq.h"
+#include "bwt_lite.h"
+#include "bwt.h"
+
+#define BSW2_FLAG_MATESW 0x100
+#define BSW2_FLAG_TANDEM 0x200
+#define BSW2_FLAG_MOVED 0x400
+#define BSW2_FLAG_RESCUED 0x800
+
+typedef struct {
+ int skip_sw:8, cpy_cmt:8, hard_clip:16;
+ int a, b, q, r, t, qr, bw, max_ins, max_chain_gap;
+ int z, is, t_seeds, multi_2nd;
+ float mask_level, coef;
+ int n_threads, chunk_size;
+} bsw2opt_t;
+
+typedef struct {
+ bwtint_t k, l;
+ uint32_t flag:18, n_seeds:13, is_rev:1;
+ int len, G, G2;
+ int beg, end;
+} bsw2hit_t;
+
+typedef struct {
+ int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm;
+ uint32_t *cigar;
+} bsw2aux_t;
+
+typedef struct {
+ int n, max;
+ bsw2hit_t *hits;
+ bsw2aux_t *aux;
+} bwtsw2_t;
+
+typedef struct {
+ void *stack;
+ int max_l;
+ uint8_t *aln_mem;
+} bsw2global_t;
+
+typedef struct {
+ int l, tid;
+ char *name, *seq, *qual, *sam, *comment;
+} bsw2seq1_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bsw2opt_t *bsw2_init_opt();
+ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool);
+ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2);
+ void bsw2_destroy(bwtsw2_t *b);
+
+ bsw2global_t *bsw2_global_init();
+ void bsw2_global_destroy(bsw2global_t *_pool);
+
+ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/bwtsw2_aux.c b/ext/src/bwa/bwtsw2_aux.c
new file mode 100644
index 0000000..d225187
--- /dev/null
+++ b/ext/src/bwa/bwtsw2_aux.c
@@ -0,0 +1,776 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+#include "bntseq.h"
+#include "bwt_lite.h"
+#include "utils.h"
+#include "bwtsw2.h"
+#include "kstring.h"
+#include "bwa.h"
+#include "ksw.h"
+
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+#include "ksort.h"
+#define __left_lt(a, b) ((a).end > (b).end)
+KSORT_INIT(hit, bsw2hit_t, __left_lt)
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+
+extern unsigned char nst_nt4_table[256];
+
+unsigned char nt_comp_table[256] = {
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
+ 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
+ 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n',
+ 'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
+ 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
+};
+
+extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
+extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level);
+
+bsw2opt_t *bsw2_init_opt()
+{
+ bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t));
+ o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30;
+ o->bw = 50;
+ o->max_ins = 20000;
+ o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0;
+ o->mask_level = 0.50f; o->coef = 5.5f;
+ o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
+ o->max_chain_gap = 10000;
+ o->cpy_cmt = 0;
+ return o;
+}
+
+void bsw2_destroy(bwtsw2_t *b)
+{
+ int i;
+ if (b == 0) return;
+ if (b->aux)
+ for (i = 0; i < b->n; ++i) free(b->aux[i].cigar);
+ free(b->aux); free(b->hits);
+ free(b);
+}
+
+bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b)
+{
+ bwtsw2_t *p;
+ p = calloc(1, sizeof(bwtsw2_t));
+ p->max = p->n = b->n;
+ if (b->n) {
+ kroundup32(p->max);
+ p->hits = calloc(p->max, sizeof(bsw2hit_t));
+ memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t));
+ }
+ return p;
+}
+
+#define __gen_ap(par, opt) do { \
+ int i; \
+ for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \
+ for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \
+ (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \
+ (par).gap_end = (opt)->r; \
+ (par).row = 5; (par).band_width = opt->bw; \
+ } while (0)
+
+void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
+{
+ int i;
+ bwtint_t k;
+ uint8_t *target = 0, *query;
+ int8_t mat[25];
+
+ bwa_fill_scmat(opt->a, opt->b, mat);
+ query = calloc(lq, 1);
+ // sort according to the descending order of query end
+ ks_introsort(hit, b->n, b->hits);
+ target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
+ // reverse _query
+ for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i];
+ // core loop
+ for (i = 0; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
+ int score, j, qle, tle;
+ p->n_seeds = 1;
+ if (p->l || p->k == 0) continue;
+ for (j = score = 0; j < i; ++j) {
+ bsw2hit_t *q = b->hits + j;
+ if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) {
+ if (q->n_seeds < (1<<13) - 2) ++q->n_seeds;
+ ++score;
+ }
+ }
+ if (score) continue;
+ if (lt > p->k) lt = p->k;
+ for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered!
+ target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
+ lt = j;
+ score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0);
+ if (score > p->G) { // extensible
+ p->G = score;
+ p->k -= tle;
+ p->len += tle;
+ p->beg -= qle;
+ }
+ }
+ free(query); free(target);
+}
+
+void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem)
+{
+ int i;
+ bwtint_t k;
+ uint8_t *target;
+ int8_t mat[25];
+
+ bwa_fill_scmat(opt->a, opt->b, mat);
+ target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1);
+ for (i = 0; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq;
+ int j, score, qle, tle;
+ if (p->l) continue;
+ for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k)
+ target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3;
+ lt = j;
+ score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1;
+// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G);
+ if (score >= p->G) {
+ p->G = score;
+ p->len = tle;
+ p->end = p->beg + qle;
+ }
+ }
+ free(target);
+}
+
+/* generate CIGAR array(s) in b->cigar[] */
+static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name)
+{
+ int i;
+ int8_t mat[25];
+
+ bwa_fill_scmat(opt->a, opt->b, mat);
+ for (i = 0; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ bsw2aux_t *q = b->aux + i;
+ uint8_t *query;
+ int beg, end, score;
+ if (p->l) continue;
+ beg = (p->flag & 0x10)? lq - p->end : p->beg;
+ end = (p->flag & 0x10)? lq - p->beg : p->end;
+ query = seq[(p->flag & 0x10)? 1 : 0] + beg;
+ q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm);
+#if 0
+ if (name && score != p->G) { // debugging only
+ int j, glen = 0;
+ for (j = 0; j < q->n_cigar; ++j)
+ if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2)
+ glen += q->cigar[j]>>4;
+ fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n",
+ __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw);
+ }
+#endif
+ if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping
+ q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2));
+ if (beg != 0) {
+ memmove(q->cigar + 1, q->cigar, q->n_cigar * 4);
+ q->cigar[0] = beg<<4 | 4;
+ ++q->n_cigar;
+ }
+ if (end < lq) {
+ q->cigar[q->n_cigar] = (lq - end)<<4 | 4;
+ ++q->n_cigar;
+ }
+ }
+ }
+}
+
+/* this is for the debugging purpose only */
+void bsw2_debug_hits(const bwtsw2_t *b)
+{
+ int i;
+ printf("# raw hits: %d\n", b->n);
+ for (i = 0; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ if (p->G > 0)
+ printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
+ }
+}
+
+static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse)
+{
+ int i;
+ if (b[0]->n + b[1]->n > b[0]->max) {
+ b[0]->max = b[0]->n + b[1]->n;
+ b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t));
+ }
+ for (i = 0; i < b[1]->n; ++i) {
+ bsw2hit_t *p = b[0]->hits + b[0]->n + i;
+ *p = b[1]->hits[i];
+ if (is_reverse) {
+ int x = p->beg;
+ p->beg = l - p->end;
+ p->end = l - x;
+ p->flag |= 0x10;
+ }
+ }
+ b[0]->n += b[1]->n;
+ bsw2_destroy(b[1]);
+ b[1] = 0;
+}
+/* seq[0] is the forward sequence and seq[1] is the reverse complement. */
+static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target,
+ int l, uint8_t *seq[2], bsw2global_t *pool)
+{
+ extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]);
+ bwtsw2_t *b[2], **bb[2], **_b, *p;
+ int k, j;
+ bwtl_t *query;
+ query = bwtl_seq2bwtl(l, seq[0]);
+ _b = bsw2_core(bns, opt, query, target, pool);
+ bwtl_destroy(query);
+ for (k = 0; k < 2; ++k) {
+ bb[k] = calloc(2, sizeof(void*));
+ bb[k][0] = calloc(1, sizeof(bwtsw2_t));
+ bb[k][1] = calloc(1, sizeof(bwtsw2_t));
+ }
+ for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand
+ for (j = 0; j < _b[k]->n; ++j) {
+ bsw2hit_t *q;
+ p = bb[_b[k]->hits[j].is_rev][k];
+ if (p->n == p->max) {
+ p->max = p->max? p->max<<1 : 8;
+ p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t));
+ }
+ q = &p->hits[p->n++];
+ *q = _b[k]->hits[j];
+ if (_b[k]->hits[j].is_rev) {
+ int x = q->beg;
+ q->beg = l - q->end;
+ q->end = l - x;
+ }
+ }
+ }
+ b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
+ bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained
+ for (k = 0; k < 2; ++k) {
+ bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
+ merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
+ bsw2_resolve_duphits(0, 0, bb[k][0], 0);
+ bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
+ bsw2_resolve_duphits(0, 0, bb[k][0], 0);
+ b[k] = bb[k][0];
+ free(bb[k]);
+ }
+ merge_hits(b, l, 1); // again, b[1] is merged to b[0]
+ bsw2_resolve_query_overlaps(b[0], opt->mask_level);
+ bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b);
+ return b[0];
+}
+
+/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */
+static void flag_fr(bwtsw2_t *b[2])
+{
+ int i, j;
+ for (i = 0; i < b[0]->n; ++i) {
+ bsw2hit_t *p = b[0]->hits + i;
+ p->flag |= 0x10000;
+ }
+ for (i = 0; i < b[1]->n; ++i) {
+ bsw2hit_t *p = b[1]->hits + i;
+ p->flag |= 0x20000;
+ }
+ for (i = 0; i < b[0]->n; ++i) {
+ bsw2hit_t *p = b[0]->hits + i;
+ for (j = 0; j < b[1]->n; ++j) {
+ bsw2hit_t *q = b[1]->hits + j;
+ if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) {
+ q->flag |= 0x30000; p->flag |= 0x30000;
+ break;
+ }
+ }
+ }
+}
+
+typedef struct {
+ int n, max;
+ bsw2seq1_t *seq;
+} bsw2seq_t;
+
+static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar)
+{
+ // FIXME: this routine does not work if the query bridge three reference sequences
+ int32_t coor, refl, lq;
+ int x, y, i, seqid;
+ bns_cnt_ambi(bns, p->k, p->len, &seqid);
+ coor = p->k - bns->anns[seqid].offset;
+ refl = bns->anns[seqid].len;
+ x = coor; y = 0;
+ // test if the alignment goes beyond the boundary
+ for (i = 0; i < n_cigar; ++i) {
+ int op = cigar[i]&0xf, ln = cigar[i]>>4;
+ if (op == 1 || op == 4 || op == 5) y += ln;
+ else if (op == 2) x += ln;
+ else x += ln, y += ln;
+ }
+ lq = y; // length of the query sequence
+ if (x > refl) { // then fix it
+ int j, nc, mq[2], nlen[2];
+ uint32_t *cn;
+ bwtint_t kk = 0;
+ nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0;
+ cn = calloc(n_cigar + 3, 4);
+ x = coor; y = 0;
+ for (i = j = 0; i < n_cigar; ++i) {
+ int op = cigar[i]&0xf, ln = cigar[i]>>4;
+ if (op == 4 || op == 5 || op == 1) { // ins or clipping
+ y += ln;
+ cn[j++] = cigar[i];
+ } else if (op == 2) { // del
+ if (x + ln >= refl && nc == 0) {
+ cn[j++] = (uint32_t)(lq - y)<<4 | 4;
+ nc = j;
+ cn[j++] = (uint32_t)y<<4 | 4;
+ kk = p->k + (x + ln - refl);
+ nlen[0] = x - coor;
+ nlen[1] = p->len - nlen[0] - ln;
+ } else cn[j++] = cigar[i];
+ x += ln;
+ } else if (op == 0) { // match
+ if (x + ln >= refl && nc == 0) {
+ // FIXME: not consider a special case where a split right between M and I
+ cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M
+ cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S
+ nc = j;
+ mq[0] += refl - x;
+ cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4;
+ if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0;
+ mq[1] += x + ln - refl;
+ kk = bns->anns[seqid].offset + refl;
+ nlen[0] = refl - coor;
+ nlen[1] = p->len - nlen[0];
+ } else {
+ cn[j++] = cigar[i];
+ mq[nc?1:0] += ln;
+ }
+ x += ln; y += ln;
+ }
+ }
+ if (mq[0] > mq[1]) { // then take the first alignment
+ n_cigar = nc;
+ memcpy(cigar, cn, 4 * nc);
+ p->len = nlen[0];
+ } else {
+ p->k = kk; p->len = nlen[1];
+ n_cigar = j - nc;
+ memcpy(cigar, cn + nc, 4 * (j - nc));
+ }
+ free(cn);
+ }
+ return n_cigar;
+}
+
+static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name)
+{
+ int i;
+ // allocate for b->aux
+ if (b->n<<1 < b->max) {
+ b->max = b->n;
+ kroundup32(b->max);
+ b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t));
+ }
+ b->aux = calloc(b->n, sizeof(bsw2aux_t));
+ // generate CIGAR
+ gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name);
+ // fix CIGAR, generate mapQ, and write chromosomal position
+ for (i = 0; i < b->n; ++i) {
+ bsw2hit_t *p = &b->hits[i];
+ bsw2aux_t *q = &b->aux[i];
+ q->flag = p->flag & 0xfe;
+ q->isize = 0;
+ if (p->l == 0) { // unique hit
+ float c = 1.0;
+ int subo;
+ // fix out-of-boundary CIGAR
+ q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar);
+ // compute mapQ
+ subo = p->G2 > opt->t? p->G2 : opt->t;
+ if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5;
+ if (p->n_seeds < 2) c *= .2;
+ q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499);
+ if (q->qual > 250) q->qual = 250;
+ if (q->qual < 0) q->qual = 0;
+ if (p->flag&1) q->qual = 0; // this is a random hit
+ q->pqual = q->qual; // set the paired qual as qual
+ // get the chromosomal position
+ q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr);
+ q->pos = p->k - bns->anns[q->chr].offset;
+ } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0;
+ }
+}
+
+static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m)
+{
+ int i;
+ if (m == 0) return;
+ // update flag, mchr and mpos
+ for (i = 0; i < b->n; ++i) {
+ bsw2aux_t *q = &b->aux[i];
+ q->flag |= 1; // paired
+ if (m->n == 0) q->flag |= 8; // mate unmapped
+ if (m->n == 1) {
+ q->mchr = m->aux[0].chr;
+ q->mpos = m->aux[0].pos;
+ if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand
+ if (q->chr == q->mchr) { // set insert size
+ if (q->mpos + m->hits[0].len > q->pos)
+ q->isize = q->mpos + m->hits[0].len - q->pos;
+ else q->isize = q->mpos - q->pos - b->hits[0].len;
+ } else q->isize = 0;
+ } else q->mchr = q->mpos = -1;
+ }
+ // update mapping quality
+ if (b->n == 1 && m->n == 1) {
+ bsw2hit_t *p = &b->hits[0];
+ if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman
+ if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20)
+ b->aux[0].pqual = 20;
+ if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
+ } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired
+ if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual]
+ b->aux[0].pqual += 20;
+ if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual;
+ if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual;
+ }
+ }
+ }
+}
+
+/* generate SAM lines for a sequence in ks with alignment stored in
+ * b. ks->name and ks->seq will be freed and set to NULL in the end. */
+static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate)
+{
+ int i, k;
+ kstring_t str;
+ memset(&str, 0, sizeof(kstring_t));
+ if (b == 0 || b->n == 0) { // no hits
+ ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name);
+ for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str);
+ if (ks->qual) {
+ kputc('\t', &str);
+ for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str);
+ } else kputs("\t*", &str);
+ kputc('\n', &str);
+ }
+ for (i = 0; b && i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ bsw2aux_t *q = b->aux + i;
+ int j, beg, end, type = 0;
+ // print mandatory fields before SEQ
+ if (q->cigar == 0) q->flag |= 0x4;
+ ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0));
+ ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1);
+ if (p->l == 0 && q->cigar) { // not a repetitive hit
+ ksprintf(&str, "\t%d\t", q->pqual);
+ for (k = 0; k < q->n_cigar; ++k)
+ ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]);
+ } else ksprintf(&str, "\t0\t*");
+ if (!is_pe) kputs("\t*\t0\t0\t", &str);
+ else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize);
+ // get the sequence begin and end
+ beg = 0; end = ks->l;
+ if (opt->hard_clip && q->cigar) {
+ if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4;
+ if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4;
+ }
+ for (j = beg; j < end; ++j) {
+ if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str);
+ else kputc(ks->seq[j], &str);
+ }
+ // print base quality if present
+ if (ks->qual) {
+ kputc('\t', &str);
+ for (j = beg; j < end; ++j) {
+ if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
+ else kputc(ks->qual[j], &str);
+ }
+ } else kputs("\t*", &str);
+ // print optional tags
+ ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm);
+ if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
+ if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1);
+ if (p->flag&BSW2_FLAG_MATESW) type |= 1;
+ if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
+ if (type) ksprintf(&str, "\tXT:i:%d", type);
+ if (opt->cpy_cmt && ks->comment) {
+ int l = strlen(ks->comment);
+ if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') {
+ kputc('\t', &str); kputs(ks->comment, &str);
+ }
+ }
+ kputc('\n', &str);
+ }
+ ks->sam = str.s;
+ free(ks->seq); ks->seq = 0;
+ free(ks->qual); ks->qual = 0;
+ free(ks->name); ks->name = 0;
+}
+
+static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen)
+{
+ double ll = log(qlen);
+ int i, k;
+ *dst = *src;
+ if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499);
+ // set band width: the query length sets a boundary on the maximum band width
+ k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a);
+ i = (qlen * dst->a - dst->a - dst->t) / dst->r;
+ if (k > i) k = i;
+ if (k < 1) k = 1; // I do not know if k==0 causes troubles
+ dst->bw = src->bw < k? src->bw : k;
+}
+
+/* Core routine to align reads in _seq. It is separated from
+ * process_seqs() to realize multi-threading */
+static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
+{
+ int x;
+ bsw2opt_t opt;
+ bsw2global_t *pool = bsw2_global_init();
+ bwtsw2_t **buf;
+ buf = calloc(_seq->n, sizeof(void*));
+ for (x = 0; x < _seq->n; ++x) {
+ bsw2seq1_t *p = _seq->seq + x;
+ uint8_t *seq[2], *rseq[2];
+ int i, l, k;
+ bwtsw2_t *b[2];
+ l = p->l;
+ update_opt(&opt, _opt, p->l);
+ if (pool->max_l < l) { // then enlarge working space for aln_extend_core()
+ int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l;
+ pool->max_l = l;
+ pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24);
+ }
+ // set seq[2] and rseq[2]
+ seq[0] = calloc(l * 4, 1);
+ seq[1] = seq[0] + l;
+ rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l;
+ // convert sequences to 2-bit representation
+ for (i = k = 0; i < l; ++i) {
+ int c = nst_nt4_table[(int)p->seq[i]];
+ if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled
+ seq[0][i] = c;
+ seq[1][l-1-i] = 3 - c;
+ rseq[0][l-1-i] = 3 - c;
+ rseq[1][i] = c;
+ }
+ if (l - k < opt.t) { // too few unambiguous bases
+ buf[x] = calloc(1, sizeof(bwtsw2_t));
+ free(seq[0]); continue;
+ }
+ // alignment
+ b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool);
+ for (k = 0; k < b[0]->n; ++k)
+ if (b[0]->hits[k].n_seeds < opt.t_seeds) break;
+ if (k < b[0]->n) {
+ b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool);
+ for (i = 0; i < b[1]->n; ++i) {
+ bsw2hit_t *p = &b[1]->hits[i];
+ int x = p->beg;
+ p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand
+ p->beg = l - p->end;
+ p->end = l - x;
+ }
+ flag_fr(b);
+ merge_hits(b, l, 0);
+ bsw2_resolve_duphits(0, 0, b[0], 0);
+ bsw2_resolve_query_overlaps(b[0], opt.mask_level);
+ } else b[1] = 0;
+ // generate CIGAR and print SAM
+ buf[x] = bsw2_dup_no_cigar(b[0]);
+ // free
+ free(seq[0]);
+ bsw2_destroy(b[0]);
+ }
+ if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf);
+ for (x = 0; x < _seq->n; ++x) {
+ bsw2seq1_t *p = _seq->seq + x;
+ uint8_t *seq[2];
+ int i;
+ seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l;
+ for (i = 0; i < p->l; ++i) {
+ int c = nst_nt4_table[(int)p->seq[i]];
+ if (c >= 4) c = (int)(drand48() * 4);
+ seq[0][i] = c;
+ seq[1][p->l-1-i] = 3 - c;
+ }
+ update_opt(&opt, _opt, p->l);
+ write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name);
+ free(seq[0]);
+ }
+ for (x = 0; x < _seq->n; ++x) {
+ if (is_pe) update_mate_aux(buf[x], buf[x^1]);
+ print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]);
+ }
+ for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]);
+ free(buf);
+ bsw2_global_destroy(pool);
+}
+
+#ifdef HAVE_PTHREAD
+typedef struct {
+ int tid, is_pe;
+ bsw2seq_t *_seq;
+ const bsw2opt_t *_opt;
+ const bntseq_t *bns;
+ uint8_t *pac;
+ const bwt_t *target;
+} thread_aux_t;
+
+/* another interface to bsw2_aln_core() to facilitate pthread_create() */
+static void *worker(void *data)
+{
+ thread_aux_t *p = (thread_aux_t*)data;
+ bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe);
+ return 0;
+}
+#endif
+
+/* process sequences stored in _seq, generate SAM lines for these
+ * sequences and reset _seq afterwards. */
+static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe)
+{
+ int i;
+ is_pe = is_pe? 1 : 0;
+
+#ifdef HAVE_PTHREAD
+ if (opt->n_threads <= 1) {
+ bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
+ } else {
+ pthread_t *tid;
+ pthread_attr_t attr;
+ thread_aux_t *data;
+ int j;
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+ data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t));
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+ for (j = 0; j < opt->n_threads; ++j) {
+ thread_aux_t *p = data + j;
+ p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe;
+ p->pac = pac; p->target = target;
+ p->_seq = calloc(1, sizeof(bsw2seq_t));
+ p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1;
+ p->_seq->n = 0;
+ p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t));
+ }
+ for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread
+ bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
+ p->seq[p->n++] = _seq->seq[i];
+ }
+ for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]);
+ for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0);
+ for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0;
+ for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back
+ bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq;
+ _seq->seq[i] = p->seq[p->n++];
+ }
+ for (j = 0; j < opt->n_threads; ++j) {
+ thread_aux_t *p = data + j;
+ free(p->_seq->seq);
+ free(p->_seq);
+ }
+ free(data); free(tid);
+ }
+#else
+ bsw2_aln_core(_seq, opt, bns, pac, target, is_pe);
+#endif
+
+ // print and reset
+ for (i = 0; i < _seq->n; ++i) {
+ bsw2seq1_t *p = _seq->seq + i;
+ if (p->sam) err_printf("%s", p->sam);
+ free(p->name); free(p->seq); free(p->qual); free(p->sam);
+ p->tid = -1; p->l = 0;
+ p->name = p->seq = p->qual = p->sam = 0;
+ }
+ err_fflush(stdout);
+ _seq->n = 0;
+}
+
+void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
+{
+ gzFile fp, fp2;
+ kseq_t *ks, *ks2;
+ int l, is_pe = 0, i, n;
+ uint8_t *pac;
+ bsw2seq_t *_seq;
+ bseq1_t *bseq;
+
+ pac = calloc(bns->l_pac/4+1, 1);
+ for (l = 0; l < bns->n_seqs; ++l)
+ err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len);
+ err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
+ fp = xzopen(fn, "r");
+ ks = kseq_init(fp);
+ _seq = calloc(1, sizeof(bsw2seq_t));
+ if (fn2) {
+ fp2 = xzopen(fn2, "r");
+ ks2 = kseq_init(fp2);
+ is_pe = 1;
+ } else fp2 = 0, ks2 = 0, is_pe = 0;
+ while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+ int size = 0;
+ if (n > _seq->max) {
+ _seq->max = n;
+ kroundup32(_seq->max);
+ _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
+ }
+ _seq->n = n;
+ for (i = 0; i < n; ++i) {
+ bseq1_t *b = &bseq[i];
+ bsw2seq1_t *p = &_seq->seq[i];
+ p->tid = -1; p->l = b->l_seq;
+ p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
+ size += p->l;
+ }
+ fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
+ free(bseq);
+ process_seqs(_seq, opt, bns, pac, target, is_pe);
+ }
+ // free
+ free(pac);
+ free(_seq->seq); free(_seq);
+ kseq_destroy(ks);
+ err_gzclose(fp);
+ if (fn2) {
+ kseq_destroy(ks2);
+ err_gzclose(fp2);
+ }
+}
diff --git a/ext/src/bwa/bwtsw2_chain.c b/ext/src/bwa/bwtsw2_chain.c
new file mode 100644
index 0000000..ade77e7
--- /dev/null
+++ b/ext/src/bwa/bwtsw2_chain.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include "bwtsw2.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef struct {
+ uint32_t tbeg, tend;
+ int qbeg, qend;
+ uint32_t flag:1, idx:31;
+ int chain; // also reuse as a counter
+} hsaip_t;
+
+#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg)
+
+#include "ksort.h"
+KSORT_INIT(hsaip, hsaip_t, _hsaip_lt)
+
+static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain)
+{
+ int j, k, m = 0;
+ ks_introsort(hsaip, n, z);
+ for (j = 0; j < n; ++j) {
+ hsaip_t *p = z + j;
+ for (k = m - 1; k >= 0; --k) {
+ hsaip_t *q = chain + k;
+ int x = p->qbeg - q->qbeg; // always positive
+ int y = p->tbeg - q->tbeg;
+ if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained
+ if (p->qend > q->qend) q->qend = p->qend;
+ if (p->tend > q->tend) q->tend = p->tend;
+ ++q->chain;
+ p->chain = shift + k;
+ break;
+ } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains
+ }
+ if (k < 0) { // not added to any previous chains
+ chain[m] = *p;
+ chain[m].chain = 1;
+ chain[m].idx = p->chain = shift + m;
+ ++m;
+ }
+ }
+ return m;
+}
+
+void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
+{
+ hsaip_t *z[2], *chain[2];
+ int i, j, k, n[2], m[2], thres = opt->t_seeds * 2;
+ char *flag;
+ // initialization
+ n[0] = b[0]->n; n[1] = b[1]->n;
+ z[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
+ z[1] = z[0] + n[0];
+ chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t));
+ for (k = j = 0; k < 2; ++k) {
+ for (i = 0; i < b[k]->n; ++i) {
+ bsw2hit_t *p = b[k]->hits + i;
+ hsaip_t *q = z[k] + i;
+ q->flag = k; q->idx = i;
+ q->tbeg = p->k; q->tend = p->k + p->len;
+ q->chain = -1;
+ q->qbeg = p->beg; q->qend = p->end;
+ }
+ }
+ // chaining
+ m[0] = chaining(opt, 0, n[0], z[0], chain[0]);
+ chain[1] = chain[0] + m[0];
+ m[1] = chaining(opt, m[0], n[1], z[1], chain[1]);
+ // change query coordinate on the reverse strand
+ for (k = 0; k < m[1]; ++k) {
+ hsaip_t *p = chain[1] + k;
+ int tmp = p->qbeg;
+ p->qbeg = len - p->qend; p->qend = len - tmp;
+ }
+ //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend);
+ // filtering
+ flag = calloc(m[0] + m[1], 1);
+ ks_introsort(hsaip, m[0] + m[1], chain[0]);
+ for (k = 1; k < m[0] + m[1]; ++k) {
+ hsaip_t *p = chain[0] + k;
+ for (j = 0; j < k; ++j) {
+ hsaip_t *q = chain[0] + j;
+ if (flag[q->idx]) continue;
+ if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) {
+ flag[p->idx] = 1;
+ break;
+ }
+ }
+ }
+ for (k = 0; k < n[0] + n[1]; ++k) {
+ hsaip_t *p = z[0] + k;
+ if (flag[p->chain])
+ b[p->flag]->hits[p->idx].G = 0;
+ }
+ free(flag);
+ // squeeze out filtered elements in b[2]
+ for (k = 0; k < 2; ++k) {
+ for (j = i = 0; j < n[k]; ++j) {
+ bsw2hit_t *p = b[k]->hits + j;
+ if (p->G) {
+ if (i != j) b[k]->hits[i++] = *p;
+ else ++i;
+ }
+ }
+ b[k]->n = i;
+ }
+ // free
+ free(z[0]); free(chain[0]);
+}
diff --git a/ext/src/bwa/bwtsw2_core.c b/ext/src/bwa/bwtsw2_core.c
new file mode 100644
index 0000000..1119601
--- /dev/null
+++ b/ext/src/bwa/bwtsw2_core.c
@@ -0,0 +1,619 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/resource.h>
+#include <assert.h>
+#include "bwt_lite.h"
+#include "bwtsw2.h"
+#include "bwt.h"
+#include "kvec.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef struct {
+ bwtint_t k, l;
+} qintv_t;
+
+#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l)
+#define qintv_hash(a) ((a).k>>7^(a).l<<17)
+
+#include "khash.h"
+KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq)
+KHASH_MAP_INIT_INT64(64, uint64_t)
+
+#define MINUS_INF -0x3fffffff
+#define MASK_LEVEL 0.90f
+
+struct __mempool_t;
+static void mp_destroy(struct __mempool_t*);
+typedef struct {
+ bwtint_t qk, ql;
+ int I, D, G;
+ uint32_t pj:2, qlen:30;
+ int tlen;
+ int ppos, upos;
+ int cpos[4];
+} bsw2cell_t;
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(int)
+#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2))
+KSORT_INIT(hitG, bsw2hit_t, __hitG_lt)
+
+static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} };
+
+typedef struct {
+ int n, max;
+ uint32_t tk, tl; // this is fine
+ bsw2cell_t *array;
+} bsw2entry_t, *bsw2entry_p;
+
+/* --- BEGIN: Stack operations --- */
+typedef struct {
+ int n_pending;
+ kvec_t(bsw2entry_p) stack0, pending;
+ struct __mempool_t *pool;
+} bsw2stack_t;
+
+#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0)
+static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); }
+inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); }
+inline static bsw2entry_p stack_pop(bsw2stack_t *s)
+{
+ assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0));
+ return kv_pop(s->stack0);
+}
+/* --- END: Stack operations --- */
+
+/* --- BEGIN: memory pool --- */
+typedef struct __mempool_t {
+ int cnt; // if cnt!=0, then there must be memory leak
+ kvec_t(bsw2entry_p) pool;
+} mempool_t;
+inline static bsw2entry_p mp_alloc(mempool_t *mp)
+{
+ ++mp->cnt;
+ if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t));
+ else return kv_pop(mp->pool);
+}
+inline static void mp_free(mempool_t *mp, bsw2entry_p e)
+{
+ --mp->cnt; e->n = 0;
+ kv_push(bsw2entry_p, mp->pool, e);
+}
+static void mp_destroy(struct __mempool_t *mp)
+{
+ int i;
+ for (i = 0; i != kv_size(mp->pool); ++i) {
+ free(kv_A(mp->pool, i)->array);
+ free(kv_A(mp->pool, i));
+ }
+ kv_destroy(mp->pool);
+ free(mp);
+}
+/* --- END: memory pool --- */
+
+/* --- BEGIN: utilities --- */
+static khash_t(64) *bsw2_connectivity(const bwtl_t *b)
+{
+ khash_t(64) *h;
+ uint32_t k, l, cntk[4], cntl[4]; // this is fine
+ uint64_t x;
+ khiter_t iter;
+ int j, ret;
+ kvec_t(uint64_t) stack;
+
+ kv_init(stack);
+ h = kh_init(64);
+ kh_resize(64, h, b->seq_len * 4);
+ x = b->seq_len;
+ kv_push(uint64_t, stack, x);
+ while (kv_size(stack)) {
+ x = kv_pop(stack);
+ k = x>>32; l = (uint32_t)x;
+ bwtl_2occ4(b, k-1, l, cntk, cntl);
+ for (j = 0; j != 4; ++j) {
+ k = b->L2[j] + cntk[j] + 1;
+ l = b->L2[j] + cntl[j];
+ if (k > l) continue;
+ x = (uint64_t)k << 32 | l;
+ iter = kh_put(64, h, x, &ret);
+ if (ret) { // if not present
+ kh_value(h, iter) = 1;
+ kv_push(uint64_t, stack, x);
+ } else ++kh_value(h, iter);
+ }
+ }
+ kv_destroy(stack);
+ //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h));
+ return h;
+}
+// pick up top T matches at a node
+static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux)
+{
+ int i, *a, n, x;
+ if (u->n <= T) return;
+ if (aux->max < u->n) {
+ aux->max = u->n;
+ aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t));
+ }
+ a = (int*)aux->array;
+ for (i = n = 0; i != u->n; ++i)
+ if (u->array[i].ql && u->array[i].G > 0)
+ a[n++] = -u->array[i].G;
+ if (n <= T) return;
+ x = -ks_ksmall(int, n, a, T);
+ n = 0;
+ for (i = 0; i < u->n; ++i) {
+ bsw2cell_t *p = u->array + i;
+ if (p->G == x) ++n;
+ if (p->G < x || (p->G == x && n >= T)) {
+ p->qk = p->ql = 0; p->G = 0;
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1;
+ }
+ }
+}
+// remove duplicated cells
+static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash)
+{
+ int i, ret, j;
+ khiter_t k;
+ qintv_t key;
+ kh_clear(qintv, hash);
+ for (i = 0; i != u->n; ++i) {
+ bsw2cell_t *p = u->array + i;
+ if (p->ql == 0) continue;
+ key.k = p->qk; key.l = p->ql;
+ k = kh_put(qintv, hash, key, &ret);
+ j = -1;
+ if (ret == 0) {
+ if ((uint32_t)kh_value(hash, k) >= p->G) j = i;
+ else {
+ j = kh_value(hash, k)>>32;
+ kh_value(hash, k) = (uint64_t)i<<32 | p->G;
+ }
+ } else kh_value(hash, k) = (uint64_t)i<<32 | p->G;
+ if (j >= 0) {
+ p = u->array + j;
+ p->qk = p->ql = 0; p->G = 0;
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
+ }
+ }
+}
+// merge two entries
+static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b)
+{
+ int i;
+ if (u->n + v->n >= u->max) {
+ u->max = u->n + v->n;
+ u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t));
+ }
+ for (i = 0; i != v->n; ++i) {
+ bsw2cell_t *p = v->array + i;
+ if (p->ppos >= 0) p->ppos += u->n;
+ if (p->cpos[0] >= 0) p->cpos[0] += u->n;
+ if (p->cpos[1] >= 0) p->cpos[1] += u->n;
+ if (p->cpos[2] >= 0) p->cpos[2] += u->n;
+ if (p->cpos[3] >= 0) p->cpos[3] += u->n;
+ }
+ memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t));
+ u->n += v->n;
+}
+
+static inline bsw2cell_t *push_array_p(bsw2entry_t *e)
+{
+ if (e->n == e->max) {
+ e->max = e->max? e->max<<1 : 256;
+ e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max);
+ }
+ return e->array + e->n;
+}
+
+static inline double time_elapse(const struct rusage *curr, const struct rusage *last)
+{
+ long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec);
+ long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec);
+ return (double)t1 + t2 * 1e-6;
+}
+/* --- END: utilities --- */
+
+/* --- BEGIN: processing partial hits --- */
+static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u)
+{
+ int i;
+ uint32_t k; // this is fine
+ for (i = 0; i < u->n; ++i) {
+ bsw2cell_t *p = u->array + i;
+ if (p->G < thres) continue;
+ for (k = u->tk; k <= u->tl; ++k) {
+ int beg, end;
+ bsw2hit_t *q = 0;
+ beg = bwt->sa[k]; end = beg + p->tlen;
+ if (p->G > hits[beg*2].G) {
+ hits[beg*2+1] = hits[beg*2];
+ q = hits + beg * 2;
+ } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1;
+ if (q) {
+ q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G;
+ q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G;
+ q->flag = q->n_seeds = 0;
+ }
+ }
+ }
+}
+/* "narrow hits" are node-to-node hits that have a high score and
+ * are not so repetitive (|SA interval|<=IS). */
+static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS)
+{
+ int i;
+ for (i = 0; i < u->n; ++i) {
+ bsw2hit_t *q;
+ bsw2cell_t *p = u->array + i;
+ if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit
+ if (b1->max == b1->n) {
+ b1->max = b1->max? b1->max<<1 : 4;
+ b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t));
+ }
+ q = &b1->hits[b1->n++];
+ q->k = p->qk; q->l = p->ql;
+ q->len = p->qlen;
+ q->G = p->G; q->G2 = 0;
+ q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen;
+ q->flag = 0;
+ // delete p
+ p->qk = p->ql = 0; p->G = 0;
+ if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3;
+ }
+ }
+}
+/* after this, "narrow SA hits" will be expanded and the coordinates
+ * will be obtained and stored in b->hits[*].k. */
+int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS)
+{
+ int i, j, n, is_rev;
+ if (b->n == 0) return 0;
+ if (bwt && bns) { // convert to chromosomal coordinates if requested
+ int old_n = b->n;
+ bsw2hit_t *old_hits = b->hits;
+ for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated
+ bsw2hit_t *p = old_hits + i;
+ if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1;
+ else if (p->G > 0) ++n;
+ }
+ b->n = b->max = n;
+ b->hits = calloc(b->max, sizeof(bsw2hit_t));
+ for (i = j = 0; i < old_n; ++i) {
+ bsw2hit_t *p = old_hits + i;
+ if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive
+ bwtint_t k;
+ if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue;
+ for (k = p->k; k <= p->l; ++k) {
+ b->hits[j] = *p;
+ b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev);
+ b->hits[j].l = 0;
+ b->hits[j].is_rev = is_rev;
+ if (is_rev) b->hits[j].k -= p->len - 1;
+ ++j;
+ }
+ } else if (p->G > 0) {
+ b->hits[j] = *p;
+ b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev);
+ b->hits[j].l = 0;
+ b->hits[j].flag |= 1;
+ b->hits[j].is_rev = is_rev;
+ if (is_rev) b->hits[j].k -= p->len - 1;
+ ++j;
+ }
+ }
+ free(old_hits);
+ }
+ for (i = j = 0; i < b->n; ++i) // squeeze out empty elements
+ if (b->hits[i].G) b->hits[j++] = b->hits[i];
+ b->n = j;
+ ks_introsort(hitG, b->n, b->hits);
+ for (i = 1; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ for (j = 0; j < i; ++j) {
+ bsw2hit_t *q = b->hits + j;
+ int compatible = 1;
+ if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates
+ if (p->l == 0 && q->l == 0) {
+ int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap
+ if (qol < 0) qol = 0;
+ if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) {
+ int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
+ - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap
+ if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL)
+ compatible = 0;
+ }
+ }
+ if (!compatible) {
+ p->G = 0;
+ if (q->G2 < p->G2) q->G2 = p->G2;
+ break;
+ }
+ }
+ }
+ n = i;
+ for (i = j = 0; i < n; ++i) {
+ if (b->hits[i].G == 0) continue;
+ if (i != j) b->hits[j++] = b->hits[i];
+ else ++j;
+ }
+ b->n = j;
+ return b->n;
+}
+
+int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level)
+{
+ int i, j, n;
+ if (b->n == 0) return 0;
+ ks_introsort(hitG, b->n, b->hits);
+ { // choose a random one
+ int G0 = b->hits[0].G;
+ for (i = 1; i < b->n; ++i)
+ if (b->hits[i].G != G0) break;
+ j = (int)(i * drand48());
+ if (j) {
+ bsw2hit_t tmp;
+ tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp;
+ }
+ }
+ for (i = 1; i < b->n; ++i) {
+ bsw2hit_t *p = b->hits + i;
+ int all_compatible = 1;
+ if (p->G == 0) break;
+ for (j = 0; j < i; ++j) {
+ bsw2hit_t *q = b->hits + j;
+ int64_t tol = 0;
+ int qol, compatible = 0;
+ float fol;
+ if (q->G == 0) continue;
+ qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg);
+ if (qol < 0) qol = 0;
+ if (p->l == 0 && q->l == 0) {
+ tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len)
+ - (p->k > q->k? p->k : q->k);
+ if (tol < 0) tol = 0;
+ }
+ fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg);
+ if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1;
+ if (!compatible) {
+ if (q->G2 < p->G) q->G2 = p->G;
+ all_compatible = 0;
+ }
+ }
+ if (!all_compatible) p->G = 0;
+ }
+ n = i;
+ for (i = j = 0; i < n; ++i) {
+ if (b->hits[i].G == 0) continue;
+ if (i != j) b->hits[j++] = b->hits[i];
+ else ++j;
+ }
+ b->n = j;
+ return j;
+}
+/* --- END: processing partial hits --- */
+
+/* --- BEGIN: global mem pool --- */
+bsw2global_t *bsw2_global_init()
+{
+ bsw2global_t *pool;
+ bsw2stack_t *stack;
+ pool = calloc(1, sizeof(bsw2global_t));
+ stack = calloc(1, sizeof(bsw2stack_t));
+ stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t));
+ pool->stack = (void*)stack;
+ return pool;
+}
+
+void bsw2_global_destroy(bsw2global_t *pool)
+{
+ stack_destroy((bsw2stack_t*)pool->stack);
+ free(pool->aln_mem);
+ free(pool);
+}
+/* --- END: global mem pool --- */
+
+static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4])
+{
+ int G = c[3]? c[3]->G + match_score : MINUS_INF;
+ if (c[1]) {
+ c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr;
+ if (c[0]->I > G) G = c[0]->I;
+ } else c[0]->I = MINUS_INF;
+ if (c[2]) {
+ c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr;
+ if (c[0]->D > G) G = c[0]->D;
+ } else c[0]->D = MINUS_INF;
+ return(c[0]->G = G);
+}
+
+static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s)
+{
+ bsw2entry_t *u;
+ bsw2cell_t *x;
+
+ u = mp_alloc(s->pool);
+ u->tk = 0; u->tl = target->seq_len;
+ x = push_array_p(u);
+ *x = g_default_cell;
+ x->G = 0; x->qk = 0; x->ql = query->seq_len;
+ u->n++;
+ stack_push0(s, u);
+}
+/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */
+bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool)
+{
+ bsw2stack_t *stack = (bsw2stack_t*)pool->stack;
+ bwtsw2_t *b, *b1, **b_ret;
+ int i, j, score_mat[16], *heap, heap_size, n_tot = 0;
+ struct rusage curr, last;
+ khash_t(qintv) *rhash;
+ khash_t(64) *chash;
+
+ // initialize connectivity hash (chash)
+ chash = bsw2_connectivity(target);
+ // calculate score matrix
+ for (i = 0; i != 4; ++i)
+ for (j = 0; j != 4; ++j)
+ score_mat[i<<2|j] = (i == j)? opt->a : -opt->b;
+ // initialize other variables
+ rhash = kh_init(qintv);
+ init_bwtsw2(target, query, stack);
+ heap_size = opt->z;
+ heap = calloc(heap_size, sizeof(int));
+ // initialize the return struct
+ b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
+ b->n = b->max = target->seq_len * 2;
+ b->hits = calloc(b->max, sizeof(bsw2hit_t));
+ b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t));
+ b_ret = calloc(2, sizeof(void*));
+ b_ret[0] = b; b_ret[1] = b1;
+ // initialize timer
+ getrusage(0, &last);
+ // the main loop: traversal of the DAG
+ while (!stack_isempty(stack)) {
+ int old_n, tj;
+ bsw2entry_t *v;
+ uint32_t tcntk[4], tcntl[4];
+ bwtint_t k, l;
+
+ v = stack_pop(stack); old_n = v->n;
+ n_tot += v->n;
+
+ for (i = 0; i < v->n; ++i) { // test max depth and band width
+ bsw2cell_t *p = v->array + i;
+ if (p->ql == 0) continue;
+ if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) {
+ p->qk = p->ql = 0;
+ if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5;
+ }
+ }
+
+ // get Occ for the DAG
+ bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl);
+ for (tj = 0; tj != 4; ++tj) { // descend to the children
+ bwtint_t qcntk[4], qcntl[4];
+ int qj, *curr_score_mat = score_mat + tj * 4;
+ khiter_t iter;
+ bsw2entry_t *u;
+
+ k = target->L2[tj] + tcntk[tj] + 1;
+ l = target->L2[tj] + tcntl[tj];
+ if (k > l) continue;
+ // update counter
+ iter = kh_get(64, chash, (uint64_t)k<<32 | l);
+ --kh_value(chash, iter);
+ // initialization
+ u = mp_alloc(stack->pool);
+ u->tk = k; u->tl = l;
+ memset(heap, 0, sizeof(int) * opt->z);
+ // loop through all the nodes in v
+ for (i = 0; i < v->n; ++i) {
+ bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G
+ int is_added = 0;
+ if (p->ql == 0) continue; // deleted node
+ c[0] = x = push_array_p(u);
+ x->G = MINUS_INF;
+ p->upos = x->upos = -1;
+ if (p->ppos >= 0) { // parent has been visited
+ c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0;
+ c[3] = v->array + p->ppos; c[2] = p;
+ if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x
+ x->ppos = v->array[p->ppos].upos; // the parent pos in u
+ p->upos = u->n++; // the current pos in u
+ if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u
+ is_added = 1;
+ }
+ } else {
+ x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr;
+ if (x->D > 0) {
+ x->G = x->D;
+ x->I = MINUS_INF; x->ppos = -1;
+ p->upos = u->n++;
+ is_added = 1;
+ }
+ }
+ if (is_added) { // x has been added to u->array. fill the remaining variables
+ x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
+ x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1;
+ if (x->G > -heap[0]) {
+ heap[0] = -x->G;
+ ks_heapadjust(int, 0, heap_size, heap);
+ }
+ }
+ if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v
+ if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) {
+ bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl);
+ for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie
+ if (p->cpos[qj] != -1) continue; // this node will be visited later
+ k = query->L2[qj] + qcntk[qj] + 1;
+ l = query->L2[qj] + qcntl[qj];
+ if (k > l) { p->cpos[qj] = -2; continue; }
+ x = push_array_p(v);
+ p = v->array + i; // p may not point to the correct position after realloc
+ x->G = x->I = x->D = MINUS_INF;
+ x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen;
+ x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1;
+ p->cpos[qj] = v->n++;
+ } // ~for(qj)
+ } // ~if(p->cpos[])
+ } // ~if
+ } // ~for(i)
+ if (u->n) save_hits(target, opt->t, b->hits, u);
+ { // push u to the stack (or to the pending array)
+ uint32_t cnt, pos;
+ cnt = (uint32_t)kh_value(chash, iter);
+ pos = kh_value(chash, iter)>>32;
+ if (pos) { // something in the pending array, then merge
+ bsw2entry_t *w = kv_A(stack->pending, pos-1);
+ if (u->n) {
+ if (w->n < u->n) { // swap
+ w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w;
+ }
+ merge_entry(opt, w, u, b);
+ }
+ if (cnt == 0) { // move from pending to stack0
+ remove_duplicate(w, rhash);
+ save_narrow_hits(target, w, b1, opt->t, opt->is);
+ cut_tail(w, opt->z, u);
+ stack_push0(stack, w);
+ kv_A(stack->pending, pos-1) = 0;
+ --stack->n_pending;
+ }
+ mp_free(stack->pool, u);
+ } else if (cnt) { // the first time
+ if (u->n) { // push to the pending queue
+ ++stack->n_pending;
+ kv_push(bsw2entry_p, stack->pending, u);
+ kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt;
+ } else mp_free(stack->pool, u);
+ } else { // cnt == 0, then push to the stack
+ bsw2entry_t *w = mp_alloc(stack->pool);
+ save_narrow_hits(target, u, b1, opt->t, opt->is);
+ cut_tail(u, opt->z, w);
+ mp_free(stack->pool, w);
+ stack_push0(stack, u);
+ }
+ }
+ } // ~for(tj)
+ mp_free(stack->pool, v);
+ } // while(top)
+ getrusage(0, &curr);
+ for (i = 0; i < 2; ++i)
+ for (j = 0; j < b_ret[i]->n; ++j)
+ b_ret[i]->hits[j].n_seeds = 0;
+ bsw2_resolve_duphits(bns, query, b, opt->is);
+ bsw2_resolve_duphits(bns, query, b1, opt->is);
+ //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot);
+ // free
+ free(heap);
+ kh_destroy(qintv, rhash);
+ kh_destroy(64, chash);
+ stack->pending.n = stack->stack0.n = 0;
+ return b_ret;
+}
diff --git a/ext/src/bwa/bwtsw2_main.c b/ext/src/bwa/bwtsw2_main.c
new file mode 100644
index 0000000..40a9e0a
--- /dev/null
+++ b/ext/src/bwa/bwtsw2_main.c
@@ -0,0 +1,89 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "bwt.h"
+#include "bwtsw2.h"
+#include "utils.h"
+#include "bwa.h"
+
+int bwa_bwtsw2(int argc, char *argv[])
+{
+ bsw2opt_t *opt;
+ bwaidx_t *idx;
+ int c;
+
+ opt = bsw2_init_opt();
+ srand48(11);
+ while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) {
+ switch (c) {
+ case 'q': opt->q = atoi(optarg); break;
+ case 'r': opt->r = atoi(optarg); break;
+ case 'a': opt->a = atoi(optarg); break;
+ case 'b': opt->b = atoi(optarg); break;
+ case 'w': opt->bw = atoi(optarg); break;
+ case 'T': opt->t = atoi(optarg); break;
+ case 't': opt->n_threads = atoi(optarg); break;
+ case 'z': opt->z = atoi(optarg); break;
+ case 's': opt->is = atoi(optarg); break;
+ case 'm': opt->mask_level = atof(optarg); break;
+ case 'c': opt->coef = atof(optarg); break;
+ case 'N': opt->t_seeds = atoi(optarg); break;
+ case 'M': opt->multi_2nd = 1; break;
+ case 'H': opt->hard_clip = 1; break;
+ case 'f': xreopen(optarg, "w", stdout); break;
+ case 'I': opt->max_ins = atoi(optarg); break;
+ case 'S': opt->skip_sw = 1; break;
+ case 'C': opt->cpy_cmt = 1; break;
+ case 'G': opt->max_chain_gap = atoi(optarg); break;
+ default: return 1;
+ }
+ }
+ opt->qr = opt->q + opt->r;
+
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa bwasw [options] <target.prefix> <query.fa> [query2.fa]\n\n");
+ fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a);
+ fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b);
+ fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q);
+ fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r);
+ fprintf(stderr, " -w INT band width [%d]\n", opt->bw);
+ fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -f FILE file to output results to instead of stdout\n");
+ fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n");
+ fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n");
+ fprintf(stderr, " -M mark multi-part alignments as secondary\n");
+ fprintf(stderr, " -S skip Smith-Waterman read pairing\n");
+ fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins);
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t);
+ fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
+ fprintf(stderr, " -z INT Z-best [%d]\n", opt->z);
+ fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is);
+ fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds);
+ fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
+ fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n");
+ fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n");
+ fprintf(stderr, " increase '-z' for better sensitivity.\n");
+ fprintf(stderr, "\n");
+
+ return 1;
+ }
+
+ // adjust opt for opt->a
+ opt->t *= opt->a;
+ opt->coef *= opt->a;
+
+ if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
+ bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
+ bwa_idx_destroy(idx);
+ free(opt);
+
+ return 0;
+}
diff --git a/ext/src/bwa/bwtsw2_pair.c b/ext/src/bwa/bwtsw2_pair.c
new file mode 100644
index 0000000..24905df
--- /dev/null
+++ b/ext/src/bwa/bwtsw2_pair.c
@@ -0,0 +1,268 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "utils.h"
+#include "bwt.h"
+#include "bntseq.h"
+#include "bwtsw2.h"
+#include "kstring.h"
+#include "ksw.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define MIN_RATIO 0.8
+#define OUTLIER_BOUND 2.0
+#define MAX_STDDEV 4.0
+#define EXT_STDDEV 4.0
+
+typedef struct {
+ int low, high, failed;
+ double avg, std;
+} bsw2pestat_t;
+
+bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
+{
+ int i, k, x, p25, p50, p75, tmp, max_len = 0;
+ uint64_t *isize;
+ bsw2pestat_t r;
+
+ memset(&r, 0, sizeof(bsw2pestat_t));
+ isize = calloc(n, 8);
+ for (i = k = 0; i < n; i += 2) {
+ bsw2hit_t *t[2];
+ int l;
+ if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits
+ t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0];
+ if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough
+ if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough
+ l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len;
+ if (l >= max_ins) continue; // skip pairs with excessively large insert
+ max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg;
+ max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
+ isize[k++] = l;
+ }
+ ks_introsort_64(k, isize);
+ p25 = isize[(int)(.25 * k + .499)];
+ p50 = isize[(int)(.50 * k + .499)];
+ p75 = isize[(int)(.75 * k + .499)];
+ ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k);
+ if (k < 8) {
+ ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__);
+ free(isize);
+ r.failed = 1;
+ return r;
+ }
+ tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+ r.low = tmp > max_len? tmp : max_len;
+ if (r.low < 1) r.low = 1;
+ r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+ if (r.low > r.high) {
+ ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__);
+ free(isize);
+ r.failed = 1;
+ return r;
+ }
+ ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
+ ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high);
+ for (i = x = 0, r.avg = 0; i < k; ++i)
+ if (isize[i] >= r.low && isize[i] <= r.high)
+ r.avg += isize[i], ++x;
+ r.avg /= x;
+ for (i = 0, r.std = 0; i < k; ++i)
+ if (isize[i] >= r.low && isize[i] <= r.high)
+ r.std += (isize[i] - r.avg) * (isize[i] - r.avg);
+ r.std = sqrt(r.std / x);
+ ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std);
+ tmp = (int)(p25 - 3. * (p75 - p25) + .499);
+ r.low = tmp > max_len? tmp : max_len;
+ if (r.low < 1) r.low = 1;
+ r.high = (int)(p75 + 3. * (p75 - p25) + .499);
+ if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
+ r.low = tmp > max_len? tmp : max_len;
+ if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
+ ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
+ free(isize);
+ return r;
+}
+
+typedef struct {
+ int n_cigar, beg, end, len;
+ int64_t pos;
+ uint32_t *cigar;
+} pairaux_t;
+
+extern unsigned char nst_nt4_table[256];
+
+void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25])
+{
+ extern void seq_reverse(int len, ubyte_t *seq, int is_comp);
+ int64_t k, beg, end;
+ uint8_t *seq, *ref;
+ int i;
+ // compute the region start and end
+ a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7
+ if (h->is_rev == 0) {
+ beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499);
+ if (beg < h->k) beg = h->k;
+ end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499);
+ a->is_rev = 1; a->flag |= 16;
+ } else {
+ beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499);
+ end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499);
+ if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg);
+ a->is_rev = 0;
+ }
+ if (beg < 1) beg = 1;
+ if (end > l_pac) end = l_pac;
+ if (end - beg < l_mseq) return;
+ // generate the sequence
+ seq = malloc(l_mseq + (end - beg));
+ ref = seq + l_mseq;
+ for (k = beg; k < end; ++k)
+ ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3;
+ if (h->is_rev == 0) {
+ for (i = 0; i < l_mseq; ++i) { // on the reverse strand
+ int c = nst_nt4_table[(int)mseq[i]];
+ seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c;
+ }
+ } else {
+ for (i = 0; i < l_mseq; ++i) // on the forward strand
+ seq[i] = nst_nt4_table[(int)mseq[i]];
+ }
+ {
+ int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
+ kswr_t aln;
+ aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
+ a->G = aln.score;
+ a->G2 = aln.score2;
+ if (a->G < opt->t) a->G = 0;
+ if (a->G2 < opt->t) a->G2 = 0;
+ if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
+ a->k = beg + aln.tb;
+ a->len = aln.te - aln.tb + 1;
+ a->beg = aln.qb;
+ a->end = aln.qe + 1;
+ /*
+ printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
+ printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
+ printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
+ */
+ }
+ if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i;
+ free(seq);
+}
+
+void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits)
+{
+ extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS);
+ bsw2pestat_t pes;
+ int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0;
+ int8_t g_mat[25];
+ kstring_t msg;
+ memset(&msg, 0, sizeof(kstring_t));
+ pes = bsw2_stat(n, hits, &msg, opt->max_ins);
+ for (i = k = 0; i < 5; ++i) {
+ for (j = 0; j < 4; ++j)
+ g_mat[k++] = i == j? opt->a : -opt->b;
+ g_mat[k++] = 0;
+ }
+ for (i = 0; i < n; i += 2) {
+ bsw2hit_t a[2];
+ memset(&a, 0, sizeof(bsw2hit_t) * 2);
+ a[0].flag = 1<<6; a[1].flag = 1<<7;
+ for (j = 0; j < 2; ++j) { // set the read1/2 flag
+ if (hits[i+j] == 0) continue;
+ for (k = 0; k < hits[i+j]->n; ++k) {
+ bsw2hit_t *p = &hits[i+j]->hits[k];
+ p->flag |= 1<<(6+j);
+ }
+ }
+ if (pes.failed) continue;
+ if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N
+ if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit
+ if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit
+ if (!opt->skip_sw) {
+ if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat);
+ if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat);
+ } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0
+ // the following enumerate all possibilities. It is tedious but necessary...
+ if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not;
+ bwtsw2_t *p[2];
+ int which;
+ if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1;
+ else p[0] = hits[i+1], p[1] = hits[i], which = 0;
+ if (a[which].G == 0) continue;
+ a[which].flag |= BSW2_FLAG_RESCUED;
+ if (p[1]->max == 0) {
+ p[1]->max = 1;
+ p[1]->hits = malloc(sizeof(bsw2hit_t));
+ }
+ p[1]->hits[0] = a[which];
+ p[1]->n = 1;
+ p[0]->hits[0].flag |= 2;
+ p[1]->hits[0].flag |= 2;
+ ++n_rescued;
+ } else { // then both ends mapped
+ int is_fixed = 0;
+ //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end);
+ for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score
+ bsw2hit_t *p = &hits[i+j]->hits[0];
+ if (p->G < a[j].G) { // the orginal mapping is suboptimal
+ a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM?
+ *p = a[j];
+ ++n_fixed;
+ is_fixed = 1;
+ } else if (p->k != a[j].k && p->G2 < a[j].G) {
+ p->G2 = a[j].G;
+ } else if (p->k == a[j].k && p->G2 < a[j].G2) {
+ p->G2 = a[j].G2;
+ }
+ }
+ if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved
+ for (j = 0; j < 2; ++j)
+ hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM);
+ } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match
+ for (j = 0; j < 2; ++j) {
+ hits[i+j]->hits[0].flag |= 2;
+ if (hits[i+j]->hits[0].k != a[j].k)
+ hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM;
+ }
+ } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end
+ if (a[0].G && a[1].G) { // now we have two "proper pairs"
+ int G[2];
+ double diff;
+ G[0] = hits[i]->hits[0].G + a[1].G;
+ G[1] = hits[i+1]->hits[0].G + a[0].G;
+ diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.);
+ if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0;
+ }
+ if (a[0].G == 0 || a[1].G == 0) { // one proper pair only
+ bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved
+ int which, isize;
+ double dev, diff;
+ if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0;
+ else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1;
+ isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k;
+ dev = fabs(isize - pes.avg) / pes.std;
+ diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0;
+ if (diff < dev * 2.) { // then move (heuristic)
+ a[which].G2 = a[which].G;
+ p[1][0] = a[which];
+ p[1]->flag |= BSW2_FLAG_MOVED | 2;
+ p[0]->flag |= 2;
+ ++n_moved;
+ }
+ }
+ } else if (is_fixed) {
+ hits[i+0]->hits[0].flag |= 2;
+ hits[i+1]->hits[0].flag |= 2;
+ }
+ }
+ }
+ ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved);
+ fputs(msg.s, stderr);
+ free(msg.s);
+}
diff --git a/ext/src/bwa/example.c b/ext/src/bwa/example.c
new file mode 100644
index 0000000..4e8494d
--- /dev/null
+++ b/ext/src/bwa/example.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <zlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include "bwamem.h"
+#include "kseq.h" // for the FASTA/Q parser
+KSEQ_DECLARE(gzFile)
+
+int main(int argc, char *argv[])
+{
+ bwaidx_t *idx;
+ gzFile fp;
+ kseq_t *ks;
+ mem_opt_t *opt;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
+ return 1;
+ }
+
+ idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
+ if (NULL == idx) {
+ fprintf(stderr, "Index load failed.\n");
+ exit(EXIT_FAILURE);
+ }
+ fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+ if (NULL == fp) {
+ fprintf(stderr, "Couldn't open %s : %s\n",
+ strcmp(argv[2], "-") ? argv[2] : "stdin",
+ errno ? strerror(errno) : "Out of memory");
+ exit(EXIT_FAILURE);
+ }
+ ks = kseq_init(fp); // initialize the FASTA/Q parser
+ opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
+
+ while (kseq_read(ks) >= 0) { // read one sequence
+ mem_alnreg_v ar;
+ int i, k;
+ ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
+ for (i = 0; i < ar.n; ++i) { // traverse each hit
+ mem_aln_t a;
+ if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
+ a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
+ // print alignment
+ printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
+ for (k = 0; k < a.n_cigar; ++k) // print CIGAR
+ printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+ printf("\t%d\n", a.NM); // print edit distance
+ free(a.cigar); // don't forget to deallocate CIGAR
+ }
+ free(ar.a); // and deallocate the hit list
+ }
+
+ free(opt);
+ kseq_destroy(ks);
+ gzclose(fp);
+ bwa_idx_destroy(idx);
+ return 0;
+}
diff --git a/ext/src/bwa/fastmap.c b/ext/src/bwa/fastmap.c
new file mode 100644
index 0000000..1e49ccb
--- /dev/null
+++ b/ext/src/bwa/fastmap.c
@@ -0,0 +1,441 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+#include <math.h>
+#include "bwa.h"
+#include "bwamem.h"
+#include "kvec.h"
+#include "utils.h"
+#include "bntseq.h"
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+extern unsigned char nst_nt4_table[256];
+
+void *kopen(const char *fn, int *_fd);
+int kclose(void *a);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+ kseq_t *ks, *ks2;
+ mem_opt_t *opt;
+ mem_pestat_t *pes0;
+ int64_t n_processed;
+ int copy_comment, actual_chunk_size;
+ bwaidx_t *idx;
+} ktp_aux_t;
+
+typedef struct {
+ ktp_aux_t *aux;
+ int n_seqs;
+ bseq1_t *seqs;
+} ktp_data_t;
+
+static void *process(void *shared, int step, void *_data)
+{
+ ktp_aux_t *aux = (ktp_aux_t*)shared;
+ ktp_data_t *data = (ktp_data_t*)_data;
+ int i;
+ if (step == 0) {
+ ktp_data_t *ret;
+ int64_t size = 0;
+ ret = calloc(1, sizeof(ktp_data_t));
+ ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
+ if (ret->seqs == 0) {
+ free(ret);
+ return 0;
+ }
+ if (!aux->copy_comment)
+ for (i = 0; i < ret->n_seqs; ++i) {
+ free(ret->seqs[i].comment);
+ ret->seqs[i].comment = 0;
+ }
+ for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
+ return ret;
+ } else if (step == 1) {
+ const mem_opt_t *opt = aux->opt;
+ const bwaidx_t *idx = aux->idx;
+ if (opt->flag & MEM_F_SMARTPE) {
+ bseq1_t *sep[2];
+ int n_sep[2];
+ mem_opt_t tmp_opt = *opt;
+ bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
+ if (n_sep[0]) {
+ tmp_opt.flag &= ~MEM_F_PE;
+ mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
+ for (i = 0; i < n_sep[0]; ++i)
+ data->seqs[sep[0][i].id].sam = sep[0][i].sam;
+ }
+ if (n_sep[1]) {
+ tmp_opt.flag |= MEM_F_PE;
+ mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
+ for (i = 0; i < n_sep[1]; ++i)
+ data->seqs[sep[1][i].id].sam = sep[1][i].sam;
+ }
+ free(sep[0]); free(sep[1]);
+ } else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
+ aux->n_processed += data->n_seqs;
+ return data;
+ } else if (step == 2) {
+ for (i = 0; i < data->n_seqs; ++i) {
+ if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
+ free(data->seqs[i].name); free(data->seqs[i].comment);
+ free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
+ }
+ free(data->seqs); free(data);
+ return 0;
+ }
+ return 0;
+}
+
+static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
+{
+ if (opt0->a) { // matching score is changed
+ if (!opt0->b) opt->b *= opt->a;
+ if (!opt0->T) opt->T *= opt->a;
+ if (!opt0->o_del) opt->o_del *= opt->a;
+ if (!opt0->e_del) opt->e_del *= opt->a;
+ if (!opt0->o_ins) opt->o_ins *= opt->a;
+ if (!opt0->e_ins) opt->e_ins *= opt->a;
+ if (!opt0->zdrop) opt->zdrop *= opt->a;
+ if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a;
+ if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a;
+ if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a;
+ }
+}
+
+int main_mem(int argc, char *argv[])
+{
+ mem_opt_t *opt, opt0;
+ int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
+ int fixed_chunk_size = -1;
+ gzFile fp, fp2 = 0;
+ char *p, *rg_line = 0, *hdr_line = 0;
+ const char *mode = 0;
+ void *ko = 0, *ko2 = 0;
+ mem_pestat_t pes[4];
+ ktp_aux_t aux;
+
+ memset(&aux, 0, sizeof(ktp_aux_t));
+ memset(pes, 0, 4 * sizeof(mem_pestat_t));
+ for (i = 0; i < 4; ++i) pes[i].failed = 1;
+
+ aux.opt = opt = mem_opt_init();
+ memset(&opt0, 0, sizeof(mem_opt_t));
+ while ((c = getopt(argc, argv, "1paMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
+ if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
+ else if (c == '1') no_mt_io = 1;
+ else if (c == 'x') mode = optarg;
+ else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
+ else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
+ else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1;
+ else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1;
+ else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1;
+ else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
+ else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
+ else if (c == 'a') opt->flag |= MEM_F_ALL;
+ else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
+ else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
+ else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
+ else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
+ else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
+ else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
+ else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
+ else if (c == 'v') bwa_verbose = atoi(optarg);
+ else if (c == 'j') ignore_alt = 1;
+ else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
+ else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
+ else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
+ else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
+ else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
+ else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
+ else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
+ else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
+ else if (c == 'C') aux.copy_comment = 1;
+ else if (c == 'K') fixed_chunk_size = atoi(optarg);
+ else if (c == 'X') opt->mask_level = atof(optarg);
+ else if (c == 'h') {
+ opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
+ opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->max_XA_hits_alt = strtol(p+1, &p, 10);
+ }
+ else if (c == 'Q') {
+ opt0.mapQ_coef_len = 1;
+ opt->mapQ_coef_len = atoi(optarg);
+ opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0;
+ } else if (c == 'O') {
+ opt0.o_del = opt0.o_ins = 1;
+ opt->o_del = opt->o_ins = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->o_ins = strtol(p+1, &p, 10);
+ } else if (c == 'E') {
+ opt0.e_del = opt0.e_ins = 1;
+ opt->e_del = opt->e_ins = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->e_ins = strtol(p+1, &p, 10);
+ } else if (c == 'L') {
+ opt0.pen_clip5 = opt0.pen_clip3 = 1;
+ opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ opt->pen_clip3 = strtol(p+1, &p, 10);
+ } else if (c == 'R') {
+ if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
+ } else if (c == 'H') {
+ if (optarg[0] != '@') {
+ FILE *fp;
+ if ((fp = fopen(optarg, "r")) != 0) {
+ char *buf;
+ buf = calloc(1, 0x10000);
+ while (fgets(buf, 0xffff, fp)) {
+ i = strlen(buf);
+ assert(buf[i-1] == '\n'); // a long line
+ buf[i-1] = 0;
+ hdr_line = bwa_insert_header(buf, hdr_line);
+ }
+ free(buf);
+ fclose(fp);
+ }
+ } else hdr_line = bwa_insert_header(optarg, hdr_line);
+ } else if (c == 'I') { // specify the insert size distribution
+ aux.pes0 = pes;
+ pes[1].failed = 0;
+ pes[1].avg = strtod(optarg, &p);
+ pes[1].std = pes[1].avg * .1;
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].std = strtod(p+1, &p);
+ pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499);
+ pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499);
+ if (pes[1].low < 1) pes[1].low = 1;
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].high = (int)(strtod(p+1, &p) + .499);
+ if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+ pes[1].low = (int)(strtod(p+1, &p) + .499);
+ if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
+ __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
+ }
+ else return 1;
+ }
+
+ if (rg_line) {
+ hdr_line = bwa_insert_header(rg_line, hdr_line);
+ free(rg_line);
+ }
+
+ if (opt->n_threads < 1) opt->n_threads = 1;
+ if (optind + 1 >= argc || optind + 3 < argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
+ fprintf(stderr, "Algorithm options:\n\n");
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len);
+ fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w);
+ fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop);
+ fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+ fprintf(stderr, " -y INT seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
+// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+ fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
+ fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
+ fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n");
+ fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
+ fprintf(stderr, " -S skip mate rescue\n");
+ fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
+ fprintf(stderr, "\nScoring options:\n\n");
+ fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
+ fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
+ fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
+ fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
+ fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
+ fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
+ fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n");
+ fprintf(stderr, " pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0 (PacBio reads to ref)\n");
+ fprintf(stderr, " ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0 (Oxford Nanopore 2D-reads to ref)\n");
+ fprintf(stderr, " intractg: -B9 -O16 -L5 (intra-species contigs to ref)\n");
+ fprintf(stderr, "\nInput/output options:\n\n");
+ fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n");
+ fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+ fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n");
+ fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+ fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T);
+ fprintf(stderr, " -h INT[,INT] if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
+ fprintf(stderr, " -a output all alignments for SE or unpaired PE\n");
+ fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n");
+ fprintf(stderr, " -V output the reference FASTA header in the XR tag\n");
+ fprintf(stderr, " -Y use soft clipping for supplementary alignments\n");
+ fprintf(stderr, " -M mark shorter split hits as secondary\n\n");
+ fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n");
+ fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n");
+ fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n");
+ fprintf(stderr, " FR orientation only. [inferred]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n");
+ fprintf(stderr, "\n");
+ free(opt);
+ return 1;
+ }
+
+ if (mode) {
+ if (strcmp(mode, "intractg") == 0) {
+ if (!opt0.o_del) opt->o_del = 16;
+ if (!opt0.o_ins) opt->o_ins = 16;
+ if (!opt0.b) opt->b = 9;
+ if (!opt0.pen_clip5) opt->pen_clip5 = 5;
+ if (!opt0.pen_clip3) opt->pen_clip3 = 5;
+ } else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "ont2d") == 0) {
+ if (!opt0.o_del) opt->o_del = 1;
+ if (!opt0.e_del) opt->e_del = 1;
+ if (!opt0.o_ins) opt->o_ins = 1;
+ if (!opt0.e_ins) opt->e_ins = 1;
+ if (!opt0.b) opt->b = 1;
+ if (opt0.split_factor == 0.) opt->split_factor = 10.;
+ if (strcmp(mode, "ont2d") == 0) {
+ if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
+ if (!opt0.min_seed_len) opt->min_seed_len = 14;
+ if (!opt0.pen_clip5) opt->pen_clip5 = 0;
+ if (!opt0.pen_clip3) opt->pen_clip3 = 0;
+ } else {
+ if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
+ if (!opt0.min_seed_len) opt->min_seed_len = 17;
+ if (!opt0.pen_clip5) opt->pen_clip5 = 0;
+ if (!opt0.pen_clip3) opt->pen_clip3 = 0;
+ }
+ } else {
+ fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode);
+ return 1; // FIXME memory leak
+ }
+ } else update_a(opt, &opt0);
+ bwa_fill_scmat(opt->a, opt->b, opt->mat);
+
+ aux.idx = bwa_idx_load_from_shm(argv[optind]);
+ if (aux.idx == 0) {
+ if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+ } else if (bwa_verbose >= 3)
+ fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
+ if (ignore_alt)
+ for (i = 0; i < aux.idx->bns->n_seqs; ++i)
+ aux.idx->bns->anns[i].is_alt = 0;
+
+ ko = kopen(argv[optind + 1], &fd);
+ if (ko == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]);
+ return 1;
+ }
+ fp = gzdopen(fd, "r");
+ aux.ks = kseq_init(fp);
+ if (optind + 2 < argc) {
+ if (opt->flag&MEM_F_PE) {
+ if (bwa_verbose >= 2)
+ fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
+ } else {
+ ko2 = kopen(argv[optind + 2], &fd2);
+ if (ko2 == 0) {
+ if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]);
+ return 1;
+ }
+ fp2 = gzdopen(fd2, "r");
+ aux.ks2 = kseq_init(fp2);
+ opt->flag |= MEM_F_PE;
+ }
+ }
+ bwa_print_sam_hdr(aux.idx->bns, hdr_line);
+ aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
+ kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
+ free(hdr_line);
+ free(opt);
+ bwa_idx_destroy(aux.idx);
+ kseq_destroy(aux.ks);
+ err_gzclose(fp); kclose(ko);
+ if (aux.ks2) {
+ kseq_destroy(aux.ks2);
+ err_gzclose(fp2); kclose(ko2);
+ }
+ return 0;
+}
+
+int main_fastmap(int argc, char *argv[])
+{
+ int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
+ uint64_t max_intv = 0;
+ kseq_t *seq;
+ bwtint_t k;
+ gzFile fp;
+ smem_i *itr;
+ const bwtintv_v *a;
+ bwaidx_t *idx;
+
+ while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
+ switch (c) {
+ case 'p': print_seq = 1; break;
+ case 'w': min_iwidth = atoi(optarg); break;
+ case 'l': min_len = atoi(optarg); break;
+ case 'i': min_intv = atoi(optarg); break;
+ case 'I': max_intv = atol(optarg); break;
+ case 'L': max_len = atoi(optarg); break;
+ default: return 1;
+ }
+ }
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa fastmap [options] <idxbase> <in.fq>\n\n");
+ fprintf(stderr, "Options: -l INT min SMEM length to output [%d]\n", min_len);
+ fprintf(stderr, " -w INT max interval size to find coordiantes [%d]\n", min_iwidth);
+ fprintf(stderr, " -i INT min SMEM interval size [%d]\n", min_intv);
+ fprintf(stderr, " -L INT max MEM length [%d]\n", max_len);
+ fprintf(stderr, " -I INT stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
+ fprintf(stderr, "\n");
+ return 1;
+ }
+
+ fp = xzopen(argv[optind + 1], "r");
+ seq = kseq_init(fp);
+ if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
+ itr = smem_itr_init(idx->bwt);
+ smem_config(itr, min_intv, max_len, max_intv);
+ while (kseq_read(seq) >= 0) {
+ err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
+ if (print_seq) {
+ err_putchar('\t');
+ err_puts(seq->seq.s);
+ } else err_putchar('\n');
+ for (i = 0; i < seq->seq.l; ++i)
+ seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
+ smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
+ while ((a = smem_next(itr)) != 0) {
+ for (i = 0; i < a->n; ++i) {
+ bwtintv_t *p = &a->a[i];
+ if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
+ err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
+ if (p->x[2] <= min_iwidth) {
+ for (k = 0; k < p->x[2]; ++k) {
+ bwtint_t pos;
+ int len, is_rev, ref_id;
+ len = (uint32_t)p->info - (p->info>>32);
+ pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
+ if (is_rev) pos -= len - 1;
+ bns_cnt_ambi(idx->bns, pos, len, &ref_id);
+ err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
+ }
+ } else err_puts("\t*");
+ err_putchar('\n');
+ }
+ }
+ err_puts("//");
+ }
+
+ smem_itr_destroy(itr);
+ bwa_idx_destroy(idx);
+ kseq_destroy(seq);
+ err_gzclose(fp);
+ return 0;
+}
diff --git a/ext/src/bwa/is.c b/ext/src/bwa/is.c
new file mode 100644
index 0000000..46f1772
--- /dev/null
+++ b/ext/src/bwa/is.c
@@ -0,0 +1,223 @@
+/*
+ * sais.c for sais-lite
+ * Copyright (c) 2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef unsigned char ubyte_t;
+#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i])
+
+/* find the start or end of each bucket */
+static void getCounts(const unsigned char *T, int *C, int n, int k, int cs)
+{
+ int i;
+ for (i = 0; i < k; ++i) C[i] = 0;
+ for (i = 0; i < n; ++i) ++C[chr(i)];
+}
+static void getBuckets(const int *C, int *B, int k, int end)
+{
+ int i, sum = 0;
+ if (end) {
+ for (i = 0; i < k; ++i) {
+ sum += C[i];
+ B[i] = sum;
+ }
+ } else {
+ for (i = 0; i < k; ++i) {
+ sum += C[i];
+ B[i] = sum - C[i];
+ }
+ }
+}
+
+/* compute SA */
+static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs)
+{
+ int *b, i, j;
+ int c0, c1;
+ /* compute SAl */
+ if (C == B) getCounts(T, C, n, k, cs);
+ getBuckets(C, B, k, 0); /* find starts of buckets */
+ j = n - 1;
+ b = SA + B[c1 = chr(j)];
+ *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
+ for (i = 0; i < n; ++i) {
+ j = SA[i], SA[i] = ~j;
+ if (0 < j) {
+ --j;
+ if ((c0 = chr(j)) != c1) {
+ B[c1] = b - SA;
+ b = SA + B[c1 = c0];
+ }
+ *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j;
+ }
+ }
+ /* compute SAs */
+ if (C == B) getCounts(T, C, n, k, cs);
+ getBuckets(C, B, k, 1); /* find ends of buckets */
+ for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
+ if (0 < (j = SA[i])) {
+ --j;
+ if ((c0 = chr(j)) != c1) {
+ B[c1] = b - SA;
+ b = SA + B[c1 = c0];
+ }
+ *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j;
+ } else SA[i] = ~j;
+ }
+}
+
+/*
+ * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working
+ * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
+ */
+static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs)
+{
+ int *C, *B, *RA;
+ int i, j, c, m, p, q, plen, qlen, name;
+ int c0, c1;
+ int diff;
+
+ /* stage 1: reduce the problem by at least 1/2 sort all the
+ * S-substrings */
+ if (k <= fs) {
+ C = SA + n;
+ B = (k <= (fs - k)) ? C + k : C;
+ } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
+ getCounts(T, C, n, k, cs);
+ getBuckets(C, B, k, 1); /* find ends of buckets */
+ for (i = 0; i < n; ++i) SA[i] = 0;
+ for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
+ else if (c != 0) SA[--B[c1]] = i + 1, c = 0;
+ }
+ induceSA(T, SA, C, B, n, k, cs);
+ if (fs < k) free(C);
+ /* compact all the sorted substrings into the first m items of SA
+ * 2*m must be not larger than n (proveable) */
+ for (i = 0, m = 0; i < n; ++i) {
+ p = SA[i];
+ if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) {
+ for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j);
+ if ((j < n) && (c0 < c1)) SA[m++] = p;
+ }
+ }
+ for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
+ /* store the length of all substrings */
+ for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
+ else if (c != 0) {
+ SA[m + ((i + 1) >> 1)] = j - i - 1;
+ j = i + 1;
+ c = 0;
+ }
+ }
+ /* find the lexicographic names of all substrings */
+ for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
+ p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
+ if (plen == qlen) {
+ for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++);
+ if (j == plen) diff = 0;
+ }
+ if (diff != 0) ++name, q = p, qlen = plen;
+ SA[m + (p >> 1)] = name;
+ }
+
+ /* stage 2: solve the reduced problem recurse if names are not yet
+ * unique */
+ if (name < m) {
+ RA = SA + n + fs - m;
+ for (i = n - 1, j = m - 1; m <= i; --i) {
+ if (SA[i] != 0) RA[j--] = SA[i] - 1;
+ }
+ if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2;
+ for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+ if ((c0 = chr(i)) < (c1 + c)) c = 1;
+ else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */
+ }
+ for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */
+ }
+ /* stage 3: induce the result for the original problem */
+ if (k <= fs) {
+ C = SA + n;
+ B = (k <= (fs - k)) ? C + k : C;
+ } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2;
+ /* put all left-most S characters into their buckets */
+ getCounts(T, C, n, k, cs);
+ getBuckets(C, B, k, 1); /* find ends of buckets */
+ for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
+ for (i = m - 1; 0 <= i; --i) {
+ j = SA[i], SA[i] = 0;
+ SA[--B[chr(j)]] = j;
+ }
+ induceSA(T, SA, C, B, n, k, cs);
+ if (fs < k) free(C);
+ return 0;
+}
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n] The output array of suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred
+ */
+int is_sa(const ubyte_t *T, int *SA, int n)
+{
+ if ((T == NULL) || (SA == NULL) || (n < 0)) return -1;
+ SA[0] = n;
+ if (n <= 1) {
+ if (n == 1) SA[1] = 0;
+ return 0;
+ }
+ return sais_main(T, SA+1, 0, n, 256, 1);
+}
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+int is_bwt(ubyte_t *T, int n)
+{
+ int *SA, i, primary = 0;
+ SA = (int*)calloc(n+1, sizeof(int));
+
+ if (is_sa(T, SA, n)) return -1;
+
+ for (i = 0; i <= n; ++i) {
+ if (SA[i] == 0) primary = i;
+ else SA[i] = T[SA[i] - 1];
+ }
+ for (i = 0; i < primary; ++i) T[i] = SA[i];
+ for (; i < n; ++i) T[i] = SA[i + 1];
+ free(SA);
+ return primary;
+}
diff --git a/ext/src/bwa/kbtree.h b/ext/src/bwa/kbtree.h
new file mode 100644
index 0000000..0da101d
--- /dev/null
+++ b/ext/src/bwa/kbtree.h
@@ -0,0 +1,388 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ * 2008-2009, Attractive Chaos <attractor at live.co.uk>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef struct {
+ int32_t is_internal:1, n:31;
+} kbnode_t;
+
+#define __KB_KEY(type, x) ((type*)((char*)x + 4))
+#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name) \
+ typedef struct { \
+ kbnode_t *root; \
+ int off_key, off_ptr, ilen, elen; \
+ int n, t; \
+ int n_keys, n_nodes; \
+ } kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t) \
+ kbtree_##name##_t *kb_init_##name(int size) \
+ { \
+ kbtree_##name##_t *b; \
+ b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \
+ b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
+ if (b->t < 2) { \
+ free(b); return 0; \
+ } \
+ b->n = 2 * b->t - 1; \
+ b->off_ptr = 4 + b->n * sizeof(key_t); \
+ b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+ b->elen = (b->off_ptr + 3) >> 2 << 2; \
+ b->root = (kbnode_t*)calloc(1, b->ilen); \
+ ++b->n_nodes; \
+ return b; \
+ }
+
+#define __kb_destroy(b) do { \
+ int i, max = 8; \
+ kbnode_t *x, **top, **stack = 0; \
+ if (b) { \
+ top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \
+ *top++ = (b)->root; \
+ while (top != stack) { \
+ x = *--top; \
+ if (x == 0 || x->is_internal == 0) { free(x); continue; } \
+ for (i = 0; i <= x->n; ++i) \
+ if (__KB_PTR(b, x)[i]) { \
+ if (top - stack == max) { \
+ max <<= 1; \
+ stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+ top = stack + (max>>1); \
+ } \
+ *top++ = __KB_PTR(b, x)[i]; \
+ } \
+ free(x); \
+ } \
+ } \
+ free(b); free(stack); \
+ } while (0)
+
+#define __kb_get_first(key_t, b, ret) do { \
+ kbnode_t *__x = (b)->root; \
+ while (__KB_PTR(b, __x)[0] != 0) \
+ __x = __KB_PTR(b, __x)[0]; \
+ (ret) = __KB_KEY(key_t, __x)[0]; \
+ } while (0)
+
+#define __KB_GET_AUX0(name, key_t, __cmp) \
+ static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+ { \
+ int tr, *rr, begin, end, n = x->n >> 1; \
+ if (x->n == 0) return -1; \
+ if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \
+ begin = 0; end = n; \
+ } else { begin = n; end = x->n - 1; } \
+ rr = r? r : &tr; \
+ n = end; \
+ while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
+ return n; \
+ }
+
+#define __KB_GET_AUX1(name, key_t, __cmp) \
+ static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+ { \
+ int tr, *rr, begin = 0, end = x->n; \
+ if (x->n == 0) return -1; \
+ rr = r? r : &tr; \
+ while (begin < end) { \
+ int mid = (begin + end) >> 1; \
+ if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+ else end = mid; \
+ } \
+ if (begin == x->n) { *rr = 1; return x->n - 1; } \
+ if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \
+ return begin; \
+ }
+
+#define __KB_GET(name, key_t) \
+ static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ int i, r = 0; \
+ kbnode_t *x = b->root; \
+ while (x) { \
+ i = __kb_getp_aux_##name(x, k, &r); \
+ if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \
+ if (x->is_internal == 0) return 0; \
+ x = __KB_PTR(b, x)[i + 1]; \
+ } \
+ return 0; \
+ } \
+ static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ return kb_getp_##name(b, &k); \
+ }
+
+#define __KB_INTERVAL(name, key_t) \
+ static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \
+ { \
+ int i, r = 0; \
+ kbnode_t *x = b->root; \
+ *lower = *upper = 0; \
+ while (x) { \
+ i = __kb_getp_aux_##name(x, k, &r); \
+ if (i >= 0 && r == 0) { \
+ *lower = *upper = &__KB_KEY(key_t, x)[i]; \
+ return; \
+ } \
+ if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \
+ if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \
+ if (x->is_internal == 0) return; \
+ x = __KB_PTR(b, x)[i + 1]; \
+ } \
+ } \
+ static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+ { \
+ kb_intervalp_##name(b, &k, lower, upper); \
+ }
+
+#define __KB_PUT(name, key_t, __cmp) \
+ /* x must be an internal node */ \
+ static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+ { \
+ kbnode_t *z; \
+ z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \
+ ++b->n_nodes; \
+ z->is_internal = y->is_internal; \
+ z->n = b->t - 1; \
+ memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+ if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+ y->n = b->t - 1; \
+ memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+ __KB_PTR(b, x)[i + 1] = z; \
+ memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \
+ ++x->n; \
+ } \
+ static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+ { \
+ int i = x->n - 1; \
+ if (x->is_internal == 0) { \
+ i = __kb_getp_aux_##name(x, k, 0); \
+ if (i != x->n - 1) \
+ memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ __KB_KEY(key_t, x)[i + 1] = *k; \
+ ++x->n; \
+ } else { \
+ i = __kb_getp_aux_##name(x, k, 0) + 1; \
+ if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \
+ __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \
+ if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \
+ } \
+ __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \
+ } \
+ } \
+ static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ kbnode_t *r, *s; \
+ ++b->n_keys; \
+ r = b->root; \
+ if (r->n == 2 * b->t - 1) { \
+ ++b->n_nodes; \
+ s = (kbnode_t*)calloc(1, b->ilen); \
+ b->root = s; s->is_internal = 1; s->n = 0; \
+ __KB_PTR(b, s)[0] = r; \
+ __kb_split_##name(b, s, 0, r); \
+ r = s; \
+ } \
+ __kb_putp_aux_##name(b, r, k); \
+ } \
+ static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ kb_putp_##name(b, &k); \
+ }
+
+
+#define __KB_DEL(name, key_t) \
+ static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+ { \
+ int yn, zn, i, r = 0; \
+ kbnode_t *xp, *y, *z; \
+ key_t kp; \
+ if (x == 0) return *k; \
+ if (s) { /* s can only be 0, 1 or 2 */ \
+ r = x->is_internal == 0? 0 : s == 1? 1 : -1; \
+ i = s == 1? x->n - 1 : -1; \
+ } else i = __kb_getp_aux_##name(x, k, &r); \
+ if (x->is_internal == 0) { \
+ if (s == 2) ++i; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ --x->n; \
+ return kp; \
+ } \
+ if (r == 0) { \
+ if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \
+ xp = __KB_PTR(b, x)[i]; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+ return kp; \
+ } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \
+ xp = __KB_PTR(b, x)[i + 1]; \
+ kp = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+ return kp; \
+ } else if (yn == b->t - 1 && zn == b->t - 1) { \
+ y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \
+ __KB_KEY(key_t, y)[y->n++] = *k; \
+ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+ y->n += z->n; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+ --x->n; \
+ free(z); \
+ return __kb_delp_aux_##name(b, y, k, s); \
+ } \
+ } \
+ ++i; \
+ if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \
+ if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \
+ memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+ if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+ __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \
+ __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+ if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+ --y->n; ++xp->n; \
+ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
+ __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \
+ if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+ --y->n; \
+ memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+ } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+ __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \
+ memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+ if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+ y->n += xp->n; \
+ memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+ --x->n; \
+ free(xp); \
+ xp = y; \
+ } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+ __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \
+ memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \
+ if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+ xp->n += y->n; \
+ memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+ memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+ --x->n; \
+ free(y); \
+ } \
+ } \
+ return __kb_delp_aux_##name(b, xp, k, s); \
+ } \
+ static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+ { \
+ kbnode_t *x; \
+ key_t ret; \
+ ret = __kb_delp_aux_##name(b, b->root, k, 0); \
+ --b->n_keys; \
+ if (b->root->n == 0 && b->root->is_internal) { \
+ --b->n_nodes; \
+ x = b->root; \
+ b->root = __KB_PTR(b, x)[0]; \
+ free(x); \
+ } \
+ return ret; \
+ } \
+ static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+ { \
+ return kb_delp_##name(b, &k); \
+ }
+
+typedef struct {
+ kbnode_t *x;
+ int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do { \
+ int __kmax = 8; \
+ __kbstack_t *__kstack, *__kp; \
+ __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+ __kp->x = (b)->root; __kp->i = 0; \
+ for (;;) { \
+ while (__kp->x && __kp->i <= __kp->x->n) { \
+ if (__kp - __kstack == __kmax - 1) { \
+ __kmax <<= 1; \
+ __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+ __kp = __kstack + (__kmax>>1) - 1; \
+ } \
+ (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+ ++__kp; \
+ } \
+ --__kp; \
+ if (__kp >= __kstack) { \
+ if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+ ++__kp->i; \
+ } else break; \
+ } \
+ free(__kstack); \
+ } while (0)
+
+#define KBTREE_INIT(name, key_t, __cmp) \
+ __KB_TREE_T(name) \
+ __KB_INIT(name, key_t) \
+ __KB_GET_AUX1(name, key_t, __cmp) \
+ __KB_GET(name, key_t) \
+ __KB_INTERVAL(name, key_t) \
+ __KB_PUT(name, key_t, __cmp) \
+ __KB_DEL(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+#endif
diff --git a/ext/src/bwa/khash.h b/ext/src/bwa/khash.h
new file mode 100644
index 0000000..12e5542
--- /dev/null
+++ b/ext/src/bwa/khash.h
@@ -0,0 +1,614 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+ int ret, is_missing;
+ khiter_t k;
+ khash_t(32) *h = kh_init(32);
+ k = kh_put(32, h, 5, &ret);
+ kh_value(h, k) = 10;
+ k = kh_get(32, h, 10);
+ is_missing = (k == kh_end(h));
+ k = kh_get(32, h, 5);
+ kh_del(32, h, k);
+ for (k = kh_begin(h); k != kh_end(h); ++k)
+ if (kh_exist(h, k)) kh_value(h, k) = 1;
+ kh_destroy(32, h);
+ return 0;
+}
+*/
+
+/*
+ 2011-12-29 (0.2.7):
+
+ * Minor code clean up; no actual effect.
+
+ 2011-09-16 (0.2.6):
+
+ * The capacity is a power of 2. This seems to dramatically improve the
+ speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+ - http://code.google.com/p/ulib/
+ - http://nothings.org/computer/judy/
+
+ * Allow to optionally use linear probing which usually has better
+ performance for random input. Double hashing is still the default as it
+ is more robust to certain non-random input.
+
+ * Added Wang's integer hash function (not used by default). This hash
+ function is more robust to certain non-random input.
+
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
+ 2008-09-19 (0.2.3):
+
+ * Corrected the example
+ * Improved interfaces
+
+ 2008-09-11 (0.2.2):
+
+ * Improved speed a little in kh_put()
+
+ 2008-09-10 (0.2.1):
+
+ * Added kh_clear()
+ * Fixed a compiling error
+
+ 2008-09-02 (0.2.0):
+
+ * Changed to token concatenation which increases flexibility.
+
+ 2008-08-31 (0.1.2):
+
+ * Fixed a bug in kh_get(), which has not been tested previously.
+
+ 2008-08-31 (0.1.1):
+
+ * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+ @header
+
+ Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.6"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
+ extern kh_##name##_t *kh_init_##name(void); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ SCOPE kh_##name##_t *kh_init_##name(void) { \
+ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
+ } \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+ { \
+ if (h) { \
+ kfree((void *)h->keys); kfree(h->flags); \
+ kfree((void *)h->vals); \
+ kfree(h); \
+ } \
+ } \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
+ { \
+ if (h && h->flags) { \
+ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+ h->size = h->n_occupied = 0; \
+ } \
+ } \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ { \
+ if (h->n_buckets) { \
+ khint_t inc, k, i, last, mask; \
+ mask = h->n_buckets - 1; \
+ k = __hash_func(key); i = k & mask; \
+ inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ i = (i + inc) & mask; \
+ if (i == last) return h->n_buckets; \
+ } \
+ return __ac_iseither(h->flags, i)? h->n_buckets : i; \
+ } else return 0; \
+ } \
+ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+ khint32_t *new_flags = 0; \
+ khint_t j = 1; \
+ { \
+ kroundup32(new_n_buckets); \
+ if (new_n_buckets < 4) new_n_buckets = 4; \
+ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+ else { /* hash table size to be changed (shrink or expand); rehash */ \
+ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (!new_flags) return -1; \
+ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+ if (h->n_buckets < new_n_buckets) { /* expand */ \
+ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (!new_keys) return -1; \
+ h->keys = new_keys; \
+ if (kh_is_map) { \
+ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ if (!new_vals) return -1; \
+ h->vals = new_vals; \
+ } \
+ } /* otherwise shrink */ \
+ } \
+ } \
+ if (j) { /* rehashing is needed */ \
+ for (j = 0; j != h->n_buckets; ++j) { \
+ if (__ac_iseither(h->flags, j) == 0) { \
+ khkey_t key = h->keys[j]; \
+ khval_t val; \
+ khint_t new_mask; \
+ new_mask = new_n_buckets - 1; \
+ if (kh_is_map) val = h->vals[j]; \
+ __ac_set_isdel_true(h->flags, j); \
+ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+ khint_t inc, k, i; \
+ k = __hash_func(key); \
+ i = k & new_mask; \
+ inc = __ac_inc(k, new_mask); \
+ while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
+ __ac_set_isempty_false(new_flags, i); \
+ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+ } else { /* write the element and jump out of the loop */ \
+ h->keys[i] = key; \
+ if (kh_is_map) h->vals[i] = val; \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+ } \
+ kfree(h->flags); /* free the working space */ \
+ h->flags = new_flags; \
+ h->n_buckets = new_n_buckets; \
+ h->n_occupied = h->size; \
+ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+ } \
+ return 0; \
+ } \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ { \
+ khint_t x; \
+ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+ if (h->n_buckets > (h->size<<1)) { \
+ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+ *ret = -1; return h->n_buckets; \
+ } \
+ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+ { \
+ khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
+ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+ else { \
+ inc = __ac_inc(k, mask); last = i; \
+ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+ if (__ac_isdel(h->flags, i)) site = i; \
+ i = (i + inc) & mask; \
+ if (i == last) { x = site; break; } \
+ } \
+ if (x == h->n_buckets) { \
+ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+ else x = i; \
+ } \
+ } \
+ } \
+ if (__ac_isempty(h->flags, x)) { /* not present at all */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; ++h->n_occupied; \
+ *ret = 1; \
+ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
+ h->keys[x] = key; \
+ __ac_set_isboth_false(h->flags, x); \
+ ++h->size; \
+ *ret = 2; \
+ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+ return x; \
+ } \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ { \
+ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
+ __ac_set_isdel_true(h->flags, x); \
+ --h->size; \
+ } \
+ }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ __KHASH_TYPE(name, khkey_t, khval_t) \
+ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+ @abstract Integer hash function
+ @param key The integer [khint32_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+ @abstract Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract 64-bit integer hash function
+ @param key The integer [khint64_t]
+ @return The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+ @abstract 64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+ @abstract const char* hash function
+ @param s Pointer to a null terminated string
+ @return The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+ khint_t h = (khint_t)*s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+ return h;
+}
+/*! @function
+ @abstract Another interface to const char* hash function
+ @param key Pointer to a null terminated string [const char*]
+ @return The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+ @abstract Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+ @abstract Type of the hash table.
+ @param name Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+ @abstract Initiate a hash table.
+ @param name Name of the hash table [symbol]
+ @return Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+ @abstract Destroy a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+ @abstract Reset a hash table without deallocating memory.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+ @abstract Resize a hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param s New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+ @abstract Insert a key to the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @param r Extra return code: 0 if the key is present in the hash table;
+ 1 if the bucket is empty (never used); 2 if the element in
+ the bucket has been deleted [int*]
+ @return Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+ @abstract Retrieve a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Key [type of keys]
+ @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+ @abstract Remove a key from the hash table.
+ @param name Name of the hash table [symbol]
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param k Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+ @abstract Test whether a bucket contains data.
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return 1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+ @abstract Get key given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+ @abstract Get value given an iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param x Iterator to the bucket [khint_t]
+ @return Value [type of values]
+ @discussion For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+ @abstract Get the start iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+ @abstract Get the end iterator
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Get the number of elements in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+ @abstract Get the number of buckets in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @return Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+ @abstract Iterate over the entries in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param kvar Variable to which key will be assigned
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (kvar) = kh_key(h,__i); \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/*! @function
+ @abstract Iterate over the values in the hash table
+ @param h Pointer to the hash table [khash_t(name)*]
+ @param vvar Variable to which value will be assigned
+ @param code Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i; \
+ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
+ if (!kh_exist(h,__i)) continue; \
+ (vvar) = kh_val(h,__i); \
+ code; \
+ } }
+
+/* More conenient interfaces */
+
+/*! @function
+ @abstract Instantiate a hash set containing integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name) \
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t) \
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name) \
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing 64-bit integer keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t) \
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name) \
+ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+ @abstract Instantiate a hash map containing const char* keys
+ @param name Name of the hash table [symbol]
+ @param khval_t Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t) \
+ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/ext/src/bwa/kopen.c b/ext/src/bwa/kopen.c
new file mode 100644
index 0000000..d238226
--- /dev/null
+++ b/ext/src/bwa/kopen.c
@@ -0,0 +1,374 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#ifdef _WIN32
+#define _KO_NO_NET
+#endif
+
+#ifndef _KO_NO_NET
+static int socket_wait(int fd, int is_read)
+{
+ fd_set fds, *fdr = 0, *fdw = 0;
+ struct timeval tv;
+ int ret;
+ tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (is_read) fdr = &fds;
+ else fdw = &fds;
+ ret = select(fd+1, fdr, fdw, 0, &tv);
+ if (ret == -1) perror("select");
+ return ret;
+}
+
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+ int on = 1, fd;
+ struct linger lng = { 0, 0 };
+ struct addrinfo hints, *res = 0;
+ memset(&hints, 0, sizeof(struct addrinfo));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+ if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+ if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+ if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+ freeaddrinfo(res);
+ return fd;
+#undef __err_connect
+}
+
+static int write_bytes(int fd, const char *buf, size_t len)
+{
+ ssize_t bytes;
+ do {
+ bytes = write(fd, buf, len);
+ if (bytes >= 0) {
+ len -= bytes;
+ } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+ return -1;
+ }
+ } while (len > 0);
+
+ return 0;
+}
+
+static int http_open(const char *fn)
+{
+ char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
+ int fd, ret, l;
+ ssize_t bytes = 0, bufsz = 0x10000;
+
+ /* parse URL; adapted from khttp_parse_url() in knetfile.c */
+ if (strstr(fn, "http://") != fn) return 0;
+ // set ->http_host
+ for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+ l = p - fn - 7;
+ http_host = calloc(l + 1, 1);
+ strncpy(http_host, fn + 7, l);
+ http_host[l] = 0;
+ for (q = http_host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ // get http_proxy
+ proxy = getenv("http_proxy");
+ // set host, port and path
+ if (proxy == 0) {
+ host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
+ port = strdup(*q? q : "80");
+ path = strdup(*p? p : "/");
+ } else {
+ host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+ for (q = host; *q && *q != ':'; ++q);
+ if (*q == ':') *q++ = 0;
+ port = strdup(*q? q : "80");
+ path = strdup(fn);
+ }
+
+ /* connect; adapted from khttp_connect() in knetfile.c */
+ l = 0;
+ fd = socket_connect(host, port);
+ buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+ l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n",
+ path, http_host);
+ if (write_bytes(fd, buf, l) != 0) {
+ close(fd);
+ fd = -1;
+ goto out;
+ }
+ l = 0;
+ retry:
+ while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency
+ if (buf[l] == '\n' && l >= 3)
+ if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+ ++l;
+ }
+ if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry;
+
+ buf[l] = 0;
+ if (bytes < 0 || l < 14) { // prematured header
+ close(fd);
+ fd = -1;
+ goto out;
+ }
+ ret = strtol(buf + 8, &p, 0); // HTTP return code
+ if (ret != 200) {
+ close(fd);
+ fd = -1;
+ }
+ out:
+ free(buf); free(http_host); free(host); free(port); free(path);
+ return fd;
+}
+
+typedef struct {
+ int max_response, ctrl_fd;
+ char *response;
+} ftpaux_t;
+
+static int kftp_get_response(ftpaux_t *aux)
+{
+ unsigned char c;
+ int n = 0;
+ char *p;
+ if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
+ while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+ if (n >= aux->max_response) {
+ aux->max_response = aux->max_response? aux->max_response<<1 : 256;
+ aux->response = realloc(aux->response, aux->max_response);
+ }
+ aux->response[n++] = c;
+ if (c == '\n') {
+ if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
+ && aux->response[3] != '-') break;
+ n = 0;
+ continue;
+ }
+ }
+ if (n < 2) return -1;
+ aux->response[n-2] = 0;
+ return strtol(aux->response, &p, 0);
+}
+
+static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
+{
+ if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+ if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1;
+ return is_get? kftp_get_response(aux) : 0;
+}
+
+static int ftp_open(const char *fn)
+{
+ char *p, *host = 0, *port = 0, *retr = 0;
+ char host2[80], port2[10];
+ int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
+ ftpaux_t aux;
+
+ /* parse URL */
+ if (strstr(fn, "ftp://") != fn) return 0;
+ for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+ if (*p != '/') return 0;
+ l = p - fn - 6;
+ port = strdup("21");
+ host = calloc(l + 1, 1);
+ strncpy(host, fn + 6, l);
+ retr = calloc(strlen(p) + 8, 1);
+ sprintf(retr, "RETR %s\r\n", p);
+
+ /* connect to ctrl */
+ memset(&aux, 0, sizeof(ftpaux_t));
+ aux.ctrl_fd = socket_connect(host, port);
+ if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
+
+ /* connect to the data stream */
+ kftp_get_response(&aux);
+ kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
+ kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
+ kftp_send_cmd(&aux, "TYPE I\r\n", 1);
+ kftp_send_cmd(&aux, "PASV\r\n", 1);
+ for (p = aux.response; *p && *p != '('; ++p);
+ if (*p != '(') goto ftp_open_end;
+ ++p;
+ sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+ memcpy(pasv_ip, v, 4 * sizeof(int));
+ pasv_port = (v[4]<<8&0xff00) + v[5];
+ kftp_send_cmd(&aux, retr, 0);
+ sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
+ sprintf(port2, "%d", pasv_port);
+ fd = socket_connect(host2, port2);
+ if (fd == -1) goto ftp_open_end;
+ ret = kftp_get_response(&aux);
+ if (ret != 150) {
+ close(fd);
+ fd = -1;
+ }
+ close(aux.ctrl_fd);
+
+ftp_open_end:
+ free(host); free(port); free(retr); free(aux.response);
+ return fd;
+}
+#endif /* !defined(_KO_NO_NET) */
+
+static char **cmd2argv(const char *cmd)
+{
+ int i, beg, end, argc;
+ char **argv, *str;
+ end = strlen(cmd);
+ for (i = end - 1; i >= 0; --i)
+ if (!isspace(cmd[i])) break;
+ end = i + 1;
+ for (beg = 0; beg < end; ++beg)
+ if (!isspace(cmd[beg])) break;
+ if (beg == end) return 0;
+ for (i = beg + 1, argc = 0; i < end; ++i)
+ if (isspace(cmd[i]) && !isspace(cmd[i-1]))
+ ++argc;
+ argv = (char**)calloc(argc + 2, sizeof(void*));
+ argv[0] = str = (char*)calloc(end - beg + 1, 1);
+ strncpy(argv[0], cmd + beg, end - beg);
+ for (i = argc = 1; i < end - beg; ++i)
+ if (isspace(str[i])) str[i] = 0;
+ else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
+ return argv;
+}
+
+#define KO_STDIN 1
+#define KO_FILE 2
+#define KO_PIPE 3
+#define KO_HTTP 4
+#define KO_FTP 5
+
+typedef struct {
+ int type, fd;
+ pid_t pid;
+} koaux_t;
+
+void *kopen(const char *fn, int *_fd)
+{
+ koaux_t *aux = 0;
+ *_fd = -1;
+ if (strstr(fn, "http://") == fn) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_HTTP;
+ aux->fd = http_open(fn);
+ } else if (strstr(fn, "ftp://") == fn) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_FTP;
+ aux->fd = ftp_open(fn);
+ } else if (strcmp(fn, "-") == 0) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_STDIN;
+ aux->fd = STDIN_FILENO;
+ } else {
+ const char *p, *q;
+ for (p = fn; *p; ++p)
+ if (!isspace(*p)) break;
+ if (*p == '<') { // pipe open
+ int need_shell, pfd[2];
+ pid_t pid;
+ // a simple check to see if we need to invoke a shell; not always working
+ for (q = p + 1; *q; ++q)
+ if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
+ break;
+ need_shell = (*q != 0);
+ if (pipe(pfd) != 0) return 0;
+ pid = vfork();
+ if (pid == -1) { /* vfork() error */
+ close(pfd[0]); close(pfd[1]);
+ return 0;
+ }
+ if (pid == 0) { /* the child process */
+ char **argv; /* FIXME: I do not know if this will lead to a memory leak */
+ close(pfd[0]);
+ dup2(pfd[1], STDOUT_FILENO);
+ close(pfd[1]);
+ if (!need_shell) {
+ argv = cmd2argv(p + 1);
+ execvp(argv[0], argv);
+ free(argv[0]); free(argv);
+ } else execl("/bin/sh", "sh", "-c", p + 1, NULL);
+ exit(1);
+ } else { /* parent process */
+ close(pfd[1]);
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_PIPE;
+ aux->fd = pfd[0];
+ aux->pid = pid;
+ }
+ } else {
+#ifdef _WIN32
+ *_fd = open(fn, O_RDONLY | O_BINARY);
+#else
+ *_fd = open(fn, O_RDONLY);
+#endif
+ if (*_fd >= 0) {
+ aux = calloc(1, sizeof(koaux_t));
+ aux->type = KO_FILE;
+ aux->fd = *_fd;
+ }
+ }
+ }
+ if (aux) *_fd = aux->fd;
+ return aux;
+}
+
+int kclose(void *a)
+{
+ koaux_t *aux = (koaux_t*)a;
+ if (aux->type == KO_PIPE) {
+ int status;
+ pid_t pid;
+ pid = waitpid(aux->pid, &status, WNOHANG);
+ if (pid != aux->pid) kill(aux->pid, 15);
+ }
+ free(aux);
+ return 0;
+}
+
+#ifdef _KO_MAIN
+#define BUF_SIZE 0x10000
+int main(int argc, char *argv[])
+{
+ void *x;
+ int l, fd;
+ unsigned char buf[BUF_SIZE];
+ FILE *fp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: kopen <file>\n");
+ return 1;
+ }
+ x = kopen(argv[1], &fd);
+ fp = fdopen(fd, "r");
+ if (fp == 0) {
+ fprintf(stderr, "ERROR: fail to open the input\n");
+ return 1;
+ }
+ do {
+ if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
+ fwrite(buf, 1, l, stdout);
+ } while (l == BUF_SIZE);
+ fclose(fp);
+ kclose(x);
+ return 0;
+}
+#endif
diff --git a/ext/src/bwa/kseq.h b/ext/src/bwa/kseq.h
new file mode 100644
index 0000000..f3862c6
--- /dev/null
+++ b/ext/src/bwa/kseq.h
@@ -0,0 +1,239 @@
+/* The MIT License
+
+ Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB 1 // isspace() && !' '
+#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX 2
+
+#define __KS_TYPE(type_t) \
+ typedef struct __kstream_t { \
+ unsigned char *buf; \
+ int begin, end, is_eof; \
+ type_t f; \
+ } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize) \
+ static inline kstream_t *ks_init(type_t f) \
+ { \
+ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+ ks->f = f; \
+ ks->buf = (unsigned char*)malloc(__bufsize); \
+ return ks; \
+ } \
+ static inline void ks_destroy(kstream_t *ks) \
+ { \
+ if (ks) { \
+ free(ks->buf); \
+ free(ks); \
+ } \
+ }
+
+#define __KS_GETC(__read, __bufsize) \
+ static inline int ks_getc(kstream_t *ks) \
+ { \
+ if (ks->is_eof && ks->begin >= ks->end) return -1; \
+ if (ks->begin >= ks->end) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; return -1;} \
+ } \
+ return (int)ks->buf[ks->begin++]; \
+ }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize) \
+ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+ { \
+ int gotany = 0; \
+ if (dret) *dret = 0; \
+ str->l = append? str->l : 0; \
+ for (;;) { \
+ int i; \
+ if (ks->begin >= ks->end) { \
+ if (!ks->is_eof) { \
+ ks->begin = 0; \
+ ks->end = __read(ks->f, ks->buf, __bufsize); \
+ if (ks->end == 0) { ks->is_eof = 1; break; } \
+ } else break; \
+ } \
+ if (delimiter == KS_SEP_LINE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == '\n') break; \
+ } else if (delimiter > KS_SEP_MAX) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (ks->buf[i] == delimiter) break; \
+ } else if (delimiter == KS_SEP_SPACE) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i])) break; \
+ } else if (delimiter == KS_SEP_TAB) { \
+ for (i = ks->begin; i < ks->end; ++i) \
+ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+ } else i = 0; /* never come to here! */ \
+ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
+ str->m = str->l + (i - ks->begin) + 1; \
+ kroundup32(str->m); \
+ str->s = (char*)realloc(str->s, str->m); \
+ } \
+ gotany = 1; \
+ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+ str->l = str->l + (i - ks->begin); \
+ ks->begin = i + 1; \
+ if (i < ks->end) { \
+ if (dret) *dret = ks->buf[i]; \
+ break; \
+ } \
+ } \
+ if (!gotany && ks_eof(ks)) return -1; \
+ if (str->s == 0) { \
+ str->m = 1; \
+ str->s = (char*)calloc(1, 1); \
+ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+ str->s[str->l] = '\0'; \
+ return str->l; \
+ } \
+ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+ { return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+ __KS_TYPE(type_t) \
+ __KS_BASIC(type_t, __bufsize) \
+ __KS_GETC(__read, __bufsize) \
+ __KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t) \
+ SCOPE kseq_t *kseq_init(type_t fd) \
+ { \
+ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
+ s->f = ks_init(fd); \
+ return s; \
+ } \
+ SCOPE void kseq_destroy(kseq_t *ks) \
+ { \
+ if (!ks) return; \
+ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+ ks_destroy(ks->f); \
+ free(ks); \
+ }
+
+/* Return value:
+ >=0 length of the sequence (normal)
+ -1 end-of-file
+ -2 truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+ SCOPE int kseq_read(kseq_t *seq) \
+ { \
+ int c; \
+ kstream_t *ks = seq->f; \
+ if (seq->last_char == 0) { /* then jump to the next header line */ \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+ if (c == -1) return -1; /* end of file */ \
+ seq->last_char = c; \
+ } /* else: the first header char has been read in the previous call */ \
+ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+ seq->seq.m = 256; \
+ seq->seq.s = (char*)malloc(seq->seq.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+ if (c == '\n') continue; /* skip empty lines */ \
+ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+ } \
+ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+ seq->seq.m = seq->seq.l + 2; \
+ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+ } \
+ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
+ if (c != '+') return seq->seq.l; /* FASTA */ \
+ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+ seq->qual.m = seq->seq.m; \
+ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+ } \
+ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+ if (c == -1) return -2; /* error: no quality string */ \
+ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+ seq->last_char = 0; /* we have not come to the next header line */ \
+ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+ return seq->seq.l; \
+ }
+
+#define __KSEQ_TYPE(type_t) \
+ typedef struct { \
+ kstring_t name, comment, seq, qual; \
+ int last_char; \
+ kstream_t *f; \
+ } kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read) \
+ KSTREAM_INIT(type_t, __read, 16384) \
+ __KSEQ_TYPE(type_t) \
+ __KSEQ_BASIC(SCOPE, type_t) \
+ __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+ __KS_TYPE(type_t) \
+ __KSEQ_TYPE(type_t) \
+ extern kseq_t *kseq_init(type_t fd); \
+ void kseq_destroy(kseq_t *ks); \
+ int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/ext/src/bwa/ksort.h b/ext/src/bwa/ksort.h
new file mode 100644
index 0000000..5851b0d
--- /dev/null
+++ b/ext/src/bwa/ksort.h
@@ -0,0 +1,273 @@
+/* The MIT License
+
+ Copyright (c) 2008, by Attractive Chaos <attractivechaos at aol.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ 2008-11-16 (0.1.4):
+
+ * Fixed a bug in introsort() that happens in rare cases.
+
+ 2008-11-05 (0.1.3):
+
+ * Fixed a bug in introsort() for complex comparisons.
+
+ * Fixed a bug in mergesort(). The previous version is not stable.
+
+ 2008-09-15 (0.1.2):
+
+ * Accelerated introsort. On my Mac (not on another Linux machine),
+ my implementation is as fast as std::sort on random input.
+
+ * Added combsort and in introsort, switch to combsort if the
+ recursion is too deep.
+
+ 2008-09-13 (0.1.1):
+
+ * Added k-small algorithm
+
+ 2008-09-05 (0.1.0):
+
+ * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+typedef struct {
+ void *left, *right;
+ int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt) \
+ void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \
+ { \
+ type_t *a2[2], *a, *b; \
+ int curr, shift; \
+ \
+ a2[0] = array; \
+ a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \
+ for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \
+ a = a2[curr]; b = a2[1-curr]; \
+ if (shift == 0) { \
+ type_t *p = b, *i, *eb = a + n; \
+ for (i = a; i < eb; i += 2) { \
+ if (i == eb - 1) *p++ = *i; \
+ else { \
+ if (__sort_lt(*(i+1), *i)) { \
+ *p++ = *(i+1); *p++ = *i; \
+ } else { \
+ *p++ = *i; *p++ = *(i+1); \
+ } \
+ } \
+ } \
+ } else { \
+ size_t i, step = 1ul<<shift; \
+ for (i = 0; i < n; i += step<<1) { \
+ type_t *p, *j, *k, *ea, *eb; \
+ if (n < i + step) { \
+ ea = a + n; eb = a; \
+ } else { \
+ ea = a + i + step; \
+ eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+ } \
+ j = a + i; k = a + i + step; p = b + i; \
+ while (j < ea && k < eb) { \
+ if (__sort_lt(*k, *j)) *p++ = *k++; \
+ else *p++ = *j++; \
+ } \
+ while (j < ea) *p++ = *j++; \
+ while (k < eb) *p++ = *k++; \
+ } \
+ } \
+ curr = 1 - curr; \
+ } \
+ if (curr == 1) { \
+ type_t *p = a2[0], *i = a2[1], *eb = array + n; \
+ for (; p < eb; ++i) *p++ = *i; \
+ } \
+ if (temp == 0) free(a2[1]); \
+ } \
+ void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \
+ { \
+ size_t k = i; \
+ type_t tmp = l[i]; \
+ while ((k = (k << 1) + 1) < n) { \
+ if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \
+ if (__sort_lt(l[k], tmp)) break; \
+ l[i] = l[k]; i = k; \
+ } \
+ l[i] = tmp; \
+ } \
+ void ks_heapmake_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \
+ ks_heapadjust_##name(i, lsize, l); \
+ } \
+ void ks_heapsort_##name(size_t lsize, type_t l[]) \
+ { \
+ size_t i; \
+ for (i = lsize - 1; i > 0; --i) { \
+ type_t tmp; \
+ tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+ } \
+ } \
+ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \
+ { \
+ type_t *i, *j, swap_tmp; \
+ for (i = s + 1; i < t; ++i) \
+ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \
+ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \
+ } \
+ } \
+ void ks_combsort_##name(size_t n, type_t a[]) \
+ { \
+ const double shrink_factor = 1.2473309501039786540366528676643; \
+ int do_swap; \
+ size_t gap = n; \
+ type_t tmp, *i, *j; \
+ do { \
+ if (gap > 2) { \
+ gap = (size_t)(gap / shrink_factor); \
+ if (gap == 9 || gap == 10) gap = 11; \
+ } \
+ do_swap = 0; \
+ for (i = a; i < a + n - gap; ++i) { \
+ j = i + gap; \
+ if (__sort_lt(*j, *i)) { \
+ tmp = *i; *i = *j; *j = tmp; \
+ do_swap = 1; \
+ } \
+ } \
+ } while (do_swap || gap > 2); \
+ if (gap != 1) __ks_insertsort_##name(a, a + n); \
+ } \
+ void ks_introsort_##name(size_t n, type_t a[]) \
+ { \
+ int d; \
+ ks_isort_stack_t *top, *stack; \
+ type_t rp, swap_tmp; \
+ type_t *s, *t, *i, *j, *k; \
+ \
+ if (n < 1) return; \
+ else if (n == 2) { \
+ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+ return; \
+ } \
+ for (d = 2; 1ul<<d < n; ++d); \
+ stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+ top = stack; s = a; t = a + (n-1); d <<= 1; \
+ while (1) { \
+ if (s < t) { \
+ if (--d == 0) { \
+ ks_combsort_##name(t - s + 1, s); \
+ t = s; \
+ continue; \
+ } \
+ i = s; j = t; k = i + ((j-i)>>1) + 1; \
+ if (__sort_lt(*k, *i)) { \
+ if (__sort_lt(*k, *j)) k = j; \
+ } else k = __sort_lt(*j, *i)? i : j; \
+ rp = *k; \
+ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \
+ for (;;) { \
+ do ++i; while (__sort_lt(*i, rp)); \
+ do --j; while (i <= j && __sort_lt(rp, *j)); \
+ if (j <= i) break; \
+ swap_tmp = *i; *i = *j; *j = swap_tmp; \
+ } \
+ swap_tmp = *i; *i = *t; *t = swap_tmp; \
+ if (i-s > t-i) { \
+ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+ s = t-i > 16? i+1 : t; \
+ } else { \
+ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+ t = i-s > 16? i-1 : s; \
+ } \
+ } else { \
+ if (top == stack) { \
+ free(stack); \
+ __ks_insertsort_##name(a, a+n); \
+ return; \
+ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+ } \
+ } \
+ } \
+ /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+ /* 0 <= kk < n */ \
+ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \
+ { \
+ type_t *low, *high, *k, *ll, *hh, *mid; \
+ low = arr; high = arr + n - 1; k = arr + kk; \
+ for (;;) { \
+ if (high <= low) return *k; \
+ if (high == low + 1) { \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ return *k; \
+ } \
+ mid = low + (high - low) / 2; \
+ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \
+ KSORT_SWAP(type_t, *mid, *(low+1)); \
+ ll = low + 1; hh = high; \
+ for (;;) { \
+ do ++ll; while (__sort_lt(*ll, *low)); \
+ do --hh; while (__sort_lt(*low, *hh)); \
+ if (hh < ll) break; \
+ KSORT_SWAP(type_t, *ll, *hh); \
+ } \
+ KSORT_SWAP(type_t, *low, *hh); \
+ if (hh <= k) low = ll; \
+ if (hh >= k) high = hh - 1; \
+ } \
+ }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/ext/src/bwa/kstring.c b/ext/src/bwa/kstring.c
new file mode 100644
index 0000000..bc1d688
--- /dev/null
+++ b/ext/src/bwa/kstring.c
@@ -0,0 +1,20 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include "kstring.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ ksprintf(s, "abcdefg: %d", 100);
+ printf("%s\n", s->s);
+ free(s);
+ return 0;
+}
+#endif
diff --git a/ext/src/bwa/kstring.h b/ext/src/bwa/kstring.h
new file mode 100644
index 0000000..3ab4de0
--- /dev/null
+++ b/ext/src/bwa/kstring.h
@@ -0,0 +1,134 @@
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+ if (s->m < size) {
+ s->m = size;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ memcpy(s->s + s->l, p, l);
+ s->l += l;
+ s->s[s->l] = 0;
+ return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+ char buf[16];
+ int l, i;
+ unsigned x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+ char buf[32];
+ long l, x;
+ if (c == 0) return kputc('0', s);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+ s->s[s->l] = 0;
+ return 0;
+}
+
+static inline int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ va_end(ap);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ }
+ va_end(ap);
+ s->l += l;
+ return l;
+}
+
+#endif
diff --git a/ext/src/bwa/ksw.c b/ext/src/bwa/ksw.c
new file mode 100644
index 0000000..9793e5e
--- /dev/null
+++ b/ext/src/bwa/ksw.c
@@ -0,0 +1,713 @@
+/* The MIT License
+
+ Copyright (c) 2011 by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <emmintrin.h>
+#include "ksw.h"
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
+ int qlen, slen;
+ uint8_t shift, mdiff, max, size;
+ __m128i *qp, *H0, *H1, *E, *Hmax;
+};
+
+/**
+ * Initialize the query data structure
+ *
+ * @param size Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen Length of the query sequence
+ * @param query Query sequence
+ * @param m Size of the alphabet
+ * @param mat Scoring matrix in a one-dimension array
+ *
+ * @return Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+{
+ kswq_t *q;
+ int slen, a, tmp, p;
+
+ size = size > 1? 2 : 1;
+ p = 8 * (3 - size); // # values per __m128i
+ slen = (qlen + p - 1) / p; // segmented length
+ q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+ q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
+ q->H0 = q->qp + slen * m;
+ q->H1 = q->H0 + slen;
+ q->E = q->H1 + slen;
+ q->Hmax = q->E + slen;
+ q->slen = slen; q->qlen = qlen; q->size = size;
+ // compute shift
+ tmp = m * m;
+ for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
+ if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
+ if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
+ }
+ q->max = q->mdiff;
+ q->shift = 256 - q->shift; // NB: q->shift is uint8_t
+ q->mdiff += q->shift; // this is the difference between the min and max scores
+ // An example: p=8, qlen=19, slen=3 and segmentation:
+ // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
+ if (size == 1) {
+ int8_t *t = (int8_t*)q->qp;
+ for (a = 0; a < m; ++a) {
+ int i, k, nlen = slen * p;
+ const int8_t *ma = mat + a * m;
+ for (i = 0; i < slen; ++i)
+ for (k = i; k < nlen; k += slen) // p iterations
+ *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
+ }
+ } else {
+ int16_t *t = (int16_t*)q->qp;
+ for (a = 0; a < m; ++a) {
+ int i, k, nlen = slen * p;
+ const int8_t *ma = mat + a * m;
+ for (i = 0; i < slen; ++i)
+ for (k = i; k < nlen; k += slen) // p iterations
+ *t++ = (k >= qlen? 0 : ma[query[k]]);
+ }
+ }
+ return q;
+}
+
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
+{
+ int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+ uint64_t *b;
+ __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
+ kswr_t r;
+
+#define __max_16(ret, xx) do { \
+ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
+ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
+ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
+ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
+ (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
+ } while (0)
+
+ // initialization
+ r = g_defr;
+ minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+ endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+ m_b = n_b = 0; b = 0;
+ zero = _mm_set1_epi32(0);
+ oe_del = _mm_set1_epi8(_o_del + _e_del);
+ e_del = _mm_set1_epi8(_e_del);
+ oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
+ e_ins = _mm_set1_epi8(_e_ins);
+ shift = _mm_set1_epi8(q->shift);
+ H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+ slen = q->slen;
+ for (i = 0; i < slen; ++i) {
+ _mm_store_si128(E + i, zero);
+ _mm_store_si128(H0 + i, zero);
+ _mm_store_si128(Hmax + i, zero);
+ }
+ // the core loop
+ for (i = 0; i < tlen; ++i) {
+ int j, k, cmp, imax;
+ __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+ h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+ h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+ for (j = 0; LIKELY(j < slen); ++j) {
+ /* SW cells are computed in the following order:
+ * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+ * E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
+ * F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
+ */
+ // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
+ h = _mm_adds_epu8(h, _mm_load_si128(S + j));
+ h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
+ e = _mm_load_si128(E + j); // e=E'(i,j)
+ h = _mm_max_epu8(h, e);
+ h = _mm_max_epu8(h, f); // h=H'(i,j)
+ max = _mm_max_epu8(max, h); // set max
+ _mm_store_si128(H1 + j, h); // save to H'(i,j)
+ // now compute E'(i+1,j)
+ e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
+ t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
+ e = _mm_max_epu8(e, t); // e=E'(i+1,j)
+ _mm_store_si128(E + j, e); // save to E'(i+1,j)
+ // now compute F'(i,j+1)
+ f = _mm_subs_epu8(f, e_ins);
+ t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
+ f = _mm_max_epu8(f, t);
+ // get H'(i-1,j) and prepare for the next j
+ h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
+ }
+ // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
+ for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
+ f = _mm_slli_si128(f, 1);
+ for (j = 0; LIKELY(j < slen); ++j) {
+ h = _mm_load_si128(H1 + j);
+ h = _mm_max_epu8(h, f); // h=H'(i,j)
+ _mm_store_si128(H1 + j, h);
+ h = _mm_subs_epu8(h, oe_ins);
+ f = _mm_subs_epu8(f, e_ins);
+ cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
+ if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
+ }
+ }
+end_loop16:
+ //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
+ __max_16(imax, max); // imax is the maximum number in max
+ if (imax >= minsc) { // write the b array; this condition adds branching unfornately
+ if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
+ if (n_b == m_b) {
+ m_b = m_b? m_b<<1 : 8;
+ b = (uint64_t*)realloc(b, 8 * m_b);
+ }
+ b[n_b++] = (uint64_t)imax<<32 | i;
+ } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+ }
+ if (imax > gmax) {
+ gmax = imax; te = i; // te is the end position on the target
+ for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
+ _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+ if (gmax + q->shift >= 255 || gmax >= endsc) break;
+ }
+ S = H1; H1 = H0; H0 = S; // swap H0 and H1
+ }
+ r.score = gmax + q->shift < 255? gmax : 255;
+ r.te = te;
+ if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
+ int max = -1, tmp, low, high, qlen = slen * 16;
+ uint8_t *t = (uint8_t*)Hmax;
+ for (i = 0; i < qlen; ++i, ++t)
+ if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
+ else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
+ //printf("%d,%d\n", max, gmax);
+ if (b) {
+ i = (r.score + q->max - 1) / q->max;
+ low = te - i; high = te + i;
+ for (i = 0; i < n_b; ++i) {
+ int e = (int32_t)b[i];
+ if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+ r.score2 = b[i]>>32, r.te2 = e;
+ }
+ }
+ }
+ free(b);
+ return r;
+}
+
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
+{
+ int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+ uint64_t *b;
+ __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
+ kswr_t r;
+
+#define __max_8(ret, xx) do { \
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
+ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
+ (ret) = _mm_extract_epi16((xx), 0); \
+ } while (0)
+
+ // initialization
+ r = g_defr;
+ minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+ endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+ m_b = n_b = 0; b = 0;
+ zero = _mm_set1_epi32(0);
+ oe_del = _mm_set1_epi16(_o_del + _e_del);
+ e_del = _mm_set1_epi16(_e_del);
+ oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
+ e_ins = _mm_set1_epi16(_e_ins);
+ H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+ slen = q->slen;
+ for (i = 0; i < slen; ++i) {
+ _mm_store_si128(E + i, zero);
+ _mm_store_si128(H0 + i, zero);
+ _mm_store_si128(Hmax + i, zero);
+ }
+ // the core loop
+ for (i = 0; i < tlen; ++i) {
+ int j, k, imax;
+ __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+ h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+ h = _mm_slli_si128(h, 2);
+ for (j = 0; LIKELY(j < slen); ++j) {
+ h = _mm_adds_epi16(h, *S++);
+ e = _mm_load_si128(E + j);
+ h = _mm_max_epi16(h, e);
+ h = _mm_max_epi16(h, f);
+ max = _mm_max_epi16(max, h);
+ _mm_store_si128(H1 + j, h);
+ e = _mm_subs_epu16(e, e_del);
+ t = _mm_subs_epu16(h, oe_del);
+ e = _mm_max_epi16(e, t);
+ _mm_store_si128(E + j, e);
+ f = _mm_subs_epu16(f, e_ins);
+ t = _mm_subs_epu16(h, oe_ins);
+ f = _mm_max_epi16(f, t);
+ h = _mm_load_si128(H0 + j);
+ }
+ for (k = 0; LIKELY(k < 16); ++k) {
+ f = _mm_slli_si128(f, 2);
+ for (j = 0; LIKELY(j < slen); ++j) {
+ h = _mm_load_si128(H1 + j);
+ h = _mm_max_epi16(h, f);
+ _mm_store_si128(H1 + j, h);
+ h = _mm_subs_epu16(h, oe_ins);
+ f = _mm_subs_epu16(f, e_ins);
+ if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+ }
+ }
+end_loop8:
+ __max_8(imax, max);
+ if (imax >= minsc) {
+ if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
+ if (n_b == m_b) {
+ m_b = m_b? m_b<<1 : 8;
+ b = (uint64_t*)realloc(b, 8 * m_b);
+ }
+ b[n_b++] = (uint64_t)imax<<32 | i;
+ } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+ }
+ if (imax > gmax) {
+ gmax = imax; te = i;
+ for (j = 0; LIKELY(j < slen); ++j)
+ _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+ if (gmax >= endsc) break;
+ }
+ S = H1; H1 = H0; H0 = S;
+ }
+ r.score = gmax; r.te = te;
+ {
+ int max = -1, tmp, low, high, qlen = slen * 8;
+ uint16_t *t = (uint16_t*)Hmax;
+ for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+ if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+ else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
+ if (b) {
+ i = (r.score + q->max - 1) / q->max;
+ low = te - i; high = te + i;
+ for (i = 0; i < n_b; ++i) {
+ int e = (int32_t)b[i];
+ if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+ r.score2 = b[i]>>32, r.te2 = e;
+ }
+ }
+ }
+ free(b);
+ return r;
+}
+
+static inline void revseq(int l, uint8_t *s)
+{
+ int i, t;
+ for (i = 0; i < l>>1; ++i)
+ t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry)
+{
+ int size;
+ kswq_t *q;
+ kswr_t r, rr;
+ kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int);
+
+ q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+ if (qry && *qry == 0) *qry = q;
+ func = q->size == 2? ksw_i16 : ksw_u8;
+ size = q->size;
+ r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra);
+ if (qry == 0) free(q);
+ if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+ revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+ q = ksw_qinit(size, r.qe + 1, query, m, mat);
+ rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score);
+ revseq(r.qe + 1, query); revseq(r.te + 1, target);
+ free(q);
+ if (r.score == rr.score)
+ r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+ return r;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+ return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry);
+}
+
+/********************
+ *** SW extension ***
+ ********************/
+
+typedef struct {
+ int32_t h, e;
+} eh_t;
+
+int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off)
+{
+ eh_t *eh; // score array
+ int8_t *qp; // query profile
+ int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off;
+ assert(h0 > 0);
+ // allocate memory
+ qp = malloc(qlen * m);
+ eh = calloc(qlen + 1, 8);
+ // generate the query profile
+ for (k = i = 0; k < m; ++k) {
+ const int8_t *p = &mat[k * m];
+ for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+ }
+ // fill the first row
+ eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0;
+ for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j)
+ eh[j].h = eh[j-1].h - e_ins;
+ // adjust $w if it is too large
+ k = m * m;
+ for (i = 0, max = 0; i < k; ++i) // get the max score
+ max = max > mat[i]? max : mat[i];
+ max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.);
+ max_ins = max_ins > 1? max_ins : 1;
+ w = w < max_ins? w : max_ins;
+ max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.);
+ max_del = max_del > 1? max_del : 1;
+ w = w < max_del? w : max_del; // TODO: is this necessary?
+ // DP loop
+ max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
+ max_off = 0;
+ beg = 0, end = qlen;
+ for (i = 0; LIKELY(i < tlen); ++i) {
+ int t, f = 0, h1, m = 0, mj = -1;
+ int8_t *q = &qp[target[i] * qlen];
+ // apply the band and the constraint (if provided)
+ if (beg < i - w) beg = i - w;
+ if (end > i + w + 1) end = i + w + 1;
+ if (end > qlen) end = qlen;
+ // compute the first column
+ if (beg == 0) {
+ h1 = h0 - (o_del + e_del * (i + 1));
+ if (h1 < 0) h1 = 0;
+ } else h1 = 0;
+ for (j = beg; LIKELY(j < end); ++j) {
+ // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+ // Similar to SSE2-SW, cells are computed in the following order:
+ // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+ // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
+ // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
+ eh_t *p = &eh[j];
+ int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
+ p->h = h1; // set H(i,j-1) for the next row
+ M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M"
+ h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0
+ h = h > f? h : f;
+ h1 = h; // save H(i,j) to h1 for the next column
+ mj = m > h? mj : j; // record the position where max score is achieved
+ m = m > h? m : h; // m is stored at eh[mj+1]
+ t = M - oe_del;
+ t = t > 0? t : 0;
+ e -= e_del;
+ e = e > t? e : t; // computed E(i+1,j)
+ p->e = e; // save E(i+1,j) for the next row
+ t = M - oe_ins;
+ t = t > 0? t : 0;
+ f -= e_ins;
+ f = f > t? f : t; // computed F(i,j+1)
+ }
+ eh[end].h = h1; eh[end].e = 0;
+ if (j == qlen) {
+ max_ie = gscore > h1? max_ie : i;
+ gscore = gscore > h1? gscore : h1;
+ }
+ if (m == 0) break;
+ if (m > max) {
+ max = m, max_i = i, max_j = mj;
+ max_off = max_off > abs(mj - i)? max_off : abs(mj - i);
+ } else if (zdrop > 0) {
+ if (i - max_i > mj - max_j) {
+ if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break;
+ } else {
+ if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break;
+ }
+ }
+ // update beg and end for the next round
+ for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j);
+ beg = j;
+ for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j);
+ end = j + 2 < qlen? j + 2 : qlen;
+ //beg = 0; end = qlen; // uncomment this line for debugging
+ }
+ free(eh); free(qp);
+ if (_qle) *_qle = max_j + 1;
+ if (_tle) *_tle = max_i + 1;
+ if (_gtle) *_gtle = max_ie + 1;
+ if (_gscore) *_gscore = gscore;
+ if (_max_off) *_max_off = max_off;
+ return max;
+}
+
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off)
+{
+ return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off);
+}
+
+/********************
+ * Global alignment *
+ ********************/
+
+#define MINUS_INF -0x40000000
+
+static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
+{
+ if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+ if (*n_cigar == *m_cigar) {
+ *m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+ cigar = realloc(cigar, (*m_cigar) << 2);
+ }
+ cigar[(*n_cigar)++] = len<<4 | op;
+ } else cigar[(*n_cigar)-1] += len<<4;
+ return cigar;
+}
+
+int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_)
+{
+ eh_t *eh;
+ int8_t *qp; // query profile
+ int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col;
+ uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
+ if (n_cigar_) *n_cigar_ = 0;
+ // allocate memory
+ n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
+ z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0;
+ qp = malloc(qlen * m);
+ eh = calloc(qlen + 1, 8);
+ // generate the query profile
+ for (k = i = 0; k < m; ++k) {
+ const int8_t *p = &mat[k * m];
+ for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+ }
+ // fill the first row
+ eh[0].h = 0; eh[0].e = MINUS_INF;
+ for (j = 1; j <= qlen && j <= w; ++j)
+ eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF;
+ for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
+ // DP loop
+ for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
+ int32_t f = MINUS_INF, h1, beg, end, t;
+ int8_t *q = &qp[target[i] * qlen];
+ beg = i > w? i - w : 0;
+ end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
+ h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF;
+ if (n_cigar_ && cigar_) {
+ uint8_t *zi = &z[(long)i * n_col];
+ for (j = beg; LIKELY(j < end); ++j) {
+ // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+ // Cells are computed in the following order:
+ // M(i,j) = H(i-1,j-1) + S(i,j)
+ // H(i,j) = max{M(i,j), E(i,j), F(i,j)}
+ // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape
+ // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape
+ // We have to separate M(i,j); otherwise the direction may not be recorded correctly.
+ // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global().
+ // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k.
+ // In practice, this should happen very rarely given a reasonable scoring system.
+ eh_t *p = &eh[j];
+ int32_t h, m = p->h, e = p->e;
+ uint8_t d; // direction
+ p->h = h1;
+ m += q[j];
+ d = m >= e? 0 : 1;
+ h = m >= e? m : e;
+ d = h >= f? d : 2;
+ h = h >= f? h : f;
+ h1 = h;
+ t = m - oe_del;
+ e -= e_del;
+ d |= e > t? 1<<2 : 0;
+ e = e > t? e : t;
+ p->e = e;
+ t = m - oe_ins;
+ f -= e_ins;
+ d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+ f = f > t? f : t;
+ zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
+ }
+ } else {
+ for (j = beg; LIKELY(j < end); ++j) {
+ eh_t *p = &eh[j];
+ int32_t h, m = p->h, e = p->e;
+ p->h = h1;
+ m += q[j];
+ h = m >= e? m : e;
+ h = h >= f? h : f;
+ h1 = h;
+ t = m - oe_del;
+ e -= e_del;
+ e = e > t? e : t;
+ p->e = e;
+ t = m - oe_ins;
+ f -= e_ins;
+ f = f > t? f : t;
+ }
+ }
+ eh[end].h = h1; eh[end].e = MINUS_INF;
+ }
+ score = eh[qlen].h;
+ if (n_cigar_ && cigar_) { // backtrack
+ int n_cigar = 0, m_cigar = 0, which = 0;
+ uint32_t *cigar = 0, tmp;
+ i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+ while (i >= 0 && k >= 0) {
+ which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
+ if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
+ else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
+ else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
+ }
+ if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
+ if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
+ for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+ tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+ *n_cigar_ = n_cigar, *cigar_ = cigar;
+ }
+ free(eh); free(qp); free(z);
+ return score;
+}
+
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+ return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_);
+}
+
+/*******************************************
+ * Main function (not compiled by default) *
+ *******************************************/
+
+#ifdef _KSW_MAIN
+
+#include <unistd.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, err_gzread)
+
+unsigned char seq_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+int main(int argc, char *argv[])
+{
+ int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
+ int8_t mat[25];
+ int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
+ uint8_t *rseq = 0;
+ gzFile fpt, fpq;
+ kseq_t *kst, *ksq;
+
+ // parse command line
+ while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
+ switch (c) {
+ case 'a': sa = atoi(optarg); break;
+ case 'b': sb = atoi(optarg); break;
+ case 'q': gapo = atoi(optarg); break;
+ case 'r': gape = atoi(optarg); break;
+ case 't': minsc = atoi(optarg); break;
+ case 'f': forward_only = 1; break;
+ case '1': xtra |= KSW_XBYTE; break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
+ return 1;
+ }
+ if (minsc > 0xffff) minsc = 0xffff;
+ xtra |= KSW_XSUBO | minsc;
+ // initialize scoring matrix
+ for (i = k = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ mat[k++] = i == j? sa : -sb;
+ mat[k++] = 0; // ambiguous base
+ }
+ for (j = 0; j < 5; ++j) mat[k++] = 0;
+ // open file
+ fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt);
+ fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
+ // all-pair alignment
+ while (kseq_read(ksq) > 0) {
+ kswq_t *q[2] = {0, 0};
+ kswr_t r;
+ for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
+ if (!forward_only) { // reverse
+ if ((int)ksq->seq.m > max_rseq) {
+ max_rseq = ksq->seq.m;
+ rseq = (uint8_t*)realloc(rseq, max_rseq);
+ }
+ for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
+ rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
+ }
+ gzrewind(fpt); kseq_rewind(kst);
+ while (kseq_read(kst) > 0) {
+ for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
+ r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
+ if (r.score >= minsc)
+ err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
+ if (rseq) {
+ r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
+ if (r.score >= minsc)
+ err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
+ }
+ }
+ free(q[0]); free(q[1]);
+ }
+ free(rseq);
+ kseq_destroy(kst); err_gzclose(fpt);
+ kseq_destroy(ksq); err_gzclose(fpq);
+ return 0;
+}
+#endif
diff --git a/ext/src/bwa/ksw.h b/ext/src/bwa/ksw.h
new file mode 100644
index 0000000..5d45a67
--- /dev/null
+++ b/ext/src/bwa/ksw.h
@@ -0,0 +1,114 @@
+#ifndef __AC_KSW_H
+#define __AC_KSW_H
+
+#include <stdint.h>
+
+#define KSW_XBYTE 0x10000
+#define KSW_XSTOP 0x20000
+#define KSW_XSUBO 0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
+
+typedef struct {
+ int score; // best score
+ int te, qe; // target end and query end
+ int score2, te2; // second best score and ending position on the target
+ int tb, qb; // target start and query start
+} kswr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+ * Aligning two sequences
+ *
+ * @param qlen length of the query sequence (typically <tlen)
+ * @param query query sequence with 0 <= query[i] < m
+ * @param tlen length of the target sequence
+ * @param target target sequence
+ * @param m number of residue types
+ * @param mat m*m scoring matrix in one-dimension array
+ * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+ * @param gape gap extension penalty
+ * @param xtra extra information (see below)
+ * @param qry query profile (see below)
+ *
+ * @return alignment information in a struct; unset values to -1
+ *
+ * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+ * score and only finds the best score and the end positions. The 2nd best
+ * score or the start positions are not attempted. The default behavior can
+ * be tuned by setting KSW_X* flags:
+ *
+ * KSW_XBYTE: use an unsigned byte to store a score. If overflow occurs,
+ * kswr_t::score will be set to 255
+ *
+ * KSW_XSUBO: track the 2nd best score and the ending position on the
+ * target if the 2nd best is higher than (xtra&0xffff)
+ *
+ * KSW_XSTOP: stop if the maximum score is above (xtra&0xffff)
+ *
+ * KSW_XSTART: find the start positions
+ *
+ * When *qry==NULL, ksw_align() will compute and allocate the query profile
+ * and when the function returns, *qry will point to the profile, which can
+ * be deallocated simply by free(). If one query is aligned against multiple
+ * target sequences, *qry should be set to NULL during the first call and
+ * freed after the last call. Note that qry can equal 0. In this case, the
+ * query profile will be deallocated in ksw_align().
+ */
+ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+ kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry);
+
+ /**
+ * Banded global alignment
+ *
+ * @param qlen query length
+ * @param query query sequence with 0 <= query[i] < m
+ * @param tlen target length
+ * @param target target sequence with 0 <= target[i] < m
+ * @param m number of residue types
+ * @param mat m*m scoring mattrix in one-dimension array
+ * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+ * @param gape gap extension penalty
+ * @param w band width
+ * @param n_cigar (out) number of CIGAR elements
+ * @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free()
+ *
+ * @return score of the alignment
+ */
+ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
+ int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar);
+
+ /**
+ * Extend alignment
+ *
+ * The routine aligns $query and $target, assuming their upstream sequences,
+ * which are not provided, have been aligned with score $h0. In return,
+ * region [0,*qle) on the query and [0,*tle) on the target sequences are
+ * aligned together. If *gscore>=0, *gscore keeps the best score such that
+ * the entire query sequence is aligned; *gtle keeps the position on the
+ * target where *gscore is achieved. Returning *gscore and *gtle helps the
+ * caller to decide whether an end-to-end hit or a partial hit is preferred.
+ *
+ * The first 9 parameters are identical to those in ksw_global()
+ *
+ * @param h0 alignment score of upstream sequences
+ * @param _qle (out) length of the query in the alignment
+ * @param _tle (out) length of the target in the alignment
+ * @param _gtle (out) length of the target if query is fully aligned
+ * @param _gscore (out) score of the best end-to-end alignment; negative if not found
+ *
+ * @return best semi-local alignment score
+ */
+ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
+ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/kthread.c b/ext/src/bwa/kthread.c
new file mode 100644
index 0000000..a5e4b4a
--- /dev/null
+++ b/ext/src/bwa/kthread.c
@@ -0,0 +1,152 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+ struct kt_for_t *t;
+ long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+ int n_threads;
+ long n;
+ ktf_worker_t *w;
+ void (*func)(void*,long,int);
+ void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+ int i, min_i = -1;
+ long k, min = LONG_MAX;
+ for (i = 0; i < t->n_threads; ++i)
+ if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+ k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+ return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+ ktf_worker_t *w = (ktf_worker_t*)data;
+ long i;
+ for (;;) {
+ i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+ if (i >= w->t->n) break;
+ w->t->func(w->t->data, i, w - w->t->w);
+ }
+ while ((i = steal_work(w->t)) >= 0)
+ w->t->func(w->t->data, i, w - w->t->w);
+ pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+
+ if (n_threads > 1) {
+ int i;
+ kt_for_t t;
+ pthread_t *tid;
+ t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+ t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+ tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+ for (i = 0; i < n_threads; ++i)
+ t.w[i].t = &t, t.w[i].i = i;
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+ } else {
+ long j;
+ for (j = 0; j < n; ++j) func(data, j, 0);
+ }
+}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+ struct ktp_t *pl;
+ int64_t index;
+ int step;
+ void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+ void *shared;
+ void *(*func)(void*, int, void*);
+ int64_t index;
+ int n_workers, n_steps;
+ ktp_worker_t *workers;
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+ ktp_worker_t *w = (ktp_worker_t*)data;
+ ktp_t *p = w->pl;
+ while (w->step < p->n_steps) {
+ // test whether we can kick off the job with this worker
+ pthread_mutex_lock(&p->mutex);
+ for (;;) {
+ int i;
+ // test whether another worker is doing the same step
+ for (i = 0; i < p->n_workers; ++i) {
+ if (w == &p->workers[i]) continue; // ignore itself
+ if (p->workers[i].step <= w->step && p->workers[i].index < w->index)
+ break;
+ }
+ if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps
+ pthread_cond_wait(&p->cv, &p->mutex);
+ }
+ pthread_mutex_unlock(&p->mutex);
+
+ // working on w->step
+ w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+ // update step and let other workers know
+ pthread_mutex_lock(&p->mutex);
+ w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+ if (w->step == 0) w->index = p->index++;
+ pthread_cond_broadcast(&p->cv);
+ pthread_mutex_unlock(&p->mutex);
+ }
+ pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+ ktp_t aux;
+ pthread_t *tid;
+ int i;
+
+ if (n_threads < 1) n_threads = 1;
+ aux.n_workers = n_threads;
+ aux.n_steps = n_steps;
+ aux.func = func;
+ aux.shared = shared_data;
+ aux.index = 0;
+ pthread_mutex_init(&aux.mutex, 0);
+ pthread_cond_init(&aux.cv, 0);
+
+ aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t));
+ for (i = 0; i < n_threads; ++i) {
+ ktp_worker_t *w = &aux.workers[i];
+ w->step = 0; w->pl = &aux; w->data = 0;
+ w->index = aux.index++;
+ }
+
+ tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+ for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+ for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+ pthread_mutex_destroy(&aux.mutex);
+ pthread_cond_destroy(&aux.cv);
+}
diff --git a/ext/src/bwa/kvec.h b/ext/src/bwa/kvec.h
new file mode 100644
index 0000000..83ad483
--- /dev/null
+++ b/ext/src/bwa/kvec.h
@@ -0,0 +1,94 @@
+/* The MIT License
+
+ Copyright (c) 2008, by Attractive Chaos <attractor at live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/*
+ An example:
+
+#include "kvec.h"
+int main() {
+ kvec_t(int) array;
+ kv_init(array);
+ kv_push(int, array, 10); // append
+ kv_a(int, array, 20) = 5; // dynamic
+ kv_A(array, 20) = 4; // static
+ kv_destroy(array);
+ return 0;
+}
+*/
+
+/*
+ 2008-09-22 (0.1.0):
+
+ * The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
+
+#define kv_copy(type, v1, v0) do { \
+ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
+ (v1).n = (v0).n; \
+ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
+ } while (0) \
+
+#define kv_push(type, v, x) do { \
+ if ((v).n == (v).m) { \
+ (v).m = (v).m? (v).m<<1 : 2; \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
+ } \
+ (v).a[(v).n++] = (x); \
+ } while (0)
+
+#define kv_pushp(type, v) ((((v).n == (v).m)? \
+ ((v).m = ((v).m? (v).m<<1 : 2), \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+ : 0), &(v).a[(v).n++])
+
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
+ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+ : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+ : 0), (v).a[(i)])
+
+#endif
diff --git a/ext/src/bwa/main.c b/ext/src/bwa/main.c
new file mode 100644
index 0000000..2f43017
--- /dev/null
+++ b/ext/src/bwa/main.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <string.h>
+#include "kstring.h"
+#include "utils.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.7.12-r1044"
+#endif
+
+int bwa_fa2pac(int argc, char *argv[]);
+int bwa_pac2bwt(int argc, char *argv[]);
+int bwa_bwtupdate(int argc, char *argv[]);
+int bwa_bwt2sa(int argc, char *argv[]);
+int bwa_index(int argc, char *argv[]);
+
+int bwa_aln(int argc, char *argv[]);
+int bwa_sai2sam_se(int argc, char *argv[]);
+int bwa_sai2sam_pe(int argc, char *argv[]);
+
+int bwa_bwtsw2(int argc, char *argv[]);
+
+int main_fastmap(int argc, char *argv[]);
+int main_mem(int argc, char *argv[]);
+int main_shm(int argc, char *argv[]);
+
+int main_pemerge(int argc, char *argv[]);
+int main_maxk(int argc, char *argv[]);
+
+static int usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Contact: Heng Li <lh3 at sanger.ac.uk>\n\n");
+ fprintf(stderr, "Usage: bwa <command> [options]\n\n");
+ fprintf(stderr, "Command: index index sequences in the FASTA format\n");
+ fprintf(stderr, " mem BWA-MEM algorithm\n");
+ fprintf(stderr, " fastmap identify super-maximal exact matches\n");
+ fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n");
+ fprintf(stderr, " aln gapped/ungapped alignment\n");
+ fprintf(stderr, " samse generate alignment (single ended)\n");
+ fprintf(stderr, " sampe generate alignment (paired ended)\n");
+ fprintf(stderr, " bwasw BWA-SW for long queries\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " shm manage indices in shared memory\n");
+ fprintf(stderr, " fa2pac convert FASTA to PAC format\n");
+ fprintf(stderr, " pac2bwt generate BWT from PAC\n");
+ fprintf(stderr, " bwtupdate update .bwt to the new format\n");
+ fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr,
+"Note: To use BWA, you need to first index the genome with `bwa index'.\n"
+" There are three alignment algorithms in BWA: `mem', `bwasw', and\n"
+" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n"
+" first. Please `man ./bwa.1' for the manual.\n\n");
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ extern char *bwa_pg;
+ int i, ret;
+ double t_real;
+ kstring_t pg = {0,0,0};
+ t_real = realtime();
+ ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]);
+ for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]);
+ bwa_pg = pg.s;
+ if (argc < 2) return usage();
+ if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1);
+ else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1);
+ else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1);
+ else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
+ else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
+ else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
+ else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
+ else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
+ else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
+ else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
+ else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
+ else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
+ else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
+ else if (strcmp(argv[1], "maxk") == 0) ret = main_maxk(argc-1, argv+1);
+ else {
+ fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+ return 1;
+ }
+ err_fflush(stdout);
+ err_fclose(stdout);
+ if (ret == 0) {
+ fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION);
+ fprintf(stderr, "[%s] CMD:", __func__);
+ for (i = 0; i < argc; ++i)
+ fprintf(stderr, " %s", argv[i]);
+ fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime());
+ }
+ free(bwa_pg);
+ return ret;
+}
diff --git a/ext/src/bwa/malloc_wrap.c b/ext/src/bwa/malloc_wrap.c
new file mode 100644
index 0000000..100b8cb
--- /dev/null
+++ b/ext/src/bwa/malloc_wrap.c
@@ -0,0 +1,57 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#ifdef USE_MALLOC_WRAPPERS
+/* Don't wrap ourselves */
+# undef USE_MALLOC_WRAPPERS
+#endif
+#include "malloc_wrap.h"
+
+void *wrap_calloc(size_t nmemb, size_t size,
+ const char *file, unsigned int line, const char *func) {
+ void *p = calloc(nmemb, size);
+ if (NULL == p) {
+ fprintf(stderr,
+ "[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+ func, nmemb * size, file, line, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+void *wrap_malloc(size_t size,
+ const char *file, unsigned int line, const char *func) {
+ void *p = malloc(size);
+ if (NULL == p) {
+ fprintf(stderr,
+ "[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+ func, size, file, line, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+void *wrap_realloc(void *ptr, size_t size,
+ const char *file, unsigned int line, const char *func) {
+ void *p = realloc(ptr, size);
+ if (NULL == p) {
+ fprintf(stderr,
+ "[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+ func, size, file, line, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
+
+char *wrap_strdup(const char *s,
+ const char *file, unsigned int line, const char *func) {
+ char *p = strdup(s);
+ if (NULL == p) {
+ fprintf(stderr,
+ "[%s] Failed to allocate %zd bytes at %s line %u: %s\n",
+ func, strlen(s), file, line, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ return p;
+}
diff --git a/ext/src/bwa/malloc_wrap.h b/ext/src/bwa/malloc_wrap.h
new file mode 100644
index 0000000..a55876a
--- /dev/null
+++ b/ext/src/bwa/malloc_wrap.h
@@ -0,0 +1,47 @@
+#ifndef MALLOC_WRAP_H
+#define MALLOC_WRAP_H
+
+#include <stdlib.h> /* Avoid breaking the usual definitions */
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ void *wrap_calloc(size_t nmemb, size_t size,
+ const char *file, unsigned int line, const char *func);
+ void *wrap_malloc(size_t size,
+ const char *file, unsigned int line, const char *func);
+ void *wrap_realloc(void *ptr, size_t size,
+ const char *file, unsigned int line, const char *func);
+ char *wrap_strdup(const char *s,
+ const char *file, unsigned int line, const char *func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef USE_MALLOC_WRAPPERS
+# ifdef calloc
+# undef calloc
+# endif
+# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__)
+
+# ifdef malloc
+# undef malloc
+# endif
+# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__)
+
+# ifdef realloc
+# undef realloc
+# endif
+# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__)
+
+# ifdef strdup
+# undef strdup
+# endif
+# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__)
+
+#endif /* USE_MALLOC_WRAPPERS */
+
+#endif /* MALLOC_WRAP_H */
diff --git a/ext/src/bwa/maxk.c b/ext/src/bwa/maxk.c
new file mode 100644
index 0000000..fee5765
--- /dev/null
+++ b/ext/src/bwa/maxk.c
@@ -0,0 +1,67 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+#include "bwa.h"
+#include "bwamem.h"
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
+
+int main_maxk(int argc, char *argv[])
+{
+ int i, c, self = 0, max_len = 0;
+ uint8_t *cnt = 0;
+ uint64_t hist[256];
+ bwt_t *bwt;
+ kseq_t *ks;
+ smem_i *itr;
+ gzFile fp;
+
+ while ((c = getopt(argc, argv, "s")) >= 0) {
+ if (c == 's') self = 1;
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: bwa maxk [-s] <index.prefix> <seq.fa>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "rb") : gzdopen(fileno(stdin), "rb");
+ ks = kseq_init(fp);
+ bwt = bwt_restore_bwt(argv[optind]);
+ itr = smem_itr_init(bwt);
+ if (self) smem_config(itr, 2, INT_MAX, 0);
+ memset(hist, 0, 8 * 256);
+
+ while (kseq_read(ks) >= 0) {
+ const bwtintv_v *a;
+ if (ks->seq.l > max_len) {
+ max_len = ks->seq.l;
+ kroundup32(max_len);
+ cnt = realloc(cnt, max_len);
+ }
+ memset(cnt, 0, ks->seq.l);
+ for (i = 0; i < ks->seq.l; ++i)
+ ks->seq.s[i] = nst_nt4_table[(int)ks->seq.s[i]];
+ smem_set_query(itr, ks->seq.l, (uint8_t*)ks->seq.s);
+ while ((a = smem_next(itr)) != 0) {
+ for (i = 0; i < a->n; ++i) {
+ bwtintv_t *p = &a->a[i];
+ int j, l, start = p->info>>32, end = (uint32_t)p->info;
+ l = end - start < 255? end - start : 255;
+ for (j = start; j < end; ++j)
+ cnt[j] = cnt[j] > l? cnt[j] : l;
+ }
+ }
+ for (i = 0; i < ks->seq.l; ++i) ++hist[cnt[i]];
+ }
+ for (i = 0; i < 256; ++i)
+ printf("%d\t%lld\n", i, (long long)hist[i]);
+ free(cnt);
+
+ smem_itr_destroy(itr);
+ bwt_destroy(bwt);
+ kseq_destroy(ks);
+ gzclose(fp);
+ return 0;
+}
diff --git a/ext/src/bwa/pemerge.c b/ext/src/bwa/pemerge.c
new file mode 100644
index 0000000..725885f
--- /dev/null
+++ b/ext/src/bwa/pemerge.c
@@ -0,0 +1,291 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <zlib.h>
+#include <pthread.h>
+#include <errno.h>
+#include "ksw.h"
+#include "kseq.h"
+#include "kstring.h"
+#include "bwa.h"
+#include "utils.h"
+KSEQ_DECLARE(gzFile)
+
+#ifdef USE_MALLOC_WRAPPERS
+# include "malloc_wrap.h"
+#endif
+
+#define MAX_SCORE_RATIO 0.9f
+#define MAX_ERR 8
+
+static const char *err_msg[MAX_ERR+1] = {
+ "successful merges",
+ "low-scoring pairs",
+ "pairs where the best SW alignment is not an overlap (long left end)",
+ "pairs where the best SW alignment is not an overlap (long right end)",
+ "pairs with large 2nd best SW score",
+ "pairs with gapped overlap",
+ "pairs where the end-to-end alignment is inconsistent with SW",
+ "pairs potentially with tandem overlaps",
+ "pairs with high sum of errors"
+};
+
+typedef struct {
+ int a, b, q, r, w;
+ int q_def, q_thres;
+ int T;
+ int chunk_size;
+ int n_threads;
+ int flag; // bit 1: print merged; 2: print unmerged
+ int8_t mat[25];
+} pem_opt_t;
+
+pem_opt_t *pem_opt_init()
+{
+ pem_opt_t *opt;
+ opt = calloc(1, sizeof(pem_opt_t));
+ opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20;
+ opt->T = opt->a * 10;
+ opt->q_def = 20;
+ opt->q_thres = 70;
+ opt->chunk_size = 10000000;
+ opt->n_threads = 1;
+ opt->flag = 3;
+ bwa_fill_scmat(opt->a, opt->b, opt->mat);
+ return opt;
+}
+
+int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2])
+{
+ uint8_t *s[2], *q[2], *seq, *qual;
+ int i, xtra, l, l_seq, sum_q, ret = 0;
+ kswr_t r;
+
+ s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq);
+ s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq);
+ for (i = 0; i < x[0].l_seq; ++i) {
+ int c = x[0].seq[i];
+ s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c];
+ q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def;
+ }
+ for (i = 0; i < x[1].l_seq; ++i) {
+ int c = x[1].seq[x[1].l_seq - 1 - i];
+ c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c];
+ s[1][i] = c < 4? 3 - c : 4;
+ q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def;
+ }
+
+ xtra = KSW_XSTART | KSW_XSUBO;
+ r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0);
+ ++r.qe; ++r.te; // change to the half-close-half-open coordinates
+
+ if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment
+ if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end
+ if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end
+ if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large
+ if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps
+
+ { // test tandem match; O(n^2)
+ int max_m, max_m2, min_l, max_l, max_l2;
+ max_m = max_m2 = 0; max_l = max_l2 = 0;
+ min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq;
+ for (l = 1; l < min_l; ++l) {
+ int m = 0, o = x[0].l_seq - l;
+ uint8_t *s0o = &s[0][o], *s1 = s[1];
+ for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck!
+ m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i]
+ if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l;
+ else if (m > max_m2) max_m2 = m, max_l2 = l;
+ }
+ if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; }
+ if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) {
+ ret = -7; goto pem_ret;
+ }
+ if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; }
+ }
+
+ l = x[0].l_seq - (r.tb - r.qb); // length to merge
+ l_seq = x[0].l_seq + x[1].l_seq - l;
+ seq = malloc(l_seq + 1);
+ qual = malloc(l_seq + 1);
+ memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l);
+ memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l);
+ for (i = 0, sum_q = 0; i < l; ++i) {
+ int k = x[0].l_seq - l + i;
+ if (s[0][k] == 4) { // ambiguous
+ seq[k] = s[1][i];
+ qual[k] = q[1][i];
+ } else if (s[1][i] == 4) { // do nothing
+ } else if (s[0][k] == s[1][i]) {
+ qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i];
+ } else { // s[0][k] != s[1][i] and neither is N
+ int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i];
+ sum_q += qq >= 3? qq<<1 : 1;
+ seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i];
+ qual[k] = abs((int)q[0][k] - (int)q[1][i]);
+ }
+ }
+ if (sum_q>>1 > opt->q_thres) { // too many mismatches
+ free(seq); free(qual);
+ ret = -8; goto pem_ret;
+ }
+
+ for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33;
+ seq[l_seq] = qual[l_seq] = 0;
+
+ free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment);
+ memset(&x[1], 0, sizeof(bseq1_t));
+ free(x[0].seq); free(x[0].qual);
+ x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual;
+
+pem_ret:
+ free(s[0]); free(s[1]); free(q[0]); free(q[1]);
+ return ret;
+}
+
+static inline void print_bseq(const bseq1_t *s, int rn)
+{
+ err_putchar(s->qual? '@' : '>');
+ err_fputs(s->name, stdout);
+ if (rn == 1 || rn == 2) {
+ err_putchar('/'); err_putchar('0' + rn); err_putchar('\n');
+ } else err_puts(" merged");
+ err_puts(s->seq);
+ if (s->qual) {
+ err_puts("+"); err_puts(s->qual);
+ }
+}
+
+typedef struct {
+ int n, start;
+ bseq1_t *seqs;
+ int64_t cnt[MAX_ERR+1];
+ const pem_opt_t *opt;
+} worker_t;
+
+void *worker(void *data)
+{
+ worker_t *w = (worker_t*)data;
+ int i;
+ for (i = w->start; i < w->n>>1; i += w->opt->n_threads)
+ ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])];
+ return 0;
+}
+
+static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1])
+{
+ int i, j, n = n_>>1<<1;
+ worker_t *w;
+
+ w = calloc(opt->n_threads, sizeof(worker_t));
+ for (i = 0; i < opt->n_threads; ++i) {
+ worker_t *p = &w[i];
+ p->start = i; p->n = n;
+ p->opt = opt;
+ p->seqs = seqs;
+ }
+ if (opt->n_threads == 1) {
+ worker(w);
+ } else {
+ pthread_t *tid;
+ tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+ for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]);
+ for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
+ free(tid);
+ }
+ for (i = 0; i < opt->n_threads; ++i) {
+ worker_t *p = &w[i];
+ for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j];
+ }
+ free(w);
+ for (i = 0; i < n>>1; ++i) {
+ if (seqs[i<<1|1].l_seq != 0) {
+ if (opt->flag&2) {
+ print_bseq(&seqs[i<<1|0], 1);
+ print_bseq(&seqs[i<<1|1], 2);
+ }
+ } else if (opt->flag&1)
+ print_bseq(&seqs[i<<1|0], 0);
+ }
+ for (i = 0; i < n; ++i) {
+ bseq1_t *s = &seqs[i];
+ free(s->name); free(s->seq); free(s->qual); free(s->comment);
+ }
+}
+
+int main_pemerge(int argc, char *argv[])
+{
+ int c, flag = 0, i, n, min_ovlp = 10;
+ int64_t cnt[MAX_ERR+1];
+ bseq1_t *bseq;
+ gzFile fp, fp2 = 0;
+ kseq_t *ks, *ks2 = 0;
+ pem_opt_t *opt;
+
+ opt = pem_opt_init();
+ while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) {
+ if (c == 'm') flag |= 1;
+ else if (c == 'u') flag |= 2;
+ else if (c == 'Q') opt->q_thres = atoi(optarg);
+ else if (c == 't') opt->n_threads = atoi(optarg);
+ else if (c == 'T') min_ovlp = atoi(optarg);
+ else return 1;
+ }
+ if (flag == 0) flag = 3;
+ opt->flag = flag;
+ opt->T = opt->a * min_ovlp;
+
+ if (optind == argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bwa pemerge [-mu] <read1.fq> [read2.fq]\n\n");
+ fprintf(stderr, "Options: -m output merged reads only\n");
+ fprintf(stderr, " -u output unmerged reads only\n");
+ fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads);
+ fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp);
+ fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres);
+ fprintf(stderr, "\n");
+ free(opt);
+ return 1;
+ }
+
+ fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+ if (NULL == fp) {
+ fprintf(stderr, "Couldn't open %s : %s\n",
+ strcmp(argv[optind], "-") ? argv[optind] : "stdin",
+ errno ? strerror(errno) : "Out of memory");
+ exit(EXIT_FAILURE);
+ }
+ ks = kseq_init(fp);
+ if (optind + 1 < argc) {
+ fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r");
+ if (NULL == fp) {
+ fprintf(stderr, "Couldn't open %s : %s\n",
+ strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin",
+ errno ? strerror(errno) : "Out of memory");
+ exit(EXIT_FAILURE);
+ }
+ ks2 = kseq_init(fp2);
+ }
+
+ memset(cnt, 0, 8 * (MAX_ERR+1));
+ while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) {
+ process_seqs(opt, n, bseq, cnt);
+ free(bseq);
+ }
+
+ fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]);
+ for (i = 1; i <= MAX_ERR; ++i)
+ fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]);
+ kseq_destroy(ks);
+ err_gzclose(fp);
+ if (ks2) {
+ kseq_destroy(ks2);
+ err_gzclose(fp2);
+ }
+ free(opt);
+
+ err_fflush(stdout);
+
+ return 0;
+}
diff --git a/ext/src/bwa/qualfa2fq.pl b/ext/src/bwa/qualfa2fq.pl
new file mode 100755
index 0000000..31e1974
--- /dev/null
+++ b/ext/src/bwa/qualfa2fq.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+die("Usage: qualfa2fq.pl <in.fasta> <in.qual>\n") if (@ARGV != 2);
+
+my ($fhs, $fhq, $q);
+open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die;
+open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die;
+
+$/ = ">"; <$fhs>; <$fhq>; $/ = "\n";
+while (<$fhs>) {
+ $q = <$fhq>;
+ print "\@$_";
+ $/ = ">";
+ $_ = <$fhs>; $q = <$fhq>;
+ chomp; chomp($q);
+ $q =~ s/\s*(\d+)\s*/chr($1+33)/eg;
+ print $_, "+\n";
+ for (my $i = 0; $i < length($q); $i += 60) {
+ print substr($q, $i, 60), "\n";
+ }
+ $/ = "\n";
+}
+
+close($fhs); close($fhq);
diff --git a/ext/src/bwa/rle.c b/ext/src/bwa/rle.c
new file mode 100644
index 0000000..221e1cd
--- /dev/null
+++ b/ext/src/bwa/rle.c
@@ -0,0 +1,191 @@
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "rle.h"
+
+const uint8_t rle_auxtab[8] = { 0x01, 0x11, 0x21, 0x31, 0x03, 0x13, 0x07, 0x17 };
+
+// insert symbol $a after $x symbols in $str; marginal counts added to $cnt; returns the size increase
+int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6])
+{
+ uint16_t *nptr = (uint16_t*)block;
+ int diff;
+
+ block += 2; // skip the first 2 counting bytes
+ if (*nptr == 0) {
+ memset(cnt, 0, 48);
+ diff = rle_enc1(block, a, rl);
+ } else {
+ uint8_t *p, *end = block + *nptr, *q;
+ int64_t pre, z, l = 0, tot, beg_l;
+ int c = -1, n_bytes = 0, n_bytes2, t = 0;
+ uint8_t tmp[24];
+ beg_l = bc[0] + bc[1] + bc[2] + bc[3] + bc[4] + bc[5];
+ tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+ if (x < beg_l) {
+ beg_l = 0, *beg = 0;
+ memset(bc, 0, 48);
+ }
+ if (x == beg_l) {
+ p = q = block + (*beg); z = beg_l;
+ memcpy(cnt, bc, 48);
+ } else if (x - beg_l <= ((tot-beg_l)>>1) + ((tot-beg_l)>>3)) { // forward
+ z = beg_l; p = block + (*beg);
+ memcpy(cnt, bc, 48);
+ while (z < x) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (q = p - 1; *q>>6 == 2; --q);
+ } else { // backward
+ memcpy(cnt, ec, 48);
+ z = tot; p = end;
+ while (z >= x) {
+ --p;
+ if (*p>>6 != 2) {
+ l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3;
+ z -= l; cnt[*p&7] -= l;
+ l = 0; t = 0;
+ } else {
+ l |= (*p&0x3fL) << t;
+ t += 6;
+ }
+ }
+ q = p;
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ *beg = q - block;
+ memcpy(bc, cnt, 48);
+ bc[c] -= l;
+ n_bytes = p - q;
+ if (x == z && a != c && p < end) { // then try the next run
+ int tc;
+ int64_t tl;
+ q = p;
+ rle_dec1(q, tc, tl);
+ if (a == tc)
+ c = tc, n_bytes = q - p, l = tl, z += l, p = q, cnt[tc] += tl;
+ }
+ if (z != x) cnt[c] -= z - x;
+ pre = x - (z - l); p -= n_bytes;
+ if (a == c) { // insert to the same run
+ n_bytes2 = rle_enc1(tmp, c, l + rl);
+ } else if (x == z) { // at the end; append to the existing run
+ p += n_bytes; n_bytes = 0;
+ n_bytes2 = rle_enc1(tmp, a, rl);
+ } else { // break the current run
+ n_bytes2 = rle_enc1(tmp, c, pre);
+ n_bytes2 += rle_enc1(tmp + n_bytes2, a, rl);
+ n_bytes2 += rle_enc1(tmp + n_bytes2, c, l - pre);
+ }
+ if (n_bytes != n_bytes2 && end != p + n_bytes) // size changed
+ memmove(p + n_bytes2, p + n_bytes, end - p - n_bytes);
+ memcpy(p, tmp, n_bytes2);
+ diff = n_bytes2 - n_bytes;
+ }
+ return (*nptr += diff);
+}
+
+int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6])
+{
+ int beg = 0;
+ int64_t bc[6];
+ memset(bc, 0, 48);
+ return rle_insert_cached(block, x, a, rl, cnt, ec, &beg, bc);
+}
+
+void rle_split(uint8_t *block, uint8_t *new_block)
+{
+ int n = *(uint16_t*)block;
+ uint8_t *end = block + 2 + n, *q = block + 2 + (n>>1);
+ while (*q>>6 == 2) --q;
+ memcpy(new_block + 2, q, end - q);
+ *(uint16_t*)new_block = end - q;
+ *(uint16_t*)block = q - block - 2;
+}
+
+void rle_count(const uint8_t *block, int64_t cnt[6])
+{
+ const uint8_t *q = block + 2, *end = q + *(uint16_t*)block;
+ while (q < end) {
+ int c;
+ int64_t l;
+ rle_dec1(q, c, l);
+ cnt[c] += l;
+ }
+}
+
+void rle_print(const uint8_t *block, int expand)
+{
+ const uint16_t *p = (const uint16_t*)block;
+ const uint8_t *q = block + 2, *end = block + 2 + *p;
+ while (q < end) {
+ int c;
+ int64_t l, x;
+ rle_dec1(q, c, l);
+ if (expand) for (x = 0; x < l; ++x) putchar("$ACGTN"[c]);
+ else printf("%c%ld", "$ACGTN"[c], (long)l);
+ }
+ putchar('\n');
+}
+
+void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6])
+{
+ int a;
+ int64_t tot, cnt[6];
+ const uint8_t *p;
+
+ y = y >= x? y : x;
+ tot = ec[0] + ec[1] + ec[2] + ec[3] + ec[4] + ec[5];
+ if (tot == 0) return;
+ if (x <= (tot - y) + (tot>>3)) {
+ int c = 0;
+ int64_t l, z = 0;
+ memset(cnt, 0, 48);
+ p = block + 2;
+ while (z < x) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+ cx[c] -= z - x;
+ if (cy) {
+ while (z < y) {
+ rle_dec1(p, c, l);
+ z += l; cnt[c] += l;
+ }
+ for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+ cy[c] -= z - y;
+ }
+ } else {
+#define move_backward(_x) \
+ while (z >= (_x)) { \
+ --p; \
+ if (*p>>6 != 2) { \
+ l |= *p>>7? (int64_t)rle_auxtab[*p>>3&7]>>4 << t : *p>>3; \
+ z -= l; cnt[*p&7] -= l; \
+ l = 0; t = 0; \
+ } else { \
+ l |= (*p&0x3fL) << t; \
+ t += 6; \
+ } \
+ } \
+
+ int t = 0;
+ int64_t l = 0, z = tot;
+ memcpy(cnt, ec, 48);
+ p = block + 2 + *(const uint16_t*)block;
+ if (cy) {
+ move_backward(y)
+ for (a = 0; a != 6; ++a) cy[a] += cnt[a];
+ cy[*p&7] += y - z;
+ }
+ move_backward(x)
+ for (a = 0; a != 6; ++a) cx[a] += cnt[a];
+ cx[*p&7] += x - z;
+
+#undef move_backward
+ }
+}
diff --git a/ext/src/bwa/rle.h b/ext/src/bwa/rle.h
new file mode 100644
index 0000000..0d59484
--- /dev/null
+++ b/ext/src/bwa/rle.h
@@ -0,0 +1,77 @@
+#ifndef RLE6_H_
+#define RLE6_H_
+
+#include <stdint.h>
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#else
+#define LIKELY(x) (x)
+#endif
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+ int rle_insert_cached(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t ec[6], int *beg, int64_t bc[6]);
+ int rle_insert(uint8_t *block, int64_t x, int a, int64_t rl, int64_t cnt[6], const int64_t end_cnt[6]);
+ void rle_split(uint8_t *block, uint8_t *new_block);
+ void rle_count(const uint8_t *block, int64_t cnt[6]);
+ void rle_rank2a(const uint8_t *block, int64_t x, int64_t y, int64_t *cx, int64_t *cy, const int64_t ec[6]);
+ #define rle_rank1a(block, x, cx, ec) rle_rank2a(block, x, -1, cx, 0, ec)
+
+ void rle_print(const uint8_t *block, int expand);
+
+#ifdef __cplusplus
+}
+#endif
+
+/******************
+ *** 43+3 codec ***
+ ******************/
+
+const uint8_t rle_auxtab[8];
+
+#define RLE_MIN_SPACE 18
+#define rle_nptr(block) ((uint16_t*)(block))
+
+// decode one run (c,l) and move the pointer p
+#define rle_dec1(p, c, l) do { \
+ (c) = *(p) & 7; \
+ if (LIKELY((*(p)&0x80) == 0)) { \
+ (l) = *(p)++ >> 3; \
+ } else if (LIKELY(*(p)>>5 == 6)) { \
+ (l) = (*(p)&0x18L)<<3L | ((p)[1]&0x3fL); \
+ (p) += 2; \
+ } else { \
+ int n = ((*(p)&0x10) >> 2) + 4; \
+ (l) = *(p)++ >> 3 & 1; \
+ while (--n) (l) = ((l)<<6) | (*(p)++&0x3fL); \
+ } \
+ } while (0)
+
+static inline int rle_enc1(uint8_t *p, int c, int64_t l)
+{
+ if (l < 1LL<<4) {
+ *p = l << 3 | c;
+ return 1;
+ } else if (l < 1LL<<8) {
+ *p = 0xC0 | l >> 6 << 3 | c;
+ p[1] = 0x80 | (l & 0x3f);
+ return 2;
+ } else if (l < 1LL<<19) {
+ *p = 0xE0 | l >> 18 << 3 | c;
+ p[1] = 0x80 | (l >> 12 & 0x3f);
+ p[2] = 0x80 | (l >> 6 & 0x3f);
+ p[3] = 0x80 | (l & 0x3f);
+ return 4;
+ } else {
+ int i, shift = 36;
+ *p = 0xF0 | l >> 42 << 3 | c;
+ for (i = 1; i < 8; ++i, shift -= 6)
+ p[i] = 0x80 | (l>>shift & 0x3f);
+ return 8;
+ }
+}
+
+#endif
diff --git a/ext/src/bwa/rope.c b/ext/src/bwa/rope.c
new file mode 100644
index 0000000..3d85981
--- /dev/null
+++ b/ext/src/bwa/rope.c
@@ -0,0 +1,318 @@
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "rle.h"
+#include "rope.h"
+
+/*******************
+ *** Memory Pool ***
+ *******************/
+
+#define MP_CHUNK_SIZE 0x100000 // 1MB per chunk
+
+typedef struct { // memory pool for fast and compact memory allocation (no free)
+ int size, i, n_elems;
+ int64_t top, max;
+ uint8_t **mem;
+} mempool_t;
+
+static mempool_t *mp_init(int size)
+{
+ mempool_t *mp;
+ mp = calloc(1, sizeof(mempool_t));
+ mp->size = size;
+ mp->i = mp->n_elems = MP_CHUNK_SIZE / size;
+ mp->top = -1;
+ return mp;
+}
+
+static void mp_destroy(mempool_t *mp)
+{
+ int64_t i;
+ for (i = 0; i <= mp->top; ++i) free(mp->mem[i]);
+ free(mp->mem); free(mp);
+}
+
+static inline void *mp_alloc(mempool_t *mp)
+{
+ if (mp->i == mp->n_elems) {
+ if (++mp->top == mp->max) {
+ mp->max = mp->max? mp->max<<1 : 1;
+ mp->mem = realloc(mp->mem, sizeof(void*) * mp->max);
+ }
+ mp->mem[mp->top] = calloc(mp->n_elems, mp->size);
+ mp->i = 0;
+ }
+ return mp->mem[mp->top] + (mp->i++) * mp->size;
+}
+
+/***************
+ *** B+ rope ***
+ ***************/
+
+rope_t *rope_init(int max_nodes, int block_len)
+{
+ rope_t *rope;
+ rope = calloc(1, sizeof(rope_t));
+ if (block_len < 32) block_len = 32;
+ rope->max_nodes = (max_nodes+ 1)>>1<<1;
+ rope->block_len = (block_len + 7) >> 3 << 3;
+ rope->node = mp_init(sizeof(rpnode_t) * rope->max_nodes);
+ rope->leaf = mp_init(rope->block_len);
+ rope->root = mp_alloc(rope->node);
+ rope->root->n = 1;
+ rope->root->is_bottom = 1;
+ rope->root->p = mp_alloc(rope->leaf);
+ return rope;
+}
+
+void rope_destroy(rope_t *rope)
+{
+ mp_destroy(rope->node);
+ mp_destroy(rope->leaf);
+ free(rope);
+}
+
+static inline rpnode_t *split_node(rope_t *rope, rpnode_t *u, rpnode_t *v)
+{ // split $v's child. $u is the first node in the bucket. $v and $u are in the same bucket. IMPORTANT: there is always enough room in $u
+ int j, i = v - u;
+ rpnode_t *w; // $w is the sibling of $v
+ if (u == 0) { // only happens at the root; add a new root
+ u = v = mp_alloc(rope->node);
+ v->n = 1; v->p = rope->root; // the new root has the old root as the only child
+ memcpy(v->c, rope->c, 48);
+ for (j = 0; j < 6; ++j) v->l += v->c[j];
+ rope->root = v;
+ }
+ if (i != u->n - 1) // then make room for a new node
+ memmove(v + 2, v + 1, sizeof(rpnode_t) * (u->n - i - 1));
+ ++u->n; w = v + 1;
+ memset(w, 0, sizeof(rpnode_t));
+ w->p = mp_alloc(u->is_bottom? rope->leaf : rope->node);
+ if (u->is_bottom) { // we are at the bottom level; $v->p is a string instead of a node
+ uint8_t *p = (uint8_t*)v->p, *q = (uint8_t*)w->p;
+ rle_split(p, q);
+ rle_count(q, w->c);
+ } else { // $v->p is a node, not a string
+ rpnode_t *p = v->p, *q = w->p; // $v and $w are siblings and thus $p and $q are cousins
+ p->n -= rope->max_nodes>>1;
+ memcpy(q, p + p->n, sizeof(rpnode_t) * (rope->max_nodes>>1));
+ q->n = rope->max_nodes>>1; // NB: this line must below memcpy() as $q->n and $q->is_bottom are modified by memcpy()
+ q->is_bottom = p->is_bottom;
+ for (i = 0; i < q->n; ++i)
+ for (j = 0; j < 6; ++j)
+ w->c[j] += q[i].c[j];
+ }
+ for (j = 0; j < 6; ++j) // compute $w->l and update $v->c
+ w->l += w->c[j], v->c[j] -= w->c[j];
+ v->l -= w->l; // update $v->c
+ return v;
+}
+
+int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache)
+{ // insert $a after $x symbols in $rope and the returns rank(a, x)
+ rpnode_t *u = 0, *v = 0, *p = rope->root; // $v is the parent of $p; $u and $v are at the same level and $u is the first node in the bucket
+ int64_t y = 0, z = 0, cnt[6];
+ int n_runs;
+ do { // top-down update. Searching and node splitting are done together in one pass.
+ if (p->n == rope->max_nodes) { // node is full; split
+ v = split_node(rope, u, v); // $v points to the parent of $p; when a new root is added, $v points to the root
+ if (y + v->l < x) // if $v is not long enough after the split, we need to move both $p and its parent $v
+ y += v->l, z += v->c[a], ++v, p = v->p;
+ }
+ u = p;
+ if (v && x - y > v->l>>1) { // then search backwardly for the right node to descend
+ p += p->n - 1; y += v->l; z += v->c[a];
+ for (; y >= x; --p) y -= p->l, z -= p->c[a];
+ ++p;
+ } else for (; y + p->l < x; ++p) y += p->l, z += p->c[a]; // then search forwardly
+ assert(p - u < u->n);
+ if (v) v->c[a] += rl, v->l += rl; // we should not change p->c[a] because this may cause troubles when p's child is split
+ v = p; p = p->p; // descend
+ } while (!u->is_bottom);
+ rope->c[a] += rl; // $rope->c should be updated after the loop as adding a new root needs the old $rope->c counts
+ if (cache) {
+ if (cache->p != (uint8_t*)p) memset(cache, 0, sizeof(rpcache_t));
+ n_runs = rle_insert_cached((uint8_t*)p, x - y, a, rl, cnt, v->c, &cache->beg, cache->bc);
+ cache->p = (uint8_t*)p;
+ } else n_runs = rle_insert((uint8_t*)p, x - y, a, rl, cnt, v->c);
+ z += cnt[a];
+ v->c[a] += rl; v->l += rl; // this should be after rle_insert(); otherwise rle_insert() won't work
+ if (n_runs + RLE_MIN_SPACE > rope->block_len) {
+ split_node(rope, u, v);
+ if (cache) memset(cache, 0, sizeof(rpcache_t));
+ }
+ return z;
+}
+
+static rpnode_t *rope_count_to_leaf(const rope_t *rope, int64_t x, int64_t cx[6], int64_t *rest)
+{
+ rpnode_t *u, *v = 0, *p = rope->root;
+ int64_t y = 0;
+ int a;
+
+ memset(cx, 0, 48);
+ do {
+ u = p;
+ if (v && x - y > v->l>>1) {
+ p += p->n - 1; y += v->l;
+ for (a = 0; a != 6; ++a) cx[a] += v->c[a];
+ for (; y >= x; --p) {
+ y -= p->l;
+ for (a = 0; a != 6; ++a) cx[a] -= p->c[a];
+ }
+ ++p;
+ } else {
+ for (; y + p->l < x; ++p) {
+ y += p->l;
+ for (a = 0; a != 6; ++a) cx[a] += p->c[a];
+ }
+ }
+ v = p; p = p->p;
+ } while (!u->is_bottom);
+ *rest = x - y;
+ return v;
+}
+
+void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy)
+{
+ rpnode_t *v;
+ int64_t rest;
+ v = rope_count_to_leaf(rope, x, cx, &rest);
+ if (y < x || cy == 0) {
+ rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
+ } else if (rest + (y - x) <= v->l) {
+ memcpy(cy, cx, 48);
+ rle_rank2a((const uint8_t*)v->p, rest, rest + (y - x), cx, cy, v->c);
+ } else {
+ rle_rank1a((const uint8_t*)v->p, rest, cx, v->c);
+ v = rope_count_to_leaf(rope, y, cy, &rest);
+ rle_rank1a((const uint8_t*)v->p, rest, cy, v->c);
+ }
+}
+
+/*********************
+ *** Rope iterator ***
+ *********************/
+
+void rope_itr_first(const rope_t *rope, rpitr_t *i)
+{
+ memset(i, 0, sizeof(rpitr_t));
+ i->rope = rope;
+ for (i->pa[i->d] = rope->root; !i->pa[i->d]->is_bottom;) // descend to the leftmost leaf
+ ++i->d, i->pa[i->d] = i->pa[i->d - 1]->p;
+}
+
+const uint8_t *rope_itr_next_block(rpitr_t *i)
+{
+ const uint8_t *ret;
+ assert(i->d < ROPE_MAX_DEPTH); // a B+ tree should not be that tall
+ if (i->d < 0) return 0;
+ ret = (uint8_t*)i->pa[i->d][i->ia[i->d]].p;
+ while (i->d >= 0 && ++i->ia[i->d] == i->pa[i->d]->n) i->ia[i->d--] = 0; // backtracking
+ if (i->d >= 0)
+ while (!i->pa[i->d]->is_bottom) // descend to the leftmost leaf
+ ++i->d, i->pa[i->d] = i->pa[i->d - 1][i->ia[i->d - 1]].p;
+ return ret;
+}
+
+/***********
+ *** I/O ***
+ ***********/
+
+void rope_print_node(const rpnode_t *p)
+{
+ if (p->is_bottom) {
+ int i;
+ putchar('(');
+ for (i = 0; i < p->n; ++i) {
+ uint8_t *block = (uint8_t*)p[i].p;
+ const uint8_t *q = block + 2, *end = block + 2 + *rle_nptr(block);
+ if (i) putchar(',');
+ while (q < end) {
+ int c = 0;
+ int64_t j, l;
+ rle_dec1(q, c, l);
+ for (j = 0; j < l; ++j) putchar("$ACGTN"[c]);
+ }
+ }
+ putchar(')');
+ } else {
+ int i;
+ putchar('(');
+ for (i = 0; i < p->n; ++i) {
+ if (i) putchar(',');
+ rope_print_node(p[i].p);
+ }
+ putchar(')');
+ }
+}
+
+void rope_dump_node(const rpnode_t *p, FILE *fp)
+{
+ int16_t i, n = p->n;
+ uint8_t is_bottom = p->is_bottom;
+ fwrite(&is_bottom, 1, 1, fp);
+ fwrite(&n, 2, 1, fp);
+ if (is_bottom) {
+ for (i = 0; i < n; ++i) {
+ fwrite(p[i].c, 8, 6, fp);
+ fwrite(p[i].p, 1, *rle_nptr(p[i].p) + 2, fp);
+ }
+ } else {
+ for (i = 0; i < p->n; ++i)
+ rope_dump_node(p[i].p, fp);
+ }
+}
+
+void rope_dump(const rope_t *r, FILE *fp)
+{
+ fwrite(&r->max_nodes, 4, 1, fp);
+ fwrite(&r->block_len, 4, 1, fp);
+ rope_dump_node(r->root, fp);
+}
+
+rpnode_t *rope_restore_node(const rope_t *r, FILE *fp, int64_t c[6])
+{
+ uint8_t is_bottom, a;
+ int16_t i, n;
+ rpnode_t *p;
+ fread(&is_bottom, 1, 1, fp);
+ fread(&n, 2, 1, fp);
+ p = mp_alloc(r->node);
+ p->is_bottom = is_bottom, p->n = n;
+ if (is_bottom) {
+ for (i = 0; i < n; ++i) {
+ uint16_t *q;
+ p[i].p = mp_alloc(r->leaf);
+ q = rle_nptr(p[i].p);
+ fread(p[i].c, 8, 6, fp);
+ fread(q, 2, 1, fp);
+ fread(q + 1, 1, *q, fp);
+ }
+ } else {
+ for (i = 0; i < n; ++i)
+ p[i].p = rope_restore_node(r, fp, p[i].c);
+ }
+ memset(c, 0, 48);
+ for (i = 0; i < n; ++i) {
+ p[i].l = 0;
+ for (a = 0; a < 6; ++a)
+ c[a] += p[i].c[a], p[i].l += p[i].c[a];
+ }
+ return p;
+}
+
+rope_t *rope_restore(FILE *fp)
+{
+ rope_t *r;
+ r = calloc(1, sizeof(rope_t));
+ fread(&r->max_nodes, 4, 1, fp);
+ fread(&r->block_len, 4, 1, fp);
+ r->node = mp_init(sizeof(rpnode_t) * r->max_nodes);
+ r->leaf = mp_init(r->block_len);
+ r->root = rope_restore_node(r, fp, r->c);
+ return r;
+}
diff --git a/ext/src/bwa/rope.h b/ext/src/bwa/rope.h
new file mode 100644
index 0000000..843a408
--- /dev/null
+++ b/ext/src/bwa/rope.h
@@ -0,0 +1,58 @@
+#ifndef ROPE_H_
+#define ROPE_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define ROPE_MAX_DEPTH 80
+#define ROPE_DEF_MAX_NODES 64
+#define ROPE_DEF_BLOCK_LEN 512
+
+typedef struct rpnode_s {
+ struct rpnode_s *p; // child; at the bottom level, $p points to a string with the first 2 bytes giving the number of runs (#runs)
+ uint64_t l:54, n:9, is_bottom:1; // $n and $is_bottom are only set for the first node in a bucket
+ int64_t c[6]; // marginal counts
+} rpnode_t;
+
+typedef struct {
+ int32_t max_nodes, block_len; // both MUST BE even numbers
+ int64_t c[6]; // marginal counts
+ rpnode_t *root;
+ void *node, *leaf; // memory pool
+} rope_t;
+
+typedef struct {
+ const rope_t *rope; // the rope
+ const rpnode_t *pa[ROPE_MAX_DEPTH]; // parent nodes
+ int ia[ROPE_MAX_DEPTH]; // index in the parent nodes
+ int d; // the current depth in the B+-tree
+} rpitr_t;
+
+typedef struct {
+ int beg;
+ int64_t bc[6];
+ uint8_t *p;
+} rpcache_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ rope_t *rope_init(int max_nodes, int block_len);
+ void rope_destroy(rope_t *rope);
+ int64_t rope_insert_run(rope_t *rope, int64_t x, int a, int64_t rl, rpcache_t *cache);
+ void rope_rank2a(const rope_t *rope, int64_t x, int64_t y, int64_t *cx, int64_t *cy);
+ #define rope_rank1a(rope, x, cx) rope_rank2a(rope, x, -1, cx, 0)
+
+ void rope_itr_first(const rope_t *rope, rpitr_t *i);
+ const uint8_t *rope_itr_next_block(rpitr_t *i);
+
+ void rope_print_node(const rpnode_t *p);
+ void rope_dump(const rope_t *r, FILE *fp);
+ rope_t *rope_restore(FILE *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/src/bwa/utils.c b/ext/src/bwa/utils.c
new file mode 100644
index 0000000..7bf8fb7
--- /dev/null
+++ b/ext/src/bwa/utils.c
@@ -0,0 +1,295 @@
+/* The MIT License
+
+ Copyright (c) 2008 Genome Research Ltd (GRL).
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3 at sanger.ac.uk> */
+#define FSYNC_ON_FLUSH
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <errno.h>
+#ifdef FSYNC_ON_FLUSH
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#include <sys/resource.h>
+#include <sys/time.h>
+#include "bwa/utils.h"
+
+#include "ksort.h"
+#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
+KSORT_INIT(128, pair64_t, pair64_lt)
+KSORT_INIT(64, uint64_t, ks_lt_generic)
+
+#include "kseq.h"
+KSEQ_INIT2(, gzFile, err_gzread)
+
+/********************
+ * System utilities *
+ ********************/
+
+FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
+{
+ FILE *fp = 0;
+ if (strcmp(fn, "-") == 0)
+ return (strstr(mode, "r"))? stdin : stdout;
+ if ((fp = fopen(fn, mode)) == 0) {
+ err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
+ }
+ return fp;
+}
+
+FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
+{
+ if (freopen(fn, mode, fp) == 0) {
+ err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno));
+ }
+ return fp;
+}
+
+gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
+{
+ gzFile fp;
+ if (strcmp(fn, "-") == 0) {
+ fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode);
+ /* According to zlib.h, this is the only reason gzdopen can fail */
+ if (!fp) err_fatal(func, "Out of memory");
+ return fp;
+ }
+ if ((fp = gzopen(fn, mode)) == 0) {
+ err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory");
+ }
+ return fp;
+}
+
+void err_fatal(const char *header, const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "[%s] ", header);
+ vfprintf(stderr, fmt, args);
+ fprintf(stderr, "\n");
+ va_end(args);
+ exit(EXIT_FAILURE);
+}
+
+void err_fatal_core(const char *header, const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "[%s] ", header);
+ vfprintf(stderr, fmt, args);
+ fprintf(stderr, " Abort!\n");
+ va_end(args);
+ abort();
+}
+
+void _err_fatal_simple(const char *func, const char *msg)
+{
+ fprintf(stderr, "[%s] %s\n", func, msg);
+ exit(EXIT_FAILURE);
+}
+
+void _err_fatal_simple_core(const char *func, const char *msg)
+{
+ fprintf(stderr, "[%s] %s Abort!\n", func, msg);
+ abort();
+}
+
+size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+ size_t ret = fwrite(ptr, size, nmemb, stream);
+ if (ret != nmemb)
+ _err_fatal_simple("fwrite", strerror(errno));
+ return ret;
+}
+
+size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+ size_t ret = fread(ptr, size, nmemb, stream);
+ if (ret != nmemb)
+ {
+ _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file");
+ }
+ return ret;
+}
+
+int err_gzread(gzFile file, void *ptr, unsigned int len)
+{
+ int ret = gzread(file, ptr, len);
+
+ if (ret < 0)
+ {
+ int errnum = 0;
+ const char *msg = gzerror(file, &errnum);
+ _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg);
+ }
+
+ return ret;
+}
+
+int err_fseek(FILE *stream, long offset, int whence)
+{
+ int ret = fseek(stream, offset, whence);
+ if (0 != ret)
+ {
+ _err_fatal_simple("fseek", strerror(errno));
+ }
+ return ret;
+}
+
+long err_ftell(FILE *stream)
+{
+ long ret = ftell(stream);
+ if (-1 == ret)
+ {
+ _err_fatal_simple("ftell", strerror(errno));
+ }
+ return ret;
+}
+
+int err_printf(const char *format, ...)
+{
+ va_list arg;
+ int done;
+ va_start(arg, format);
+ done = vfprintf(stdout, format, arg);
+ int saveErrno = errno;
+ va_end(arg);
+ if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno));
+ return done;
+}
+
+int err_fprintf(FILE *stream, const char *format, ...)
+{
+ va_list arg;
+ int done;
+ va_start(arg, format);
+ done = vfprintf(stream, format, arg);
+ int saveErrno = errno;
+ va_end(arg);
+ if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno));
+ return done;
+}
+
+int err_fputc(int c, FILE *stream)
+{
+ int ret = putc(c, stream);
+ if (EOF == ret)
+ {
+ _err_fatal_simple("fputc", strerror(errno));
+ }
+
+ return ret;
+}
+
+int err_fputs(const char *s, FILE *stream)
+{
+ int ret = fputs(s, stream);
+ if (EOF == ret)
+ {
+ _err_fatal_simple("fputs", strerror(errno));
+ }
+
+ return ret;
+}
+
+int err_puts(const char *s)
+{
+ int ret = puts(s);
+ if (EOF == ret)
+ {
+ _err_fatal_simple("puts", strerror(errno));
+ }
+
+ return ret;
+}
+
+int err_fflush(FILE *stream)
+{
+ int ret = fflush(stream);
+ if (ret != 0) _err_fatal_simple("fflush", strerror(errno));
+
+#ifdef FSYNC_ON_FLUSH
+ /* Calling fflush() ensures that all the data has made it to the
+ kernel buffers, but this may not be sufficient for remote filesystems
+ (e.g. NFS, lustre) as an error may still occur while the kernel
+ is copying the buffered data to the file server. To be sure of
+ catching these errors, we need to call fsync() on the file
+ descriptor, but only if it is a regular file. */
+ {
+ struct stat sbuf;
+ if (0 != fstat(fileno(stream), &sbuf))
+ _err_fatal_simple("fstat", strerror(errno));
+
+ if (S_ISREG(sbuf.st_mode))
+ {
+ if (0 != fsync(fileno(stream)))
+ _err_fatal_simple("fsync", strerror(errno));
+ }
+ }
+#endif
+ return ret;
+}
+
+int err_fclose(FILE *stream)
+{
+ int ret = fclose(stream);
+ if (ret != 0) _err_fatal_simple("fclose", strerror(errno));
+ return ret;
+}
+
+int err_gzclose(gzFile file)
+{
+ int ret = gzclose(file);
+ if (Z_OK != ret)
+ {
+ _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret));
+ }
+
+ return ret;
+}
+
+/*********
+ * Timer *
+ *********/
+
+double cputime()
+{
+ struct rusage r;
+ getrusage(RUSAGE_SELF, &r);
+ return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec);
+}
+
+double realtime()
+{
+ struct timeval tp;
+ struct timezone tzp;
+ gettimeofday(&tp, &tzp);
+ return tp.tv_sec + tp.tv_usec * 1e-6;
+}
diff --git a/ext/src/bwa/xa2multi.pl b/ext/src/bwa/xa2multi.pl
new file mode 100755
index 0000000..fff50fd
--- /dev/null
+++ b/ext/src/bwa/xa2multi.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+while (<>) {
+ if (/\tXA:Z:(\S+)/) {
+ my $l = $1;
+ print;
+ my @t = split("\t");
+ while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) {
+ my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated!
+ my $seq = $t[9];
+ my $phred = $t[10];
+ # if alternative alignment has other orientation than primary,
+ # then print the reverse (complement) of sequence and phred string
+ if ((($t[1]&0x10)>0) xor ($2<0)) {
+ $seq = reverse $seq;
+ $seq =~ tr/ACGTacgt/TGCAtgca/;
+ $phred = reverse $phred;
+ }
+ print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n");
+ }
+ } else { print; }
+}
diff --git a/ext/src/getopt_pp/CMakeLists.txt b/ext/src/getopt_pp/CMakeLists.txt
new file mode 100644
index 0000000..e594e19
--- /dev/null
+++ b/ext/src/getopt_pp/CMakeLists.txt
@@ -0,0 +1,5 @@
+project(getopt_pp CXX)
+
+add_library(getopt_pp STATIC
+ getopt_pp.cpp)
+
diff --git a/ext/src/getopt_pp/getopt_pp.cpp b/ext/src/getopt_pp/getopt_pp.cpp
new file mode 100644
index 0000000..485b53a
--- /dev/null
+++ b/ext/src/getopt_pp/getopt_pp.cpp
@@ -0,0 +1,206 @@
+/*
+GetOpt_pp: Yet another C++ version of getopt.
+ Copyright (C) 2007, 2008 Daniel Gutson, FuDePAN
+
+ This file is part of GetOpt_pp.
+
+ GetOpt_pp is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ board-games is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <unistd.h>
+#include "getopt_pp/getopt_pp.h"
+
+#if __APPLE__
+extern char** environ;
+#endif
+
+namespace GetOpt {
+
+const char GetOpt_pp::EMPTY_OPTION = 0;
+
+GETOPT_INLINE void GetOpt_pp::_init_flags()
+{
+ std::stringstream ss;
+ _flags = ss.flags();
+}
+
+GETOPT_INLINE void GetOpt_pp::_parse(int argc, char* argv[])
+{
+ OptionData* currentData = NULL;
+ _app_name = argv[0];
+
+ // parse arguments by their '-' or '--':
+ // (this will be a state machine soon)
+ for(int i=1; i < argc; i++)
+ {
+ const char current = argv[i][0];
+ const char next = argv[i][1];
+
+ if (current == '-' && (isalpha(next) || next == '-' ) )
+ {
+ // see what's next, differentiate whether it's short or long:
+ if (next == '-' && argv[i][2] != 0)
+ {
+ // long option
+ currentData = &_longOps[&argv[i][2]];
+ }
+ else
+ {
+ // short option
+ // iterate over all of them, keeping the last one in currentData
+ // (so the intermediates will generate 'existent' arguments, as of '-abc')
+ size_t j=1;
+ do
+ {
+ currentData = &_shortOps[argv[i][j]];
+ j++;
+ }
+ while (argv[i][j] != 0);
+ }
+ }
+ else
+ {
+ // save value!
+ if (currentData == NULL)
+ currentData = &_shortOps[EMPTY_OPTION];
+
+ currentData->args.push_back(argv[i]);
+ }
+ }
+
+ _last = _Option::OK; // TODO: IMPROVE!!
+}
+
+GETOPT_INLINE void GetOpt_pp::_parse_env()
+{
+ // this will be optimized in version 3
+ std::string var_name;
+ std::string var_value;
+ size_t var=0;
+ std::string::size_type pos;
+ OptionData* data;
+
+ while (environ[var] != NULL)
+ {
+ var_name = environ[var];
+ pos = var_name.find('=');
+
+ if (pos != std::string::npos)
+ {
+ var_value = var_name.substr(pos+1);
+ var_name = var_name.substr(0, pos);
+
+ if (_longOps.find(var_name) == _longOps.end())
+ {
+ data = &_longOps[var_name];
+ data->args.push_back(var_value);
+ data->flags = OptionData::Envir;
+ }
+ }
+ else
+ (data = &_longOps[var_name])->flags = OptionData::Envir;
+
+ var++;
+ }
+}
+
+GETOPT_INLINE GetOpt_pp::GetOpt_pp(int argc, char* argv[])
+ : _exc(std::ios_base::goodbit)
+{
+ _init_flags();
+ _parse(argc, argv);
+}
+
+GETOPT_INLINE GetOpt_pp::GetOpt_pp(int argc, char* argv[], _EnvTag)
+{
+ _init_flags();
+ _parse(argc, argv);
+ _parse_env();
+}
+
+GETOPT_INLINE GetOpt_pp& GetOpt_pp::operator >> (const _Option& opt) throw (GetOptEx)
+{
+ if (_last != _Option::ParsingError)
+ {
+ _last = opt(_shortOps, _longOps, _flags);
+
+ switch(_last)
+ {
+ case _Option::OK:
+ break;
+
+ case _Option::OptionNotFound:
+ if (_exc & std::ios_base::eofbit )
+ throw OptionNotFoundEx();
+ break;
+
+ case _Option::BadType:
+ if (_exc & std::ios_base::failbit )
+ throw InvalidFormatEx();
+ break;
+
+ case _Option::NoArgs:
+ if (_exc & std::ios_base::eofbit )
+ throw ArgumentNotFoundEx();
+ break;
+
+ case _Option::TooManyArgs:
+ if (_exc & std::ios_base::failbit )
+ throw TooManyArgumentsEx();
+ break;
+
+ case _Option::OptionNotFound_NoEx:
+ break; // Ok, it will be read by casting to bool
+
+ case _Option::ParsingError: break; // just to disable warning
+ }
+ }
+ else if (_exc & std::ios_base::failbit )
+ throw ParsingErrorEx();
+
+ return *this;
+}
+
+GETOPT_INLINE GetOpt_pp& GetOpt_pp::operator >> (std::ios_base& (*iomanip)(std::ios_base&))
+{
+ std::stringstream ss;
+ ss.flags(_flags);
+ _flags = (ss << iomanip).flags();
+ return *this;
+}
+
+GETOPT_INLINE bool GetOpt_pp::options_remain() const
+{
+ bool remain = false;
+ ShortOptions::const_iterator it = _shortOps.begin();
+ while (it != _shortOps.end() && !remain)
+ {
+ remain = (it->second.flags == OptionData::CmdLine_NotExtracted);
+ ++it;
+ }
+
+ if (!remain)
+ {
+ LongOptions::const_iterator it = _longOps.begin();
+ while (it != _longOps.end() && !remain)
+ {
+ remain = (it->second.flags == OptionData::CmdLine_NotExtracted);
+ ++it;
+ }
+ }
+
+ return remain;
+}
+
+}
diff --git a/ext/src/llvm/CMakeLists.txt b/ext/src/llvm/CMakeLists.txt
index 6993c91..f34f99a 100644
--- a/ext/src/llvm/CMakeLists.txt
+++ b/ext/src/llvm/CMakeLists.txt
@@ -19,6 +19,9 @@ check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
find_library(DL_LIB NAMES "dl")
+add_definitions(-D__STDC_CONSTANT_MACROS)
+add_definitions(-D__STDC_LIMIT_MACROS)
+
# FIXME: Signal handler return type, currently hardcoded to 'void'
set(RETSIGTYPE void)
diff --git a/manual.html b/manual.html
index 6b02feb..130127b 100644
--- a/manual.html
+++ b/manual.html
@@ -1,6 +1,6 @@
<html>
<head>
- <title>SPAdes 3.9.1 Manual</title>
+ <title>SPAdes 3.10.0 Manual</title>
<style type="text/css">
.code {
background-color: lightgray;
@@ -8,7 +8,7 @@
</style>
</head>
<body>
-<h1>SPAdes 3.9.1 Manual</h1>
+<h1>SPAdes 3.10.0 Manual</h1>
1. <a href="#sec1">About SPAdes</a><br>
1.1. <a href="#sec1.1">Supported data types</a><br>
@@ -34,17 +34,17 @@
<a name="sec1"></a>
<h2>1. About SPAdes</h2>
<p>
- SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. This manual will help you to install and run SPAdes.
-SPAdes version 3.9.1 was released under GPLv2 on December 4, 2016 and can be downloaded from <a href="http://cab.spbu.ru/software/spades/" target="_blank">http://cab.spbu.ru/software/spades/</a>.
+ SPAdes – St. Petersburg genome assembler – is an assembly toolkit containing various assembly pipelines. This manual will help you to install and run SPAdes.
+SPAdes version 3.10.0 was released under GPLv2 on January 27, 2017 and can be downloaded from <a href="http://cab.spbu.ru/software/spades/" target="_blank">http://cab.spbu.ru/software/spades/</a>.
<a name="sec1.1"></a>
<h3>1.1 Supported data types</h3>
<p>
The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.
<p>
- Version 3.9.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+ Version 3.10.0 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on bacterial (both single-cell MDA and standard isolates), fungal and other small genomes. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
<p>
- SPAdes 3.9.1 includes the following additional pipelines:
+ SPAdes 3.10.0 includes the following additional pipelines:
<ul>
<li>dipSPAdes – a module for assembling highly polymorphic diploid genomes (see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a>).</li>
<li>metaSPAdes – a pipeline for metagenomic data sets (see <a href="#meta">metaSPAdes options</a>). </li>
@@ -76,7 +76,7 @@ SPAdes comes in several separate modules:
<li> <a href="http://spades.bioinf.spbau.ru/spades_test_datasets/ecoli_sc/" target="_blank">MDA single-cell <i>E. coli</i></a>; 6.3 Gb, 29M reads, 2x100bp, insert size ~ 270bp </li>
</ul>
<p>
- We ran SPAdes with default parameters using 16 threads on a server with Intel Xeon 2.27GHz processors. BayesHammer runs in approximately 30-40 minutes and takes up to 8Gb of RAM to perform read error correction on each data set. Assembly takes about 15 minutes for the <i>E. coli</i> isolate data set and 30 minutes for the <i>E. coli</i> single-cell data set. Both data sets require about 9Gb of RAM (see notes below). MismatchCorrector runs for about 25 minutes on both data sets, and r [...]
+ We ran SPAdes with default parameters using 16 threads on a server with Intel Xeon 2.27GHz processors and SSD hard drive. BayesHammer runs in approximately half an hour and takes up to 8Gb of RAM to perform read error correction on each data set. Assembly takes about 10 minutes for the <i>E. coli</i> isolate data set and 20 minutes for the <i>E. coli</i> single-cell data set. Both data sets require about 8Gb of RAM (see notes below). MismatchCorrector runs for about 15 minutes on bot [...]
<p>
<table border="1" cellpadding="4" cellspacing="0">
@@ -98,42 +98,42 @@ SPAdes comes in several separate modules:
<tr>
<td> BayesHammer </td>
+ <td align="center"> 29m </td>
+ <td align="center"> 7.1 </td>
+ <td align="center"> 11 </td>
<td align="center"> 34m </td>
- <td align="center"> 7.7 </td>
- <td align="center"> 8.4 </td>
- <td align="center"> 40m </td>
- <td align="center"> 7.5 </td>
+ <td align="center"> 7.6 </td>
<td align="center"> 8.8 </td>
</tr>
<tr>
<td> SPAdes </td>
- <td align="center"> 16m </td>
- <td align="center"> 8.6 </td>
+ <td align="center"> 11m </td>
+ <td align="center"> 8.4 </td>
<td align="center"> 1.6 </td>
- <td align="center"> 28m </td>
- <td align="center"> 8.6 </td>
- <td align="center"> 2.7 </td>
+ <td align="center"> 17m </td>
+ <td align="center"> 8 </td>
+ <td align="center"> 3.0 </td>
</tr>
<tr>
<td> MismatchCorrector </td>
- <td align="center"> 22m </td>
+ <td align="center"> 13m </td>
<td align="center"> 1.8 </td>
- <td align="center"> 21.8 </td>
- <td align="center"> 26m </td>
+ <td align="center"> 27.1 </td>
+ <td align="center"> 16m </td>
<td align="center"> 1.8 </td>
- <td align="center"> 22.9 </td>
+ <td align="center"> 25.5 </td>
</tr>
<tr>
<td> Whole pipeline </td>
- <td align="center"> 1h 12m </td>
- <td align="center"> 8.6 </td>
- <td align="center"> 24.2 </td>
- <td align="center"> 1h 34m </td>
- <td align="center"> 8.6 </td>
- <td align="center"> 25.5 </td>
+ <td align="center"> 53m </td>
+ <td align="center"> 8.4 </td>
+ <td align="center"> 29.6 </td>
+ <td align="center"> 1h 7m </td>
+ <td align="center"> 8 </td>
+ <td align="center"> 28.3 </td>
</tr>
</table>
@@ -143,7 +143,7 @@ SPAdes comes in several separate modules:
<li> Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory. </li>
<li> Each module removes its temporary files as soon as it finishes. </li>
<li> SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM. </li>
- <li> Performance statistics is given for SPAdes version 3.9.1. </li>
+ <li> Performance statistics is given for SPAdes version 3.10.0. </li>
</ul>
@@ -157,13 +157,13 @@ SPAdes comes in several separate modules:
<h3>2.1 Downloading SPAdes Linux binaries</h3>
<p>
- To download <a href="http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
+ To download <a href="http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- wget http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1-Linux.tar.gz
- tar -xzf SPAdes-3.9.1-Linux.tar.gz
- cd SPAdes-3.9.1-Linux/bin/
+ wget http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0-Linux.tar.gz
+ tar -xzf SPAdes-3.10.0-Linux.tar.gz
+ cd SPAdes-3.10.0-Linux/bin/
</code>
</pre>
@@ -192,13 +192,13 @@ SPAdes comes in several separate modules:
<h3>2.2 Downloading SPAdes binaries for Mac</h3>
<p>
- To obtain <a href="http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
+ To obtain <a href="http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- curl http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1-Darwin.tar.gz -o SPAdes-3.9.1-Darwin.tar.gz
- tar -zxf SPAdes-3.9.1-Darwin.tar.gz
- cd SPAdes-3.9.1-Darwin/bin/
+ curl http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0-Darwin.tar.gz -o SPAdes-3.10.0-Darwin.tar.gz
+ tar -zxf SPAdes-3.10.0-Darwin.tar.gz
+ cd SPAdes-3.10.0-Darwin/bin/
</code>
</pre>
@@ -229,20 +229,20 @@ SPAdes comes in several separate modules:
<p>
If you wish to compile SPAdes by yourself you will need the following libraries to be pre-installed:
<ul>
- <li>g++ (version 4.7 or higher)</li>
+ <li>g++ (version 4.8.2 or higher)</li>
<li>cmake (version 2.8.12 or higher)</li>
<li>zlib</li>
<li>libbz2</li>
</ul>
<p>
- If you meet these requirements, you can download the <a href="http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1.tar.gz">SPAdes source code</a>:
+ If you meet these requirements, you can download the <a href="http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0.tar.gz">SPAdes source code</a>:
<pre class="code">
<code>
- wget http://cab.spbu.ru/files/release3.9.1/SPAdes-3.9.1.tar.gz
- tar -xzf SPAdes-3.9.1.tar.gz
- cd SPAdes-3.9.1
+ wget http://cab.spbu.ru/files/release3.10.0/SPAdes-3.10.0.tar.gz
+ tar -xzf SPAdes-3.10.0.tar.gz
+ cd SPAdes-3.10.0
</code>
</pre>
@@ -325,18 +325,23 @@ SPAdes comes in several separate modules:
<pre class="code">
<code>
-===== Assembling finished.
+===== Assembling finished. Used k-mer sizes: 21, 33, 55
* Corrected reads are in spades_test/corrected/
* Assembled contigs are in spades_test/contigs.fasta
* Assembled scaffolds are in spades_test/scaffolds.fasta
+ * Assembly graph is in spades_test/assembly_graph.fastg
+ * Assembly graph in GFA format is in spades_test/assembly_graph.gfa
+ * Paths in the assembly graph corresponding to the contigs are in spades_test/contigs.paths
+ * Paths in the assembly graph corresponding to the scaffolds are in spades_test/scaffolds.paths
======= SPAdes pipeline finished.
-SPAdes log can be found here: /home/andrey/ablab/algorithmic-biology/assembler/spades_test/spades.log
+========= TEST PASSED CORRECTLY.
-Thank you for using SPAdes!
+SPAdes log can be found here: spades_test/spades.log
+Thank you for using SPAdes!
</code>
</pre>
@@ -349,7 +354,7 @@ Thank you for using SPAdes!
SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.
<p>
- To run SPAdes 3.9.1 you need at least one library of the following types:
+ To run SPAdes 3.10.0 you need at least one library of the following types:
<ul>
<li>Illumina paired-end/high-quality mate-pairs/unpaired reads</li>
<li>IonTorrent paired-end/high-quality mate-pairs/unpaired reads</li>
@@ -620,7 +625,7 @@ In comparison to the <code>--continue</code> option, you can change some of the
</p>
<p>
- <code>--pe<b><#></b>-<b><or></b> <file_name> </code><br>
+ <code>--pe<b><#></b>-<b><or></b> </code><br>
Orientation of reads for paired-end library number <code><b><#></b></code> (<code><b><#></b></code> = 1,2,..,9; <code><b><or></b></code> = "fr","rf","ff"). <br>
The default orientation for paired-end libraries is forward-reverse. For example, to specify reverse-forward orientation for the second paired-end library, you should use the flag:
<code>--pe2-rf </code><br>
@@ -642,7 +647,7 @@ In comparison to the <code>--continue</code> option, you can change some of the
File with right reads for mate-pair library number <code><b><#></b></code> (<code><b><#></b></code> = 1,2,..,9).
</p>
<p>
- <code>--mp<b><#></b>-<b><or></b> <file_name> </code><br>
+ <code>--mp<b><#></b>-<b><or></b> </code><br>
Orientation of reads for mate-pair library number <code><b><#></b></code> (<code><b><#></b></code> = 1,2,..,9; <code><b><or></b></code> = "fr","rf","ff"). <br>
The default orientation for mate-pair libraries is reverse-forward. For example, to specify forward-forward orientation for the first mate-pair library, you should use the flag:
<code>--mp1-ff </code><br>
@@ -671,7 +676,7 @@ In comparison to the <code>--continue</code> option, you can change some of the
</p>
<p>
- <code>--hqmp<b><#></b>-<b><or></b> <file_name> </code><br>
+ <code>--hqmp<b><#></b>-<b><or></b> </code><br>
Orientation of reads for high-quality mate-pair library number <code><b><#></b></code> (<code><b><#></b></code> = 1,2,..,9; <code><b><or></b></code> = "fr","rf","ff"). <br>
The default orientation for high-quality mate-pair libraries is forward-reverse. For example, to specify reverse-forward orientation for the first high-quality mate-pair library, you should use the flag:
<code>--hqmp1-rf </code><br>
@@ -1098,6 +1103,7 @@ However, it might be tricky to fully utilize the advantages of long reads you h
<li><code><output_dir>/corrected/</code> directory contains reads corrected by BayesHammer in <code>*.fastq.gz</code> files; if compression is disabled, reads are stored in uncompressed <code>*.fastq</code> files</li>
<li><code><output_dir>/scaffolds.fasta</code> contains resulting scaffolds (recommended for use as resulting sequences)</li>
<li><code><output_dir>/contigs.fasta</code> contains resulting contigs</li>
+ <li><code><output_dir>/assembly_graph.gfa</code> contains SPAdes assembly graph and scaffolds paths in <a href="https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md" target="_blank">GFA 1.0 format</a></li>
<li><code><output_dir>/assembly_graph.fastg</code> contains SPAdes assembly graph in <a href="http://fastg.sourceforge.net/FASTG_Spec_v1.00.pdf" target="_blank">FASTG format</a></li>
<li><code><output_dir>/contigs.paths</code> contains paths in the assembly graph corresponding to contigs.fasta (see details below)</li>
<li><code><output_dir>/scaffolds.paths</code> contains paths in the assembly graph corresponding to scaffolds.fasta (see details below)</li>
@@ -1106,8 +1112,11 @@ However, it might be tricky to fully utilize the advantages of long reads you h
<p>
Contigs/scaffolds names in SPAdes output FASTA files have the following format: <br><code>>NODE_3_length_237403_cov_243.207_ID_45</code><br> Here <code>3</code> is the number of the contig/scaffold, <code>237403</code> is the sequence length in nucleotides and <code>243.207</code> is the k-mer coverage for the last (largest) k value used. Note that the k-mer coverage is always lower than the read (per-base) coverage.
+<p>
+ In general, SPAdes uses two techniques for joining contigs into scaffolds. First one relies on read pairs and tries to estimate the size of the gap separating contigs. The second one relies on the assembly graph: e.g. if two contigs are separated by a complex tandem repeat, that cannot be resolved exactly, contigs are joined into scaffold with a fixed gap size of 100 bp. Contigs produced by SPAdes do not contain N symbols.
+
<p>
- To view FASTG files we recommend to use <a href="http://rrwick.github.io/Bandage/" target="_blank">Bandage visualization tool</a>. Note that sequences stored in <code>assembly_graph.fastg</code> correspond to contigs before repeat resolution (edges of the assembly graph). Paths corresponding to contigs after repeat resolution (scaffolding) are stored in <code>contigs.paths</code> (<code>scaffolds.paths</code>) in the format accepted by Bandage (see <a href="https://github.com/rrwick/B [...]
+ To view FASTG and GFA files we recommend to use <a href="http://rrwick.github.io/Bandage/" target="_blank">Bandage visualization tool</a>. Note that sequences stored in <code>assembly_graph.fastg</code> correspond to contigs before repeat resolution (edges of the assembly graph). Paths corresponding to contigs after repeat resolution (scaffolding) are stored in <code>contigs.paths</code> (<code>scaffolds.paths</code>) in the format accepted by Bandage (see <a href="https://github.com/ [...]
<p> Let the contig with the name <code>NODE_5_length_100000_cov_215.651_ID_5</code> consist of the following edges of the assembly graph:
<pre>
diff --git a/metaspades.py b/metaspades.py
index c19e2fb..ff31c92 100755
--- a/metaspades.py
+++ b/metaspades.py
@@ -186,10 +186,15 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
len(options_storage.SHORT_READS_TYPES.keys()) +
len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+ # auto detecting SPAdes mode (rna, meta, etc) if it is not a rerun (--continue or --restart-from)
+ if secondary_filling or not options_storage.will_rerun(options):
+ mode = options_storage.get_mode()
+ if mode is not None:
+ options.append(('--' + mode, ''))
+
# for parsing options from "previous run command"
options_storage.continue_mode = False
options_storage.k_mers = None
-
for opt, arg in options:
if opt == '-o':
if not skip_output_dir:
@@ -197,13 +202,17 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.error('-o option was specified at least twice')
options_storage.output_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ support.check_path_is_ascii(options_storage.output_dir, 'output directory')
elif opt == "--tmp-dir":
options_storage.tmp_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ support.check_path_is_ascii(options_storage.tmp_dir, 'directory for temporary files')
elif opt == "--configs-dir":
options_storage.configs_dir = support.check_dir_existence(arg)
elif opt == "--reference":
options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--series-analysis":
+ options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log)
elif opt == "--dataset":
options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
@@ -225,16 +234,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
elif opt == "--sc":
options_storage.single_cell = True
elif opt == "--meta":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.meta = True
elif opt == "--large-genome":
options_storage.large_genome = True
elif opt == "--plasmid":
options_storage.plasmid = True
elif opt == "--rna":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.rna = True
elif opt == "--iontorrent":
options_storage.iontorrent = True
@@ -327,9 +332,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
show_usage(0, show_hidden=True)
elif opt == "--test":
- options_storage.set_test_options()
- support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
- support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ options_storage.set_test_options()
#break
elif opt == "--diploid":
options_storage.diploid_mode = True
@@ -338,6 +341,14 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
raise ValueError
+ if options_storage.test_mode:
+ if options_storage.plasmid:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset_plasmid/pl1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset_plasmid/pl2.fq.gz"), dataset_data)
+ else:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+
if not options_storage.output_dir:
support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
if not os.path.isdir(options_storage.output_dir):
@@ -372,7 +383,6 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
existing_dataset_data = None
if existing_dataset_data is not None:
dataset_data = existing_dataset_data
- options_storage.dataset_yaml_filename = processed_dataset_fpath
else:
if options_storage.dataset_yaml_filename:
try:
@@ -384,8 +394,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
dataset_data = support.correct_dataset(dataset_data)
dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
- options_storage.dataset_yaml_filename = processed_dataset_fpath
- pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
@@ -397,6 +406,9 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
+ if existing_dataset_data is None:
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
options_storage.set_default_values()
### FILLING cfg
cfg["common"] = empty_config()
@@ -412,6 +424,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["common"].__dict__["max_threads"] = options_storage.threads
cfg["common"].__dict__["max_memory"] = options_storage.memory
cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+ if options_storage.series_analysis:
+ cfg["common"].__dict__["series_analysis"] = options_storage.series_analysis
# dataset section
cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
@@ -430,6 +444,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
if options_storage.meta or options_storage.large_genome:
cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+ if options_storage.read_buffer_size:
+ cfg["error_correction"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
# assembly
if not options_storage.only_error_correction:
@@ -449,9 +465,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.read_buffer_size:
cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
- if options_storage.large_genome:
- cfg["assembly"].__dict__["bwa_paired"] = True
- cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+
#corrector can work only if contigs exist (not only error correction)
if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
cfg["mismatch_corrector"] = empty_config()
@@ -500,9 +514,11 @@ def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop
support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
-def get_options_from_params(params_filename, spades_py_name=None):
+def get_options_from_params(params_filename, running_script):
+ cmd_line = None
+ options = None
if not os.path.isfile(params_filename):
- return None, None
+ return cmd_line, options, "failed to parse command line of the previous run (%s not found)!" % params_filename
params = open(params_filename, 'r')
cmd_line = params.readline().strip()
spades_prev_version = None
@@ -512,20 +528,22 @@ def get_options_from_params(params_filename, spades_py_name=None):
break
params.close()
if spades_prev_version is None:
- support.error("failed to parse SPAdes version of the previous run! "
- "Please restart from the beginning or specify another output directory.")
+ return cmd_line, options, "failed to parse SPAdes version of the previous run!"
if spades_prev_version.strip() != spades_version.strip():
- support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
- "Please restart from the beginning or specify another output directory."
- % (spades_prev_version.strip(), spades_version.strip()))
- if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
- spades_py_name = 'spades.py' # try default name
- else:
- spades_py_name = os.path.basename(spades_py_name)
- spades_py_pos = cmd_line.find(spades_py_name)
- if spades_py_pos == -1:
- return None, None
- return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+ return cmd_line, options, "SPAdes version of the previous run (%s) is not equal " \
+ "to the current version of SPAdes (%s)!" \
+ % (spades_prev_version.strip(), spades_version.strip())
+ if 'Command line: ' not in cmd_line or '\t' not in cmd_line:
+ return cmd_line, options, "failed to parse executable script of the previous run!"
+ options = cmd_line.split('\t')[1:]
+ prev_running_script = cmd_line.split('\t')[0][len('Command line: '):]
+ # we cannot restart/continue spades.py run with metaspades.py/rnaspades.py/etc and vice versa
+ if os.path.basename(prev_running_script) != os.path.basename(running_script):
+ return cmd_line, options, "executable script of the previous run (%s) is not equal " \
+ "to the current executable script (%s)!" \
+ % (os.path.basename(prev_running_script),
+ os.path.basename(running_script))
+ return cmd_line, options, ""
def show_version():
@@ -554,19 +572,14 @@ def main(args):
support.check_binaries(bin_home, log)
- # auto detecting SPAdes mode (rna, meta, etc)
- mode = options_storage.get_mode()
- if mode is not None:
- args.append('--' + mode)
-
# parse options and safe all parameters to cfg
options = args
cfg, dataset_data = fill_cfg(options, log)
if options_storage.continue_mode:
- cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
- if not options:
- support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cmd_line, options, err_msg = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if err_msg:
+ support.error(err_msg + " Please restart from the beginning or specify another output directory.")
cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
if options_storage.restart_from:
check_cfg_for_partial_run(cfg, type='restart-from')
@@ -699,6 +712,7 @@ def main(args):
result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_assembly_graph_filename_gfa = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name_gfa)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
@@ -715,6 +729,7 @@ def main(args):
spades_cfg.__dict__["result_contigs"] = result_contigs_filename
spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_graph_gfa"] = result_assembly_graph_filename_gfa
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
@@ -844,7 +859,9 @@ def main(args):
if options_storage.continue_mode and os.path.isfile(corrected):
log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
continue
-
+ if not os.path.isfile(assembled) or os.path.getsize(assembled) == 0:
+ log.info("\n== Skipping processing of " + assembly_type + " (empty file)\n")
+ continue
support.continue_from_here(log)
log.info("\n== Processing of " + assembly_type + "\n")
@@ -855,7 +872,6 @@ def main(args):
corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
-
corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
ext_python_modules_home, log, assembled, result_corrected_filename)
@@ -893,6 +909,9 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa):
+ message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa)
+ log.info(message)
if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
message = " * Paths in the assembly graph corresponding to the contigs are in " + \
support.process_spaces(result_contigs_paths_filename)
@@ -933,7 +952,10 @@ def main(args):
result_fasta = list(support.read_fasta(result_filename))
# correctness check: should be one contig of length 1000 bp
correct_number = 1
- correct_length = 1000
+ if options_storage.plasmid:
+ correct_length = 9667
+ else:
+ correct_length = 1000
if not len(result_fasta):
support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
elif len(result_fasta) > correct_number:
diff --git a/plasmidspades.py b/plasmidspades.py
index c19e2fb..ff31c92 100755
--- a/plasmidspades.py
+++ b/plasmidspades.py
@@ -186,10 +186,15 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
len(options_storage.SHORT_READS_TYPES.keys()) +
len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+ # auto detecting SPAdes mode (rna, meta, etc) if it is not a rerun (--continue or --restart-from)
+ if secondary_filling or not options_storage.will_rerun(options):
+ mode = options_storage.get_mode()
+ if mode is not None:
+ options.append(('--' + mode, ''))
+
# for parsing options from "previous run command"
options_storage.continue_mode = False
options_storage.k_mers = None
-
for opt, arg in options:
if opt == '-o':
if not skip_output_dir:
@@ -197,13 +202,17 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.error('-o option was specified at least twice')
options_storage.output_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ support.check_path_is_ascii(options_storage.output_dir, 'output directory')
elif opt == "--tmp-dir":
options_storage.tmp_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ support.check_path_is_ascii(options_storage.tmp_dir, 'directory for temporary files')
elif opt == "--configs-dir":
options_storage.configs_dir = support.check_dir_existence(arg)
elif opt == "--reference":
options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--series-analysis":
+ options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log)
elif opt == "--dataset":
options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
@@ -225,16 +234,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
elif opt == "--sc":
options_storage.single_cell = True
elif opt == "--meta":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.meta = True
elif opt == "--large-genome":
options_storage.large_genome = True
elif opt == "--plasmid":
options_storage.plasmid = True
elif opt == "--rna":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.rna = True
elif opt == "--iontorrent":
options_storage.iontorrent = True
@@ -327,9 +332,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
show_usage(0, show_hidden=True)
elif opt == "--test":
- options_storage.set_test_options()
- support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
- support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ options_storage.set_test_options()
#break
elif opt == "--diploid":
options_storage.diploid_mode = True
@@ -338,6 +341,14 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
raise ValueError
+ if options_storage.test_mode:
+ if options_storage.plasmid:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset_plasmid/pl1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset_plasmid/pl2.fq.gz"), dataset_data)
+ else:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+
if not options_storage.output_dir:
support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
if not os.path.isdir(options_storage.output_dir):
@@ -372,7 +383,6 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
existing_dataset_data = None
if existing_dataset_data is not None:
dataset_data = existing_dataset_data
- options_storage.dataset_yaml_filename = processed_dataset_fpath
else:
if options_storage.dataset_yaml_filename:
try:
@@ -384,8 +394,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
dataset_data = support.correct_dataset(dataset_data)
dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
- options_storage.dataset_yaml_filename = processed_dataset_fpath
- pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
@@ -397,6 +406,9 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
+ if existing_dataset_data is None:
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
options_storage.set_default_values()
### FILLING cfg
cfg["common"] = empty_config()
@@ -412,6 +424,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["common"].__dict__["max_threads"] = options_storage.threads
cfg["common"].__dict__["max_memory"] = options_storage.memory
cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+ if options_storage.series_analysis:
+ cfg["common"].__dict__["series_analysis"] = options_storage.series_analysis
# dataset section
cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
@@ -430,6 +444,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
if options_storage.meta or options_storage.large_genome:
cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+ if options_storage.read_buffer_size:
+ cfg["error_correction"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
# assembly
if not options_storage.only_error_correction:
@@ -449,9 +465,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.read_buffer_size:
cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
- if options_storage.large_genome:
- cfg["assembly"].__dict__["bwa_paired"] = True
- cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+
#corrector can work only if contigs exist (not only error correction)
if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
cfg["mismatch_corrector"] = empty_config()
@@ -500,9 +514,11 @@ def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop
support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
-def get_options_from_params(params_filename, spades_py_name=None):
+def get_options_from_params(params_filename, running_script):
+ cmd_line = None
+ options = None
if not os.path.isfile(params_filename):
- return None, None
+ return cmd_line, options, "failed to parse command line of the previous run (%s not found)!" % params_filename
params = open(params_filename, 'r')
cmd_line = params.readline().strip()
spades_prev_version = None
@@ -512,20 +528,22 @@ def get_options_from_params(params_filename, spades_py_name=None):
break
params.close()
if spades_prev_version is None:
- support.error("failed to parse SPAdes version of the previous run! "
- "Please restart from the beginning or specify another output directory.")
+ return cmd_line, options, "failed to parse SPAdes version of the previous run!"
if spades_prev_version.strip() != spades_version.strip():
- support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
- "Please restart from the beginning or specify another output directory."
- % (spades_prev_version.strip(), spades_version.strip()))
- if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
- spades_py_name = 'spades.py' # try default name
- else:
- spades_py_name = os.path.basename(spades_py_name)
- spades_py_pos = cmd_line.find(spades_py_name)
- if spades_py_pos == -1:
- return None, None
- return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+ return cmd_line, options, "SPAdes version of the previous run (%s) is not equal " \
+ "to the current version of SPAdes (%s)!" \
+ % (spades_prev_version.strip(), spades_version.strip())
+ if 'Command line: ' not in cmd_line or '\t' not in cmd_line:
+ return cmd_line, options, "failed to parse executable script of the previous run!"
+ options = cmd_line.split('\t')[1:]
+ prev_running_script = cmd_line.split('\t')[0][len('Command line: '):]
+ # we cannot restart/continue spades.py run with metaspades.py/rnaspades.py/etc and vice versa
+ if os.path.basename(prev_running_script) != os.path.basename(running_script):
+ return cmd_line, options, "executable script of the previous run (%s) is not equal " \
+ "to the current executable script (%s)!" \
+ % (os.path.basename(prev_running_script),
+ os.path.basename(running_script))
+ return cmd_line, options, ""
def show_version():
@@ -554,19 +572,14 @@ def main(args):
support.check_binaries(bin_home, log)
- # auto detecting SPAdes mode (rna, meta, etc)
- mode = options_storage.get_mode()
- if mode is not None:
- args.append('--' + mode)
-
# parse options and safe all parameters to cfg
options = args
cfg, dataset_data = fill_cfg(options, log)
if options_storage.continue_mode:
- cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
- if not options:
- support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cmd_line, options, err_msg = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if err_msg:
+ support.error(err_msg + " Please restart from the beginning or specify another output directory.")
cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
if options_storage.restart_from:
check_cfg_for_partial_run(cfg, type='restart-from')
@@ -699,6 +712,7 @@ def main(args):
result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_assembly_graph_filename_gfa = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name_gfa)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
@@ -715,6 +729,7 @@ def main(args):
spades_cfg.__dict__["result_contigs"] = result_contigs_filename
spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_graph_gfa"] = result_assembly_graph_filename_gfa
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
@@ -844,7 +859,9 @@ def main(args):
if options_storage.continue_mode and os.path.isfile(corrected):
log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
continue
-
+ if not os.path.isfile(assembled) or os.path.getsize(assembled) == 0:
+ log.info("\n== Skipping processing of " + assembly_type + " (empty file)\n")
+ continue
support.continue_from_here(log)
log.info("\n== Processing of " + assembly_type + "\n")
@@ -855,7 +872,6 @@ def main(args):
corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
-
corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
ext_python_modules_home, log, assembled, result_corrected_filename)
@@ -893,6 +909,9 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa):
+ message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa)
+ log.info(message)
if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
message = " * Paths in the assembly graph corresponding to the contigs are in " + \
support.process_spaces(result_contigs_paths_filename)
@@ -933,7 +952,10 @@ def main(args):
result_fasta = list(support.read_fasta(result_filename))
# correctness check: should be one contig of length 1000 bp
correct_number = 1
- correct_length = 1000
+ if options_storage.plasmid:
+ correct_length = 9667
+ else:
+ correct_length = 1000
if not len(result_fasta):
support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
elif len(result_fasta) > correct_number:
diff --git a/rnaspades.py b/rnaspades.py
index c19e2fb..ff31c92 100755
--- a/rnaspades.py
+++ b/rnaspades.py
@@ -186,10 +186,15 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
len(options_storage.SHORT_READS_TYPES.keys()) +
len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+ # auto detecting SPAdes mode (rna, meta, etc) if it is not a rerun (--continue or --restart-from)
+ if secondary_filling or not options_storage.will_rerun(options):
+ mode = options_storage.get_mode()
+ if mode is not None:
+ options.append(('--' + mode, ''))
+
# for parsing options from "previous run command"
options_storage.continue_mode = False
options_storage.k_mers = None
-
for opt, arg in options:
if opt == '-o':
if not skip_output_dir:
@@ -197,13 +202,17 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.error('-o option was specified at least twice')
options_storage.output_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ support.check_path_is_ascii(options_storage.output_dir, 'output directory')
elif opt == "--tmp-dir":
options_storage.tmp_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ support.check_path_is_ascii(options_storage.tmp_dir, 'directory for temporary files')
elif opt == "--configs-dir":
options_storage.configs_dir = support.check_dir_existence(arg)
elif opt == "--reference":
options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--series-analysis":
+ options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log)
elif opt == "--dataset":
options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
@@ -225,16 +234,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
elif opt == "--sc":
options_storage.single_cell = True
elif opt == "--meta":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.meta = True
elif opt == "--large-genome":
options_storage.large_genome = True
elif opt == "--plasmid":
options_storage.plasmid = True
elif opt == "--rna":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.rna = True
elif opt == "--iontorrent":
options_storage.iontorrent = True
@@ -327,9 +332,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
show_usage(0, show_hidden=True)
elif opt == "--test":
- options_storage.set_test_options()
- support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
- support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ options_storage.set_test_options()
#break
elif opt == "--diploid":
options_storage.diploid_mode = True
@@ -338,6 +341,14 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
raise ValueError
+ if options_storage.test_mode:
+ if options_storage.plasmid:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset_plasmid/pl1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset_plasmid/pl2.fq.gz"), dataset_data)
+ else:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+
if not options_storage.output_dir:
support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
if not os.path.isdir(options_storage.output_dir):
@@ -372,7 +383,6 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
existing_dataset_data = None
if existing_dataset_data is not None:
dataset_data = existing_dataset_data
- options_storage.dataset_yaml_filename = processed_dataset_fpath
else:
if options_storage.dataset_yaml_filename:
try:
@@ -384,8 +394,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
dataset_data = support.correct_dataset(dataset_data)
dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
- options_storage.dataset_yaml_filename = processed_dataset_fpath
- pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
@@ -397,6 +406,9 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
+ if existing_dataset_data is None:
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
options_storage.set_default_values()
### FILLING cfg
cfg["common"] = empty_config()
@@ -412,6 +424,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["common"].__dict__["max_threads"] = options_storage.threads
cfg["common"].__dict__["max_memory"] = options_storage.memory
cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+ if options_storage.series_analysis:
+ cfg["common"].__dict__["series_analysis"] = options_storage.series_analysis
# dataset section
cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
@@ -430,6 +444,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
if options_storage.meta or options_storage.large_genome:
cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+ if options_storage.read_buffer_size:
+ cfg["error_correction"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
# assembly
if not options_storage.only_error_correction:
@@ -449,9 +465,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.read_buffer_size:
cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
- if options_storage.large_genome:
- cfg["assembly"].__dict__["bwa_paired"] = True
- cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+
#corrector can work only if contigs exist (not only error correction)
if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
cfg["mismatch_corrector"] = empty_config()
@@ -500,9 +514,11 @@ def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop
support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
-def get_options_from_params(params_filename, spades_py_name=None):
+def get_options_from_params(params_filename, running_script):
+ cmd_line = None
+ options = None
if not os.path.isfile(params_filename):
- return None, None
+ return cmd_line, options, "failed to parse command line of the previous run (%s not found)!" % params_filename
params = open(params_filename, 'r')
cmd_line = params.readline().strip()
spades_prev_version = None
@@ -512,20 +528,22 @@ def get_options_from_params(params_filename, spades_py_name=None):
break
params.close()
if spades_prev_version is None:
- support.error("failed to parse SPAdes version of the previous run! "
- "Please restart from the beginning or specify another output directory.")
+ return cmd_line, options, "failed to parse SPAdes version of the previous run!"
if spades_prev_version.strip() != spades_version.strip():
- support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
- "Please restart from the beginning or specify another output directory."
- % (spades_prev_version.strip(), spades_version.strip()))
- if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
- spades_py_name = 'spades.py' # try default name
- else:
- spades_py_name = os.path.basename(spades_py_name)
- spades_py_pos = cmd_line.find(spades_py_name)
- if spades_py_pos == -1:
- return None, None
- return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+ return cmd_line, options, "SPAdes version of the previous run (%s) is not equal " \
+ "to the current version of SPAdes (%s)!" \
+ % (spades_prev_version.strip(), spades_version.strip())
+ if 'Command line: ' not in cmd_line or '\t' not in cmd_line:
+ return cmd_line, options, "failed to parse executable script of the previous run!"
+ options = cmd_line.split('\t')[1:]
+ prev_running_script = cmd_line.split('\t')[0][len('Command line: '):]
+ # we cannot restart/continue spades.py run with metaspades.py/rnaspades.py/etc and vice versa
+ if os.path.basename(prev_running_script) != os.path.basename(running_script):
+ return cmd_line, options, "executable script of the previous run (%s) is not equal " \
+ "to the current executable script (%s)!" \
+ % (os.path.basename(prev_running_script),
+ os.path.basename(running_script))
+ return cmd_line, options, ""
def show_version():
@@ -554,19 +572,14 @@ def main(args):
support.check_binaries(bin_home, log)
- # auto detecting SPAdes mode (rna, meta, etc)
- mode = options_storage.get_mode()
- if mode is not None:
- args.append('--' + mode)
-
# parse options and safe all parameters to cfg
options = args
cfg, dataset_data = fill_cfg(options, log)
if options_storage.continue_mode:
- cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
- if not options:
- support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cmd_line, options, err_msg = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if err_msg:
+ support.error(err_msg + " Please restart from the beginning or specify another output directory.")
cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
if options_storage.restart_from:
check_cfg_for_partial_run(cfg, type='restart-from')
@@ -699,6 +712,7 @@ def main(args):
result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_assembly_graph_filename_gfa = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name_gfa)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
@@ -715,6 +729,7 @@ def main(args):
spades_cfg.__dict__["result_contigs"] = result_contigs_filename
spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_graph_gfa"] = result_assembly_graph_filename_gfa
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
@@ -844,7 +859,9 @@ def main(args):
if options_storage.continue_mode and os.path.isfile(corrected):
log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
continue
-
+ if not os.path.isfile(assembled) or os.path.getsize(assembled) == 0:
+ log.info("\n== Skipping processing of " + assembly_type + " (empty file)\n")
+ continue
support.continue_from_here(log)
log.info("\n== Processing of " + assembly_type + "\n")
@@ -855,7 +872,6 @@ def main(args):
corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
-
corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
ext_python_modules_home, log, assembled, result_corrected_filename)
@@ -893,6 +909,9 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa):
+ message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa)
+ log.info(message)
if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
message = " * Paths in the assembly graph corresponding to the contigs are in " + \
support.process_spaces(result_contigs_paths_filename)
@@ -933,7 +952,10 @@ def main(args):
result_fasta = list(support.read_fasta(result_filename))
# correctness check: should be one contig of length 1000 bp
correct_number = 1
- correct_length = 1000
+ if options_storage.plasmid:
+ correct_length = 9667
+ else:
+ correct_length = 1000
if not len(result_fasta):
support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
elif len(result_fasta) > correct_number:
diff --git a/rnaspades_manual.html b/rnaspades_manual.html
index 7b5199b..5a23b1e 100644
--- a/rnaspades_manual.html
+++ b/rnaspades_manual.html
@@ -1,6 +1,6 @@
<html>
<head>
- <title>rnaSPAdes 1.0.0 Manual</title>
+ <title>rnaSPAdes manual</title>
<style type="text/css">
.code {
background-color: lightgray;
@@ -8,10 +8,12 @@
</style>
</head>
<body>
-<h1>rnaSPAdes 1.0.0 Manual</h1>
+<h1>rnaSPAdes manual</h1>
1. <a href="#sec1">About rnaSPAdes</a><br>
2. <a href="#sec2">rnaSPAdes specifics</a><br>
+ 2.1. <a href="#sec2.1">Running rnaSPAdes</a><br>
+ 2.2. <a href="#sec2.2">rnaSPAdes output</a><br>
3. <a href="#sec3">Assembly evaluation</a><br>
4. <a href="#sec4">Citation</a><br>
5. <a href="#sec5">Feedback and bug reports</a><br>
@@ -24,6 +26,8 @@
<a name="sec2"></a>
<h2>2 rnaSPAdes specifics</h2>
+<a name="sec2.1"></a>
+<h3>2.1 Running rnaSPAdes</h3>
<p>
To run rnaSPAdes use
@@ -43,16 +47,21 @@ or
Note that we assume that SPAdes installation directory is added to the <code>PATH</code> variable (provide full path to rnaSPAdes executable otherwise: <code><rnaspades installation dir>/rnaspades.py</code>).
-
-<p>Here are the main differences of rnaSPAdes:
+<p>Here are several notes regarding options :
<ul>
- <li>rnaSPAdes outputs only one FASTA file named <code>transcripts.fasta</code>. The corresponding file with paths in the <code>assembly_graph.fastg</code> is <code>transcripts.paths</code>.</li>
<li>rnaSPAdes can take as an input only one paired-end library and multiple single-end libraries.</li>
<li>rnaSPAdes does not support <code>--careful</code> and <code>--cov-cutoff</code> options.</li>
<li>rnaSPAdes is not compatible with other pipeline options such as <code>--meta</code>, <code>--sc</code> and <code>--plasmid</code>.</li>
- <li>rnaSPAdes works using only a single k-mer size (55 by the default). We strongly recommend no to change this parameter. In case your RNA-Seq data set contains long Illumina reads (150 bp and longer) you may try to use longer k-mer size (approximately half of the read length). In case you have any doubts about your run, do not hesitate to contact us using e-mail given below.</li>
+ <li>rnaSPAdes works using only a single k-mer size (55 by the default). We strongly recommend not to change this parameter. In case your RNA-Seq data set contains long Illumina reads (150 bp and longer) you may try to use longer k-mer size (approximately half of the read length). In case you have any doubts about your run, do not hesitate to contact us using e-mail given below.</li>
</ul>
+<a name="sec2.2"></a>
+<h3>2.2 rnaSPAdes output</h3>
+<p>
+rnaSPAdes outputs only one FASTA file named <code>transcripts.fasta</code>. The corresponding file with paths in the <code>assembly_graph.fastg</code> is <code>transcripts.paths</code>.
+
+<p>
+ Contigs/scaffolds names in rnaSPAdes output FASTA files have the following format: <br><code>>NODE_97_length_6237_cov_11.9819_g8_i2</code><br> Similarly to SPAdes, <code>97</code> is the number of the transcript, <code>6237</code> is its sequence length in nucleotides and <code>11.9819</code> is the k-mer coverage. Note that the k-mer coverage is always lower than the read (per-base) coverage. <code>g8_i2</code> correspond to the gene number 8 and isoform number 2 within this gene. Tr [...]
<a name="sec3">
<h2>3 Assembly evaluation</h2>
diff --git a/spades.py b/spades.py
index c19e2fb..ff31c92 100755
--- a/spades.py
+++ b/spades.py
@@ -186,10 +186,15 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
len(options_storage.SHORT_READS_TYPES.keys()) +
len(options_storage.LONG_READS_TYPES))] # "[{}]*num" doesn't work here!
+ # auto detecting SPAdes mode (rna, meta, etc) if it is not a rerun (--continue or --restart-from)
+ if secondary_filling or not options_storage.will_rerun(options):
+ mode = options_storage.get_mode()
+ if mode is not None:
+ options.append(('--' + mode, ''))
+
# for parsing options from "previous run command"
options_storage.continue_mode = False
options_storage.k_mers = None
-
for opt, arg in options:
if opt == '-o':
if not skip_output_dir:
@@ -197,13 +202,17 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
support.error('-o option was specified at least twice')
options_storage.output_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.output_dir
+ support.check_path_is_ascii(options_storage.output_dir, 'output directory')
elif opt == "--tmp-dir":
options_storage.tmp_dir = abspath(expanduser(arg))
options_storage.dict_of_rel2abs[arg] = options_storage.tmp_dir
+ support.check_path_is_ascii(options_storage.tmp_dir, 'directory for temporary files')
elif opt == "--configs-dir":
options_storage.configs_dir = support.check_dir_existence(arg)
elif opt == "--reference":
options_storage.reference = support.check_file_existence(arg, 'reference', log)
+ elif opt == "--series-analysis":
+ options_storage.series_analysis = support.check_file_existence(arg, 'series-analysis', log)
elif opt == "--dataset":
options_storage.dataset_yaml_filename = support.check_file_existence(arg, 'dataset', log)
@@ -225,16 +234,12 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
elif opt == "--sc":
options_storage.single_cell = True
elif opt == "--meta":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.meta = True
elif opt == "--large-genome":
options_storage.large_genome = True
elif opt == "--plasmid":
options_storage.plasmid = True
elif opt == "--rna":
- #FIXME temporary solution
- options_storage.single_cell = True
options_storage.rna = True
elif opt == "--iontorrent":
options_storage.iontorrent = True
@@ -327,9 +332,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
show_usage(0, show_hidden=True)
elif opt == "--test":
- options_storage.set_test_options()
- support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
- support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+ options_storage.set_test_options()
#break
elif opt == "--diploid":
options_storage.diploid_mode = True
@@ -338,6 +341,14 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
raise ValueError
+ if options_storage.test_mode:
+ if options_storage.plasmid:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset_plasmid/pl1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset_plasmid/pl2.fq.gz"), dataset_data)
+ else:
+ support.add_to_dataset('-1', os.path.join(spades_home, "test_dataset/ecoli_1K_1.fq.gz"), dataset_data)
+ support.add_to_dataset('-2', os.path.join(spades_home, "test_dataset/ecoli_1K_2.fq.gz"), dataset_data)
+
if not options_storage.output_dir:
support.error("the output_dir is not set! It is a mandatory parameter (-o output_dir).", log)
if not os.path.isdir(options_storage.output_dir):
@@ -372,7 +383,6 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
existing_dataset_data = None
if existing_dataset_data is not None:
dataset_data = existing_dataset_data
- options_storage.dataset_yaml_filename = processed_dataset_fpath
else:
if options_storage.dataset_yaml_filename:
try:
@@ -384,8 +394,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
else:
dataset_data = support.correct_dataset(dataset_data)
dataset_data = support.relative2abs_paths(dataset_data, os.getcwd())
- options_storage.dataset_yaml_filename = processed_dataset_fpath
- pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+ options_storage.dataset_yaml_filename = processed_dataset_fpath
support.check_dataset_reads(dataset_data, options_storage.only_assembler, log)
if not support.get_lib_ids_by_type(dataset_data, spades_logic.READS_TYPES_USED_IN_CONSTRUCTION):
@@ -397,6 +406,9 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if len(support.get_lib_ids_by_type(dataset_data, 'paired-end')) > 1:
support.error('you cannot specify more than one paired-end library in RNA-Seq mode!')
+ if existing_dataset_data is None:
+ pyyaml.dump(dataset_data, open(options_storage.dataset_yaml_filename, 'w'))
+
options_storage.set_default_values()
### FILLING cfg
cfg["common"] = empty_config()
@@ -412,6 +424,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["common"].__dict__["max_threads"] = options_storage.threads
cfg["common"].__dict__["max_memory"] = options_storage.memory
cfg["common"].__dict__["developer_mode"] = options_storage.developer_mode
+ if options_storage.series_analysis:
+ cfg["common"].__dict__["series_analysis"] = options_storage.series_analysis
# dataset section
cfg["dataset"].__dict__["yaml_filename"] = options_storage.dataset_yaml_filename
@@ -430,6 +444,8 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
cfg["error_correction"].__dict__["iontorrent"] = options_storage.iontorrent
if options_storage.meta or options_storage.large_genome:
cfg["error_correction"].__dict__["count_filter_singletons"] = 1
+ if options_storage.read_buffer_size:
+ cfg["error_correction"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
# assembly
if not options_storage.only_error_correction:
@@ -449,9 +465,7 @@ def fill_cfg(options_to_parse, log, secondary_filling=False):
if options_storage.read_buffer_size:
cfg["assembly"].__dict__["read_buffer_size"] = options_storage.read_buffer_size
cfg["assembly"].__dict__["correct_scaffolds"] = options_storage.correct_scaffolds
- if options_storage.large_genome:
- cfg["assembly"].__dict__["bwa_paired"] = True
- cfg["assembly"].__dict__["scaffolding_mode"] = "old_pe_2015"
+
#corrector can work only if contigs exist (not only error correction)
if (not options_storage.only_error_correction) and options_storage.mismatch_corrector:
cfg["mismatch_corrector"] = empty_config()
@@ -500,9 +514,11 @@ def check_cfg_for_partial_run(cfg, type='restart-from'): # restart-from ot stop
support.error("failed to " + action + " K=%s because this K " % k_str + verb + " not specified!")
-def get_options_from_params(params_filename, spades_py_name=None):
+def get_options_from_params(params_filename, running_script):
+ cmd_line = None
+ options = None
if not os.path.isfile(params_filename):
- return None, None
+ return cmd_line, options, "failed to parse command line of the previous run (%s not found)!" % params_filename
params = open(params_filename, 'r')
cmd_line = params.readline().strip()
spades_prev_version = None
@@ -512,20 +528,22 @@ def get_options_from_params(params_filename, spades_py_name=None):
break
params.close()
if spades_prev_version is None:
- support.error("failed to parse SPAdes version of the previous run! "
- "Please restart from the beginning or specify another output directory.")
+ return cmd_line, options, "failed to parse SPAdes version of the previous run!"
if spades_prev_version.strip() != spades_version.strip():
- support.error("SPAdes version of the previous run (%s) is not equal to the current version of SPAdes (%s)! "
- "Please restart from the beginning or specify another output directory."
- % (spades_prev_version.strip(), spades_version.strip()))
- if spades_py_name is None or cmd_line.find(os.path.basename(spades_py_name)) == -1:
- spades_py_name = 'spades.py' # try default name
- else:
- spades_py_name = os.path.basename(spades_py_name)
- spades_py_pos = cmd_line.find(spades_py_name)
- if spades_py_pos == -1:
- return None, None
- return cmd_line, cmd_line[spades_py_pos + len(spades_py_name):].split('\t')
+ return cmd_line, options, "SPAdes version of the previous run (%s) is not equal " \
+ "to the current version of SPAdes (%s)!" \
+ % (spades_prev_version.strip(), spades_version.strip())
+ if 'Command line: ' not in cmd_line or '\t' not in cmd_line:
+ return cmd_line, options, "failed to parse executable script of the previous run!"
+ options = cmd_line.split('\t')[1:]
+ prev_running_script = cmd_line.split('\t')[0][len('Command line: '):]
+ # we cannot restart/continue spades.py run with metaspades.py/rnaspades.py/etc and vice versa
+ if os.path.basename(prev_running_script) != os.path.basename(running_script):
+ return cmd_line, options, "executable script of the previous run (%s) is not equal " \
+ "to the current executable script (%s)!" \
+ % (os.path.basename(prev_running_script),
+ os.path.basename(running_script))
+ return cmd_line, options, ""
def show_version():
@@ -554,19 +572,14 @@ def main(args):
support.check_binaries(bin_home, log)
- # auto detecting SPAdes mode (rna, meta, etc)
- mode = options_storage.get_mode()
- if mode is not None:
- args.append('--' + mode)
-
# parse options and safe all parameters to cfg
options = args
cfg, dataset_data = fill_cfg(options, log)
if options_storage.continue_mode:
- cmd_line, options = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
- if not options:
- support.error("failed to parse command line of the previous run! Please restart from the beginning or specify another output directory.")
+ cmd_line, options, err_msg = get_options_from_params(os.path.join(options_storage.output_dir, "params.txt"), args[0])
+ if err_msg:
+ support.error(err_msg + " Please restart from the beginning or specify another output directory.")
cfg, dataset_data = fill_cfg(options, log, secondary_filling=True)
if options_storage.restart_from:
check_cfg_for_partial_run(cfg, type='restart-from')
@@ -699,6 +712,7 @@ def main(args):
result_contigs_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_name)
result_scaffolds_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_name)
result_assembly_graph_filename = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name)
+ result_assembly_graph_filename_gfa = os.path.join(cfg["common"].output_dir, options_storage.assembly_graph_name_gfa)
result_contigs_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.contigs_paths)
result_scaffolds_paths_filename = os.path.join(cfg["common"].output_dir, options_storage.scaffolds_paths)
result_transcripts_filename = os.path.join(cfg["common"].output_dir, options_storage.transcripts_name)
@@ -715,6 +729,7 @@ def main(args):
spades_cfg.__dict__["result_contigs"] = result_contigs_filename
spades_cfg.__dict__["result_scaffolds"] = result_scaffolds_filename
spades_cfg.__dict__["result_graph"] = result_assembly_graph_filename
+ spades_cfg.__dict__["result_graph_gfa"] = result_assembly_graph_filename_gfa
spades_cfg.__dict__["result_contigs_paths"] = result_contigs_paths_filename
spades_cfg.__dict__["result_scaffolds_paths"] = result_scaffolds_paths_filename
spades_cfg.__dict__["result_transcripts"] = result_transcripts_filename
@@ -844,7 +859,9 @@ def main(args):
if options_storage.continue_mode and os.path.isfile(corrected):
log.info("\n== Skipping processing of " + assembly_type + " (already processed)\n")
continue
-
+ if not os.path.isfile(assembled) or os.path.getsize(assembled) == 0:
+ log.info("\n== Skipping processing of " + assembly_type + " (empty file)\n")
+ continue
support.continue_from_here(log)
log.info("\n== Processing of " + assembly_type + "\n")
@@ -855,7 +872,6 @@ def main(args):
corr_cfg = merge_configs(cfg["mismatch_corrector"], cfg["common"])
result_corrected_filename = os.path.join(tmp_dir_for_corrector, "corrected_contigs.fasta")
-
corrector_logic.run_corrector( tmp_configs_dir, bin_home, corr_cfg,
ext_python_modules_home, log, assembled, result_corrected_filename)
@@ -893,6 +909,9 @@ def main(args):
if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename):
message = " * Assembly graph is in " + support.process_spaces(result_assembly_graph_filename)
log.info(message)
+ if "assembly" in cfg and os.path.isfile(result_assembly_graph_filename_gfa):
+ message = " * Assembly graph in GFA format is in " + support.process_spaces(result_assembly_graph_filename_gfa)
+ log.info(message)
if "assembly" in cfg and os.path.isfile(result_contigs_paths_filename):
message = " * Paths in the assembly graph corresponding to the contigs are in " + \
support.process_spaces(result_contigs_paths_filename)
@@ -933,7 +952,10 @@ def main(args):
result_fasta = list(support.read_fasta(result_filename))
# correctness check: should be one contig of length 1000 bp
correct_number = 1
- correct_length = 1000
+ if options_storage.plasmid:
+ correct_length = 9667
+ else:
+ correct_length = 1000
if not len(result_fasta):
support.error("TEST FAILED: %s does not contain contigs!" % result_filename)
elif len(result_fasta) > correct_number:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6ef1d66..d539593 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,7 +20,7 @@ set(CMAKE_MODULE_PATH
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
# Define various dirs
set(SPADES_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-set(SPADES_MODULES_DIR ${SPADES_MAIN_SRC_DIR}/modules)
+set(SPADES_MODULES_DIR ${SPADES_MAIN_SRC_DIR}/common)
set(SPADES_MAIN_INCLUDE_DIR ${SPADES_MAIN_SRC_DIR}/include)
set(SPADES_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(SPADES_TOOLS_BINARY_DIR ${SPADES_BINARY_DIR}/bin)
@@ -82,7 +82,7 @@ if (NOT OPENMP_FOUND)
endif()
# sub projects
-add_subdirectory(modules)
+add_subdirectory(common)
add_subdirectory(projects)
add_subdirectory(spades_pipeline)
@@ -119,6 +119,8 @@ install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../test_dataset"
DESTINATION share/spades)
install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../test_dataset_truspades"
DESTINATION share/spades)
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/../test_dataset_plasmid"
+ DESTINATION share/spades)
# manual, LICENSE, and GPLv2
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/../manual.html"
DESTINATION share/spades
diff --git a/src/cmake/options.cmake b/src/cmake/options.cmake
index 370c73e..3bc0aef 100644
--- a/src/cmake/options.cmake
+++ b/src/cmake/options.cmake
@@ -26,6 +26,9 @@ if (SPADES_STATIC_BUILD)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
set(LINK_SEARCH_START_STATIC TRUE)
set(LINK_SEARCH_END_STATIC TRUE)
+ # This is dirty hack to get rid of -Wl,-Bdynamic
+ set(CMAKE_EXE_LINK_DYNAMIC_C_FLAGS "-Wl,-Bstatic")
+ set(CMAKE_EXE_LINK_DYNAMIC_CXX_FLAGS "-Wl,-Bstatic")
if (APPLE)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc")
diff --git a/src/cmake/pack.cmake b/src/cmake/pack.cmake
index b5982c0..7cf99c9 100644
--- a/src/cmake/pack.cmake
+++ b/src/cmake/pack.cmake
@@ -12,10 +12,10 @@ set(CPACK_PACKAGE_NAME "SPAdes")
set(CPACK_PACKAGE_VENDOR "Saint Petersburg State University")
set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README")
set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE")
-set(CPACK_PACKAGE_VERSION "3.9.1")
+set(CPACK_PACKAGE_VERSION "3.10.0")
set(CPACK_PACKAGE_VERSION_MAJOR "3")
-set(CPACK_PACKAGE_VERSION_MINOR "9")
-set(CPACK_PACKAGE_VERSION_PATCH "1")
+set(CPACK_PACKAGE_VERSION_MINOR "10")
+set(CPACK_PACKAGE_VERSION_PATCH "0")
set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction)
# Source stuff
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
new file mode 100644
index 0000000..52bd90a
--- /dev/null
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,22 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(common_modules CXX)
+
+add_subdirectory(pipeline)
+add_subdirectory(assembly_graph)
+add_subdirectory(modules/path_extend)
+add_subdirectory(modules)
+add_subdirectory(stages)
+add_subdirectory(utils)
+add_subdirectory(io)
+add_subdirectory(utils/mph_index)
+add_subdirectory(utils/coverage_model)
+
+add_library(common_modules STATIC empty.cpp)
+
+target_link_libraries(common_modules assembly_graph input pipeline coverage_model path_extend stages utils mph_index modules)
diff --git a/src/utils/adt/array_vector.hpp b/src/common/adt/array_vector.hpp
similarity index 100%
rename from src/utils/adt/array_vector.hpp
rename to src/common/adt/array_vector.hpp
diff --git a/src/common/adt/bag.hpp b/src/common/adt/bag.hpp
new file mode 100644
index 0000000..47d58ad
--- /dev/null
+++ b/src/common/adt/bag.hpp
@@ -0,0 +1,87 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/verify.hpp"
+
+template<class T, class hash = std::hash<T>>
+class bag {
+ typedef std::unordered_map<T, size_t, hash> Data;
+ Data data_;
+ size_t size_;
+public:
+
+ bag() : size_(0) {
+ }
+
+ typedef typename Data::const_iterator const_iterator;
+
+ void put(const T& t, size_t mult) {
+ VERIFY(mult > 0);
+ data_[t] += mult;
+ size_ += mult;
+ }
+
+ void put(const T& t) {
+ put(t, 1);
+ }
+
+ bool take(const T& t, size_t mult) {
+ VERIFY(mult > 0);
+ /*typename map<T, size_t>::iterator*/auto it = data_.find(t);
+ if (it == data_.end()) {
+ return false;
+ } else {
+ size_t have = it->second;
+ if (have < mult) {
+ data_.erase(it->first);
+ size_ -= have;
+ return false;
+ } else if (have == mult) {
+ data_.erase(it->first);
+ size_ -= have;
+ return true;
+ } else {
+ it->second -= mult;
+ size_ -= mult;
+ return true;
+ }
+ }
+ }
+
+ bool take(const T& t) {
+ return take(t, 1);
+ }
+
+ size_t mult(const T& t) const {
+ auto it = data_.find(t);
+ if (it == data_.end()) {
+ return 0;
+ } else {
+ return it->second;
+ }
+ }
+
+ void clear() {
+ data_.clear();
+ size_ = 0;
+ }
+
+ const_iterator begin() const {
+ return data_.begin();
+ }
+
+ const_iterator end() const {
+ return data_.end();
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+};
diff --git a/src/utils/adt/bf.hpp b/src/common/adt/bf.hpp
similarity index 100%
rename from src/utils/adt/bf.hpp
rename to src/common/adt/bf.hpp
diff --git a/src/utils/adt/chained_iterator.hpp b/src/common/adt/chained_iterator.hpp
similarity index 100%
rename from src/utils/adt/chained_iterator.hpp
rename to src/common/adt/chained_iterator.hpp
diff --git a/src/common/adt/concurrent_dsu.hpp b/src/common/adt/concurrent_dsu.hpp
new file mode 100644
index 0000000..b45445c
--- /dev/null
+++ b/src/common/adt/concurrent_dsu.hpp
@@ -0,0 +1,297 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef CONCURRENTDSU_HPP_
+#define CONCURRENTDSU_HPP_
+
+#include "io/kmers/mmapped_writer.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdint>
+
+#include <algorithm>
+#include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <fstream>
+
+// Silence bogus gcc warnings
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+class ConcurrentDSU {
+ struct atomic_set_t {
+ uint64_t data : 61;
+ uint64_t aux : 2;
+ bool root : 1;
+ } __attribute__ ((packed));
+
+ static_assert(sizeof(atomic_set_t) == 8, "Unexpected size of atomic_set_t");
+
+public:
+ ConcurrentDSU(size_t size)
+ : data_(size) {
+
+ for (size_t i = 0; i < size; i++)
+ data_[i] = {.data = 1, .aux = 0, .root = true};
+ }
+
+ ~ConcurrentDSU() { }
+
+ void unite(size_t x, size_t y) {
+ uint64_t x_size, y_size;
+ uint64_t x_aux, y_aux;
+
+ // Step one: update the links
+ while (true) {
+ x = find_set(x);
+ y = find_set(y);
+ if (x == y)
+ return;
+
+ atomic_set_t x_entry = data_[x], y_entry = data_[y];
+ // If someone already changed roots => retry
+ if (!x_entry.root || !y_entry.root)
+ continue;
+
+ // We need to link the smallest subtree to the largest
+ x_size = x_entry.data, y_size = y_entry.data;
+ x_aux = x_entry.aux, y_aux = y_entry.aux;
+ if (x_size > y_size || (x_size == y_size && x > y)) {
+ std::swap(x, y);
+ std::swap(x_size, y_size);
+ std::swap(x_aux, y_aux);
+ std::swap(x_entry, y_entry);
+ }
+
+ // Link 'x' to 'y'. If someone already changed 'x' => try again.
+ atomic_set_t new_x_entry = {.data = y, .aux = x_aux, .root = false};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+
+ // Step two: update the size. We already linked 'x' to 'y'. Therefore we
+ // need to add 'x_size' to whichever value is currently inside 'y'.
+ while (true) {
+ y = find_set(y);
+ atomic_set_t y_entry = data_[y];
+ // If someone already changed the roots => retry
+ if (!y_entry.root)
+ continue;
+
+ // Update the size. If someone already changed 'y' => try again.
+ atomic_set_t new_y_entry = {.data = x_size + y_entry.data, .aux = y_aux, .root = true};
+ if (!data_[y].compare_exchange_strong(y_entry, new_y_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ size_t set_size(size_t i) const {
+ while (true) {
+ size_t el = find_set(i);
+ atomic_set_t entry = data_[el];
+ if (!entry.root)
+ continue;
+
+ return entry.data;
+ }
+ }
+
+ size_t find_set(size_t x) const {
+ // Step one: find the root
+ size_t r = x;
+ atomic_set_t r_entry = data_[r];
+ while (!r_entry.root) {
+ r = r_entry.data;
+ r_entry = data_[r];
+ }
+
+ // Step two: traverse the path from 'x' to root trying to update the links
+ // Note that the links might change, therefore we stop as soon as we'll
+ // end at 'some' root.
+ while (x != r) {
+ atomic_set_t x_entry = data_[x];
+ if (x_entry.root)
+ break;
+
+ // Try to update parent (may fail, it's ok)
+ atomic_set_t new_x_entry = {.data = r, .aux = x_entry.aux, .root = false};
+ data_[x].compare_exchange_weak(x_entry, new_x_entry);
+ x = x_entry.data;
+ }
+
+ return x;
+ }
+
+ bool same(size_t x, size_t y) const {
+ while (true) {
+ x = find_set(x);
+ y = find_set(y);
+ if (x == y)
+ return true;
+ if (data_[x].load().root)
+ return false;
+ }
+ }
+
+ size_t num_sets() const {
+ size_t count = 0;
+ for (const auto &entry : data_) {
+ count += entry.load(std::memory_order_relaxed).root;
+ }
+
+ return count;
+ }
+
+ bool is_root(size_t x) const {
+ return data_[x].load(std::memory_order_relaxed).root;
+ }
+
+ uint64_t aux(size_t x) const {
+ return data_[x].load(std::memory_order_relaxed).aux;
+ }
+
+ uint64_t root_aux(size_t x) const {
+ while (true) {
+ x = find_set(x);
+ atomic_set_t entry = data_[x];
+
+ if (!entry.root)
+ continue;
+
+ return entry.aux;
+ }
+ }
+
+ void set_aux(size_t x, uint64_t data) {
+ while (true) {
+ atomic_set_t x_entry = data_[x];
+ atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = x_entry.root};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ void set_root_aux(size_t x, uint64_t data) {
+ while (true) {
+ x = find_set(x);
+ atomic_set_t x_entry = data_[x];
+ if (!x_entry.root)
+ continue;
+
+ atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = true};
+ if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
+ continue;
+
+ break;
+ }
+ }
+
+ size_t extract_to_file(const std::string &Prefix) {
+ // First, touch all the sets to make them directly connect to the root
+# pragma omp parallel for
+ for (size_t x = 0; x < data_.size(); ++x)
+ (void) find_set(x);
+
+ std::unordered_map<size_t, size_t> sizes;
+
+#if 0
+ for (size_t x = 0; x < size; ++x) {
+ if (data_[x].parent != x) {
+ size_t t = data_[x].parent;
+ VERIFY(data_[t].parent == t)
+ }
+ }
+#endif
+
+ // Insert all the root elements into the map
+ sizes.reserve(num_sets());
+ for (size_t x = 0; x < data_.size(); ++x) {
+ if (is_root(x))
+ sizes[x] = 0;
+ }
+
+ // Now, calculate the counts. We can do this in parallel, because we know no
+ // insertion can occur.
+# pragma omp parallel for
+ for (size_t x = 0; x < data_.size(); ++x) {
+ size_t &entry = sizes[parent(x)];
+# pragma omp atomic
+ entry += 1;
+ }
+
+ // Now we know the sizes of each cluster. Go over again and calculate the
+ // file-relative (cumulative) offsets.
+ size_t off = 0;
+ for (size_t x = 0; x < data_.size(); ++x) {
+ if (is_root(x)) {
+ size_t &entry = sizes[x];
+ size_t noff = off + entry;
+ entry = off;
+ off = noff;
+ }
+ }
+
+ // Write down the entries
+ std::vector<size_t> out(off);
+ for (size_t x = 0; x < data_.size(); ++x) {
+ size_t &entry = sizes[parent(x)];
+ out[entry++] = x;
+ }
+ std::ofstream os(Prefix, std::ios::binary | std::ios::out);
+ os.write((char *) &out[0], out.size() * sizeof(out[0]));
+ os.close();
+
+ // Write down the sizes
+ MMappedRecordWriter<size_t> index(Prefix + ".idx");
+ index.reserve(sizes.size());
+ size_t *idx = index.data();
+ for (size_t x = 0, i = 0, sz = 0; x < data_.size(); ++x) {
+ if (is_root(x)) {
+ idx[i++] = sizes[x] - sz;
+ sz = sizes[x];
+ }
+ }
+
+ return sizes.size();
+ }
+
+ void get_sets(std::vector<std::vector<size_t> > &otherWay) {
+ otherWay.resize(data_.size());
+ for (size_t i = 0; i < data_.size(); i++) {
+ size_t set = find_set(i);
+ otherWay[set].push_back(i);
+ }
+ otherWay.erase(remove_if(otherWay.begin(), otherWay.end(), zero_size),
+ otherWay.end());
+ }
+
+private:
+ size_t parent(size_t x) const {
+ atomic_set_t val = data_[x];
+ return (val.root ? x : val.data);
+ }
+
+ static bool zero_size(const std::vector<size_t> &v) {
+ return v.size() == 0;
+ }
+
+ mutable std::vector<std::atomic<atomic_set_t> > data_;
+};
+
+#pragma GCC diagnostic pop
+
+#endif /* CONCURRENTDSU_HPP_ */
diff --git a/src/utils/adt/filter_iterator.hpp b/src/common/adt/filter_iterator.hpp
similarity index 100%
rename from src/utils/adt/filter_iterator.hpp
rename to src/common/adt/filter_iterator.hpp
diff --git a/src/utils/adt/flat_map.hpp b/src/common/adt/flat_map.hpp
similarity index 100%
rename from src/utils/adt/flat_map.hpp
rename to src/common/adt/flat_map.hpp
diff --git a/src/utils/adt/flat_set.hpp b/src/common/adt/flat_set.hpp
similarity index 100%
rename from src/utils/adt/flat_set.hpp
rename to src/common/adt/flat_set.hpp
diff --git a/src/utils/adt/hll.hpp b/src/common/adt/hll.hpp
similarity index 100%
rename from src/utils/adt/hll.hpp
rename to src/common/adt/hll.hpp
diff --git a/src/utils/adt/iterator_range.hpp b/src/common/adt/iterator_range.hpp
similarity index 100%
rename from src/utils/adt/iterator_range.hpp
rename to src/common/adt/iterator_range.hpp
diff --git a/src/common/adt/kmer_hash_vector.hpp b/src/common/adt/kmer_hash_vector.hpp
new file mode 100644
index 0000000..fcc486f
--- /dev/null
+++ b/src/common/adt/kmer_hash_vector.hpp
@@ -0,0 +1,370 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * kmer_hash_vector.hpp
+ *
+ * Created on: Jul 19, 2012
+ * Author: alex
+ */
+
+#ifndef KMER_HASH_VECTOR_HPP_
+#define KMER_HASH_VECTOR_HPP_
+
+
+#include "sequence/runtime_k.hpp"
+#include "kmer_map.hpp"
+
+
+namespace runtime_k {
+
+class IKmerHashVector {
+
+protected:
+ static const size_t LOAD_OVERHEAD = 1000;
+
+ size_t nthreads_;
+
+ size_t cell_size_;
+
+public:
+ typedef RtSeq input_value_type;
+
+ IKmerHashVector(size_t nthreads)
+ : nthreads_ (nthreads)
+ , cell_size_ (LOAD_OVERHEAD) {
+ }
+
+ virtual ~IKmerHashVector() {
+
+ }
+
+ virtual IKmerHashVector * copy() const = 0;
+
+ virtual void clear() = 0;
+
+ virtual void clear(size_t i) = 0;
+
+ virtual bool is_full() const = 0;
+
+ virtual bool is_presisely_full() const = 0;
+
+ virtual size_t capacity(size_t i) const = 0;
+
+ virtual size_t size(size_t i) const = 0;
+
+
+ virtual void insert(const input_value_type& value) = 0;
+
+ virtual void reserve(size_t cell_size) = 0;
+
+
+ virtual size_t get_k() const = 0;
+
+ size_t get_threads_num() const
+ {
+ return nthreads_;
+ }
+
+ virtual void dump (KmerMap<int>& destination, size_t bucketNum) = 0;
+};
+
+
+
+class KmerHashVector {
+
+public:
+
+ typedef IKmerHashVector base_vector_type;
+
+private:
+
+ base_vector_type * data_;
+
+public:
+
+ typedef KmerHashVector vector_type;
+
+ typedef base_vector_type::input_value_type input_value_type;
+
+
+ KmerHashVector(size_t k, size_t nthreads);
+
+ KmerHashVector(base_vector_type * vec): data_(vec) {
+ }
+
+ KmerHashVector(const vector_type& vec) {
+ data_ = vec.data_->copy();
+ }
+
+ vector_type& operator=(const vector_type& vec) {
+ if (vec.data_ != data_) {
+ delete data_;
+ data_ = vec.data_->copy();
+ }
+
+ return *this;
+ }
+
+ ~KmerHashVector() {
+ delete data_;
+ }
+
+
+
+ bool is_full() const {
+ return data_->is_full();
+ }
+
+ bool is_presisely_full() const {
+ return data_->is_presisely_full();
+ }
+
+ size_t get_threads_num() const
+ {
+ return data_->get_threads_num();
+ }
+
+
+ void insert(const input_value_type& value) {
+ data_->insert(value);
+ }
+
+ void clear() {
+ data_->clear();
+ }
+
+
+ void clear(size_t i) {
+ data_->clear(i);
+ }
+
+ size_t get_k() const {
+ return data_->get_k();
+ }
+
+ size_t capacity(size_t i) const {
+ return data_->capacity(i);
+ }
+
+ void reserve(size_t cell_size) {
+ data_->reserve(cell_size);
+ }
+
+ base_vector_type * get_data() const {
+ return data_;
+ }
+
+ void print_sizes() {
+ for (size_t i = 0; i < data_->get_threads_num(); ++i) {
+ INFO("Size " << i << ": " << data_->size(i));
+ }
+ }
+
+ void dump (KmerMap<int>& destination, size_t bucketNum) {
+ data_->dump(destination, bucketNum);
+ }
+};
+
+
+// ================================= VECTOR IMPLEMENTATION =================================
+
+template <size_t size_>
+class KmerHashVectorImpl: public IKmerHashVector {
+
+public:
+
+ typedef TypeContainerImpl<size_> type_container;
+
+ typedef typename type_container::Kmer Kmer;
+
+ typedef typename type_container::vector_type vector_type;
+
+ typedef std::vector<vector_type> data_type;
+
+ typedef IKmerHashVector base_type;
+
+ typedef typename base_type::input_value_type input_value_type;
+
+private:
+
+ data_type data_;
+
+ size_t k_;
+
+public:
+
+ KmerHashVectorImpl(size_t k, size_t nthreads):
+ IKmerHashVector(nthreads)
+ , data_ (nthreads)
+ , k_ (k) {
+ }
+
+ virtual base_type * copy() const {
+ return new KmerHashVectorImpl<size_>(*this);
+ }
+
+ virtual bool is_full() const {
+ return data_[0].size() >= cell_size_;
+ }
+
+ virtual bool is_presisely_full() const {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ if (data_[i].size() >= cell_size_)
+ return true;
+ }
+ return false;
+ }
+
+ virtual void insert(const input_value_type& value) {
+ Kmer kmer = type_container::from_sequence(value);
+ data_[kmer.GetHash() % nthreads_].push_back(kmer);
+ }
+
+ virtual void clear() {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ data_[i].clear();
+ }
+ }
+
+ virtual void clear(size_t i) {
+ data_[i].clear();
+ }
+
+ virtual size_t get_k() const {
+ return k_;
+ }
+
+ virtual size_t capacity(size_t i) const {
+ return data_[i].capacity();
+ }
+
+ virtual size_t size(size_t i) const {
+ return data_[i].size();
+ }
+
+ virtual void reserve(size_t cell_size) {
+ cell_size_ = cell_size;
+ for (size_t i = 0; i < nthreads_; ++i) {
+ data_[i].reserve(cell_size_ + LOAD_OVERHEAD);
+ }
+ }
+
+ const data_type& get_data() const {
+ return data_;
+ }
+
+ virtual void dump (KmerMap<int>& destination, size_t bucketNum) {
+ KmerMapImpl<size_, int>& destImpl = dynamic_cast<KmerMapImpl<size_, int>&>(destination.get_data());
+
+ for (auto it = data_[bucketNum].begin(), end = data_[bucketNum].end(); it != end; ++it) {
+ ++destImpl[*it];
+ }
+ }
+};
+
+
+// ================================= VECTOR FACTORIES =================================
+// Single factory interface
+class SingleKmerHashVectorFactory {
+
+public:
+
+ virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const = 0;
+
+ virtual ~SingleKmerHashVectorFactory() {
+
+ }
+};
+
+
+// Single factory for specific k and value
+template <size_t ts_>
+class SingleKmerHashVectorFactoryImpl: public SingleKmerHashVectorFactory {
+
+public:
+
+ virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const {
+ VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
+ //INFO(k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
+
+ return new KmerHashVectorImpl< GET_K_BY_TS(ts_) >(k, nthreads);
+ }
+
+};
+
+//Factory genetator
+template<size_t ts_>
+class HashVectorGenerator {
+
+public:
+
+ static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
+ factories[ts_] = new SingleKmerHashVectorFactoryImpl<ts_>();
+ HashVectorGenerator<ts_ - 1> :: GenerateHashVectors (factories);
+ }
+};
+
+//Terminating factory generator
+template<>
+class HashVectorGenerator<MIN_TS> {
+
+public:
+
+ static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
+ factories[MIN_TS] = new SingleKmerHashVectorFactoryImpl<MIN_TS>;
+ }
+};
+
+
+//Lazy singleton for factory for every required value
+class KmerHashVectorFactory {
+
+private:
+
+ std::vector < SingleKmerHashVectorFactory* > single_factories_;
+
+ KmerHashVectorFactory() {
+ VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
+
+ single_factories_ = std::vector < SingleKmerHashVectorFactory* >(MAX_TS + 1);
+ HashVectorGenerator<MAX_TS>::GenerateHashVectors(single_factories_);
+ }
+
+public:
+
+ static KmerHashVectorFactory& GetInstance() {
+ static KmerHashVectorFactory instance;
+
+ return instance;
+ }
+
+ KmerHashVector GetHashVector(size_t k, size_t nthreads) {
+ VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
+ ToString(MIN_K) + " and <= " + ToString(MAX_K));
+
+ return KmerHashVector(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads));
+ }
+
+ IKmerHashVector * GetRawHashVector(size_t k, size_t nthreads) {
+ VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
+ ToString(MIN_K) + " and <= " + ToString(MAX_K));
+
+ return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads);
+ }
+};
+
+KmerHashVector GetHashVector(size_t k, size_t nthreads) {
+ return KmerHashVectorFactory::GetInstance().GetHashVector(k, nthreads);
+}
+
+KmerHashVector::KmerHashVector(size_t k, size_t nthreads): data_(KmerHashVectorFactory::GetInstance().GetRawHashVector(k, nthreads)) {
+}
+
+} //namespace runtime_k
+
+#endif /* KMER_HASH_VECTOR_HPP_ */
diff --git a/src/common/adt/kmer_vector.hpp b/src/common/adt/kmer_vector.hpp
new file mode 100644
index 0000000..2be2fb2
--- /dev/null
+++ b/src/common/adt/kmer_vector.hpp
@@ -0,0 +1,192 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __KMER_VECTOR_HPP__
+#define __KMER_VECTOR_HPP__
+
+#include "array_vector.hpp"
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+
+# include <jemalloc/jemalloc.h>
+
+#endif
+
+template<class Seq>
+class KMerVector {
+private:
+ typedef typename Seq::DataType ElTy;
+
+ ElTy *realloc() {
+#ifdef SPADES_USE_JEMALLOC
+ // First, try to expand in-place
+ if (storage_ && sizeof(ElTy) * capacity_ * el_sz_ > 4096 &&
+ je_rallocm((void **) &storage_, NULL, sizeof(ElTy) * capacity_ * el_sz_, 0, ALLOCM_NO_MOVE) ==
+ ALLOCM_SUCCESS)
+ return storage_;
+
+ // Failed, do usual malloc / memcpy / free cycle
+ ElTy *res = (ElTy *) je_malloc(sizeof(ElTy) * capacity_ * el_sz_);
+ if (storage_)
+ std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
+ je_free(storage_);
+ storage_ = res;
+#else
+ // No JEMalloc, no cookies
+ ElTy *res = new ElTy[capacity_ * el_sz_];
+ if (storage_)
+ std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
+
+ delete[] storage_;
+ storage_ = res;
+#endif
+
+ return storage_;
+ }
+
+public:
+ typedef typename array_vector<ElTy>::reference reference;
+ typedef typename array_vector<ElTy>::value_type value_type;
+ typedef typename array_vector<ElTy>::iterator iterator;
+ typedef typename array_vector<ElTy>::const_iterator const_iterator;
+
+ typedef array_less<ElTy> less2_fast;
+ typedef array_equal_to<ElTy> equal_to;
+
+ explicit KMerVector(unsigned K, size_t capacity = 1)
+ : K_(K), size_(0), capacity_(std::max(capacity, (size_t) 1)), el_sz_(Seq::GetDataSize(K)), storage_(NULL),
+ vector_(realloc(), size_, el_sz_) {
+ }
+
+ KMerVector(KMerVector &&that)
+ : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(that.storage_),
+ vector_(storage_, size_, el_sz_) {
+ that.storage_ = NULL;
+ }
+
+ KMerVector(const KMerVector &that)
+ : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(NULL),
+ vector_(realloc(), size_, el_sz_) {
+ memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
+ }
+
+ ~KMerVector() {
+#ifdef SPADES_USE_JEMALLOC
+ je_free(storage_);
+#else
+ delete[] storage_;
+#endif
+ }
+
+ KMerVector &operator=(const KMerVector &that) {
+ if (this != &that) {
+ K_ = that.K_;
+ size_ = that.size_;
+ capacity_ = that.capacity_;
+ el_sz_ = that.el_sz_;
+
+ storage_ = NULL;
+ realloc();
+ memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
+
+ vector_.set_data(storage_);
+ vector_.set_size(size_);
+ }
+
+ return *this;
+ }
+
+ void push_back(const ElTy *data) {
+ if (capacity_ == size_)
+ reserve(capacity_ * 2);
+
+ vector_[size_] = data;
+ size_ += 1;
+ vector_.set_size(size_);
+ }
+
+ void push_back(const Seq &s) {
+ push_back(s.data());
+ }
+
+ void push_back(reference s) {
+ push_back(s.data());
+ }
+
+ void push_back(const value_type &s) {
+ push_back(s.data());
+ }
+
+ void reserve(size_t amount) {
+ if (capacity_ < amount) {
+ capacity_ = amount;
+ vector_.set_data(realloc());
+ }
+ }
+
+ void clear() {
+ size_ = 0;
+ vector_.set_size(size_);
+ }
+
+ void shrink_to_fit() {
+ capacity_ = std::max(size_, size_t(1));
+ vector_.set_data(realloc());
+ }
+
+ iterator begin() {
+ return vector_.begin();
+ }
+
+ const_iterator begin() const {
+ return vector_.begin();
+ }
+
+ iterator end() {
+ return vector_.end();
+ }
+
+ const_iterator end() const {
+ return vector_.end();
+ }
+
+ const ElTy *data() const {
+ return storage_;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t el_size() const {
+ return el_sz_;
+ }
+
+ size_t el_data_size() const {
+ return el_sz_ * sizeof(ElTy);
+ }
+
+ size_t capacity() const {
+ return capacity_;
+ }
+
+ const ElTy *operator[](size_t idx) const {
+ return vector_[idx];
+ }
+
+private:
+ unsigned K_;
+ size_t size_;
+ size_t capacity_;
+ size_t el_sz_;
+ ElTy *storage_;
+ array_vector<ElTy> vector_;
+};
+
+
+#endif /* __KMER_VECTOR_HPP */
diff --git a/src/common/adt/loser_tree.hpp b/src/common/adt/loser_tree.hpp
new file mode 100644
index 0000000..7dbab36
--- /dev/null
+++ b/src/common/adt/loser_tree.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include "iterator_range.hpp"
+#include <vector>
+
+namespace adt {
+
+template<typename IntegerType>
+IntegerType ilog2(IntegerType x) {
+ IntegerType lg = 0;
+ while (x >= 256) { x >>= 8; lg += 8; }
+ while (x >>= 1) lg += 1;
+
+ return lg;
+}
+
+template<typename IntegerType>
+IntegerType ilog2ceil(IntegerType x) {
+ return ilog2(x - 1) + 1;
+}
+
+template<class It, class Cmp>
+class loser_tree {
+ typedef typename std::iterator_traits<It>::value_type value_type;
+
+ size_t log_k_;
+ size_t k_;
+ std::vector<size_t> entry_;
+ Cmp inner_cmp_;
+
+ bool cmp(const adt::iterator_range<It> &a, const adt::iterator_range<It> &b) const {
+ // Emulate sentinels
+ if (b.end() == b.begin())
+ return true;
+ if (a.end() == a.begin())
+ return false;
+
+ return inner_cmp_(*a.begin(), *b.begin());
+ }
+
+ size_t init_winner(size_t root) {
+ if (root >= k_)
+ return root - k_;
+
+ size_t left = init_winner(2 * root);
+ size_t right = init_winner(2 * root + 1);
+ if (cmp(runs_[left], runs_[right])) {
+ entry_[root] = right;
+ return left;
+ } else {
+ entry_[root] = left;
+ return right;
+ }
+ }
+
+ public:
+ loser_tree(const std::vector<adt::iterator_range<It>> &runs,
+ Cmp inner_cmp = Cmp())
+ : inner_cmp_(inner_cmp), runs_(runs) {
+ log_k_ = ilog2ceil(runs.size());
+ k_ = (size_t(1) << log_k_);
+
+ // fprintf(stderr, "k: %zu, logK: %zu, nruns: %zu\n", k_, log_k_, runs.size());
+
+ entry_.resize(2 * k_);
+ for (size_t i = 0; i < k_; ++i)
+ entry_[k_ + i] = i;
+
+ // Insert sentinels
+ for (size_t i = runs.size(); i < k_; ++i)
+ runs_.emplace_back(adt::make_range(runs_[0].end(), runs_[0].end()));
+
+ // Populate tree
+ entry_[0] = init_winner(1);
+
+ // for (const auto &entry : entry_)
+ // fprintf(stderr, "%zu, ", entry);
+ // fprintf(stderr, "\n");
+ }
+
+ size_t replay(size_t winner_index) {
+ auto &winner = runs_[winner_index];
+ if (winner.begin() == winner.end())
+ return winner_index;
+
+ winner = adt::make_range(std::next(winner.begin()), winner.end());
+ for (size_t i = (winner_index + k_) >> 1; i > 0; i >>= 1)
+ if (cmp(runs_[entry_[i]], runs_[winner_index]))
+ std::swap(entry_[i], winner_index);
+
+ return winner_index;
+ }
+
+ bool empty() const {
+ size_t winner_index = entry_[0];
+ const auto &winner = runs_[winner_index];
+ return (winner.begin() == winner.end());
+ }
+
+
+ template<class It2>
+ size_t multi_merge(It2 out, size_t amount = -1ULL) {
+ size_t cnt = 0;
+ size_t winner_index = entry_[0];
+
+ for (cnt = 0; cnt < amount; ++cnt) {
+ auto &winner = runs_[winner_index];
+ if (winner.begin() == winner.end())
+ break;
+
+ *out++ = *winner.begin();
+
+ winner_index = replay(winner_index);
+ }
+
+ entry_[0] = winner_index;
+
+ return cnt;
+ }
+
+ value_type pop() {
+ size_t winner_index = entry_[0];
+ value_type res = *runs_[winner_index].begin();
+ entry_[0] = replay(winner_index);
+
+ return res;
+ }
+
+
+ private:
+ std::vector<adt::iterator_range<It>> runs_;
+};
+
+}
\ No newline at end of file
diff --git a/src/common/adt/parallel_seq_vector.hpp b/src/common/adt/parallel_seq_vector.hpp
new file mode 100644
index 0000000..44c8d6c
--- /dev/null
+++ b/src/common/adt/parallel_seq_vector.hpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "parallel_unordered_map.hpp"
+#include "utils/openmp_wrapper.h"
+
+#include "sequence/runtime_k.hpp"
+#include "kmer_map.hpp"
+#include "kmer_hash_vector.hpp"
+
+class ParallelSeqVector {
+
+public:
+ typedef runtime_k::KmerHashVector par_container_t;
+
+ typedef runtime_k::KmerMap<int> destination_container_t;
+
+ typedef RtSeq Kmer;
+
+private:
+
+ size_t k_;
+
+ size_t nthreads_;
+
+ std::vector<par_container_t> nodes_;
+
+public:
+
+ ParallelSeqVector(size_t k, size_t nthreads, size_t cell_size) :
+ k_(k),
+ nthreads_(nthreads),
+ nodes_()
+
+ {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_.push_back(runtime_k::GetHashVector(k_, nthreads_));
+ }
+
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_[i].reserve(cell_size);
+ }
+ }
+
+
+ void AddEdge(const Kmer &kmer, size_t thread_number) {
+ nodes_[thread_number].insert(kmer);
+ }
+
+ void CountSequence(const Sequence& s, size_t thread_number) {
+ if (s.size() < k_)
+ return;
+
+ Kmer kmer = s.start<Kmer>(k_);
+
+ AddEdge(kmer, thread_number);
+ for (size_t j = k_; j < s.size(); ++j) {
+ kmer <<= s[j];
+ AddEdge(kmer, thread_number);
+ }
+
+ }
+//
+// void MergeMaps(destination_container_t & dest_container, size_t i) {
+// for (size_t j = 0; j < nthreads_; ++j) {
+// dest_container.transfer(nodes_[j], i);
+// }
+// }
+
+ void Dump(destination_container_t & bucket, size_t bucket_number) {
+ for (size_t i = 0; i < nodes_.size(); ++i) {
+ nodes_[i].dump(bucket, bucket_number);
+ nodes_[i].clear(bucket_number);
+ }
+ }
+
+
+ size_t SingleBucketCount() const {
+ return nodes_[0].capacity(0);
+ }
+
+ bool IsFull(size_t i) const {
+ return nodes_[i].is_full();
+ }
+
+ void Clear(size_t i) {
+ nodes_[i].clear();
+ }
+
+ void Clear() {
+ for (size_t i = 0; i < nthreads_; ++i) {
+ nodes_[i].clear();
+ }
+ }
+
+ void print_sizes() {
+ for (size_t i = 0; i < nodes_.size(); ++i) {
+ INFO("Size " << i << "::: ");
+ nodes_[i].print_sizes();
+ }
+ }
+
+
+};
diff --git a/src/utils/adt/parallel_unordered_map.hpp b/src/common/adt/parallel_unordered_map.hpp
similarity index 100%
rename from src/utils/adt/parallel_unordered_map.hpp
rename to src/common/adt/parallel_unordered_map.hpp
diff --git a/src/utils/adt/pointer_iterator.hpp b/src/common/adt/pointer_iterator.hpp
similarity index 100%
rename from src/utils/adt/pointer_iterator.hpp
rename to src/common/adt/pointer_iterator.hpp
diff --git a/src/common/adt/queue_iterator.hpp b/src/common/adt/queue_iterator.hpp
new file mode 100644
index 0000000..5a867af
--- /dev/null
+++ b/src/common/adt/queue_iterator.hpp
@@ -0,0 +1,143 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef QUEUE_ITERATOR_HPP_
+#define QUEUE_ITERATOR_HPP_
+
+#include "utils/verify.hpp"
+#include <set>
+
+template<typename T, typename Comparator>
+class erasable_priority_queue {
+private:
+ std::set<T, Comparator> storage_;
+public:
+ /*
+ * Be careful! This constructor requires Comparator to have default constructor even if you call it with
+ * specified comparator. In this case just create default constructor with VERIFY(false) inside it.
+ */
+ erasable_priority_queue(const Comparator& comparator = Comparator()) :
+ storage_(comparator) {
+ }
+
+ template<typename InputIterator>
+ erasable_priority_queue(InputIterator begin, InputIterator end,
+ const Comparator& comparator = Comparator()) :
+ storage_(begin, end, comparator) {
+ }
+
+ void pop() {
+ VERIFY(!storage_.empty());
+ storage_.erase(storage_.begin());
+ }
+
+ const T& top() const {
+ VERIFY(!storage_.empty());
+ return *(storage_.begin());
+ }
+
+ void push(const T& key) {
+ storage_.insert(key);
+ }
+
+ bool erase(const T& key) {
+ bool res = storage_.erase(key) > 0;
+ return res;
+ }
+
+ void clear() {
+ storage_.clear();
+ }
+
+ bool empty() const {
+ return storage_.empty();
+ }
+
+ size_t size() const {
+ return storage_.size();
+ }
+
+ template <class InputIterator>
+ void insert ( InputIterator first, InputIterator last ) {
+ storage_.insert(first, last);
+ }
+
+};
+
+template<typename T, typename Comparator = std::less<T>>
+class DynamicQueueIterator {
+
+ bool current_actual_;
+ bool current_deleted_;
+ T current_;
+ erasable_priority_queue<T, Comparator> queue_;
+
+public:
+
+ DynamicQueueIterator(const Comparator& comparator = Comparator()) :
+ current_actual_(false), current_deleted_(false), queue_(comparator) {
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ queue_.insert(begin, end);
+ }
+
+ void push(const T& to_add) {
+ queue_.push(to_add);
+ }
+
+ void erase(const T& to_remove) {
+ if (current_actual_ && to_remove == current_) {
+ current_deleted_ = true;
+ }
+ queue_.erase(to_remove);
+ }
+
+ void clear() {
+ queue_.clear();
+ current_actual_ = false;
+ current_deleted_ = false;
+ }
+
+ bool IsEnd() const {
+ return queue_.empty();
+ }
+
+ size_t size() const {
+ return queue_.size();
+ }
+
+ const T& operator*() {
+ VERIFY(!queue_.empty());
+ if(!current_actual_ || current_deleted_) {
+ current_ = queue_.top();
+ current_actual_ = true;
+ current_deleted_ = false;
+ }
+ return current_;
+ }
+
+ void operator++() {
+ if (!current_actual_) {
+ queue_.pop();
+ } else if (!current_deleted_) {
+ queue_.erase(current_);
+ }
+ current_actual_ = false;
+ }
+
+ //use carefully!
+ void ReleaseCurrent() {
+ current_actual_ = false;
+ }
+
+};
+
+
+#endif /* QUEUE_ITERATOR_HPP_ */
+
diff --git a/src/utils/adt/small_pod_vector.hpp b/src/common/adt/small_pod_vector.hpp
similarity index 100%
rename from src/utils/adt/small_pod_vector.hpp
rename to src/common/adt/small_pod_vector.hpp
diff --git a/src/common/assembly_graph/CMakeLists.txt b/src/common/assembly_graph/CMakeLists.txt
new file mode 100644
index 0000000..953a25e
--- /dev/null
+++ b/src/common/assembly_graph/CMakeLists.txt
@@ -0,0 +1,12 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(assembly_graph CXX)
+
+add_library(assembly_graph STATIC
+ components/connected_component.cpp paths/bidirectional_path.cpp paths/bidirectional_path_io/io_support.cpp paths/bidirectional_path_io/bidirectional_path_output.cpp graph_support/scaff_supplementary.cpp ../modules/alignment/edge_index_refiller.cpp graph_support/coverage_uniformity_analyzer.cpp)
+target_link_libraries(assembly_graph hattrie)
diff --git a/src/modules/assembly_graph/components/component_filters.hpp b/src/common/assembly_graph/components/component_filters.hpp
similarity index 100%
rename from src/modules/assembly_graph/components/component_filters.hpp
rename to src/common/assembly_graph/components/component_filters.hpp
diff --git a/src/modules/assembly_graph/components/connected_component.cpp b/src/common/assembly_graph/components/connected_component.cpp
similarity index 100%
rename from src/modules/assembly_graph/components/connected_component.cpp
rename to src/common/assembly_graph/components/connected_component.cpp
diff --git a/src/common/assembly_graph/components/connected_component.hpp b/src/common/assembly_graph/components/connected_component.hpp
new file mode 100644
index 0000000..2fa958f
--- /dev/null
+++ b/src/common/assembly_graph/components/connected_component.hpp
@@ -0,0 +1,26 @@
+//
+// Created by lab42 on 8/24/15.
+//
+#pragma once
+#include <map>
+//#include "path_extend/bidirectional_path.hpp"
+#include "assembly_graph/core/graph.hpp"
+
+namespace debruijn_graph{
+
+class ConnectedComponentCounter {
+public:
+ mutable std::map<EdgeId, size_t> component_ids_;
+ mutable std::map<size_t, size_t> component_edges_quantity_;
+ mutable std::map<size_t, size_t> component_total_len_;
+ const Graph &g_;
+ ConnectedComponentCounter(const Graph &g):g_(g) {}
+ void CalculateComponents() const;
+// size_t GetComponent(path_extend::BidirectionalPath * p) const;
+ size_t GetComponent(EdgeId & e) const;
+ bool IsFilled() const {
+ return (component_ids_.size() != 0);
+ }
+
+};
+}
diff --git a/src/common/assembly_graph/components/graph_component.hpp b/src/common/assembly_graph/components/graph_component.hpp
new file mode 100644
index 0000000..2abcaec
--- /dev/null
+++ b/src/common/assembly_graph/components/graph_component.hpp
@@ -0,0 +1,226 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class GraphComponent {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename std::set<VertexId>::const_iterator vertex_iterator;
+ typedef typename std::set<EdgeId>::const_iterator edge_iterator;
+ const Graph& graph_;
+ std::set<VertexId> vertices_;
+ std::set<EdgeId> edges_;
+ std::set<VertexId> exits_;
+ std::set<VertexId> entrances_;
+ std::string name_;
+
+ template<class VertexIt>
+ void FillVertices(VertexIt begin, VertexIt end, bool add_conjugate = false) {
+ for (auto it = begin; it != end; ++it) {
+ vertices_.insert(*it);
+ if (add_conjugate)
+ vertices_.insert(graph_.conjugate(*it));
+ }
+ }
+
+ template<class EdgeIt>
+ void FillEdges(EdgeIt begin, EdgeIt end, bool add_conjugate = false) {
+ for (auto it = begin; it != end; ++it) {
+ edges_.insert(*it);
+ if (add_conjugate)
+ edges_.insert(graph_.conjugate(*it));
+ }
+ }
+
+ void FillInducedEdges() {
+ for (VertexId v : vertices_) {
+ for (EdgeId e : graph_.OutgoingEdges(v)) {
+ if (vertices_.count(graph_.EdgeEnd(e)) > 0) {
+ edges_.insert(e);
+ }
+ }
+ }
+ }
+
+ void FillRelevantVertices() {
+ for (EdgeId e : edges_) {
+ vertices_.insert(graph_.EdgeStart(e));
+ vertices_.insert(graph_.EdgeEnd(e));
+ }
+ }
+
+ void FindEntrancesAndExits() {
+ for (auto v : vertices_) {
+ for (auto e : graph_.IncomingEdges(v)) {
+ if (!contains(e)) {
+ entrances_.insert(v);
+ break;
+ }
+ }
+
+ for (auto e : graph_.OutgoingEdges(v)) {
+ if (!contains(e)) {
+ exits_.insert(v);
+ break;
+ }
+ }
+ }
+ }
+
+ void Swap(GraphComponent<Graph> &that) {
+ VERIFY(&this->graph_ == &that.graph_);
+ std::swap(this->name_, that.name_);
+ std::swap(this->vertices_, that.vertices_);
+ std::swap(this->edges_, that.edges_);
+ std::swap(this->exits_, that.exits_);
+ std::swap(this->entrances_, that.entrances_);
+ }
+
+ template<class EdgeIt>
+ void FillFromEdges(EdgeIt begin, EdgeIt end,
+ bool add_conjugate) {
+ FillEdges(begin, end, add_conjugate);
+ FillRelevantVertices();
+ FindEntrancesAndExits();
+ }
+
+ GraphComponent<Graph> &operator=(const GraphComponent<Graph> &);
+ GraphComponent(const GraphComponent<Graph> &);
+
+public:
+
+ template<class VertexIt>
+ static GraphComponent FromVertices(const Graph &g, VertexIt begin, VertexIt end,
+ bool add_conjugate = false, const string &name = "") {
+ GraphComponent answer(g, name);
+ answer.FillVertices(begin, end, add_conjugate);
+ answer.FillInducedEdges();
+ answer.FindEntrancesAndExits();
+ return answer;
+ }
+
+ template<class EdgeIt>
+ static GraphComponent FromEdges(const Graph &g, EdgeIt begin, EdgeIt end,
+ bool add_conjugate = false, const string &name = "") {
+ GraphComponent answer(g, name);
+ answer.FillFromEdges(begin, end, add_conjugate);
+ return answer;
+ }
+
+ template<class Container>
+ static GraphComponent FromVertices(const Graph &g, const Container &c,
+ bool add_conjugate = false, const string &name = "") {
+ return FromVertices(g, c.begin(), c.end(), add_conjugate, name);
+ }
+
+ template<class Container>
+ static GraphComponent FromEdges(const Graph &g, const Container &c,
+ bool add_conjugate = false, const string &name = "") {
+ return FromEdges(g, c.begin(), c.end(), add_conjugate, name);
+ }
+
+ static GraphComponent WholeGraph(const Graph &g, const string &name = "") {
+ return FromVertices(g, g.begin(), g.end(), false, name);
+ }
+
+ static GraphComponent Empty(const Graph &g, const string &name = "") {
+ return GraphComponent(g, name);
+ }
+
+ GraphComponent(const Graph &g, const string &name = "") :
+ graph_(g), name_(name) {
+ }
+
+ //may be used for conjugate closure
+ GraphComponent(const GraphComponent& component,
+ bool add_conjugate,
+ const string &name = "") : graph_(component.graph_), name_(name) {
+ FillFromEdges(component.e_begin(), component.e_end(), add_conjugate);
+ }
+
+ GraphComponent(GraphComponent&& that) : graph_(that.graph_) {
+ Swap(that);
+ }
+
+ GraphComponent<Graph> &operator=(GraphComponent<Graph> &&that) {
+ Swap(that);
+ return *this;
+ }
+
+ const Graph& g() const {
+ return graph_;
+ }
+
+ string name() const {
+ return name_;
+ }
+
+ size_t v_size() const {
+ return vertices_.size();
+ }
+
+ size_t e_size() const {
+ return edges_.size();
+ }
+
+ bool contains(EdgeId e) const {
+ return edges_.count(e) > 0;
+ }
+
+ bool contains(VertexId v) const {
+ return vertices_.count(v) > 0;
+ }
+
+ edge_iterator e_begin() const {
+ return edges_.begin();
+ }
+
+ edge_iterator e_end() const {
+ return edges_.end();
+ }
+
+ const std::set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ const std::set<VertexId>& vertices() const{
+ return vertices_;
+ }
+
+ vertex_iterator v_begin() const {
+ return vertices_.begin();
+ }
+
+ vertex_iterator v_end() const {
+ return vertices_.end();
+ }
+
+ const std::set<VertexId>& exits() const {
+ return exits_;
+ }
+
+ const std::set<VertexId>& entrances() const {
+ return entrances_;
+ }
+
+ bool IsBorder(VertexId v) const {
+ return exits_.count(v) || entrances_.count(v);
+ }
+
+ bool empty() const {
+ return v_size() == 0;
+ }
+
+};
+
+}
diff --git a/src/common/assembly_graph/components/splitters.hpp b/src/common/assembly_graph/components/splitters.hpp
new file mode 100644
index 0000000..9aa5d0d
--- /dev/null
+++ b/src/common/assembly_graph/components/splitters.hpp
@@ -0,0 +1,882 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/standard_base.hpp"
+#include "graph_component.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+#include "component_filters.hpp"
+
+namespace omnigraph {
+
+
+template<typename Element>
+class JSIterator {
+public:
+
+ virtual Element Next() = 0;
+
+ virtual bool HasNext() = 0;
+
+ virtual ~JSIterator() {
+ }
+};
+
+template<class Graph>
+class GraphSplitter : public JSIterator<GraphComponent<Graph>>{
+private:
+ const Graph& graph_;
+public:
+ GraphSplitter(const Graph& graph)
+ : graph_(graph) {
+ }
+
+ const Graph& graph() const {
+ return graph_;
+ }
+protected:
+ //todo remove after returning to optional
+ std::unique_ptr<GraphComponent<Graph>> MakeUniquePtr(GraphComponent<Graph>&& component) const {
+ return std::unique_ptr<GraphComponent<Graph>>(new GraphComponent<Graph>(std::move(component)));
+ }
+
+ //todo remove after returning to optional
+ GraphComponent<Graph> GetValueAndReset(std::unique_ptr<GraphComponent<Graph>>& component_ptr) const {
+ VERIFY(component_ptr);
+ auto answer = std::move(*component_ptr);
+ component_ptr = nullptr;
+ return answer;
+ }
+};
+
+template<class Graph>
+class PrecountedComponentSplitter : public GraphSplitter<Graph> {
+ bool HasNext_;
+ GraphComponent<Graph> component_;
+public:
+
+ template<class It>
+ PrecountedComponentSplitter(const Graph &graph, It begin, It end)
+ : GraphSplitter<Graph>(graph), HasNext_(false),
+ component_(graph, begin, end) {
+ }
+
+ template<class It>
+ PrecountedComponentSplitter(GraphComponent<Graph> component)
+ : GraphSplitter<Graph>(component.g()), HasNext_(false),
+ component_(component) {
+ }
+
+ GraphComponent<Graph> Next() {
+ HasNext_ = false;
+ return component_;
+ }
+
+// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge, size_t /*length*/) const {
+// return edges_.count(edge) != 0;
+// }
+ bool HasNext() {
+ return HasNext_;
+ }
+};
+
+template<typename Element>
+class RelaxingIterator : public JSIterator<Element> {
+public:
+ template<typename It>
+ void Relax(It begin, It end) {
+ Relax(vector<Element>(begin, end));
+ }
+
+// virtual bool CheckProcessVertex(VertexId /*vertex*/, size_t distance) {
+// return distance <= bound_;
+// }
+ virtual void Relax(const vector<Element> &v) = 0;
+
+ virtual void Relax(Element) = 0;
+
+ virtual ~RelaxingIterator() {
+ }
+};
+
+template<class Collection>
+class CollectionIterator : public RelaxingIterator<typename Collection::value_type> {
+private:
+ typedef typename Collection::value_type Element;
+ typedef typename Collection::const_iterator Iter;
+ shared_ptr<Collection> storage_;
+ Iter current_;
+ const Iter end_;
+ set<Element> relaxed_;
+public:
+ CollectionIterator(const Collection &collection)
+ : current_(collection.begin()), end_(collection.end()) {
+ }
+
+ CollectionIterator(shared_ptr<Collection> collection)
+ : storage_(collection), current_(collection->begin()), end_(collection->end()) {
+ }
+
+ CollectionIterator(Iter begin, Iter end)
+ : current_(begin), end_(end) {
+ }
+
+ Element Next() {
+ if(!HasNext()) { //This function actually changes value of current! It is not just to verify!
+ //fixme use VERIFY_MSG instead
+ VERIFY(HasNext());
+ }
+ Element next = *current_;
+ ++current_;
+ return next;
+ }
+
+ bool HasNext() {
+ while(current_ != end_ && relaxed_.count(*current_) == 1) {
+ ++current_;
+ }
+ return current_ != end_;
+ }
+
+ void Relax(Element e) {
+ relaxed_.insert(e);
+ }
+
+ void Relax(const vector<Element> &v) {
+ for (auto it = v.begin(); it != v.end(); ++it)
+ Relax(*it);
+ }
+
+ virtual ~CollectionIterator() {
+ }
+};
+
+template<class Graph>
+class PathIterator : public RelaxingIterator<typename Graph::VertexId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ vector<VertexId> path_;
+ size_t current_;
+
+ static vector<VertexId> ExtractVertices(const Graph &graph, const vector<EdgeId> &path) {
+ vector<VertexId> result;
+ for(size_t i = 0; i < path.size(); i++) {
+ if(i == 0 || path[i] != path[i - 1]) {
+ result.push_back(graph.EdgeStart(path[i]));
+ result.push_back(graph.EdgeEnd(path[i]));
+ }
+ }
+ return result;
+ }
+
+public:
+ PathIterator(const Graph &graph, const vector<EdgeId> &path)
+ : graph_(graph), path_(ExtractVertices(graph, path)), current_(0) {
+ }
+
+ VertexId Next() {
+ if(!HasNext()) {
+ VERIFY(HasNext());
+ }
+ VertexId next = path_[current_];
+ Relax(next);
+ return next;
+ }
+
+ bool HasNext() {
+ return current_ < path_.size();
+ }
+
+ void Relax(const vector<VertexId> &v) {
+ set<VertexId> toRelax(v.begin(), v.end());
+ while(toRelax.count(path_[current_]) == 1)
+ current_++;
+ }
+
+ void Relax(VertexId e) {
+ Relax(vector<VertexId>({e}));
+ }
+};
+
+template<class Graph>
+class AbstractNeighbourhoodFinder {
+private:
+ const Graph &graph_;
+public:
+ AbstractNeighbourhoodFinder(const Graph &graph) : graph_(graph) {
+ }
+
+ const Graph &graph() const {
+ return graph_;
+ }
+
+ virtual GraphComponent<Graph> Find(typename Graph::VertexId v) const = 0;
+
+ virtual vector<typename Graph::VertexId> InnerVertices(const GraphComponent<Graph> &component) const = 0;
+
+ virtual ~AbstractNeighbourhoodFinder() {
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class ComponentCloser {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &graph_;
+ size_t edge_length_bound_;
+
+public:
+ ComponentCloser(const Graph &graph, size_t edge_length_bound)
+ : graph_(graph),
+ edge_length_bound_(edge_length_bound) {
+ }
+
+ void CloseComponent(set<VertexId> &component) const {
+ set<VertexId> additional_vertices;
+ for (auto it = component.begin(); it != component.end(); ++it) {
+ for (EdgeId e : graph_.OutgoingEdges(*it)) {
+ if (graph_.length(e) >= edge_length_bound_) {
+ additional_vertices.insert(graph_.EdgeEnd(e));
+ }
+ }
+ for (EdgeId e : graph_.IncomingEdges(*it)) {
+ if (graph_.length(e) >= edge_length_bound_) {
+ additional_vertices.insert(graph_.EdgeStart(e));
+ }
+ }
+ }
+ component.insert(additional_vertices.begin(),
+ additional_vertices.end());
+ }
+
+ GraphComponent<Graph> CloseComponent(const GraphComponent<Graph>& component) const {
+ set<VertexId> vertices(component.v_begin(), component.v_end());
+ CloseComponent(vertices);
+ return GraphComponent<Graph>::FromVertices(graph_, vertices);
+ }
+};
+
+template<class Graph>
+class HighCoverageComponentFinder : public AbstractNeighbourhoodFinder<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ class CoverageBoundedDFS {
+ private:
+ const Graph &graph_;
+ const double coverage_bound_;
+ const size_t edge_limit_;
+ mutable size_t edge_summary_length_;
+
+ void Find(EdgeId edge, std::set<EdgeId> &result) const {
+ if (result.size() > edge_limit_) {
+ return;
+ }
+
+ if (math::ls(graph_.coverage(edge), coverage_bound_)) {
+ return;
+ }
+
+ if (result.count(edge) || result.count(graph_.conjugate(edge))) {
+ return;
+ }
+
+ edge_summary_length_ += graph_.length(edge);
+ result.insert(edge);
+ result.insert(graph_.conjugate(edge));
+
+ VertexId v = graph_.EdgeEnd(edge);
+ for (auto e : graph_.IncidentEdges(v)) {
+ Find(e, result);
+ }
+
+ v = graph_.EdgeStart(edge);
+ for (auto e : graph_.IncidentEdges(v)) {
+ Find(e, result);
+ }
+ }
+
+ public:
+ CoverageBoundedDFS(const Graph &graph, double coverage_bound,
+ size_t edge_limit = 10000)
+ : graph_(graph),
+ coverage_bound_(coverage_bound),
+ edge_limit_(edge_limit),
+ edge_summary_length_(0) {
+ }
+
+ std::set<EdgeId> Find(VertexId v) const {
+ edge_summary_length_ = 0;
+ std::set<EdgeId> result;
+ for (auto e : graph_.OutgoingEdges(v)) {
+ Find(e, result);
+ }
+ for (auto e : graph_.IncomingEdges(v)) {
+ Find(e, result);
+ }
+ return result;
+ }
+
+ size_t EdgeSummaryLength() const {
+ return edge_summary_length_;
+ }
+ };
+
+
+ const double coverage_bound_;
+ CoverageBoundedDFS dfs_helper;
+
+public:
+ HighCoverageComponentFinder(const Graph &graph, double max_coverage)
+ : AbstractNeighbourhoodFinder<Graph>(graph), coverage_bound_(max_coverage), dfs_helper(graph, max_coverage) {
+ }
+
+ GraphComponent<Graph> Find(typename Graph::VertexId v) const {
+ std::set<EdgeId> result = dfs_helper.Find(v);
+ return GraphComponent<Graph>::FromEdges(this->graph(), result, false);
+ }
+
+ size_t EdgeSummaryLength(VertexId v) const {
+ GraphComponent<Graph> component = Find(v);
+ DEBUG("Summary edge length for vertex " << v.int_id() << " is " << dfs_helper.EdgeSummaryLength());
+ return dfs_helper.EdgeSummaryLength();
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) const {
+ return vector<VertexId>(component.v_begin(), component.v_end());
+ }
+};
+
+
+//This method finds a neighbourhood of a set of vertices. Vertices that are connected by an edge of length more than 600 are not considered as adjacent.
+template<class Graph>
+class ReliableNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ set<VertexId> FindNeighbours(const set<VertexId> &s) const {
+ set<VertexId> result(s.begin(), s.end());
+ for (VertexId v : result) {
+ for (EdgeId e : this->graph().IncidentEdges(v)) {
+ if(this->graph().length(e) <= edge_length_bound_) {
+ result.insert(this->graph().EdgeEnd(e));
+ result.insert(this->graph().EdgeStart(e));
+ }
+ }
+ }
+ return result;
+ }
+
+ set<VertexId> FindNeighbours(const set<VertexId> &s, size_t eps) const {
+ set<VertexId> result = s;
+ for(size_t i = 0; i < eps; i++) {
+ result = FindNeighbours(result);
+ }
+ return result;
+ }
+
+ set<VertexId> FindBorder(const GraphComponent<Graph>& component) const {
+ set<VertexId> result;
+ insert_all(result, component.entrances());
+ insert_all(result, component.exits());
+ return result;
+ }
+
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
+ static const size_t DEFAULT_MAX_SIZE = 100;
+
+ const size_t edge_length_bound_;
+ const size_t max_size_;
+
+ ReliableNeighbourhoodFinder(const Graph &graph, size_t edge_length_bound =
+ DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = DEFAULT_MAX_SIZE)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ edge_length_bound_(edge_length_bound),
+ max_size_(max_size) {
+ }
+
+ GraphComponent<Graph> Find(typename Graph::VertexId v) const {
+ auto cd = DijkstraHelper<Graph>::CreateCountingDijkstra(this->graph(), max_size_,
+ edge_length_bound_);
+ cd.Run(v);
+ vector<VertexId> result_vector = cd.ReachedVertices();
+ set<VertexId> result(result_vector.begin(), result_vector.end());
+ ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
+ cc.CloseComponent(result);
+ return GraphComponent<Graph>::FromVertices(this->graph(), result);
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) const {
+ set<VertexId> border = FindNeighbours(FindBorder(component), 2);
+ std::vector<VertexId> result;
+ std::set_difference(component.vertices().begin(), component.vertices().end(),
+ border.begin(), border.end(),
+ std::inserter(result, result.end()));
+ return result;
+ }
+};
+
+template<class Graph>
+class PathNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ VertexId OtherEnd(EdgeId e, VertexId v) const {
+ if (this->graph().EdgeStart(e) == v)
+ return this->graph().EdgeEnd(e);
+ else
+ return this->graph().EdgeStart(e);
+ }
+
+ bool Go(VertexId v, size_t curr_depth, set<VertexId>& grey, set<VertexId>& black) const {
+ //allows single vertex to be visited many times with different depth values
+ TRACE("Came to vertex " << this->graph().str(v) << " on depth " << curr_depth);
+ if (curr_depth >= max_depth_) {
+ TRACE("Too deep");
+ return true;
+ }
+ if (grey.size() >= max_size_) {
+ TRACE("Too many vertices");
+ return false;
+ }
+
+ TRACE("Started processing of vertex " << this->graph().str(v));
+ grey.insert(v);
+
+ TRACE("Sorting incident edges");
+ vector<EdgeId> incident_path;
+ vector<EdgeId> incident_non_path;
+ for (EdgeId e : this->graph().IncidentEdges(v)) {
+ if (path_edges_.count(e) != 0) {
+ /*condition not to go backward*/
+ if (this->graph().EdgeStart(e) == v) {
+ incident_path.push_back(e);
+ }
+ } else {
+ incident_non_path.push_back(e);
+ }
+ }
+
+ for (EdgeId e : incident_non_path) {
+ if (this->graph().length(e) > edge_length_bound_) {
+ TRACE("Edge " << this->graph().str(e) << " is too long");
+ continue;
+ }
+ TRACE("Going along edge " << this->graph().str(e));
+ if (!Go(OtherEnd(e, v), curr_depth + 1, grey, black))
+ return false;
+ }
+
+ TRACE("End processing of vertex " << this->graph().str(v));
+ black.insert(v);
+
+ for (EdgeId e : incident_path) {
+ if (grey.count(OtherEnd(e, v)) != 0)
+ continue;
+ TRACE("Going along next path edge " << this->graph().str(e));
+ if (!Go(OtherEnd(e, v), 0, grey, black))
+ return false;
+ }
+
+ return true;
+ }
+
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
+ static const size_t DEFAULT_MAX_DEPTH = 2;
+ static const size_t DEFAULT_MAX_SIZE = 20;
+
+ set<EdgeId> path_edges_;
+ const size_t edge_length_bound_;
+ const size_t max_size_;
+ const size_t max_depth_;
+
+ mutable set<VertexId> last_inner_;
+
+ PathNeighbourhoodFinder(const Graph &graph, const vector<EdgeId>& path, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = DEFAULT_MAX_SIZE, size_t max_depth = DEFAULT_MAX_DEPTH)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ path_edges_(path.begin(), path.end()),
+ edge_length_bound_(edge_length_bound),
+ max_size_(max_size),
+ max_depth_(max_depth) {
+ }
+
+
+ GraphComponent<Graph> Find(VertexId v) const {
+ TRACE("Starting from vertex " << this->graph().str(v));
+ last_inner_.clear();
+ set<VertexId> grey;
+ set<VertexId> black;
+ Go(v, 0, grey, black);
+ last_inner_ = black;
+ last_inner_.insert(v);
+ ComponentCloser<Graph>(this->graph(), 0).CloseComponent(grey);
+ return GraphComponent<Graph>::FromVertices(this->graph(), grey);
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &/*component*/) const {
+ return vector<VertexId>(last_inner_.begin(), last_inner_.end());
+ }
+private:
+ DECL_LOGGER("PathNeighbourhoodFinder");
+};
+
+//todo delete and think if we really need hierarchy
+template<class Graph>
+class ShortEdgeComponentFinder : public AbstractNeighbourhoodFinder<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ static const size_t DEFAULT_EDGE_LENGTH_BOUND = 100;
+
+ const size_t edge_length_bound_;
+
+ ShortEdgeComponentFinder(const Graph &graph, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND)
+ : AbstractNeighbourhoodFinder<Graph>(graph),
+ edge_length_bound_(edge_length_bound) {
+ }
+
+ GraphComponent<Graph> Find(VertexId v) const {
+ auto cd = DijkstraHelper<Graph>::CreateShortEdgeDijkstra(this->graph(), edge_length_bound_);
+ cd.Run(v);
+ return GraphComponent<Graph>::FromVertices(this->graph(), cd.ProcessedVertices());
+ }
+
+ vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) const {
+ return vector<VertexId>(component.v_begin(), component.v_end());
+ }
+};
+
+template<class Graph>
+class FilteringSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ shared_ptr<GraphComponent<Graph>> next_;
+public:
+ FilteringSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()),
+ inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return omnigraph::GraphComponent<Graph>(this->graph());
+ }
+ auto result = next_;
+ next_ = nullptr;
+ return *result;
+ }
+
+ bool HasNext() {
+ while (!next_ && inner_splitter_->HasNext()) {
+ next_ = std::make_shared(inner_splitter_->Next());
+ if (!checker_->Check(*next_)) {
+ next_ = nullptr;
+ }
+ }
+ return next_;
+ }
+
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+//TODO split combined component into several.
+template<class Graph>
+class CollectingSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ std::unique_ptr<GraphComponent<Graph>> next_;
+ set<VertexId> filtered_;
+public:
+ CollectingSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return omnigraph::GraphComponent<Graph>::Empty(this->graph());
+ } else {
+ if (next_) {
+ return this->GetValueAndReset(next_);
+ } else {
+ auto result = GraphComponent<Graph>::FromVertices(this->graph(),
+ filtered_,
+ false, "filtered");
+ filtered_.clear();
+ return result;
+ }
+ }
+ }
+
+ bool HasNext() {
+ while (!next_ && inner_splitter_->HasNext()) {
+ next_ = this->MakeUniquePtr(inner_splitter_->Next());
+ if (!checker_->Check(*next_)) {
+ filtered_.insert(next_->v_begin(), next_->v_end());
+ next_ = nullptr;
+ }
+ }
+ return next_ || !filtered_.empty();
+ }
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+template<class Graph>
+class CondensingSplitterWrapper : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ shared_ptr<GraphSplitter<Graph>> inner_splitter_;
+ shared_ptr<GraphComponentFilter<Graph>> checker_;
+ std::unique_ptr<GraphComponent<Graph>> next_;
+
+ string CutName(const string &name, size_t max_length) {
+ VERIFY(max_length >= 7);
+ size_t length = name.size();
+ if (length <= max_length)
+ return name;
+ else {
+ return name.substr(0, (max_length - 5) / 2) + "....." + name.substr(length - (max_length - 5) / 2, (max_length - 5) / 2);
+ }
+ }
+
+ GraphComponent<Graph> ConstructComponent() {
+ GraphComponent<Graph> next = inner_splitter_->Next();
+ if (checker_->Check(next)) {
+ return next;
+ }
+ set<VertexId> vertices(next.v_begin(), next.v_end());
+ string name = next.name();
+ for(size_t i = 0; i < 10 && inner_splitter_->HasNext(); i++) {
+ next = inner_splitter_->Next();
+ if (checker_->Check(next)) {
+ VERIFY(!next_);
+ next_ = this->MakeUniquePtr(std::move(next));
+ break;
+ } else {
+ vertices.insert(next.v_begin(), next.v_end());
+ if (next.name() != "") {
+ name += ";";
+ name += next.name();
+ }
+ }
+ }
+ return GraphComponent<Graph>::FromVertices(this->graph(), vertices, false, CutName(name, 60));
+ }
+
+
+public:
+ CondensingSplitterWrapper(
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<GraphComponentFilter<Graph>> checker)
+ : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
+ checker_(checker) {
+ }
+
+ GraphComponent<Graph> Next() {
+ if (!HasNext()) {
+ VERIFY(false);
+ return GraphComponent<Graph>(this->graph());
+ }
+
+ if (next_) {
+ return this->GetValueAndReset(next_);
+ } else {
+ return ConstructComponent();
+ }
+ }
+
+ bool HasNext() {
+ if (next_)
+ return true;
+ if (!inner_splitter_->HasNext())
+ return false;
+ return true;
+ }
+private:
+ DECL_LOGGER("FilteringSplitterWrapper");
+};
+
+template<class Graph>
+class NeighbourhoodFindingSplitter : public GraphSplitter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator_;
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder_;
+
+public:
+ NeighbourhoodFindingSplitter(
+ const Graph& graph,
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator,
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(inner_iterator),
+ neighbourhood_finder_(neighbourhood_finder) {
+ }
+
+ NeighbourhoodFindingSplitter(
+ const Graph& graph,
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(inner_iterator),
+ neighbourhood_finder_(
+ make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
+ }
+
+ NeighbourhoodFindingSplitter(const Graph& graph)
+ : GraphSplitter<Graph>(graph),
+ inner_iterator_(
+ make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end())),
+ neighbourhood_finder_(make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
+ }
+
+ GraphComponent<Graph> Next() {
+ VertexId next_vertex = inner_iterator_->Next();
+ GraphComponent<Graph> result = neighbourhood_finder_->Find(next_vertex);
+ vector<VertexId> to_relax = neighbourhood_finder_->InnerVertices(result);
+ to_relax.push_back(next_vertex);
+ inner_iterator_->Relax(to_relax);
+ return result;
+ }
+
+ bool HasNext() {
+ return inner_iterator_->HasNext();
+ }
+};
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ReliableSplitter(const Graph &graph,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ConnectedSplitter(const Graph &graph,
+ size_t edge_length_bound = 1000000,
+ size_t max_size = 1000000) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> ReliableSplitterAlongPath(
+ const Graph &graph, const vector<typename Graph::EdgeId>& path, size_t edge_length_bound = PathNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
+ size_t max_size = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t max_depth = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_DEPTH) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
+ PathIterator<Graph>>(graph, path);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<PathNeighbourhoodFinder<Graph>>(graph, path,
+ edge_length_bound, max_size, max_depth);
+
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> LongEdgesExclusiveSplitter(
+ const Graph &graph, size_t bound =
+ ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
+ CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ShortEdgeComponentFinder<Graph>>(graph, bound);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
+ inner_iterator, nf);
+}
+
+template<class Graph, typename Collection>
+shared_ptr<GraphSplitter<Graph>> StandardSplitter(
+ const Graph &graph, const Collection &collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
+ max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
+}
+
+template<class Graph, typename Collection>
+shared_ptr<GraphSplitter<Graph>> StandardSplitter(
+ const Graph &graph, shared_ptr<Collection> collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ typedef typename Graph::VertexId VertexId;
+ shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
+ shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
+ ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
+ max_size);
+ return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
+}
+
+template<class Graph>
+shared_ptr<GraphSplitter<Graph>> WholeGraphSplitter(
+ const Graph &graph, size_t max_size,
+ size_t edge_length_bound) {
+ return NeighbourhoodFindingSplitter<Graph>(graph, graph.vertices(), max_size, edge_length_bound);
+}
+
+template<class Graph>
+GraphComponent<Graph> VertexNeighborhood(
+ const Graph &graph, typename Graph::VertexId vertex, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ vector<typename Graph::VertexId> vv = {vertex};
+ shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
+ return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
+}
+
+//TODO make a method that draws a picture that contains given set of edges for sure. ? mb refactor this into just drawing instead of splitting?
+template<class Graph>
+GraphComponent<Graph> EdgeNeighborhood(
+ const Graph &graph, typename Graph::EdgeId edge, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
+ size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
+ vector<typename Graph::VertexId> vv = {graph.EdgeStart(edge)};
+ shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
+ return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
+}
+
+}
diff --git a/src/common/assembly_graph/core/action_handlers.hpp b/src/common/assembly_graph/core/action_handlers.hpp
new file mode 100644
index 0000000..6395991
--- /dev/null
+++ b/src/common/assembly_graph/core/action_handlers.hpp
@@ -0,0 +1,347 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __OMNI_ACTION_HANDLERS_HPP__
+#define __OMNI_ACTION_HANDLERS_HPP__
+
+#include "utils/verify.hpp"
+#include "utils/logger/logger.hpp"
+
+#include <boost/noncopyable.hpp>
+#include <string>
+#include <vector>
+
+namespace omnigraph {
+
+using std::vector;
+
+/**
+* ActionHandler is base listening class for graph events. All structures and information storages
+* which are meant to synchronize with graph should use this structure. In order to make handler listen
+* to graph events one should add it to graph listeners.
+* Normally structure itself extends ActionHandler and overrides several handling methods. In
+* constructor it adds itself to graph handler list and removes itself form this list in destructor.
+* All events are divided into two levels: low level events and high level events.
+* Low level events are addition/deletion of vertices/edges. These events should be triggered only after
+* high level events when all data was already transferred and graph structure is consistent.
+* High level events should be used to keep external data synchronized with graph and keep internal data
+* consistent. Now high level events are merge, glue and split. This list can be extended in near future.
+*/
+template<typename VertexId, typename EdgeId>
+class ActionHandler : private boost::noncopyable {
+ const std::string handler_name_;
+private:
+ bool attached_;
+public:
+ /**
+ * Create action handler with given name. With this name one can find out what tipe of handler is it.
+ */
+ ActionHandler(const std::string &name)
+ : handler_name_(name), attached_(true) {
+ }
+
+ virtual ~ActionHandler() {
+ TRACE("~ActionHandler " << handler_name_);
+ }
+
+ /**
+ * Method returns name of this handler
+ */
+ const std::string &name() const {
+ return handler_name_;
+ }
+
+ /**
+ * Low level event which is triggered when vertex is added to graph.
+ * @param v new vertex
+ */
+ virtual void HandleAdd(VertexId /*v*/) { }
+
+ /**
+ * Low level event which is triggered when edge is added to graph.
+ * @param e new edge
+ */
+ virtual void HandleAdd(EdgeId /*e*/) { }
+
+ /**
+ * Low level event which is triggered when vertex is deleted from graph.
+ * @param v vertex to delete
+ */
+ virtual void HandleDelete(VertexId /*v*/) { }
+
+ /**
+ * Low level event which is triggered when edge is deleted from graph.
+ * @param e edge to delete
+ */
+ virtual void HandleDelete(EdgeId /*e*/) { }
+
+ /**
+ * High level event which is triggered when merge operation is performed on graph, which is when
+ * path of edges with all inner vertices having exactly one incoming and one outgoing edge is
+ * replaced with a single edge. Since this is high level operation event of creation of new edge
+ * and events of deletion of old edges should not have been triggered yet when this event was triggered.
+ * @param old_edges path of edges to be replaced with single edge
+ * @param new_edge new edge that was added to be a replacement of path
+ */
+ virtual void HandleMerge(const vector<EdgeId> & /*old_edges*/, EdgeId /*new_edge*/) { }
+
+ /**
+ * High level event which is triggered when glue operation is performed on graph, which is when
+ * edge is completely replaced with other edge. This operation is widely used in bulge removal
+ * when alternative path is glued to main path. Since this is high level operation event of deletion
+ * of old edge should not have been triggered yet when this event was triggered.
+ * @param new_edge edge glue result
+ * @param edge1 edge to be glued to edge2
+ * @param edge2 edge edge1 should be glued with
+ */
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) { }
+
+ /**
+ * High level event which is triggered when split operation is performed on graph, which is when
+ * edge is split into several shorter edges. Split operation is reverse to merge operation.
+ * Since this is high level operation event of deletion of old edge and events of creation of new edges
+ * should not have been triggered yet when this event was triggered.
+ * @param old_edge edge to be split
+ * @param new_edges edges which are results of split
+ */
+ virtual void HandleSplit(EdgeId /*old_edge*/, EdgeId /*new_edge_1*/,
+ EdgeId /*new_edge_2*/) { }
+
+ /**
+ * Every thread safe descendant should override this method for correct concurrent graph processing.
+ */
+ virtual bool IsThreadSafe() const {
+ return false;
+ }
+
+ bool IsAttached() const {
+ return attached_;
+ }
+
+ void Attach() {
+ VERIFY(!attached_);
+ attached_ = true;
+ }
+
+ void Detach() {
+ VERIFY(attached_);
+ attached_ = false;
+ }
+};
+
+template<class Graph>
+class GraphActionHandler : public ActionHandler<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+ typedef ActionHandler<typename Graph::VertexId, typename Graph::EdgeId> base;
+
+ const Graph &g_;
+
+protected:
+ const Graph &g() const {
+ return g_;
+ }
+
+public:
+ GraphActionHandler(const Graph &g, const std::string &name)
+ : base(name),
+ g_(g) {
+ TRACE("Adding new action handler: " << this->name());
+ g_.AddActionHandler(this);
+ }
+
+ GraphActionHandler(const GraphActionHandler<Graph> &other)
+ : base(other.name()),
+ g_(other.g_) {
+ TRACE("Adding new action handler: " << this->name());
+ g_.AddActionHandler(this);
+ }
+
+ virtual ~GraphActionHandler() {
+ TRACE("Removing action handler: " << this->name());
+ if (this->IsAttached())
+ this->Detach();
+ g_.RemoveActionHandler(this);
+ }
+};
+
+/**
+* In order to support various types of graphs and make handler structure more flexible HandlerApplier
+* structure was introduced. If certain implementation of graph requires special handler triggering scheme
+* one can store certain extension of HandlerApplier in graph and trigger HandlerApplier methods instead
+* of GraphHandler methods.
+* HandlerApplier contains one method for each of graph events which define the exact way this event
+* should be triggered.
+*/
+template<typename VertexId, typename EdgeId>
+class HandlerApplier {
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+public:
+
+ virtual void
+ ApplyAdd(Handler &handler, VertexId v) const = 0;
+
+ virtual void
+ ApplyAdd(Handler &handler, EdgeId e) const = 0;
+
+ virtual void
+ ApplyDelete(Handler &handler, VertexId v) const = 0;
+
+ virtual void
+ ApplyDelete(Handler &handler, EdgeId e) const = 0;
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const = 0;
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const = 0;
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge,
+ EdgeId new_edge_1, EdgeId new_edge2) const = 0;
+
+ virtual ~HandlerApplier() {
+ }
+};
+
+/**
+* SimpleHandlerApplier is simple implementation of handler applier with no special filtering.
+*/
+template<class Graph>
+class SimpleHandlerApplier : public HandlerApplier<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+ virtual void ApplyAdd(Handler &handler, VertexId v) const {
+ handler.HandleAdd(v);
+ }
+
+ virtual void ApplyAdd(Handler &handler, EdgeId e) const {
+ handler.HandleAdd(e);
+ }
+
+ virtual void ApplyDelete(Handler &handler, VertexId v) const {
+ handler.HandleDelete(v);
+ }
+
+ virtual void ApplyDelete(Handler &handler, EdgeId e) const {
+ handler.HandleDelete(e);
+ }
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const {
+ handler.HandleMerge(old_edges, new_edge);
+ }
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const {
+ handler.HandleGlue(new_edge, edge1, edge2);
+ }
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge, EdgeId new_edge1,
+ EdgeId new_edge2) const {
+ handler.HandleSplit(old_edge, new_edge1, new_edge2);
+ }
+
+};
+
+/**
+* PairedHandlerApplier is implementation of HandlerApplier for graph with synchronization of actions
+* performed with vertices/edges and its reverse-complement analogues. Thus while corresponding
+* method was called only once event should be triggered twice: for the parameters with which method
+* was called and for reverse-complement parameters. Also certain assertions were added for bad cases.
+*/
+template<class Graph>
+class PairedHandlerApplier : public HandlerApplier<typename Graph::VertexId,
+ typename Graph::EdgeId> {
+private:
+ Graph &graph_;
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+ PairedHandlerApplier(Graph &graph)
+ : graph_(graph) {
+ }
+
+ virtual void ApplyAdd(Handler &handler, VertexId v) const {
+ VertexId rcv = graph_.conjugate(v);
+ handler.HandleAdd(v);
+ if (v != rcv) {
+ handler.HandleAdd(rcv);
+ }
+ }
+
+ virtual void ApplyAdd(Handler &handler, EdgeId e) const {
+ EdgeId rce = graph_.conjugate(e);
+ handler.HandleAdd(e);
+ if (e != rce) {
+ handler.HandleAdd(rce);
+ }
+ }
+
+ virtual void ApplyDelete(Handler &handler, VertexId v) const {
+ VertexId rcv = graph_.conjugate(v);
+ handler.HandleDelete(v);
+ if (v != rcv) {
+ handler.HandleDelete(rcv);
+ }
+ }
+
+ virtual void ApplyDelete(Handler &handler, EdgeId e) const {
+ EdgeId rce = graph_.conjugate(e);
+ handler.HandleDelete(e);
+ if (e != rce) {
+ handler.HandleDelete(rce);
+ }
+ }
+
+ virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
+ EdgeId new_edge) const {
+ EdgeId rce = graph_.conjugate(new_edge);
+ handler.HandleMerge(old_edges, new_edge);
+ if (new_edge != rce) {
+ vector<EdgeId> rc_old_edges;
+ for (int i = (int) old_edges.size() - 1; i >= 0; i--) {
+ rc_old_edges.push_back(graph_.conjugate(old_edges[i]));
+ }
+ handler.HandleMerge(rc_old_edges, rce);
+ }
+ }
+
+ virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
+ EdgeId edge2) const {
+ EdgeId rc_edge1 = graph_.conjugate(edge1);
+ EdgeId rc_edge2 = graph_.conjugate(edge2);
+ VERIFY(edge1 != edge2);
+ VERIFY(edge2 != rc_edge2);
+ handler.HandleGlue(new_edge, edge1, edge2);
+ if (edge1 != rc_edge1) {
+ handler.HandleGlue(graph_.conjugate(new_edge), rc_edge1, rc_edge2);
+ }
+ }
+
+ virtual void ApplySplit(Handler &handler, EdgeId old_edge,
+ EdgeId new_edge_1, EdgeId new_edge2) const {
+ EdgeId rce = graph_.conjugate(old_edge);
+ //VERIFY(old_edge != rce);
+ handler.HandleSplit(old_edge, new_edge_1, new_edge2);
+ if (old_edge != rce) {
+ handler.HandleSplit(rce, graph_.conjugate(new_edge2),
+ graph_.conjugate(new_edge_1));
+ }
+ }
+
+private:
+ DECL_LOGGER("PairedHandlerApplier")
+};
+
+};
+
+#endif
diff --git a/src/common/assembly_graph/core/basic_graph_stats.hpp b/src/common/assembly_graph/core/basic_graph_stats.hpp
new file mode 100644
index 0000000..bad128f
--- /dev/null
+++ b/src/common/assembly_graph/core/basic_graph_stats.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "utils/standard_base.hpp"
+namespace omnigraph {
+
+template<class Graph>
+class AvgCovereageCounter {
+private:
+ const Graph &graph_;
+ const size_t min_length_;
+public:
+ AvgCovereageCounter(const Graph &graph, size_t min_length = 0) :
+ graph_(graph), min_length_(min_length) {
+ }
+
+ double Count() const {
+ double cov = 0;
+ size_t length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (graph_.length(*it) >= min_length_) {
+ cov += graph_.coverage(*it) * (double) graph_.length(*it);
+ length += graph_.length(*it);
+ }
+ }
+ if (length == 0)
+ return 0.;
+ return cov / (double) length;
+ }
+};
+
+template<class Graph>
+size_t CumulativeLength(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& path) {
+ size_t s = 0;
+ for (auto it = path.begin(); it != path.end(); ++it)
+ s += g.length(*it);
+
+ return s;
+}
+
+template<class Graph>
+double AvgCoverage(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& path) {
+ double unnormalized_coverage = 0;
+ size_t path_length = 0;
+ for (auto edge : path) {
+ size_t length = g.length(edge);
+ path_length += length;
+ unnormalized_coverage += g.coverage(edge) * (double) length;
+ }
+ return unnormalized_coverage / (double) path_length;
+}
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/core/construction_helper.hpp b/src/common/assembly_graph/core/construction_helper.hpp
new file mode 100644
index 0000000..229a228
--- /dev/null
+++ b/src/common/assembly_graph/core/construction_helper.hpp
@@ -0,0 +1,84 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+//#include "core.hpp"
+#include "observable_graph.hpp"
+
+namespace omnigraph {
+
+template<class DataMaster>
+class ConstructionHelper {
+ //typedef GraphCore<DataMaster> Graph;
+ typedef ObservableGraph<DataMaster> Graph;
+ typedef typename Graph::DataMasterT DataMasterT;
+ typedef typename Graph::VertexData VertexData;
+ typedef typename Graph::EdgeData EdgeData;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::VertexIt VertexIt;
+ typedef typename Graph::edge_const_iterator edge_const_iterator;
+
+ Graph &graph_;
+
+public:
+
+ ConstructionHelper(Graph &graph) : graph_(graph) {
+ }
+
+ Graph &graph() {
+ return graph_;
+ }
+
+ EdgeId AddEdge(const EdgeData &data) {
+ return AddEdge(data, graph_.GetGraphIdDistributor());
+ }
+
+ EdgeId AddEdge(const EdgeData &data, restricted::IdDistributor &id_distributor) {
+ return graph_.AddEdge(data, id_distributor);
+ }
+
+ void LinkIncomingEdge(VertexId v, EdgeId e) {
+ VERIFY(graph_.EdgeEnd(e) == VertexId(0));
+ graph_.conjugate(v)->AddOutgoingEdge(graph_.conjugate(e));
+ e->SetEndVertex(v);
+ }
+
+ void LinkOutgoingEdge(VertexId v, EdgeId e) {
+ VERIFY(graph_.EdgeEnd(graph_.conjugate(e)) == VertexId(0));
+ v->AddOutgoingEdge(e);
+ graph_.conjugate(e)->SetEndVertex(graph_.conjugate(v));
+ }
+
+ void DeleteLink(VertexId v, EdgeId e) {
+ v->RemoveOutgoingEdge(e);
+ }
+
+ void DeleteUnlinkedEdge(EdgeId e) {
+ EdgeId rc = graph_.conjugate(e);
+ if (e != rc) {
+ delete rc.get();
+ }
+ delete e.get();
+ }
+
+ VertexId CreateVertex(const VertexData &data) {
+ return CreateVertex(data, graph_.GetGraphIdDistributor());
+ }
+
+ VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
+ return graph_.CreateVertex(data, id_distributor);
+ }
+
+ template<class Iter>
+ void AddVerticesToGraph(Iter begin, Iter end) {
+ for(; begin != end; ++begin) {
+ graph_.AddVertexToGraph(*begin);
+ }
+ }
+};
+
+}
diff --git a/src/common/assembly_graph/core/coverage.hpp b/src/common/assembly_graph/core/coverage.hpp
new file mode 100644
index 0000000..8385b04
--- /dev/null
+++ b/src/common/assembly_graph/core/coverage.hpp
@@ -0,0 +1,335 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * coverage.hpp
+ *
+ * Created on: Jun 21, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "utils/logger/logger.hpp"
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include "math/xmath.h"
+#include "action_handlers.hpp"
+namespace omnigraph {
+
+using std::vector;
+//todo save/load absolute coverage
+template<class Graph>
+class CoverageIndex : public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ //typedef unordered_map<EdgeId, int> map_type;
+
+ Graph& g_;
+// map_type storage_;
+
+// size_t KPlusOneMerCoverage(EdgeId edge) const {
+// return (size_t) math::round(coverage(edge) * (double) this->g().length(edge));
+// }
+
+// template<class ReadThreader>
+// Path<EdgeId> ProcessSequence(const ReadThreader& threader,
+// const Sequence& sequence) const {
+// return threader.MapSequence(sequence);
+// }
+
+// void AddPathsToGraph(const Path<EdgeId>& path) {
+//
+// if (path.sequence().size() == 0)
+// return;
+//
+// const vector<EdgeId>& edges_list = path.sequence();
+//
+// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
+// IncCoverage(*it, this->g().length(*it));
+// }
+// IncCoverage(edges_list[0], -int(path.start_pos()));
+// EdgeId last = edges_list[edges_list.size() - 1];
+// IncCoverage(last, int(path.end_pos()) - int(this->g().length(last)));
+// }
+
+// void IncCoverageInMap(EdgeId edge, int toAdd, map_type& map) {
+// //VERIFY(toAdd >= 0);
+// map[edge] += toAdd;
+// VERIFY(map[edge] >= 0);
+// }
+//
+// void AddPathsToMap(const Path<EdgeId>& path, map_type& map) {
+//
+// if (path.sequence().size() == 0)
+// return;
+//
+// const vector<EdgeId>& edges_list = path.sequence();
+//
+// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
+// IncCoverageInMap(*it, this->g().length(*it), map);
+// }
+// IncCoverageInMap(edges_list[0], -int(path.start_pos()), map);
+// EdgeId last = edges_list[edges_list.size() - 1];
+// IncCoverageInMap(last,
+// int(path.end_pos()) - int(this->g().length(last)),
+// map);
+// }
+
+ public:
+ CoverageIndex(Graph &g)
+ : GraphActionHandler<Graph>(g, "CoverageIndex"), g_(g) {
+ }
+
+ virtual ~CoverageIndex() {
+ }
+
+ /**
+ * In NON averaged units
+ */
+ void SetRawCoverage(EdgeId e, unsigned cov) {
+ g_.data(e).set_raw_coverage(cov);
+ }
+
+ void IncRawCoverage(EdgeId e, unsigned count) {
+ g_.data(e).inc_raw_coverage((int)count);
+ }
+
+ void SetAvgCoverage(EdgeId e, double cov) {
+ g_.data(e).set_raw_coverage((int) math::round(cov * (double) this->g().length(e)));
+ }
+
+ /**
+ * Returns average coverage of the edge
+ */
+ double coverage(EdgeId edge) const {
+ return (double) RawCoverage(edge) / (double) this->g().length(edge);
+ }
+
+ unsigned RawCoverage(EdgeId edge) const {
+ return g_.data(edge).raw_coverage();
+ }
+// /**
+// * Returns average coverage of the edge
+// */
+// double operator[](EdgeId e) const {
+// return coverage(e);
+// }
+
+// /**
+// * Method increases coverage value
+// */
+// void IncCoverage(EdgeId edge, int to_add) {
+// edge->IncCoverage(to_add);
+// VERIFY(edge->GetRawCoverage() >= 0);
+// }
+//
+// /**
+// * Method increases coverage value by 1
+// */
+// void IncCoverage(EdgeId edge) {
+// IncCoverage(edge, 1);
+// }
+
+// template<class ReadThreader, class Read>
+// void Fill(io::IReader<Read>& stream, const ReadThreader& threader) {
+//
+// INFO("Processing reads (takes a while)");
+// size_t counter = 0;
+// stream.reset();
+//
+// while (!stream.eof()) {
+// Read r;
+// stream >> r;
+// Path<EdgeId> path = ProcessSequence(threader, r.sequence());
+// AddPathsToGraph(path);
+//
+// VERBOSE_POWER(++counter, " reads processed");
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+// }
+//
+// template<class ReadThreader, class Read>
+// void FillParallel(io::ReadStreamVector<io::IReader<Read> >& streams,
+// const ReadThreader& threader, size_t buffer_size) {
+//
+// INFO("Processing reads (takes a while)");
+// perf_counter pc;
+// size_t counter = 0;
+//
+// size_t nthreads = streams.size();
+// size_t buf_size = buffer_size
+// / (nthreads * (sizeof(Path<EdgeId> ) + 32));
+//
+//#pragma omp parallel num_threads(nthreads)
+// {
+//#pragma omp for reduction(+ : counter)
+// for (size_t i = 0; i < nthreads; ++i) {
+//
+// Read r;
+// io::IReader<Read>& stream = streams[i];
+// stream.reset();
+// std::vector<Path<EdgeId> > buffer(buf_size);
+//
+// size_t j = 0;
+// while (!stream.eof()) {
+// stream >> r;
+// ++counter;
+// buffer[j++] = ProcessSequence(threader, r.sequence());
+//
+// if (j == buf_size) {
+// j = 0;
+//
+//#pragma omp critical
+// {
+// for (size_t l = 0; l < buf_size; ++l) {
+// AddPathsToGraph(buffer[l]);
+// }
+// }
+// }
+// }
+//
+//#pragma omp critical
+// {
+// for (size_t l = 0; l < j; ++l) {
+// AddPathsToGraph(buffer[l]);
+// }
+// }
+// }
+//
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+//
+// INFO("Elapsed time: " << pc.time_ms());
+// }
+//
+// template<class ReadThreader, class Read>
+// void FillFastParallel(
+// io::ReadStreamVector<io::IReader<Read> >& streams,
+// const ReadThreader& threader) {
+//
+// INFO("Processing reads (takes a while)");
+// perf_counter pc;
+// size_t counter = 0;
+//
+// size_t nthreads = streams.size();
+////
+// std::vector<map_type*> maps(nthreads);
+//// maps[0] = &storage_;
+//
+// for (size_t i = 0; i < nthreads; ++i) {
+// maps[i] = new map_type();
+// }
+//
+//#pragma omp parallel num_threads(nthreads)
+// {
+//#pragma omp for reduction(+ : counter)
+// for (size_t i = 0; i < nthreads; ++i) {
+//
+// Read r;
+// io::IReader<Read>& stream = streams[i];
+// stream.reset();
+// Path<EdgeId> path;
+//
+// while (!stream.eof()) {
+// stream >> r;
+// ++counter;
+// path = ProcessSequence(threader, r.sequence());
+//
+// AddPathsToMap(path, *maps[i]);
+// }
+// }
+// }
+//
+// INFO("Merging maps");
+// for (size_t i = 0; i < nthreads; ++i) {
+// for (auto it = maps[i]->begin(); it != maps[i]->end(); ++it) {
+// it->first->IncCoverage(it->second);
+// }
+// delete maps[i];
+// }
+//
+// INFO("DeBruijn graph coverage counted, reads used: " << counter);
+//
+// INFO("Elapsed time: " << pc.time_ms());
+// }
+
+// template<class Index>
+// void FillFromIndex(Index& index) {
+// for (auto I = index.value_cbegin(), E = index.value_cend();
+// I != E; ++I) {
+// const auto& edge_info = *I;
+// VERIFY(edge_info.offset != -1u);
+// VERIFY(edge_info.edge_id.get() != NULL);
+// IncRawCoverage(edge_info.edge_id, edge_info.count);
+// }
+//
+// DEBUG("Coverage counted");
+// }
+
+ virtual void HandleDelete(EdgeId edge) {
+ SetRawCoverage(edge, 0);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+ unsigned coverage = 0;
+ for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
+ coverage += RawCoverage(*it);
+ }
+ SetRawCoverage(new_edge, coverage);
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1, EdgeId new_edge2) {
+// size_t length1 = this->g().length(newEdge1);
+// size_t length = this->g().length(oldEdge);
+// size_t coverage = KPlusOneMerCoverage(oldEdge);
+// size_t coverage1 = coverage * length1 / length;
+// if (coverage1 == 0)
+// coverage1 = 1;
+// size_t coverage2 = coverage - coverage1;
+// if (coverage2 == 0)
+// coverage2 = 1;
+// SetCoverage(newEdge1, coverage1);
+// SetCoverage(newEdge2, coverage2);
+ double avg_cov = coverage(old_edge);
+ if (old_edge == g_.conjugate(old_edge)) {
+ int raw1 = std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1)));
+ SetRawCoverage(new_edge1, raw1);
+ SetRawCoverage(g_.conjugate(new_edge1), raw1);
+ SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
+ } else {
+ SetRawCoverage(new_edge1, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1))));
+ SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
+ }
+ }
+
+ void Save(EdgeId e, std::ostream& out) const {
+ out << fmt::format("{:.6f}", coverage(e));
+ }
+
+ void Load(EdgeId e, std::istream& in) {
+ double cov;
+ in >> cov;
+ SetAvgCoverage(e, cov);
+ }
+
+ /*
+ * Is thread safe if different threads process different edges.
+ */
+ bool IsThreadSafe() const {
+ return true;
+ }
+};
+
+}
diff --git a/src/common/assembly_graph/core/debruijn_data.hpp b/src/common/assembly_graph/core/debruijn_data.hpp
new file mode 100644
index 0000000..f196c2e
--- /dev/null
+++ b/src/common/assembly_graph/core/debruijn_data.hpp
@@ -0,0 +1,170 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <cstring>
+#include "utils/verify.hpp"
+#include "utils/logger/logger.hpp"
+#include "sequence/sequence_tools.hpp"
+#include "utils/standard_base.hpp"
+
+namespace debruijn_graph {
+class DeBruijnMaster;
+
+class DeBruijnVertexData {
+ friend class DeBruinMaster;
+public:
+ DeBruijnVertexData() {
+
+ }
+};
+
+class CoverageData {
+ private:
+ unsigned coverage_;
+
+ public:
+ CoverageData()
+ : coverage_(0) {
+ }
+
+ void inc_coverage(int value) {
+ VERIFY(value >= 0 || coverage_ > unsigned(-value));
+ coverage_ += value;
+ }
+
+ void set_coverage(unsigned coverage) {
+ coverage_ = coverage;
+ }
+
+ //not length normalized
+ unsigned coverage() const {
+ return coverage_;
+ }
+};
+
+class DeBruijnEdgeData {
+ friend class DeBruinMaster;
+ CoverageData coverage_;
+ CoverageData flanking_cov_;
+ Sequence nucls_;
+public:
+
+ DeBruijnEdgeData(const Sequence &nucls) :
+ nucls_(nucls) {
+ }
+
+ const Sequence& nucls() const {
+ return nucls_;
+ }
+
+ void inc_raw_coverage(int value) {
+ coverage_.inc_coverage(value);
+ }
+
+ void set_raw_coverage(unsigned coverage) {
+ coverage_.set_coverage(coverage);
+ }
+
+ unsigned raw_coverage() const {
+ return coverage_.coverage();
+ }
+
+ void inc_flanking_coverage(int value) {
+ flanking_cov_.inc_coverage(value);
+ }
+
+ void set_flanking_coverage(unsigned flanking_coverage) {
+ flanking_cov_.set_coverage(flanking_coverage);
+ }
+
+ //not length normalized
+ unsigned flanking_coverage() const {
+ return flanking_cov_.coverage();
+ }
+
+ size_t size() const {
+ return nucls_.size();
+ }
+};
+
+class DeBruijnDataMaster {
+private:
+ const size_t k_;
+
+public:
+ typedef DeBruijnVertexData VertexData;
+ typedef DeBruijnEdgeData EdgeData;
+
+ DeBruijnDataMaster(size_t k) :
+ k_(k) {
+ }
+
+ const EdgeData MergeData(const std::vector<const EdgeData*>& to_merge, bool safe_merging = true) const;
+
+ std::pair<VertexData, std::pair<EdgeData, EdgeData>> SplitData(const EdgeData& edge, size_t position, bool is_self_conj = false) const;
+
+ EdgeData GlueData(const EdgeData&, const EdgeData& data2) const;
+
+ bool isSelfConjugate(const EdgeData &data) const {
+ return data.nucls() == !(data.nucls());
+ }
+
+ EdgeData conjugate(const EdgeData &data) const {
+ return EdgeData(!(data.nucls()));
+ }
+
+ VertexData conjugate(const VertexData & /*data*/) const {
+ return VertexData();
+ }
+
+ size_t length(const EdgeData& data) const {
+ return data.nucls().size() - k_;
+ }
+
+ size_t length(const VertexData& ) const {
+ return k_;
+ }
+
+ size_t k() const {
+ return k_;
+ }
+
+};
+
+//typedef DeBruijnVertexData VertexData;
+//typedef DeBruijnEdgeData EdgeData;
+//typedef DeBruijnDataMaster DataMaster;
+
+inline const DeBruijnEdgeData DeBruijnDataMaster::MergeData(const std::vector<const DeBruijnEdgeData*>& to_merge, bool safe_merging) const {
+ std::vector<Sequence> ss;
+ ss.reserve(to_merge.size());
+ for (auto it = to_merge.begin(); it != to_merge.end(); ++it) {
+ ss.push_back((*it)->nucls());
+ }
+ return EdgeData(MergeOverlappingSequences(ss, k_, safe_merging));
+}
+
+inline std::pair<DeBruijnVertexData, std::pair<DeBruijnEdgeData, DeBruijnEdgeData>> DeBruijnDataMaster::SplitData(const EdgeData& edge,
+ size_t position,
+ bool is_self_conj) const {
+ const Sequence& nucls = edge.nucls();
+ size_t end = nucls.size();
+ if (is_self_conj) {
+ VERIFY(position < end);
+ end -= position;
+ }
+ return std::make_pair(VertexData(), std::make_pair(EdgeData(edge.nucls().Subseq(0, position + k_)), EdgeData(nucls.Subseq(position, end))));
+}
+
+inline DeBruijnEdgeData DeBruijnDataMaster::GlueData(const DeBruijnEdgeData&, const DeBruijnEdgeData& data2) const {
+ return data2;
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_core/directions.hpp b/src/common/assembly_graph/core/directions.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_core/directions.hpp
rename to src/common/assembly_graph/core/directions.hpp
diff --git a/src/modules/assembly_graph/graph_core/graph.hpp b/src/common/assembly_graph/core/graph.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_core/graph.hpp
rename to src/common/assembly_graph/core/graph.hpp
diff --git a/src/common/assembly_graph/core/graph_core.hpp b/src/common/assembly_graph/core/graph_core.hpp
new file mode 100644
index 0000000..71dd589
--- /dev/null
+++ b/src/common/assembly_graph/core/graph_core.hpp
@@ -0,0 +1,620 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include "utils/verify.hpp"
+#include "utils/logger/logger.hpp"
+#include "order_and_law.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+#include "utils/simple_tools.hpp"
+
+namespace omnigraph {
+
+using std::vector;
+template<class DataMaster>
+class GraphCore;
+
+template<class DataMaster>
+class ConstructionHelper;
+
+template<class T>
+class PairedElementManipulationHelper;
+
+template<class DataMaster>
+class PairedVertex;
+
+template<class DataMaster>
+class PairedEdge;
+
+template<class DataMaster>
+class PairedEdge {
+ private:
+ typedef typename DataMaster::EdgeData EdgeData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ friend class GraphCore<DataMaster>;
+ friend class ConstructionHelper<DataMaster>;
+ friend class PairedElementManipulationHelper<EdgeId>;
+ //todo unfriend
+ friend class PairedVertex<DataMaster>;
+ VertexId end_;
+ EdgeData data_;
+ EdgeId conjugate_;
+
+ PairedEdge(VertexId end, const EdgeData &data)
+ : end_(end),
+ data_(data) {
+ }
+
+ EdgeData &data() {
+ return data_;
+ }
+
+ void set_data(const EdgeData &data) {
+ data_ = data;
+ }
+
+ VertexId end() const {
+ return end_;
+ }
+
+ VertexId start() const {
+ return conjugate_->end()->conjugate();
+ }
+
+ void set_conjugate(EdgeId conjugate) {
+ conjugate_ = conjugate;
+ }
+
+ void SetEndVertex(VertexId end) {
+ end_ = end;
+ }
+
+public:
+ EdgeId conjugate() const {
+ return conjugate_;
+ }
+
+ size_t length(size_t k) const {
+ return data_.size() - k;
+ }
+};
+
+template<class DataMaster>
+class PairedVertex {
+private:
+ typedef typename DataMaster::VertexData VertexData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ typedef typename std::vector<EdgeId>::const_iterator edge_raw_iterator;
+
+ class conjugate_iterator : public boost::iterator_facade<conjugate_iterator,
+ EdgeId, boost::forward_traversal_tag, EdgeId> {
+ public:
+ explicit conjugate_iterator(edge_raw_iterator it,
+ bool conjugate = false)
+ : it_(it),
+ conjugate_(conjugate) {
+ }
+
+ //todo do we need it?
+ conjugate_iterator()
+ : conjugate_(false) {
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ it_++;
+ }
+
+ bool equal(const conjugate_iterator &other) const {
+ return other.it_ == it_ && other.conjugate_ == conjugate_;
+ }
+
+ EdgeId dereference() const {
+ return (conjugate_ ? (*it_)->conjugate() : *it_);
+ }
+
+ edge_raw_iterator it_;
+ bool conjugate_;
+ };
+
+public:
+ typedef conjugate_iterator edge_const_iterator;
+
+private:
+ friend class GraphCore<DataMaster>;
+ friend class ConstructionHelper<DataMaster>;
+ friend class PairedEdge<DataMaster>;
+ friend class PairedElementManipulationHelper<VertexId>;
+ friend class conjugate_iterator;
+
+ std::vector<EdgeId> outgoing_edges_;
+
+ VertexId conjugate_;
+
+ VertexData data_;
+
+ bool IsMinimal() const {
+ return conjugate_->conjugate_ <= conjugate_;
+ }
+
+ VertexId conjugate() const {
+ return conjugate_;
+ }
+
+ void set_conjugate(VertexId conjugate) {
+ conjugate_ = conjugate;
+ }
+
+ size_t OutgoingEdgeCount() const {
+ return outgoing_edges_.size();
+ }
+
+ edge_const_iterator out_begin() const {
+ return edge_const_iterator(outgoing_edges_.cbegin(), false);
+ }
+
+ edge_const_iterator out_end() const {
+ return edge_const_iterator(outgoing_edges_.cend(), false);
+ }
+
+ size_t IncomingEdgeCount() const {
+ return conjugate_->OutgoingEdgeCount();
+ }
+
+ size_t IncomingEdgesCount() const {
+ return conjugate_->OutgoingEdgeCount();
+ }
+
+ edge_const_iterator in_begin() const {
+ return edge_const_iterator(conjugate_->outgoing_edges_.cbegin(), true);
+ }
+
+ edge_const_iterator in_end() const {
+ return edge_const_iterator(conjugate_->outgoing_edges_.cend(), true);
+ }
+
+ PairedVertex(VertexData data)
+ : data_(data) {
+ }
+
+ VertexData &data() {
+ return data_;
+ }
+
+ void set_data(VertexData data) {
+ data_ = data;
+ }
+
+ const std::vector<EdgeId> OutgoingEdgesTo(VertexId v) const {
+ vector<EdgeId> result;
+ for (auto it = outgoing_edges_.begin(); it != outgoing_edges_.end(); ++it) {
+ if ((*it)->end() == v) {
+ result.push_back(*it);
+ }
+ }
+ return result;
+ }
+
+ void AddOutgoingEdge(EdgeId e) {
+ outgoing_edges_.insert(std::upper_bound(outgoing_edges_.begin(), outgoing_edges_.end(), e), e);
+ //outgoing_edges_.push_back(e);
+ }
+
+ bool RemoveOutgoingEdge(const EdgeId e) {
+ auto it = std::find(outgoing_edges_.begin(), outgoing_edges_.end(), e);
+ if (it == outgoing_edges_.end())
+ return false;
+
+ outgoing_edges_.erase(it);
+ return true;
+ }
+
+ ~PairedVertex() {
+ VERIFY(outgoing_edges_.size() == 0);
+ }
+};
+
+template<class DataMaster>
+class GraphCore: private boost::noncopyable {
+public:
+ typedef DataMaster DataMasterT;
+ typedef typename DataMasterT::VertexData VertexData;
+ typedef typename DataMasterT::EdgeData EdgeData;
+ typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
+ typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
+ typedef typename std::set<VertexId>::const_iterator VertexIt;
+ typedef typename PairedVertex<DataMaster>::edge_const_iterator edge_const_iterator;
+
+private:
+ restricted::LocalIdDistributor id_distributor_;
+ DataMaster master_;
+ std::set<VertexId> vertices_;
+
+ friend class ConstructionHelper<DataMaster>;
+public:
+ VertexIt begin() const {
+ return vertices_.begin();
+ }
+
+ VertexIt end() const {
+ return vertices_.end();
+ }
+
+ const std::set<VertexId>& vertices() const {
+ return vertices_;
+ }
+
+ size_t size() const {
+ return vertices_.size();
+ }
+
+ edge_const_iterator out_begin(VertexId v) const {
+ return v->out_begin();
+ }
+
+ edge_const_iterator out_end(VertexId v) const {
+ return v->out_end();
+ }
+
+ edge_const_iterator in_begin(VertexId v) const {
+ return v->in_begin();
+ }
+
+ edge_const_iterator in_end(VertexId v) const {
+ return v->in_end();
+ }
+
+private:
+ void DeleteVertexFromGraph(VertexId vertex) {
+ this->vertices_.erase(vertex);
+ this->vertices_.erase(conjugate(vertex));
+ }
+
+ void DestroyVertex(VertexId vertex) {
+ VertexId conjugate = vertex->conjugate();
+ delete vertex.get();
+ delete conjugate.get();
+ }
+
+ bool AdditionalCompressCondition(VertexId v) const {
+ return !(EdgeEnd(GetUniqueOutgoingEdge(v)) == conjugate(v) && EdgeStart(GetUniqueIncomingEdge(v)) == conjugate(v));
+ }
+
+protected:
+
+ VertexId CreateVertex(const VertexData& data1, const VertexData& data2, restricted::IdDistributor& id_distributor) {
+ VertexId vertex1(new PairedVertex<DataMaster>(data1), id_distributor);
+ VertexId vertex2(new PairedVertex<DataMaster>(data2), id_distributor);
+ vertex1->set_conjugate(vertex2);
+ vertex2->set_conjugate(vertex1);
+ return vertex1;
+ }
+
+ VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
+ return CreateVertex(data, master_.conjugate(data), id_distributor);
+ }
+
+ VertexId CreateVertex(const VertexData &data) {
+ return CreateVertex(data, id_distributor_);
+ }
+
+ void AddVertexToGraph(VertexId vertex) {
+ vertices_.insert(vertex);
+ vertices_.insert(conjugate(vertex));
+ }
+
+ VertexId HiddenAddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
+ VertexId vertex = CreateVertex(data, id_distributor);
+ AddVertexToGraph(vertex);
+ return vertex;
+ }
+
+ VertexId HiddenAddVertex(const VertexData& data) {
+ return HiddenAddVertex(data, id_distributor_);
+ }
+
+ void HiddenDeleteVertex(VertexId vertex) {
+ DeleteVertexFromGraph(vertex);
+ DestroyVertex(vertex);
+ }
+
+ /////////////////////////low-level ops (move to helper?!)
+
+ ////what with this method?
+ EdgeId AddSingleEdge(VertexId v1, VertexId v2, const EdgeData &data,
+ restricted::IdDistributor &idDistributor) {
+ EdgeId newEdge(new PairedEdge<DataMaster>(v2, data), idDistributor);
+ if (v1 != VertexId(0))
+ v1->AddOutgoingEdge(newEdge);
+ return newEdge;
+ }
+
+ EdgeId HiddenAddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId result = AddSingleEdge(VertexId(0), VertexId(0), data, id_distributor);
+ if (this->master().isSelfConjugate(data)) {
+ result->set_conjugate(result);
+ return result;
+ }
+ EdgeId rcEdge = AddSingleEdge(VertexId(0), VertexId(0), this->master().conjugate(data), id_distributor);
+ result->set_conjugate(rcEdge);
+ rcEdge->set_conjugate(result);
+ return result;
+ }
+
+ EdgeId HiddenAddEdge(const EdgeData &data) {
+ return HiddenAddEdge(data, id_distributor_);
+ }
+
+ EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ // todo was suppressed for concurrent execution reasons (see concurrent_graph_component.hpp)
+ // VERIFY(this->vertices_.find(v1) != this->vertices_.end() && this->vertices_.find(v2) != this->vertices_.end());
+ EdgeId result = AddSingleEdge(v1, v2, data, id_distributor);
+ if (this->master().isSelfConjugate(data) && (v1 == conjugate(v2))) {
+ // todo why was it removed???
+ // Because of some split issues: when self-conjugate edge is split armageddon happends
+ // VERIFY(v1 == conjugate(v2));
+ // VERIFY(v1 == conjugate(v2));
+ result->set_conjugate(result);
+ return result;
+ }
+ EdgeId rcEdge = AddSingleEdge(v2->conjugate(), v1->conjugate(), this->master().conjugate(data), id_distributor);
+ result->set_conjugate(rcEdge);
+ rcEdge->set_conjugate(result);
+ return result;
+ }
+
+ EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
+ return HiddenAddEdge(v1, v2, data, id_distributor_);
+ }
+
+ void HiddenDeleteEdge(EdgeId edge) {
+ TRACE("Hidden delete edge " << edge.int_id());
+ EdgeId rcEdge = conjugate(edge);
+ VertexId rcStart = conjugate(edge->end());
+ VertexId start = conjugate(rcEdge->end());
+ start->RemoveOutgoingEdge(edge);
+ rcStart->RemoveOutgoingEdge(rcEdge);
+ if (edge != rcEdge) {
+ delete rcEdge.get();
+ }
+ delete edge.get();
+ }
+
+ void HiddenDeletePath(const std::vector<EdgeId>& edgesToDelete, const std::vector<VertexId>& verticesToDelete) {
+ for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
+ HiddenDeleteEdge(*it);
+ for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
+ HiddenDeleteVertex(*it);
+ }
+
+public:
+
+ GraphCore(const DataMaster& master) : master_(master) {
+ }
+
+ virtual ~GraphCore() {
+ VERIFY(size() == 0);
+ }
+
+ class IteratorContainer {
+ public:
+ typedef edge_const_iterator const_iterator;
+ private:
+ const_iterator begin_;
+ const_iterator end_;
+ public:
+ IteratorContainer(const_iterator begin, const_iterator end) :
+ begin_(begin), end_(end) {
+
+ }
+
+ const_iterator begin() const {
+ return begin_;
+ }
+
+ const_iterator end() const {
+ return end_;
+ }
+ };
+
+ restricted::LocalIdDistributor &GetGraphIdDistributor() {
+ return id_distributor_;
+ }
+
+ const restricted::LocalIdDistributor &GetGraphIdDistributor() const {
+ return id_distributor_;
+ }
+
+ size_t int_id(EdgeId edge) const {
+ return edge.int_id();
+ }
+
+ size_t int_id(VertexId vertex) const {
+ return vertex.int_id();
+ }
+
+ const DataMaster& master() const {
+ return master_;
+ }
+
+ const EdgeData& data(EdgeId edge) const {
+ return edge->data();
+ }
+
+ const VertexData& data(VertexId v) const {
+ return v->data();
+ }
+
+ EdgeData& data(EdgeId edge) {
+ return edge->data();
+ }
+
+ VertexData& data(VertexId v) {
+ return v->data();
+ }
+
+ size_t OutgoingEdgeCount(VertexId v) const {
+ return v->OutgoingEdgeCount();
+ }
+
+ IteratorContainer OutgoingEdges(VertexId v) const {
+ //INFO("Outgoing");
+ return IteratorContainer(out_begin(v), out_end(v));
+ }
+
+ size_t IncomingEdgeCount(VertexId v) const {
+ return v->IncomingEdgeCount();
+ }
+
+ IteratorContainer IncomingEdges(VertexId v) const {
+ return IteratorContainer(in_begin(v), in_end(v));
+ }
+
+ std::vector<EdgeId> GetEdgesBetween(VertexId v, VertexId u) const {
+ return v->OutgoingEdgesTo(u);
+ }
+
+ bool RelatedVertices(VertexId v1, VertexId v2) const {
+ return v1 == v2 || v1 == conjugate(v2);
+ }
+
+ ////////////////////////edge information
+ VertexId EdgeStart(EdgeId edge) const {
+ return edge->start();
+ }
+
+ VertexId EdgeEnd(EdgeId edge) const {
+ //INFO("Edge end");
+ return edge->end();
+ }
+
+ VertexId conjugate(VertexId v) const {
+ return v->conjugate();
+ }
+
+ EdgeId conjugate(EdgeId edge) const {
+ return edge->conjugate();
+ }
+
+ size_t length(const EdgeId edge) const {
+ return master_.length(data(edge));
+ }
+
+ size_t length(const VertexId v) const {
+ return master_.length(data(v));
+ }
+
+ //////////////////////shortcut methods
+
+ std::vector<EdgeId> IncidentEdges(VertexId v) const {
+ vector<EdgeId> answer;
+ push_back_all(answer, IncomingEdges(v));
+ push_back_all(answer, OutgoingEdges(v));
+ return answer;
+ }
+
+ EdgeId GetUniqueOutgoingEdge(VertexId v) const {
+ VERIFY(CheckUniqueOutgoingEdge(v));
+ return *out_begin(v);
+ }
+
+ bool CheckUniqueIncomingEdge(VertexId v) const {
+ return IncomingEdgeCount(v) == 1;
+ }
+
+ EdgeId GetUniqueIncomingEdge(VertexId v) const {
+ VERIFY(CheckUniqueIncomingEdge(v));
+ return *in_begin(v);
+ }
+
+ bool CheckUniqueOutgoingEdge(VertexId v) const {
+ return OutgoingEdgeCount(v) == 1;
+ }
+
+ bool IsDeadEnd(VertexId v) const {
+ return OutgoingEdgeCount(v) == 0;
+ }
+
+ bool IsDeadStart(VertexId v) const {
+ return IncomingEdgeCount(v) == 0;
+ }
+
+ bool CanCompressVertex(VertexId v) const {
+ // TRACE("Compress vertex check: ");
+ // TRACE("Outgoing check: " << (OutgoingEdgeCount(v) == 1));
+ // TRACE("Outgoing check: " << (CheckUniqueOutgoingEdge(v)));
+ // TRACE("Incoming check: " << (IncomingEdgeCount(v) == 1));
+ // TRACE("Incoming check: " << (CheckUniqueIncomingEdge(v) == 1));
+ // if((OutgoingEdgeCount(v) == 1) && (IncomingEdgeCount(v) == 1)) {
+ // TRACE("Loop check: " << (GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v)));
+ // TRACE("Additional check: " << AdditionalCompressCondition(v));
+ // }
+ return OutgoingEdgeCount(v) == 1 && IncomingEdgeCount(v) == 1 &&
+ GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v) &&
+ AdditionalCompressCondition(v);
+ }
+
+ //////////////////////printing
+ std::string str(const EdgeId e) const {
+// return master_.str(data(edge));
+ std::stringstream ss;
+ ss << int_id(e) << " (" << length(e) << ")";
+ return ss.str();
+ }
+
+ std::string str(const VertexId v) const {
+// return master_.str(data(v));
+ return ToString(int_id(v));
+ }
+
+ std::string detailed_str(const VertexId v) const {
+ std::stringstream ss;
+ ss << str(v) << ";";
+ ss << "Incoming edges" << str(IncomingEdges(v)) << "; ";
+ ss << "Outgoing edges" << str(OutgoingEdges(v)) << ";";
+ return ss.str();
+ }
+
+ std::string detailed_str(const std::vector<EdgeId>& path) const {
+ std::stringstream ss;
+ ss << "Path: ";
+ ss << "Vertex " << detailed_str(EdgeStart(path[0])) << " | ";
+ for (auto it = path.begin(); it != path.end(); ++it) {
+ EdgeId e = *it;
+ ss << "Edge " << str(e) << " | ";
+ ss << "Vertex " << detailed_str(EdgeEnd(e)) << " | ";
+ }
+ return ss.str();
+ }
+
+ template<class Container>
+ std::string str(const Container& container) const {
+ return str(container.begin(), container.end());
+ }
+
+ template<class It>
+ std::string str(It begin, It end) const {
+ std::stringstream ss;
+ std::string delim = "";
+ for (auto it = begin; it != end; ++it) {
+ ss << delim << str(*it);
+ delim = ", ";
+ }
+ return ss.str();
+ }
+
+private:
+ DECL_LOGGER("GraphCore");
+};
+
+}
diff --git a/src/common/assembly_graph/core/graph_iterators.hpp b/src/common/assembly_graph/core/graph_iterators.hpp
new file mode 100644
index 0000000..4edb985
--- /dev/null
+++ b/src/common/assembly_graph/core/graph_iterators.hpp
@@ -0,0 +1,408 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "common/adt/queue_iterator.hpp"
+#include "func/pred.hpp"
+#include "action_handlers.hpp"
+#include "utils/simple_tools.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+
+namespace omnigraph {
+
+/**
+ * SmartIterator is able to iterate through collection content of which can be changed in process of
+ * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
+ * way graph is changed. Also one can define order of iteration by specifying Comparator.
+ */
+template<class Graph, typename ElementId, typename Comparator = std::less<ElementId>>
+class SmartIterator : public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ DynamicQueueIterator<ElementId, Comparator> inner_it_;
+ bool add_new_;
+ bool canonical_only_;
+ //todo think of checking it in HandleAdd
+ func::TypedPredicate<ElementId> add_condition_;
+
+protected:
+
+ void push(const ElementId& el) {
+ if ((!canonical_only_ || el <= this->g().conjugate(el)) &&
+ add_condition_(el)) {
+ inner_it_.push(el);
+ }
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ for (auto it = begin; it != end; ++it) {
+ push(*it);
+ }
+ }
+
+ void erase(const ElementId& el) {
+ if (!canonical_only_ || el <= this->g().conjugate(el)) {
+ inner_it_.erase(el);
+ }
+ }
+
+ void clear() {
+ inner_it_.clear();
+ }
+
+ SmartIterator(const Graph &g, const std::string &name, bool add_new,
+ const Comparator& comparator, bool canonical_only,
+ func::TypedPredicate<ElementId> add_condition = func::AlwaysTrue<ElementId>())
+ : base(g, name),
+ inner_it_(comparator),
+ add_new_(add_new),
+ canonical_only_(canonical_only),
+ add_condition_(add_condition) {
+ }
+
+public:
+
+ bool canonical_only() const {
+ return canonical_only_;
+ }
+
+ bool IsEnd() const {
+ return inner_it_.IsEnd();
+ }
+
+ size_t size() const {
+ return inner_it_.size();
+ }
+
+ ElementId operator*() {
+ return *inner_it_;
+ }
+
+ void operator++() {
+ ++inner_it_;
+ }
+
+ void HandleAdd(ElementId v) override {
+ if (add_new_)
+ push(v);
+ }
+
+ void HandleDelete(ElementId v) override {
+ erase(v);
+ }
+
+ //use carefully!
+ void ReleaseCurrent() {
+ inner_it_.ReleaseCurrent();
+ }
+
+};
+
+/**
+ * SmartIterator is abstract class which acts both as QueueIterator and GraphActionHandler. As QueueIterator
+ * SmartIterator is able to iterate through collection content of which can be changed in process of
+ * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
+ * way graph is changed. Also one can define order of iteration by specifying Comparator.
+ */
+template<class Graph, typename ElementId,
+ typename Comparator = std::less<ElementId>>
+class SmartSetIterator : public SmartIterator<Graph, ElementId, Comparator> {
+ typedef SmartIterator<Graph, ElementId, Comparator> base;
+
+public:
+ SmartSetIterator(const Graph &g,
+ bool add_new = false,
+ const Comparator& comparator = Comparator(),
+ bool canonical_only = false,
+ func::TypedPredicate<ElementId> add_condition = func::AlwaysTrue<ElementId>())
+ : base(g, "SmartSet " + ToString(this), add_new, comparator, canonical_only, add_condition) {
+ }
+
+ template<class Iterator>
+ SmartSetIterator(const Graph &g, Iterator begin, Iterator end,
+ bool add_new = false,
+ const Comparator& comparator = Comparator(),
+ bool canonical_only = false,
+ func::TypedPredicate<ElementId> add_condition = func::AlwaysTrue<ElementId>())
+ : SmartSetIterator(g, add_new, comparator, canonical_only, add_condition) {
+ insert(begin, end);
+ }
+
+ template<typename InputIterator>
+ void insert(InputIterator begin, InputIterator end) {
+ base::insert(begin, end);
+ }
+
+ void push(const ElementId& el) {
+ base::push(el);
+ }
+
+ void clear() {
+ base::clear();
+ }
+};
+
+/**
+ * SmartVertexIterator iterates through vertices of graph. It listens to AddVertex/DeleteVertex graph events
+ * and correspondingly edits the set of vertices to iterate through. Note: high level event handlers are
+ * triggered before low level event handlers like H>andleAdd/HandleDelete. Thus if Comparator uses certain
+ * structure which is also updated with handlers make sure that all information is updated in high level
+ * event handlers.
+ */
+template<class Graph, typename Comparator = std::less<typename Graph::VertexId> >
+class SmartVertexIterator : public SmartIterator<Graph,
+ typename Graph::VertexId, Comparator> {
+ public:
+ typedef typename Graph::VertexId VertexId;
+
+ static size_t get_id() {
+ static size_t id = 0;
+ return id++;
+ }
+
+ public:
+ SmartVertexIterator(const Graph &g, const Comparator& comparator =
+ Comparator(), bool canonical_only = false)
+ : SmartIterator<Graph, VertexId, Comparator>(
+ g, "SmartVertexIterator " + ToString(get_id()), true,
+ comparator, canonical_only) {
+ this->insert(g.begin(), g.end());
+ }
+
+};
+
+//todo return verifies when they can be switched off
+template<class Graph>
+class GraphEdgeIterator : public boost::iterator_facade<GraphEdgeIterator<Graph>
+ , typename Graph::EdgeId, boost::forward_traversal_tag
+ , typename Graph::EdgeId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexIt const_vertex_iterator;
+ typedef typename Graph::edge_const_iterator const_edge_iterator;
+
+ const Graph& g_;
+ const_vertex_iterator v_it_;
+ const_edge_iterator e_it_;
+ bool canonical_only_;
+
+public:
+
+ GraphEdgeIterator(const Graph& g, const_vertex_iterator v_it, bool canonical_only = false)
+ : g_(g),
+ v_it_(v_it),
+ canonical_only_(canonical_only) {
+ if (v_it_ != g_.end()) {
+ e_it_ = g_.out_begin(*v_it_);
+ Skip();
+ }
+ }
+
+private:
+
+ bool Canonical(EdgeId e) const {
+ return e <= g_.conjugate(e);
+ }
+
+ friend class boost::iterator_core_access;
+
+ void Skip() {
+ //VERIFY(v_it_ != g_.end());
+ while (true) {
+ if (e_it_ == g_.out_end(*v_it_)) {
+ v_it_++;
+ if (v_it_ == g_.end())
+ return;
+ e_it_ = g_.out_begin(*v_it_);
+ } else {
+ if (!canonical_only_ || Canonical(*e_it_))
+ return;
+ else
+ e_it_++;
+ }
+ }
+ }
+
+ void increment() {
+ if (v_it_ == g_.end())
+ return;
+ e_it_++;
+ Skip();
+ }
+
+ bool equal(const GraphEdgeIterator &other) const {
+ if (other.v_it_ != v_it_)
+ return false;
+ if (v_it_ != g_.end() && other.e_it_ != e_it_)
+ return false;
+ if (other.canonical_only_ != canonical_only_)
+ return false;
+ return true;
+ }
+
+ EdgeId dereference() const {
+ //VERIFY(v_it_ != g_.end());
+ return *e_it_;
+ }
+
+};
+
+template<class Graph>
+class ConstEdgeIterator {
+ typedef typename Graph::EdgeId EdgeId;
+ GraphEdgeIterator<Graph> begin_, end_;
+
+ public:
+ ConstEdgeIterator(const Graph &g, bool canonical_only = false)
+ : begin_(g, g.begin(), canonical_only), end_(g, g.end(), canonical_only) {
+ }
+
+ bool IsEnd() const {
+ return begin_ == end_;
+ }
+
+ EdgeId operator*() const {
+ return *begin_;
+ }
+
+ const ConstEdgeIterator& operator++() {
+ begin_++;
+ return *this;
+ }
+};
+
+/**
+ * SmartEdgeIterator iterates through edges of graph. It listens to AddEdge/DeleteEdge graph events
+ * and correspondingly edits the set of edges to iterate through. Note: high level event handlers are
+ * triggered before low level event handlers like HandleAdd/HandleDelete. Thus if Comparator uses certain
+ * structure which is also updated with handlers make sure that all information is updated in high level
+ * event handlers.
+ */
+template<class Graph, typename Comparator = std::less<typename Graph::EdgeId> >
+class SmartEdgeIterator : public SmartIterator<Graph, typename Graph::EdgeId, Comparator> {
+ typedef GraphEdgeIterator<Graph> EdgeIt;
+ public:
+ typedef typename Graph::EdgeId EdgeId;
+
+ static size_t get_id() {
+ static size_t id = 0;
+ return id++;
+ }
+
+ public:
+ SmartEdgeIterator(const Graph &g, Comparator comparator = Comparator(),
+ bool canonical_only = false)
+ : SmartIterator<Graph, EdgeId, Comparator>(
+ g, "SmartEdgeIterator " + ToString(get_id()), true,
+ comparator, canonical_only) {
+ this->insert(EdgeIt(g, g.begin()), EdgeIt(g, g.end()));
+
+// for (auto it = graph.begin(); it != graph.end(); ++it) {
+// //todo: this solution doesn't work with parallel simplification
+// this->insert(graph.out_begin(*it), graph.out_end(*it));
+// //this does
+// //auto out = graph.OutgoingEdges(*it);
+// //this->base::insert(out.begin(), out.end());
+// }
+ }
+};
+
+//todo move out
+template<class Graph, class ElementId>
+class IterationHelper {
+};
+
+template<class Graph>
+class IterationHelper<Graph, typename Graph::VertexId> {
+ const Graph& g_;
+public:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::VertexIt const_vertex_iterator;
+
+ IterationHelper(const Graph& g)
+ : g_(g) {
+ }
+
+ const_vertex_iterator begin() const {
+ return g_.begin();
+ }
+
+ const_vertex_iterator end() const {
+ return g_.end();
+ }
+
+ std::vector<const_vertex_iterator> Chunks(size_t chunk_cnt) const {
+ VERIFY(chunk_cnt > 0);
+ if (chunk_cnt == 1) {
+ return {begin(), end()};
+ }
+
+ //trying to split vertices into equal chunks, leftovers put into first chunk
+ vector<const_vertex_iterator> answer;
+ size_t vertex_cnt = g_.size();
+ size_t chunk_size = vertex_cnt / chunk_cnt;
+ auto it = g_.begin();
+ answer.push_back(it);
+ for (size_t i = 0; i + chunk_cnt * chunk_size < vertex_cnt; ++i) {
+ it++;
+ }
+ if (chunk_size > 0) {
+ size_t i = 0;
+ do {
+ ++it;
+ if (++i % chunk_size == 0)
+ answer.push_back(it);
+ } while (it != g_.end());
+
+ VERIFY(i == chunk_cnt * chunk_size);
+ } else {
+ VERIFY(it == g_.end());
+ answer.push_back(it);
+ }
+ VERIFY(answer.back() == g_.end());
+ return answer;
+ }
+
+};
+
+//todo move out
+template<class Graph>
+class IterationHelper<Graph, typename Graph::EdgeId> {
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef GraphEdgeIterator<Graph> const_edge_iterator;
+
+ IterationHelper(const Graph& g)
+ : g_(g) {
+ }
+
+ const_edge_iterator begin() const {
+ return const_edge_iterator(g_, g_.begin());
+ }
+
+ const_edge_iterator end() const {
+ return const_edge_iterator(g_, g_.end());
+ }
+
+ std::vector<omnigraph::GraphEdgeIterator<Graph>> Chunks(size_t chunk_cnt) const {
+ if (chunk_cnt == 1) {
+ return {begin(), end()};
+ }
+
+ vector<omnigraph::GraphEdgeIterator<Graph>> answer;
+
+ for (auto v_it : IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt)) {
+ answer.push_back(omnigraph::GraphEdgeIterator<Graph>(g_, v_it));
+ }
+ return answer;
+ }
+};
+
+}
diff --git a/src/common/assembly_graph/core/observable_graph.hpp b/src/common/assembly_graph/core/observable_graph.hpp
new file mode 100644
index 0000000..5b62e24
--- /dev/null
+++ b/src/common/assembly_graph/core/observable_graph.hpp
@@ -0,0 +1,499 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <set>
+#include <cstring>
+#include "utils/logger/logger.hpp"
+#include "graph_core.hpp"
+#include "graph_iterators.hpp"
+
+namespace omnigraph {
+
+using std::vector;
+using std::set;
+template<class DataMaster>
+class ObservableGraph: public GraphCore<DataMaster> {
+public:
+ typedef GraphCore<DataMaster> base;
+ typedef typename base::DataMasterT DataMasterT;
+ typedef typename base::VertexData VertexData;
+ typedef typename base::EdgeData EdgeData;
+ typedef typename base::EdgeId EdgeId;
+ typedef typename base::VertexId VertexId;
+ typedef typename base::VertexIt VertexIt;
+ typedef typename base::edge_const_iterator edge_const_iterator;
+
+ typedef HandlerApplier<VertexId, EdgeId> Applier;
+ typedef SmartVertexIterator<ObservableGraph> SmartVertexIt;
+ typedef SmartEdgeIterator<ObservableGraph> SmartEdgeIt;
+ typedef ConstEdgeIterator<ObservableGraph> ConstEdgeIt;
+ typedef ActionHandler<VertexId, EdgeId> Handler;
+
+private:
+ //todo switch to smart iterators
+ mutable std::vector<Handler*> action_handler_list_;
+ const HandlerApplier<VertexId, EdgeId> *applier_;
+
+public:
+//todo move to graph core
+ typedef ConstructionHelper<DataMaster> HelperT;
+
+ HelperT GetConstructionHelper() {
+// TODO: fix everything and restore this check
+// VERIFY(this->VerifyAllDetached());
+ return HelperT(*this);
+ }
+
+ const Applier& GetHandlerApplier() const {
+ return *applier_;
+ }
+
+ void AddActionHandler(Handler* action_handler) const;
+
+ bool RemoveActionHandler(const Handler* action_handler) const;
+
+ bool AllHandlersThreadSafe() const;
+
+ // TODO: for debug. remove.
+ void PrintHandlersNames() const;
+
+ //todo make Fire* protected once again with helper friend class
+ void FireAddVertex(VertexId v) const;
+
+ void FireAddEdge(EdgeId e) const;
+
+ void FireDeleteVertex(VertexId v) const;
+
+ void FireDeleteEdge(EdgeId e) const;
+
+ void FireMerge(std::vector<EdgeId> old_edges, EdgeId new_edge) const;
+
+ void FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const;
+
+ void FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const;
+
+ bool VerifyAllDetached();
+
+ //smart iterators
+ template<typename Comparator>
+ SmartVertexIterator<ObservableGraph, Comparator> SmartVertexBegin(
+ const Comparator& comparator, bool canonical_only = false) const {
+ return SmartVertexIterator<ObservableGraph, Comparator>(*this,
+ comparator, canonical_only);
+ }
+
+ SmartVertexIterator<ObservableGraph> SmartVertexBegin(bool canonical_only = false) const {
+ return SmartVertexIterator<ObservableGraph>(*this, std::less<VertexId>(), canonical_only);
+ }
+
+ template<typename Comparator>
+ SmartEdgeIterator<ObservableGraph, Comparator> SmartEdgeBegin(
+ const Comparator& comparator, bool canonical_only = false) const {
+ return SmartEdgeIterator<ObservableGraph, Comparator>(*this, comparator, canonical_only);
+ }
+
+ SmartEdgeIterator<ObservableGraph> SmartEdgeBegin(bool canonical_only = false) const {
+ return SmartEdgeIterator<ObservableGraph>(*this, std::less<EdgeId>(), canonical_only);
+ }
+
+ ConstEdgeIterator<ObservableGraph> ConstEdgeBegin(bool canonical_only = false) const {
+ return ConstEdgeIterator<ObservableGraph>(*this, canonical_only);
+ }
+
+ void FireDeletePath(const std::vector<EdgeId>& edges_to_delete, const std::vector<VertexId>& vertices_to_delete) const;
+
+ ObservableGraph(const DataMaster& master) :
+ base(master), applier_(new PairedHandlerApplier<ObservableGraph>(*this)) {
+ }
+
+ virtual ~ObservableGraph();
+
+ /////////////////////////graph operations
+ //adding/removing vertices and edges
+ VertexId AddVertex(const VertexData& data) {
+ return AddVertex(data, GetGraphIdDistributor());
+ }
+
+ VertexId AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor);
+
+ void DeleteVertex(VertexId v);
+
+ void ForceDeleteVertex(VertexId v);
+
+ using base::GetGraphIdDistributor;
+ using base::conjugate;
+
+ EdgeId AddEdge(const EdgeData &data) {
+ return AddEdge(data, GetGraphIdDistributor());
+ }
+
+ EdgeId AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor);
+
+ EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
+ return AddEdge(v1, v2, data, GetGraphIdDistributor());
+ }
+
+ EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor);
+
+ void DeleteEdge(EdgeId e);
+
+ void DeleteAllOutgoing(VertexId v);
+
+ void DeleteAllIncoming(VertexId v);
+
+ void CompressVertex(VertexId v);
+
+ EdgeId UnsafeCompressVertex(VertexId v);
+
+ std::vector<EdgeId> EdgesToDelete(const std::vector<EdgeId>& path) const;
+
+ std::vector<VertexId> VerticesToDelete(const std::vector<EdgeId>& path) const;
+
+ std::vector<EdgeId> CorrectMergePath(const std::vector<EdgeId>& path) const;
+
+ EdgeId MergePath(const std::vector<EdgeId>& path, bool safe_merging = true);
+
+ std::pair<EdgeId, EdgeId> SplitEdge(EdgeId edge, size_t position);
+
+ EdgeId GlueEdges(EdgeId edge1, EdgeId edge2);
+
+private:
+ DECL_LOGGER("ObservableGraph")
+};
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::VertexId ObservableGraph<DataMaster>::AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
+ VertexId v = base::HiddenAddVertex(data, id_distributor);
+ FireAddVertex(v);
+ return v;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteVertex(VertexId v) {
+ VERIFY(base::IsDeadEnd(v) && base::IsDeadStart(v));
+ VERIFY(v != VertexId(NULL));
+ FireDeleteVertex(v);
+ base::HiddenDeleteVertex(v);
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::ForceDeleteVertex(VertexId v) {
+ DeleteAllOutgoing(v);
+ DeleteAllIncoming(v);
+ DeleteVertex(v);
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId e = base::HiddenAddEdge(v1, v2, data, id_distributor);
+ FireAddEdge(e);
+ return e;
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId e = base::HiddenAddEdge(data, id_distributor);
+ FireAddEdge(e);
+ return e;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteEdge(EdgeId e) {
+ FireDeleteEdge(e);
+ base::HiddenDeleteEdge(e);
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteAllOutgoing(VertexId v) {
+ while (base::OutgoingEdgeCount(v) > 0) {
+ EdgeId edge = *base::out_begin(v);
+ DeleteEdge(edge);
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::DeleteAllIncoming(VertexId v) {
+ while (base::IncomingEdgeCount(v) > 0) {
+ EdgeId edge = *base::in_begin(v);
+ DeleteEdge(edge);
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::CompressVertex(VertexId v) {
+ //VERIFY(CanCompressVertex(v));
+ if (base::CanCompressVertex(v)) {
+ UnsafeCompressVertex(v);
+ } else {
+ TRACE("Vertex " << base::str(v) << " can't be compressed");
+ }
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::UnsafeCompressVertex(VertexId v) {
+ VERIFY(base::CanCompressVertex(v));
+ std::vector<EdgeId> edges_to_merge;
+ edges_to_merge.push_back(base::GetUniqueIncomingEdge(v));
+ edges_to_merge.push_back(base::GetUniqueOutgoingEdge(v));
+ return MergePath(edges_to_merge);
+}
+
+template<class DataMaster>
+std::vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::EdgesToDelete(const std::vector<EdgeId>& path) const {
+ std::set<EdgeId> edgesToDelete;
+ edgesToDelete.insert(path[0]);
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ if (edgesToDelete.find(base::conjugate(e)) == edgesToDelete.end())
+ edgesToDelete.insert(e);
+ }
+ return std::vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
+}
+
+template<class DataMaster>
+vector<typename ObservableGraph<DataMaster>::VertexId> ObservableGraph<DataMaster>::VerticesToDelete(const vector<EdgeId>& path) const {
+ std::set<VertexId> verticesToDelete;
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ VertexId v = base::EdgeStart(e);
+ if (verticesToDelete.find(base::conjugate(v)) == verticesToDelete.end())
+ verticesToDelete.insert(v);
+ }
+ return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::AddActionHandler(Handler* action_handler) const {
+#pragma omp critical(action_handler_list_modification)
+ {
+ TRACE("Action handler " << action_handler->name() << " added");
+ if (find(action_handler_list_.begin(), action_handler_list_.end(), action_handler) != action_handler_list_.end()) {
+ VERIFY_MSG(false, "Action handler " << action_handler->name() << " has already been added");
+ } else {
+ action_handler_list_.push_back(action_handler);
+ }
+ }
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::RemoveActionHandler(const Handler* action_handler) const {
+ bool result = false;
+#pragma omp critical(action_handler_list_modification)
+ {
+ auto it = std::find(action_handler_list_.begin(), action_handler_list_.end(), action_handler);
+ if (it != action_handler_list_.end()) {
+ action_handler_list_.erase(it);
+ TRACE("Action handler " << action_handler->name() << " removed");
+ result = true;
+ } else {
+ TRACE("Action handler " << action_handler->name() << " wasn't found among graph action handlers");
+ }
+ }
+ return result;
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::AllHandlersThreadSafe() const {
+ for (Handler* handler : action_handler_list_) {
+ if (handler->IsAttached() && !handler->IsThreadSafe()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::PrintHandlersNames() const {
+ for (Handler* handler : action_handler_list_) {
+ std::cout << handler->name() << " attached=" << handler->IsAttached() << std::endl;
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireAddVertex(VertexId v) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ TRACE("FireAddVertex to handler " << handler_ptr->name());
+ applier_->ApplyAdd(*handler_ptr, v);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireAddEdge(EdgeId e) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ TRACE("FireAddEdge to handler " << handler_ptr->name());
+ applier_->ApplyAdd(*handler_ptr, e);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeleteVertex(VertexId v) const {
+ for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
+ if ((*it)->IsAttached()) {
+ applier_->ApplyDelete(**it, v);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeleteEdge(EdgeId e) const {
+ for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
+ if ((*it)->IsAttached()) {
+ applier_->ApplyDelete(**it, e);
+ }
+ };
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireMerge(vector<EdgeId> old_edges, EdgeId new_edge) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplyMerge(*handler_ptr, old_edges, new_edge);
+ }
+ }
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplyGlue(*handler_ptr, new_edge, edge1, edge2);
+ }
+ };
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ applier_->ApplySplit(*handler_ptr, edge, new_edge1, new_edge2);
+ }
+ }
+}
+
+template<class DataMaster>
+bool ObservableGraph<DataMaster>::VerifyAllDetached() {
+ for (Handler* handler_ptr : action_handler_list_) {
+ if (handler_ptr->IsAttached()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template<class DataMaster>
+void ObservableGraph<DataMaster>::FireDeletePath(const vector<EdgeId>& edgesToDelete, const vector<VertexId>& verticesToDelete) const {
+ for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
+ FireDeleteEdge(*it);
+ for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
+ FireDeleteVertex(*it);
+}
+
+template<class DataMaster>
+ObservableGraph<DataMaster>::~ObservableGraph<DataMaster>() {
+ while (base::size() > 0) {
+ ForceDeleteVertex(*base::begin());
+ }
+}
+
+template<class DataMaster>
+vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::CorrectMergePath(const vector<EdgeId>& path) const {
+ for (size_t i = 0; i < path.size(); i++) {
+ if (path[i] == base::conjugate(path[i])) {
+ vector<EdgeId> result;
+ if (i < path.size() - 1 - i) {
+ for (size_t j = 0; j < path.size(); j++)
+ result.push_back(base::conjugate(path[path.size() - 1 - j]));
+ i = path.size() - 1 - i;
+ } else {
+ result = path;
+ }
+ size_t size = 2 * i + 1;
+ for (size_t j = result.size(); j < size; j++) {
+ result.push_back(base::conjugate(result[size - 1 - j]));
+ }
+ return result;
+ }
+ }
+ return path;
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::MergePath(const vector<EdgeId>& path, bool safe_merging) {
+ VERIFY(!path.empty());
+ for (size_t i = 0; i < path.size(); i++)
+ for (size_t j = i + 1; j < path.size(); j++) {
+ VERIFY(path[i] != path[j]);
+ }
+ if (path.size() == 1) {
+ TRACE(
+ "Path of single edge " << base::str(*(path.begin())) << ". Nothing to merge.");
+ };
+ // cerr << "Merging " << PrintDetailedPath(pObservableGraph<DataMaster><VertexIdT, EdgeIdT, VertexIt>ath) << endl;
+ // cerr << "Conjugate " << PrintConjugatePath(path) << endl;
+ vector<EdgeId> corrected_path = CorrectMergePath(path);
+ VertexId v1 = base::EdgeStart(corrected_path[0]);
+ VertexId v2 = base::EdgeEnd(corrected_path[corrected_path.size() - 1]);
+ vector<const EdgeData*> to_merge;
+ for (auto it = corrected_path.begin(); it != corrected_path.end(); ++it) {
+ to_merge.push_back(&(base::data(*it)));
+ }
+ EdgeId new_edge = base::HiddenAddEdge(v1, v2, base::master().MergeData(to_merge, safe_merging));
+ FireMerge(corrected_path, new_edge);
+ vector<EdgeId> edges_to_delete = EdgesToDelete(corrected_path);
+ vector<VertexId> vertices_to_delete = VerticesToDelete(corrected_path);
+ FireDeletePath(edges_to_delete, vertices_to_delete);
+ FireAddEdge(new_edge);
+ base::HiddenDeletePath(edges_to_delete, vertices_to_delete);
+ return new_edge;
+}
+
+template<class DataMaster>
+std::pair<typename ObservableGraph<DataMaster>::EdgeId, typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::SplitEdge(EdgeId edge, size_t position) {
+ bool sc_flag = (edge == conjugate(edge));
+ VERIFY_MSG(position > 0 && position < (sc_flag ? base::length(edge) / 2 + 1 : base::length(edge)),
+ "Edge length is " << base::length(edge) << " but split pos was " << position);
+ std::pair<VertexData, std::pair<EdgeData, EdgeData> > newData = base::master().SplitData(base::data(edge), position, sc_flag);
+ VertexId splitVertex = base::HiddenAddVertex(newData.first);
+ EdgeId new_edge1 = base::HiddenAddEdge(base::EdgeStart(edge), splitVertex, newData.second.first);
+ EdgeId new_edge2 = base::HiddenAddEdge(splitVertex, sc_flag ? conjugate(splitVertex) : base::EdgeEnd(edge), newData.second.second);
+ VERIFY(!sc_flag || new_edge2 == conjugate(new_edge2))
+ FireSplit(edge, new_edge1, new_edge2);
+ FireDeleteEdge(edge);
+ FireAddVertex(splitVertex);
+ FireAddEdge(new_edge1);
+ FireAddEdge(new_edge2);
+ base::HiddenDeleteEdge(edge);
+ return make_pair(new_edge1, new_edge2);
+}
+
+template<class DataMaster>
+typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::GlueEdges(EdgeId edge1, EdgeId edge2) {
+ EdgeId new_edge = base::HiddenAddEdge(base::EdgeStart(edge2), base::EdgeEnd(edge2), base::master().GlueData(base::data(edge1), base::data(edge2)));
+ FireGlue(new_edge, edge1, edge2);
+ FireDeleteEdge(edge1);
+ FireDeleteEdge(edge2);
+ FireAddEdge(new_edge);
+ VertexId start = base::EdgeStart(edge1);
+ VertexId end = base::EdgeEnd(edge1);
+ base::HiddenDeleteEdge(edge1);
+ base::HiddenDeleteEdge(edge2);
+ if (base::IsDeadStart(start) && base::IsDeadEnd(start)) {
+ DeleteVertex(start);
+ }
+ if (base::IsDeadStart(end) && base::IsDeadEnd(end)) {
+ DeleteVertex(end);
+ }
+ return new_edge;
+}
+}
diff --git a/src/common/assembly_graph/core/order_and_law.hpp b/src/common/assembly_graph/core/order_and_law.hpp
new file mode 100644
index 0000000..1f0373c
--- /dev/null
+++ b/src/common/assembly_graph/core/order_and_law.hpp
@@ -0,0 +1,644 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <boost/utility.hpp>
+
+#include <ostream>
+#include <unordered_set>
+#include <unordered_map>
+#include "utils/stacktrace.hpp"
+#include <algorithm>
+#include <map>
+#include "utils/openmp_wrapper.h"
+#include "folly/PackedSyncPtr.h"
+
+
+namespace restricted {
+
+//todo discuss with Anton
+static const uint16_t MAX_THREAD_CNT = 128;
+
+class IdDistributor {
+public:
+ virtual size_t GetId() = 0;
+
+ virtual ~IdDistributor() {
+ }
+};
+
+template<class Iter>
+class ListIdDistributor : public IdDistributor {
+ friend class IdSegmentStorage;
+
+private:
+ Iter left_;
+ Iter right_;
+ size_t shift_;
+ size_t max_;
+
+ ListIdDistributor(Iter left, Iter right, size_t shift = 0, size_t max = size_t(-1)) : left_(left),
+ right_(right),
+ shift_(shift), max_(max) {
+ }
+
+public:
+ bool valid() {
+ return left_ < right_;
+ }
+
+ size_t GetId() {
+ size_t result = *(left_);
+ VERIFY(result < max_);
+ ++left_;
+ return shift_ + result;
+ }
+};
+
+class SegmentIterator {
+private:
+ size_t value_;
+public:
+ SegmentIterator(size_t value) : value_(value) {
+ }
+
+ size_t operator*() const {
+ return value_;
+ }
+
+ void operator++() {
+ value_++;
+ }
+
+ void operator++(int) {
+ ++value_;
+ }
+
+ bool operator==(const SegmentIterator &that) const {
+ return value_ == that.value_;
+ }
+
+ bool operator!=(const SegmentIterator &that) const {
+ return value_ != that.value_;
+ }
+};
+
+class IdSegmentStorage {
+ friend class LocalIdDistributor;
+
+public:
+ ListIdDistributor<SegmentIterator> GetSegmentIdDistributor(size_t left, size_t right) {
+ VERIFY(left < right);
+ VERIFY(right <= size_);
+ return ListIdDistributor<SegmentIterator>(SegmentIterator(left), SegmentIterator(right), min_value_, size_);
+ }
+
+ template<class Iter>
+ ListIdDistributor<Iter> GetSegmentIdDistributor(Iter left, Iter right) {
+ VERIFY(left < right);
+ return ListIdDistributor<Iter>(left, right, min_value_, size_);
+ }
+
+ IdSegmentStorage() : min_value_(0), size_(0) { }
+
+private:
+ IdSegmentStorage(size_t min_value, size_t size) : min_value_(min_value), size_(size) { }
+
+ size_t min_value_;
+ size_t size_;
+};
+
+// Id distributor for pure_pointer. Singleton.
+class LocalIdDistributor : public IdDistributor, boost::noncopyable {
+ friend class PeriodicIdDistributor;
+
+ static const size_t INITIAL_MAX_INT_ID = 2;
+public:
+ size_t GetId() {
+ return max_int_id_++;
+ }
+
+ IdSegmentStorage Reserve(size_t size) {
+ max_int_id_ += size;
+ return IdSegmentStorage(max_int_id_ - size, size);
+ }
+
+ IdSegmentStorage ReserveUpTo(size_t max) {
+ VERIFY(max_int_id_ == INITIAL_MAX_INT_ID);
+ max_int_id_ = max;
+ return IdSegmentStorage(0, max);
+ }
+
+// static GlobalIdDistributor &GetInstance() {
+// static GlobalIdDistributor instance(INITIAL_MAX_INT_ID);
+// return instance;
+// }
+
+ size_t GetMax() const {
+ return max_int_id_;
+ }
+
+ LocalIdDistributor(size_t min_id_value = INITIAL_MAX_INT_ID) : max_int_id_(min_id_value) { }
+
+private:
+ size_t max_int_id_;
+};
+
+/* id distributor used for concurrent algorithms.
+* each thread use their own PeriodicIdDistributor with period equals to
+* the quantity of threads. After thread's job is done Synchronize call are required
+* to increase id in GlobalIdDistributor.
+*/
+class PeriodicIdDistributor : public IdDistributor {
+
+public:
+ PeriodicIdDistributor(LocalIdDistributor &id_distributor, size_t first_id, size_t period)
+ : id_distributor_(id_distributor), cur_id_(first_id), period_(period) {
+ }
+
+ virtual size_t GetId() {
+ size_t id = cur_id_;
+ cur_id_ += period_;
+
+ return id;
+ }
+
+ void Synchronize() const {
+ size_t &global_max_id = id_distributor_.max_int_id_;
+ global_max_id = std::max(cur_id_, global_max_id);
+ }
+
+private:
+ LocalIdDistributor &id_distributor_;
+ size_t cur_id_;
+ size_t period_;
+};
+
+template<class PurePtrT>
+class PurePtrLock;
+
+template<class PurePtrT>
+class PurePtrMarker;
+
+//todo maybe make it extend folly::PackedSyncPtr<T>?
+template<class T>
+struct pure_pointer {
+ typedef T type;
+ typedef T *pointer_type;
+
+ explicit pure_pointer()
+ : int_id_(0) {
+ ptr_.init(pointer_type(0), MAX_THREAD_CNT);
+ }
+
+ explicit pure_pointer(T *ptr)
+ : int_id_(size_t(ptr)) {
+ ptr_.init(ptr, MAX_THREAD_CNT);
+ VERIFY(int_id_ < 2);
+ }
+
+ explicit pure_pointer(T *ptr, IdDistributor &idDistributor)
+ : int_id_(generate_id(ptr, idDistributor)) {
+ ptr_.init(ptr, MAX_THREAD_CNT);
+ }
+
+// lock_pointer_type& get_lockable() {
+// return ptr_;
+// }
+
+ T *get() const {
+ return ptr_.get();
+ }
+
+ T &operator*() const {
+ return *ptr_;
+ }
+
+ T *operator->() const {
+ return ptr_.get();
+ }
+
+ bool operator==(const pure_pointer &rhs) const {
+ if (int_id_ == rhs.int_id_) {
+ VERIFY(ptr_.get() == rhs.ptr_.get());
+ return true;
+ }
+ return false;
+ }
+
+ bool operator!=(const pure_pointer &rhs) const {
+ return !operator==(rhs);
+ }
+
+ bool operator<(const pure_pointer &rhs) const {
+ return this->int_id_ < rhs.int_id_;
+ }
+
+ bool operator<=(const pure_pointer &rhs) const {
+ return *this < rhs || *this == rhs;
+ }
+
+ size_t hash() const {
+ return this->int_id_;
+ }
+
+ size_t int_id() const {
+ return int_id_;
+ }
+
+private:
+ friend class PurePtrLock<pure_pointer<T>>;
+
+ friend class PurePtrMarker<pure_pointer<T>>;
+
+ typedef folly::PackedSyncPtr<T> lock_pointer_type;
+
+ static size_t generate_id(T *ptr, IdDistributor &idDistributor) {
+ if (ptr == 0 || ptr == (T *) 1 || ptr == (T *) (-1)) {
+ return size_t(ptr);
+ }
+
+ return idDistributor.GetId();
+ }
+
+ lock_pointer_type ptr_;
+
+ size_t int_id_;
+};
+
+template<class LockT>
+class ReEnteringLock {
+ LockT &lock_;
+ bool reentered_;
+
+ uint16_t locking_thread() const {
+ //don't need barrier here (as folly documentation says)
+ return lock_.extra();
+ }
+
+ uint16_t current_thread() const {
+ return uint16_t(omp_get_thread_num());
+ }
+
+ void Lock() {
+ lock_.lock();
+ lock_.setExtra(current_thread());
+ }
+
+ void Unlock() {
+ lock_.setExtra(MAX_THREAD_CNT);
+ lock_.unlock();
+ }
+
+public:
+ ReEnteringLock(LockT &lock) :
+ lock_(lock),
+ reentered_(false) {
+ if (locking_thread() == current_thread()) {
+ reentered_ = true;
+ } else {
+ Lock();
+ }
+ }
+
+ ~ReEnteringLock() {
+ if (!reentered_) {
+ Unlock();
+ }
+ }
+};
+
+/**
+* Lock that uses a pure ptr as a target.
+* Be careful NOT to pass a COPY of pure ptr you want to use as locked object!
+*/
+template<class PurePtrT>
+class PurePtrLock {
+ ReEnteringLock<typename PurePtrT::lock_pointer_type> inner_lock_;
+
+public:
+ PurePtrLock(PurePtrT &pure_ptr) :
+ inner_lock_(pure_ptr.ptr_) {
+ }
+
+};
+
+/**
+* Way to "mark" pure pointer without using additional memory.
+* Marking/unmarking operations are atomic
+* Be careful NOT to pass a COPY of pure ptr you want to mark!
+* Do not use with PurePtrLocks, they use the same space for storing data...
+*/
+template<class PurePtrT>
+class PurePtrMarker {
+ typedef typename PurePtrT::lock_pointer_type LockWithData;
+
+ void ChangeMark(PurePtrT &pure_ptr, uint16_t new_mark) const {
+ LockWithData &lock_with_data = pure_ptr.ptr_;
+ lock_with_data.lock();
+ lock_with_data.setExtra(new_mark);
+ lock_with_data.unlock();
+ }
+
+public:
+
+ void mark(PurePtrT &pure_ptr) const {
+ ChangeMark(pure_ptr, 0);
+ }
+
+ void unmark(PurePtrT &pure_ptr) const {
+ ChangeMark(pure_ptr, MAX_THREAD_CNT);
+ }
+
+ bool is_marked(const PurePtrT &pure_ptr) const {
+ uint16_t curr_mark = pure_ptr.ptr_.extra();
+ VERIFY(curr_mark == 0 || curr_mark == MAX_THREAD_CNT);
+ return curr_mark == 0;
+ }
+
+};
+
+//template<class T>
+//struct Comparator
+//{
+// typedef pure_pointer<T> pointer_type_t;
+//
+// bool operator()(pointer_type_t const& a, pointer_type_t const& b) const {
+// return a.get() < b.get();
+// }
+//};
+
+template<class T>
+struct Hash {
+ typedef pure_pointer<T> pointer_type_t;
+ std::hash<T *> inner_hash_;
+
+ size_t operator()(pointer_type_t const &a) const {
+ return inner_hash_(a.get());
+ }
+};
+
+template<class It>
+struct iterator_wrapper {
+ typedef typename It::value_type value_type;
+ typedef typename It::difference_type difference_type;
+ typedef typename It::reference reference;
+ typedef typename It::pointer pointer;
+
+ explicit iterator_wrapper(It it) : it_(it) { }
+
+ reference operator*() const { return it_.operator*(); }
+
+ pointer operator->() const { return it_.operator->(); }
+
+ bool operator==(const iterator_wrapper &rhs) const { return it_ == rhs.it_; }
+
+ bool operator!=(const iterator_wrapper &rhs) const { return it_ != rhs.it_; }
+
+private:
+ It it_;
+};
+
+template<class T>
+struct set {
+ typedef Hash<typename T::type> hash_t;
+ typedef std::unordered_set<T, hash_t> base_set_t;
+ typedef typename base_set_t::value_type value_type;
+
+ typedef iterator_wrapper<typename base_set_t::iterator> iterator;
+ typedef iterator_wrapper<typename base_set_t::const_iterator> const_iterator;
+
+public:
+ set() : base_set_(10, hash_t()) {
+ }
+
+ template<class It>
+ set(It begin, It end) : base_set_(begin, end, 10, hash_t()) {
+ }
+
+ const_iterator begin() const { return const_iterator(base_set_.begin()); }
+
+ const_iterator end() const { return const_iterator(base_set_.end()); }
+
+ iterator begin() { return iterator(base_set_.begin()); }
+
+ iterator end() { return iterator(base_set_.end()); }
+
+ const_iterator find(const T &key) const { return const_iterator(base_set_.find(key)); }
+
+ iterator find(const T &key) { return iterator(base_set_.find(key)); }
+
+ size_t count(T const &item) const { return base_set_.count(item); }
+
+ std::pair<iterator, bool> insert(value_type const &item) {
+ const std::pair<iterator, bool> &ret = base_set_.insert(item);
+ return make_pair(iterator(ret.first), ret.second);
+ }
+
+ template<class It>
+ void insert(It first, It last) { base_set_.insert(first, last); }
+
+ size_t erase(const T &x) { return base_set_.erase(x); }
+
+ void clear() { base_set_.clear(); }
+
+ size_t size() const { return base_set_.size(); }
+
+ bool operator==(const set &rhs) const {
+ if (this->size() != rhs.size())
+ return false;
+
+ for (auto i = base_set_.begin(), j = rhs.base_set_.begin();
+ i != base_set_.end() && j != rhs.base_set_.end();
+ ++i, ++j) {
+ if (*i != *j)
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const set &rhs) const {
+ return !(*this == rhs);
+ }
+
+ template<class Comparator>
+ void Copy(std::set<T, Comparator> &container) const {
+ container.insert(base_set_.begin(), base_set_.end());
+ }
+
+private:
+ base_set_t base_set_;
+};
+
+
+template<class Key, class Value>
+struct map {
+ typedef Hash<typename Key::type> hash_t;
+ typedef std::unordered_map<Key, Value, hash_t> base_map_t;
+ typedef typename base_map_t::value_type value_type;
+
+ typedef iterator_wrapper<typename base_map_t::iterator> iterator;
+ typedef iterator_wrapper<typename base_map_t::const_iterator> const_iterator;
+
+public:
+ map()
+ : base_map_(10, hash_t()) {
+ }
+
+ template<class It>
+ map(It begin, It end)
+ : base_map_(begin, end, 10, hash_t()) {
+ }
+
+ const_iterator begin() const { return const_iterator(base_map_.begin()); }
+
+ const_iterator end() const { return const_iterator(base_map_.end()); }
+
+ iterator begin() { return iterator(base_map_.begin()); }
+
+ iterator end() { return iterator(base_map_.end()); }
+
+ const_iterator find(const Key &key) const {
+ return const_iterator(base_map_.find(key));
+ }
+
+ iterator find(const Key &key) { return iterator(base_map_.find(key)); }
+
+ size_t count(Key const &item) const { return base_map_.count(item); }
+
+ Value &operator[](Key const &x) { return base_map_[x]; }
+
+ std::pair<iterator, bool> insert(value_type const &value) {
+ std::pair<iterator, bool> ret = base_map_.insert(value);
+ return make_pair(iterator(ret.first), ret.second);
+ }
+
+ template<class It>
+ void insert(It first, It last) { base_map_.insert(first, last); }
+
+ size_t erase(Key const &x) { return base_map_.erase(x); }
+
+ void clear() { base_map_.clear(); }
+
+ size_t size() const { return base_map_.size(); }
+
+ bool operator==(const map &rhs) const {
+ if (size() != rhs.size())
+ return false;
+
+ for (auto i = base_map_.begin(), j = rhs.base_map_.begin();
+ i != base_map_.end() && j != rhs.base_map_.end();
+ ++i, ++j) {
+ if (*i != *j)
+ return false;
+ }
+
+ return true;
+ }
+
+ bool operator!=(const map &rhs) const {
+ return !(*this == rhs);
+ }
+
+ template<class Comparator>
+ void Copy(std::map<Key, Value, Comparator> &container) const {
+ container.insert(base_map_.begin(), base_map_.end());
+ }
+
+private:
+ base_map_t base_map_;
+};
+
+template<class T>
+std::ostream &operator<<(std::ostream &stream, const pure_pointer<T> &pointer) {
+ stream << pointer.int_id();
+ return stream;
+}
+
+} // namespace restricted
+
+namespace std {
+template<class T>
+struct hash<restricted::pure_pointer<T>> {
+ size_t operator()(const restricted::pure_pointer<T> &pointer) const {
+ return pointer.hash();
+ }
+};
+}
+
+template<class T, class Comparator>
+class PairComparator {
+private:
+ Comparator comparator_;
+public:
+ PairComparator(Comparator comparator) : comparator_(comparator) {
+ }
+
+ bool operator()(std::pair<T, T> a, std::pair<T, T> b) const {
+ return a.first == b.first ? comparator_(a.second, b.second) : comparator_(a.first, b.first);
+ }
+};
+
+//
+//template<typename T, class Comparator>
+//class MixedComparator {
+//private:
+// Comparator c1_;
+// Comparator c2_;
+//public:
+// MixedComparator(const Comparator &c1, const Comparator &c2) : c1_(c1), c2_(c2) {
+// }
+//
+// bool operator()(const T &a, const T &b) const {
+// if(c1_.IsAFAKE(a) || c1_.IsAFAKE(b)) {
+// if(c1_.IsAFAKEMin(a))
+// return !c1_.IsAFAKEMin(b);
+// if(c1_.IsAFAKEMax(b))
+// return c1_.IsAFAKEMax(a);
+// return false;
+// }
+// if(c1_.IsValidId(a) && c1_.IsValidId(b))
+// return c1_(a, b);
+// if(c1_.IsValidId(a))
+// return true;
+// if(c1_.IsValidId(b))
+// return false;
+// if(c2_.IsValidId(a) && c2_.IsValidId(b)) {
+// return c2_(a, b);
+// }
+// VERIFY(false);
+// return false;
+// }
+//
+// bool IsValidId(T element) {
+// return c1_.IsValid(element) || c2_.IsValid(element);
+// }
+//};
+
+template<class Container, class Comparator>
+class ContainerComparator {
+private:
+ Comparator comparator_;
+public:
+ ContainerComparator(const Comparator &comparator) : comparator_(comparator) {
+ }
+
+ bool operator()(const Container &a, const Container &b) const {
+ for (auto ita = a.begin, itb = b.begin(); ita != a.end() && itb != b.end(); ++ita, ++itb) {
+ if (*ita != *itb)
+ return comparator_(*ita, *itb);
+ }
+ if (a.size() < b.size()) {
+ return true;
+ }
+ return false;
+ }
+
+};
+
diff --git a/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp b/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp
new file mode 100644
index 0000000..536e4ed
--- /dev/null
+++ b/src/common/assembly_graph/dijkstra/dijkstra_algorithm.hpp
@@ -0,0 +1,288 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "utils/simple_tools.hpp"
+#include "dijkstra_settings.hpp"
+
+#include <queue>
+#include <vector>
+#include <set>
+#include <map>
+
+namespace omnigraph {
+
+template<typename Graph, typename distance_t = size_t>
+struct element_t{
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ distance_t distance;
+ VertexId curr_vertex;
+ VertexId prev_vertex;
+ EdgeId edge_between;
+
+ element_t(distance_t new_distance, VertexId new_cur_vertex, VertexId new_prev_vertex,
+ EdgeId new_edge_between) : distance(new_distance), curr_vertex(new_cur_vertex),
+ prev_vertex(new_prev_vertex), edge_between(new_edge_between) { }
+};
+
+template<typename T>
+class ReverseDistanceComparator {
+public:
+ ReverseDistanceComparator() {
+ }
+
+ bool operator()(T obj1, T obj2){
+ if(obj1.distance != obj2.distance)
+ return obj2.distance < obj1.distance;
+ if(obj2.curr_vertex != obj1.curr_vertex)
+ return obj2.curr_vertex < obj1.curr_vertex;
+ if(obj2.prev_vertex != obj1.prev_vertex)
+ return obj2.prev_vertex < obj1.prev_vertex;
+ return obj2.edge_between < obj1.edge_between;
+ }
+};
+
+template<class Graph, class DijkstraSettings, typename distance_t = size_t>
+class Dijkstra {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef distance_t DistanceType;
+
+ typedef std::map<VertexId, distance_t> distances_map;
+ typedef typename distances_map::const_iterator distances_map_ci;
+ typedef typename std::priority_queue<element_t<Graph, distance_t>, std::vector<element_t<Graph, distance_t>>,
+ ReverseDistanceComparator<element_t<Graph, distance_t>>> queue_t;
+
+ // constructor parameters
+ const Graph& graph_;
+ DijkstraSettings settings_;
+ const size_t max_vertex_number_;
+
+ // changeable parameters
+ bool finished_;
+ size_t vertex_number_;
+ bool vertex_limit_exceeded_;
+
+ // accumulative structures
+ distances_map distances_;
+ std::set<VertexId> processed_vertices_;
+ std::map<VertexId, pair<VertexId, EdgeId>> prev_vert_map_;
+
+ void Init(VertexId start, queue_t &queue) {
+ vertex_number_ = 0;
+ distances_.clear();
+ processed_vertices_.clear();
+ prev_vert_map_.clear();
+ set_finished(false);
+ settings_.Init(start);
+ queue.push(element_t<Graph, distance_t>(0, start, VertexId(0), EdgeId(0)));
+ prev_vert_map_[start] = std::pair<VertexId, EdgeId>(VertexId(0), EdgeId(0));
+ }
+
+ void set_finished(bool state) {
+ finished_ = state;
+ }
+
+ bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const {
+ return settings_.CheckPutVertex(vertex, edge, length);
+ }
+
+ bool CheckProcessVertex(VertexId vertex, distance_t distance) {
+ ++vertex_number_;
+ if (vertex_number_ > max_vertex_number_) {
+ vertex_limit_exceeded_ = true;
+ return false;
+ }
+ return (vertex_number_ < max_vertex_number_) && settings_.CheckProcessVertex(vertex, distance);
+ }
+
+ distance_t GetLength(EdgeId edge) const {
+ return settings_.GetLength(edge);
+ }
+
+ void AddNeighboursToQueue(VertexId cur_vertex, distance_t cur_dist, queue_t& queue) {
+ auto neigh_iterator = settings_.GetIterator(cur_vertex);
+ while (neigh_iterator.HasNext()) {
+ TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " started");
+ auto cur_pair = neigh_iterator.Next();
+ if (!DistanceCounted(cur_pair.vertex)) {
+ TRACE("Adding new entry to queue");
+ distance_t new_dist = GetLength(cur_pair.edge) + cur_dist;
+ TRACE("Entry: vertex " << graph_.str(cur_vertex) << " distance " << new_dist);
+ if (CheckPutVertex(cur_pair.vertex, cur_pair.edge, new_dist)) {
+ TRACE("CheckPutVertex returned true and new entry is added");
+ queue.push(element_t<Graph, distance_t>(new_dist, cur_pair.vertex,
+ cur_vertex, cur_pair.edge));
+ }
+ }
+ TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " finished");
+ }
+ TRACE("All neighbours of vertex " << graph_.str(cur_vertex) << " processed");
+ }
+
+public:
+ Dijkstra(const Graph &graph, DijkstraSettings settings, size_t max_vertex_number = size_t(-1)) :
+ graph_(graph),
+ settings_(settings),
+ max_vertex_number_(max_vertex_number),
+ finished_(false),
+ vertex_number_(0),
+ vertex_limit_exceeded_(false) {}
+
+ Dijkstra(Dijkstra&& /*other*/) = default;
+
+ Dijkstra& operator=(Dijkstra&& /*other*/) = default;
+
+ Dijkstra(const Dijkstra& /*other*/) = delete;
+
+ Dijkstra& operator=(const Dijkstra& /*other*/) = delete;
+
+ bool finished() const {
+ return finished_;
+ }
+
+ bool DistanceCounted(VertexId vertex) const {
+ return distances_.find(vertex) != distances_.end();
+ }
+
+ distance_t GetDistance(VertexId vertex) const {
+ VERIFY(DistanceCounted(vertex));
+ return distances_.find(vertex)->second;
+ }
+
+ std::pair<distances_map_ci, distances_map_ci> GetDistances() const {
+ distances_map_ci begin = distances_.begin();
+ distances_map_ci end = distances_.end();
+ return make_pair(begin, end);
+ }
+
+ void Run(VertexId start) {
+ TRACE("Starting dijkstra run from vertex " << graph_.str(start));
+ queue_t queue;
+ Init(start, queue);
+ TRACE("Priority queue initialized. Starting search");
+
+ while (!queue.empty() && !finished()) {
+ TRACE("Dijkstra iteration started");
+ const element_t<Graph, distance_t>& next = queue.top();
+ distance_t distance = next.distance;
+ VertexId vertex = next.curr_vertex;
+
+ prev_vert_map_[vertex] = std::pair<VertexId, EdgeId>(next.prev_vertex, next.edge_between);
+ queue.pop();
+ TRACE("Vertex " << graph_.str(vertex) << " with distance " << distance << " fetched from queue");
+
+ if (DistanceCounted(vertex)) {
+ TRACE("Distance to vertex " << graph_.str(vertex) << " already counted. Proceeding to next queue entry.");
+ continue;
+ }
+ distances_.insert(make_pair(vertex, distance));
+
+ TRACE("Vertex " << graph_.str(vertex) << " is found to be at distance "
+ << distance << " from vertex " << graph_.str(start));
+ if (!CheckProcessVertex(vertex, distance)) {
+ TRACE("Check for processing vertex failed. Proceeding to the next queue entry.");
+ continue;
+ }
+ processed_vertices_.insert(vertex);
+ AddNeighboursToQueue(vertex, distance, queue);
+ }
+ set_finished(true);
+ TRACE("Finished dijkstra run from vertex " << graph_.str(start));
+ }
+
+ std::vector<EdgeId> GetShortestPathTo(VertexId vertex) {
+ std::vector<EdgeId> path;
+ if (prev_vert_map_.find(vertex) == prev_vert_map_.end())
+ return path;
+
+ VertexId curr_vertex = vertex;
+ VertexId prev_vertex = get(prev_vert_map_, vertex).first;
+ EdgeId edge = get(prev_vert_map_, curr_vertex).second;
+
+ while (prev_vertex != VertexId(0)) {
+ if (graph_.EdgeStart(edge) == prev_vertex)
+ path.insert(path.begin(), edge);
+ else
+ path.push_back(edge);
+ curr_vertex = prev_vertex;
+ const auto& prev_v_e = get(prev_vert_map_, curr_vertex);
+ prev_vertex = prev_v_e.first;
+ edge = prev_v_e.second;
+ }
+ return path;
+ }
+
+ vector<VertexId> ReachedVertices() const {
+ vector<VertexId> result;
+ for (auto it = distances_.begin(); it != distances_.end(); ++it) {
+ result.push_back(it->first);
+ }
+ return result;
+ }
+
+ const set<VertexId>& ProcessedVertices() const {
+ return processed_vertices_;
+ }
+
+ bool VertexLimitExceeded() const {
+ return vertex_limit_exceeded_;
+ }
+
+private:
+ DECL_LOGGER("Dijkstra");
+};
+
+template<class Graph>
+class DistanceCounter {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph>> BaseDijkstraSettings;
+
+public:
+ DistanceCounter(const Graph& graph) :
+ graph_(graph),
+ dijkstra_(graph, BaseDijkstraSettings(
+ LengthCalculator<Graph>(),
+ VertexProcessChecker<Graph>(),
+ VertexPutChecker<Graph>(),
+ ForwardNeighbourIteratorFactory<Graph>())),
+ ready_(false) {
+ }
+
+ bool IsReachable(VertexId from, VertexId to) {
+ EnsureFrom(from);
+ return dijkstra_.DistanceCounted(to);
+ }
+
+ size_t Distance(VertexId from, VertexId to) {
+ EnsureFrom(from);
+ return dijkstra_.GetDistance(to);
+ }
+
+private:
+ void EnsureFrom(VertexId from) {
+ if (!ready_ || prev_ != from) {
+ dijkstra_.run(from);
+ ready_ = true;
+ prev_ = from;
+ }
+ }
+
+ const Graph& graph_;
+ Dijkstra<Graph, BaseDijkstraSettings> dijkstra_;
+ VertexId prev_;
+ bool ready_;
+};
+
+}
diff --git a/src/common/assembly_graph/dijkstra/dijkstra_helper.hpp b/src/common/assembly_graph/dijkstra/dijkstra_helper.hpp
new file mode 100644
index 0000000..a912a31
--- /dev/null
+++ b/src/common/assembly_graph/dijkstra/dijkstra_helper.hpp
@@ -0,0 +1,163 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "dijkstra_algorithm.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class DijkstraHelper {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > UnorientedDijkstra;
+
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ BackwardNeighbourIteratorFactory<Graph> > > BackwardDijkstra;
+
+ //------------------------------
+ // bounded dijkstra
+ //------------------------------
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > BoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, BoundedDijkstraSettings> BoundedDijkstra;
+
+ static BoundedDijkstra CreateBoundedDijkstra(const Graph &graph, size_t length_bound,
+ size_t max_vertex_number = -1ul){
+ return BoundedDijkstra(graph, BoundedDijkstraSettings(
+ LengthCalculator<Graph>(graph),
+ BoundProcessChecker<Graph>(length_bound),
+ BoundPutChecker<Graph>(length_bound),
+ ForwardNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+
+ //------------------------------
+ // bounded backward dijkstra
+ //------------------------------
+
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ BackwardNeighbourIteratorFactory<Graph> > BackwardBoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, BackwardBoundedDijkstraSettings> BackwardBoundedDijkstra;
+
+ static BackwardBoundedDijkstra CreateBackwardBoundedDijkstra(const Graph &graph,
+ size_t bound, size_t max_vertex_number = size_t(-1)){
+ return BackwardBoundedDijkstra(graph, BackwardBoundedDijkstraSettings(
+ LengthCalculator<Graph>(graph),
+ BoundProcessChecker<Graph>(bound),
+ BoundPutChecker<Graph>(bound),
+ BackwardNeighbourIteratorFactory<Graph>(graph)), max_vertex_number);
+ }
+
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ EdgeComponentPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > ComponentFinder;
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ ComponentLenCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > NeighbourhoodFinder;
+ //------------------------------
+
+ typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ VertexProcessChecker<Graph>,
+ SubgraphPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > > SubgraphDijkstra;
+
+ typedef ComposedDijkstraSettings<Graph,
+ PathIgnoringLengthCalculator<Graph>,
+ BoundProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > PathIgnoringDijkstraSettings;
+
+
+ //------------------------------
+ // short edge dijkstra settings
+ //------------------------------
+ typedef ComposedDijkstraSettings<Graph,
+ BoundedEdgeLenCalculator<Graph>,
+ ZeroLengthProcessChecker<Graph>,
+ VertexPutChecker<Graph>,
+ UnorientedNeighbourIteratorFactory<Graph> > ShortEdgeDijkstraSettings;
+
+ typedef Dijkstra<Graph, ShortEdgeDijkstraSettings> ShortEdgeDijkstra;
+
+ static ShortEdgeDijkstra CreateShortEdgeDijkstra(const Graph &graph, size_t edge_length_bound,
+ size_t max_vertex_number = size_t(-1)){
+ return ShortEdgeDijkstra(graph, ShortEdgeDijkstraSettings(
+ BoundedEdgeLenCalculator<Graph>(graph, edge_length_bound),
+ ZeroLengthProcessChecker<Graph>(),
+ VertexPutChecker<Graph>(),
+ UnorientedNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+
+ //------------------------------
+ // counting dijkstra
+ //------------------------------
+ typedef CountingDijkstraSettings<Graph,
+ UnorientedNeighbourIteratorFactory<Graph> > UnorientCountingDijkstraSettings;
+
+ typedef Dijkstra<Graph, UnorientCountingDijkstraSettings> CountingDijkstra;
+
+ static CountingDijkstra CreateCountingDijkstra(const Graph &graph, size_t max_size,
+ size_t edge_length_bound, size_t max_vertex_number = size_t(-1)){
+ return CountingDijkstra(graph, UnorientCountingDijkstraSettings(graph,
+ UnorientedNeighbourIteratorFactory<Graph>(graph),
+ max_size, edge_length_bound), max_vertex_number);
+ }
+
+
+ //------------------------------
+ // targeted bounded dijkstra
+ //------------------------------
+
+ typedef ComposedDijkstraSettings<Graph,
+ LengthCalculator<Graph>,
+ BoundedVertexTargetedProcessChecker<Graph>,
+ BoundPutChecker<Graph>,
+ ForwardNeighbourIteratorFactory<Graph> > TargetedBoundedDijkstraSettings;
+
+ typedef Dijkstra<Graph, TargetedBoundedDijkstraSettings> TargetedBoundedDijkstra;
+
+ static TargetedBoundedDijkstra CreateTargetedBoundedDijkstra(const Graph &graph,
+ VertexId target_vertex, size_t bound, size_t max_vertex_number = size_t(-1)){
+ return TargetedBoundedDijkstra(graph,
+ TargetedBoundedDijkstraSettings(LengthCalculator<Graph>(graph),
+ BoundedVertexTargetedProcessChecker<Graph>(target_vertex, bound),
+ BoundPutChecker<Graph>(bound),
+ ForwardNeighbourIteratorFactory<Graph>(graph)),
+ max_vertex_number);
+ }
+};
+
+}
diff --git a/src/modules/algorithms/dijkstra/dijkstra_settings.hpp b/src/common/assembly_graph/dijkstra/dijkstra_settings.hpp
similarity index 100%
rename from src/modules/algorithms/dijkstra/dijkstra_settings.hpp
rename to src/common/assembly_graph/dijkstra/dijkstra_settings.hpp
diff --git a/src/common/assembly_graph/dijkstra/length_calculator.hpp b/src/common/assembly_graph/dijkstra/length_calculator.hpp
new file mode 100644
index 0000000..78fe439
--- /dev/null
+++ b/src/common/assembly_graph/dijkstra/length_calculator.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "utils/standard_base.hpp"
+
+namespace omnigraph {
+
+template<class Graph, typename distance_t = size_t>
+class LengthCalculator {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+protected:
+ const Graph &graph_;
+public:
+ LengthCalculator(const Graph &graph) : graph_(graph) { }
+ virtual distance_t GetLength(EdgeId edge) const{
+ return distance_t(graph_.length(edge));
+ }
+ virtual ~LengthCalculator() { }
+};
+
+template<class Graph, typename distance_t = size_t>
+class ComponentLenCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<EdgeId> &component_;
+public:
+ ComponentLenCalculator(const Graph &graph, set<EdgeId> &component) :
+ LengthCalculator<Graph, distance_t>(graph), component_(component) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if (component_.count(edge) != 0)
+ return 0;
+ return this->graph_.length(edge);
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class BoundedEdgeLenCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ distance_t bound_;
+public:
+ BoundedEdgeLenCalculator(const Graph &graph, distance_t bound) :
+ LengthCalculator<Graph, distance_t>(graph), bound_(bound) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if(this->graph_.length(edge) <= bound_)
+ return 0;
+ return 1;
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class AlongPathLengthCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef LengthCalculator<Graph, distance_t> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<VertexId> vertex_path_;
+ distance_t bound_;
+
+ set<VertexId> CollectVertices(vector<EdgeId> &edge_path){
+ set<VertexId> result;
+ for(auto e = edge_path.begin(); e != edge_path.end(); e++){
+ result.insert(this->graph_.EdgeStart(*e));
+ result.insert(this->graph_.EdgeEnd(*e));
+ }
+ return result;
+ }
+
+public:
+ AlongPathLengthCalculator(const Graph &graph, vector<EdgeId> &edge_path, distance_t bound) :
+ LengthCalculator<Graph, distance_t>(graph),
+ vertex_path_(CollectVertices(edge_path)),
+ bound_(bound) { }
+
+ distance_t GetLength(EdgeId edge) const{
+ if (vertex_path_.count(this->graph_.EdgeStart(edge))
+ && vertex_path_.count(this->graph_.EdgeEnd(edge)))
+ return min(int(base::GetLength(edge)), 200);
+ return base::GetLength(edge);
+ }
+};
+
+template<class Graph, typename distance_t = size_t>
+class PathIgnoringLengthCalculator : public LengthCalculator<Graph, distance_t> {
+ typedef LengthCalculator<Graph, distance_t> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ set<EdgeId> path_;
+ distance_t bound_;
+
+public:
+ PathIgnoringLengthCalculator(const Graph &graph, const vector<EdgeId> &edge_path) :
+ LengthCalculator<Graph, distance_t>(graph), path_(edge_path.begin(), edge_path.end())
+ { }
+
+ distance_t GetLength(EdgeId edge) const {
+ if (path_.find(edge) != path_.end()) {
+ return 0;
+ }
+ return base::GetLength(edge);
+ }
+};
+
+
+}
diff --git a/src/modules/algorithms/dijkstra/neighbours_iterator.hpp b/src/common/assembly_graph/dijkstra/neighbours_iterator.hpp
similarity index 100%
rename from src/modules/algorithms/dijkstra/neighbours_iterator.hpp
rename to src/common/assembly_graph/dijkstra/neighbours_iterator.hpp
diff --git a/src/modules/algorithms/dijkstra/vertex_process_checker.hpp b/src/common/assembly_graph/dijkstra/vertex_process_checker.hpp
similarity index 100%
rename from src/modules/algorithms/dijkstra/vertex_process_checker.hpp
rename to src/common/assembly_graph/dijkstra/vertex_process_checker.hpp
diff --git a/src/modules/algorithms/dijkstra/vertex_put_checker.hpp b/src/common/assembly_graph/dijkstra/vertex_put_checker.hpp
similarity index 100%
rename from src/modules/algorithms/dijkstra/vertex_put_checker.hpp
rename to src/common/assembly_graph/dijkstra/vertex_put_checker.hpp
diff --git a/src/common/assembly_graph/graph_support/basic_edge_conditions.hpp b/src/common/assembly_graph/graph_support/basic_edge_conditions.hpp
new file mode 100644
index 0000000..a32a2f3
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/basic_edge_conditions.hpp
@@ -0,0 +1,151 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "func/func.hpp"
+#include "func/pred.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
+#include "assembly_graph/core/directions.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+using EdgePredicate = func::TypedPredicate<typename Graph::EdgeId>;
+
+template<class Graph>
+class EdgeCondition : public func::AbstractPredicate<typename Graph::EdgeId> {
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &g_;
+protected:
+
+ EdgeCondition(const Graph &g)
+ : g_(g) {
+ }
+
+ const Graph &g() const {
+ return g_;
+ }
+
+};
+
+template<class Graph>
+class IsolatedEdgeCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ bool IsTerminalVertex(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
+ }
+
+public:
+ IsolatedEdgeCondition(const Graph &g) : base(g) {
+ }
+
+ bool Check(EdgeId e) const {
+ return IsTerminalVertex(this->g().EdgeStart(e)) && IsTerminalVertex(this->g().EdgeEnd(e));
+ }
+
+};
+
+template<class Graph>
+inline bool HasAlternatives(const Graph &g, typename Graph::EdgeId e) {
+ return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1
+ && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
+}
+
+
+template<class Graph>
+class AlternativesPresenceCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+public:
+
+ AlternativesPresenceCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ return HasAlternatives(this->g(), e);
+ }
+
+};
+
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId> AddAlternativesPresenceCondition(const Graph &g,
+ func::TypedPredicate<typename Graph::EdgeId> condition) {
+ return func::And(AlternativesPresenceCondition<Graph>(g), condition);
+}
+
+
+template<class Graph>
+class CoverageUpperBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef EdgeCondition<Graph> base;
+ const double max_coverage_;
+
+public:
+
+ CoverageUpperBound(const Graph &g, double max_coverage)
+ : base(g),
+ max_coverage_(max_coverage) {
+ }
+
+ bool Check(EdgeId e) const {
+ return math::le(this->g().coverage(e), max_coverage_);
+ }
+
+};
+
+template<class Graph>
+class LengthUpperBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef EdgeCondition<Graph> base;
+
+ const size_t max_length_;
+
+public:
+
+ LengthUpperBound(const Graph &g, size_t max_length)
+ : base(g),
+ max_length_(max_length) {
+ }
+
+ bool Check(EdgeId e) const {
+ return this->g().length(e) <= max_length_;
+ }
+
+};
+
+template<class Graph>
+class SelfConjugateCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+public:
+
+ SelfConjugateCondition(const Graph& g)
+ : base(g) {
+ }
+
+ bool Check(EdgeId e) const {
+ return e == this->g().conjugate(e);
+ }
+
+private:
+ DECL_LOGGER("SelfConjugateCondition");
+};
+
+
+}
diff --git a/src/common/assembly_graph/graph_support/basic_vertex_conditions.hpp b/src/common/assembly_graph/graph_support/basic_vertex_conditions.hpp
new file mode 100644
index 0000000..c3e6427
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/basic_vertex_conditions.hpp
@@ -0,0 +1,66 @@
+#pragma once
+#include "func/pred.hpp"
+#include "func/func.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class VertexCondition : public func::AbstractPredicate<typename Graph::VertexId> {
+ typedef typename Graph::VertexId VertexId;
+ const Graph &g_;
+protected:
+
+ VertexCondition(const Graph &g)
+ : g_(g) {
+ }
+
+ const Graph &g() const {
+ return g_;
+ }
+
+};
+
+template<class Graph>
+class CompressCondition : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ CompressCondition(const Graph &g) :
+ VertexCondition<Graph>(g) {
+ }
+
+ bool Check(VertexId v) const override {
+ return this->g().CanCompressVertex(v);
+ }
+};
+
+template<class Graph>
+class IsolatedVertexCondition : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ IsolatedVertexCondition(const Graph& g) :
+ VertexCondition<Graph>(g) {
+ }
+
+ bool Check(VertexId v) const override {
+ return this->g().IsDeadStart(v) && this->g().IsDeadEnd(v);
+ }
+};
+
+template<class Graph>
+class TerminalVertexCondition : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ TerminalVertexCondition(const Graph& g) :
+ VertexCondition<Graph>(g) {
+ }
+
+ bool Check(VertexId v) const override {
+ return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
+ }
+
+};
+
+}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_support/chimera_stats.hpp b/src/common/assembly_graph/graph_support/chimera_stats.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_support/chimera_stats.hpp
rename to src/common/assembly_graph/graph_support/chimera_stats.hpp
diff --git a/src/modules/assembly_graph/graph_support/comparators.hpp b/src/common/assembly_graph/graph_support/comparators.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_support/comparators.hpp
rename to src/common/assembly_graph/graph_support/comparators.hpp
diff --git a/src/common/assembly_graph/graph_support/contig_output.hpp b/src/common/assembly_graph/graph_support/contig_output.hpp
new file mode 100644
index 0000000..f3ef639
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/contig_output.hpp
@@ -0,0 +1,602 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/stats/picture_dump.hpp"
+#include <io/reads/osequencestream.hpp>
+#include "assembly_graph/components/connected_component.hpp"
+#include "assembly_graph/stats/statistics.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+
+namespace debruijn_graph {
+
+//This class corrects mismatches or masks repeat differences or other such things with the sequence of an edge
+template<class Graph>
+class ContigCorrector {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+protected:
+ const Graph &graph() const {
+ return graph_;
+ }
+
+public:
+ ContigCorrector(const Graph &graph) : graph_(graph) {
+ }
+
+ virtual string correct(EdgeId e) = 0;
+
+ virtual ~ContigCorrector() {
+ }
+};
+
+template<class Graph>
+class DefaultContigCorrector : public ContigCorrector<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+public:
+ DefaultContigCorrector(const Graph &graph) : ContigCorrector<Graph>(graph) {
+ }
+
+ string correct(EdgeId e) {
+ return this->graph().EdgeNucls(e).str();
+ }
+};
+
+
+class GFASegmentWriter {
+private:
+ std::ostream &ostream_;
+
+
+public:
+
+ GFASegmentWriter(std::ostream &stream) : ostream_(stream) {
+ }
+
+ void Write(size_t edge_id, const Sequence &seq, double cov) {
+ ostream_ << "S\t" << edge_id << "\t";
+ ostream_ << seq.str() << "\t";
+ ostream_ << "KC:i:" << int(cov) << std::endl;
+ }
+};
+
+class GFALinkWriter {
+private:
+ std::ostream &ostream_;
+ size_t overlap_size_;
+
+public:
+
+ GFALinkWriter(std::ostream &stream, size_t overlap_size) : ostream_(stream), overlap_size_(overlap_size) {
+ }
+
+ void Write(size_t first_segment, std::string &first_orientation, size_t second_segment, std::string &second_orientation) {
+ ostream_ << "L\t" << first_segment << "\t" << first_orientation << "\t" ;
+ ostream_ << second_segment << "\t" << second_orientation << "\t" << overlap_size_ << "M";
+ ostream_ << std::endl;
+
+ }
+};
+
+
+struct PathSegmentSequence {
+ size_t path_id_;
+ size_t segment_number_;
+ std::vector<std::string> segment_sequence_;
+ PathSegmentSequence(size_t path_id, std::vector<std::string> &segment_sequence)
+ : path_id_(path_id), segment_number_(1), segment_sequence_(segment_sequence) {
+ }
+
+ PathSegmentSequence()
+ : path_id_(0), segment_number_(1), segment_sequence_(){
+ }
+ void Reset() {
+ segment_sequence_.clear();
+ }
+};
+
+class GFAPathWriter {
+private:
+ std::ostream &ostream_;
+
+public:
+
+ GFAPathWriter(std::ostream &stream)
+ : ostream_(stream) {
+ }
+
+ void Write(const PathSegmentSequence &path_segment_sequence) {
+ ostream_ << "P" << "\t" ;
+ ostream_ << path_segment_sequence.path_id_ << "_" << path_segment_sequence.segment_number_ << "\t";
+ std::string delimeter = "";
+ for (size_t i = 0; i < path_segment_sequence.segment_sequence_.size() - 1; ++i) {
+ ostream_ << delimeter << path_segment_sequence.segment_sequence_[i];
+ delimeter = ",";
+ }
+ ostream_ << "\t";
+ std::string delimeter2 = "";
+ for (size_t i = 0; i < path_segment_sequence.segment_sequence_.size() - 1; ++i) {
+ ostream_ << delimeter2 << "*";
+ delimeter2 = ",";
+ }
+ ostream_ << std::endl;
+ }
+
+};
+
+template<class Graph>
+class GFAWriter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ const path_extend::PathContainer &paths_;
+ const string filename_;
+ std::set<EdgeId> set_of_authentic_edges_;
+
+ bool IsCanonical(EdgeId e) const {
+ if (e <= graph_.conjugate(e)) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ std::string GetOrientation(EdgeId e) const {
+ return IsCanonical(e) ? "+" : "-";
+ }
+
+ void WriteSegments(std::ofstream &stream) {
+ GFASegmentWriter segment_writer(stream);
+ for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ segment_writer.Write((*it).int_id(), graph_.EdgeNucls(*it), graph_.coverage(*it) * graph_.length(*it));
+ }
+ }
+
+ void WriteLinks(std::ofstream &stream) {
+ GFALinkWriter link_writer(stream, graph_.k());
+ for (auto it = graph_.SmartVertexBegin(); !it.IsEnd(); ++it) {
+ for (auto inc_edge : graph_.IncomingEdges(*it)) {
+ std::string orientation_first = GetOrientation(inc_edge);
+ size_t segment_first = IsCanonical(inc_edge) ? inc_edge.int_id() : graph_.conjugate(inc_edge).int_id();
+ for (auto out_edge : graph_.OutgoingEdges(*it)) {
+ size_t segment_second = IsCanonical(out_edge) ? out_edge.int_id() : graph_.conjugate(out_edge).int_id();
+ std::string orientation_second = GetOrientation(out_edge);
+ link_writer.Write(segment_first, orientation_first, segment_second, orientation_second);
+ }
+ }
+ }
+ }
+
+ void UpdateSegmentedPath(PathSegmentSequence &segmented_path, EdgeId e) {
+ std::string segment_id = IsCanonical(e) ? ToString(e.int_id()) : ToString(graph_.conjugate(e).int_id());
+ std::string orientation = GetOrientation(e);
+ segmented_path.segment_sequence_.push_back(segment_id + orientation);
+ }
+
+ void WritePaths(std::ofstream &stream) {
+ GFAPathWriter path_writer(stream);
+ for (const auto &path_pair : paths_) {
+ const path_extend::BidirectionalPath &p = (*path_pair.first);
+ if (p.Size() == 0) {
+ continue;
+ }
+ PathSegmentSequence segmented_path;
+ segmented_path.path_id_ = p.GetId();
+ for (size_t i = 0; i < p.Size() - 1; ++i) {
+ EdgeId e = p[i];
+ UpdateSegmentedPath(segmented_path, e);
+ if (graph_.EdgeEnd(e) != graph_.EdgeStart(p[i+1])) {
+ path_writer.Write(segmented_path);
+ segmented_path.segment_number_++;
+ segmented_path.Reset();
+ }
+ }
+ UpdateSegmentedPath(segmented_path, p.Back());
+ path_writer.Write(segmented_path);
+
+ }
+ }
+
+public:
+ GFAWriter(const Graph &graph, const path_extend::PathContainer &paths, const string &filename)
+ : graph_(graph), paths_(paths), filename_(filename) {
+ }
+
+ void Write() {
+ std::ofstream stream;
+ stream.open(filename_);
+ WriteSegments(stream);
+ WriteLinks(stream);
+ WritePaths(stream);
+ }
+};
+
+//This class uses corrected sequences to construct contig (just return as is, find unipath, trim contig)
+template<class Graph>
+class ContigConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ ContigCorrector<Graph> &corrector_;
+protected:
+ string correct(EdgeId e) {
+ return corrector_.correct(e);
+ }
+
+ const Graph &graph() const {
+ return graph_;
+ }
+
+public:
+
+ ContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : graph_(graph), corrector_(corrector) {
+ }
+
+ virtual pair<string, double> construct(EdgeId e) = 0;
+
+ virtual ~ContigConstructor(){
+ }
+};
+
+template<class Graph>
+class DefaultContigConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+public:
+
+ DefaultContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ return make_pair(this->correct(e), this->graph().coverage(e));
+ }
+};
+
+template<class Graph>
+vector<typename Graph::EdgeId> Unipath(const Graph& g, typename Graph::EdgeId e) {
+ omnigraph::UniquePathFinder<Graph> unipath_finder(g);
+ vector<typename Graph::EdgeId> answer = unipath_finder.UniquePathBackward(e);
+ const vector<typename Graph::EdgeId>& forward = unipath_finder.UniquePathForward(e);
+ for (size_t i = 1; i < forward.size(); ++i) {
+ answer.push_back(forward[i]);
+ }
+ return answer;
+}
+
+template<class Graph>
+class UnipathConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+
+
+ string MergeOverlappingSequences(std::vector<string>& ss, size_t overlap) {
+ if (ss.empty()) {
+ return "";
+ }
+ stringstream result;
+ result << ss.front().substr(0, overlap);
+// prev_end = ss.front().substr(0, overlap);
+ for (auto it = ss.begin(); it != ss.end(); ++it) {
+// VERIFY(prev_end == it->substr(0, overlap));
+ result << it->substr(overlap);
+// prev_end = it->substr(it->size() - overlap);
+ }
+ return result.str();
+ }
+
+
+ string MergeSequences(const Graph& g,
+ const vector<typename Graph::EdgeId>& continuous_path) {
+ vector<string> path_sequences;
+ for (size_t i = 0; i < continuous_path.size(); ++i) {
+ if(i > 0)
+ VERIFY(
+ g.EdgeEnd(continuous_path[i - 1])
+ == g.EdgeStart(continuous_path[i]));
+ path_sequences.push_back(this->correct(continuous_path[i]));
+ }
+ return MergeOverlappingSequences(path_sequences, g.k());
+ }
+
+public:
+
+ UnipathConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ vector<EdgeId> unipath = Unipath(this->graph(), e);
+ return make_pair(MergeSequences(this->graph(), unipath), stats::AvgCoverage(this->graph(), unipath));
+ }
+};
+
+template<class Graph>
+class CuttingContigConstructor : public ContigConstructor<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+ bool ShouldCut(VertexId v) const {
+ const Graph &g = this->graph();
+ vector<EdgeId> edges;
+ push_back_all(edges, g.OutgoingEdges(v));
+ if(edges.size() == 0)
+ return false;
+ for(size_t i = 1; i < edges.size(); i++) {
+ if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
+ return false;
+ }
+ edges.clear();
+ push_back_all(edges, g.IncomingEdges(v));
+ for(size_t i = 0; i < edges.size(); i++)
+ for(size_t j = i + 1; j < edges.size(); j++) {
+ if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
+ return true;
+ }
+ return false;
+ }
+
+public:
+
+ CuttingContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
+ }
+
+ pair<string, double> construct(EdgeId e) {
+ string result = this->correct(e);
+ if(result.size() > this->graph().k() && ShouldCut(this->graph().EdgeEnd(e))) {
+ result = result.substr(0, result.size() - this->graph().k());
+ }
+ if(result.size() > this->graph().k() && ShouldCut(this->graph().conjugate(this->graph().EdgeStart(e)))) {
+ result = result.substr(this->graph().k(), result.size());
+ }
+ return make_pair(result, this->graph().coverage(e));
+ }
+};
+
+struct ExtendedContigIdT {
+ string full_id_;
+ string short_id_;
+
+ ExtendedContigIdT(): full_id_(""), short_id_("") {}
+
+ ExtendedContigIdT(string full_id, string short_id): full_id_(full_id), short_id_(short_id) {}
+};
+
+template <class Graph>
+void MakeContigIdMap(const Graph& graph, map<EdgeId, ExtendedContigIdT>& ids, const ConnectedComponentCounter &cc_counter_, string prefix) {
+ int counter = 0;
+ for (auto it = graph.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ if (ids.count(e) == 0) {
+ string id;
+ if (cfg::get().pd) {
+ size_t c_id = cc_counter_.GetComponent(e);
+ id = io::MakeContigComponentId(++counter, graph.length(e) + graph.k(), graph.coverage(e), c_id, prefix);
+ }
+ else
+ id = io::MakeContigId(++counter, graph.length(e) + graph.k(), graph.coverage(e), prefix);
+ ids[e] = ExtendedContigIdT(id, ToString(counter) + "+");
+ if (e != graph.conjugate(e))
+ ids[graph.conjugate(e)] = ExtendedContigIdT(id + "'", ToString(counter) + "-");
+ }
+ }
+}
+
+template<class Graph>
+class ContigPrinter {
+private:
+ const Graph &graph_;
+ ContigConstructor<Graph> &constructor_;
+ template<class sequence_stream>
+ void ReportEdge(sequence_stream& oss
+ , const pair<string, double> sequence_data) {
+ oss << sequence_data.second;
+ oss << sequence_data.first;
+ }
+
+ void ReportEdge(io::osequencestream_for_fastg& oss,
+ const string& sequence,
+ const string& id,
+ const set<string>& nex_ids) {
+ oss.set_header(id);
+ oss << nex_ids;
+ oss << sequence;
+ }
+
+public:
+ ContigPrinter(const Graph &graph, ContigConstructor<Graph> &constructor) : graph_(graph), constructor_(constructor) {
+ }
+
+ template<class sequence_stream>
+ void PrintContigs(sequence_stream &os) {
+ for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ ReportEdge<sequence_stream>(os, constructor_.construct(*it));
+ }
+ }
+
+ template<class sequence_stream>
+ void PrintContigsFASTG(sequence_stream &os, const ConnectedComponentCounter & cc_counter) {
+ map<EdgeId, ExtendedContigIdT> ids;
+ MakeContigIdMap(graph_, ids, cc_counter, "EDGE");
+ for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ set<string> next;
+ VertexId v = graph_.EdgeEnd(*it);
+ auto edges = graph_.OutgoingEdges(v);
+ for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
+ next.insert(ids[*next_it].full_id_);
+ }
+ ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next);
+ if (*it != graph_.conjugate(*it))
+ {
+ set<string> next_conj;
+ v = graph_.EdgeEnd(graph_.conjugate(*it));
+ edges = graph_.OutgoingEdges(v);
+ for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
+ next_conj.insert(ids[*next_it].full_id_);
+ }
+ ReportEdge(os, constructor_.construct(graph_.conjugate(*it)).first, ids[graph_.conjugate(*it)].full_id_, next_conj);
+ }
+ }
+ }
+};
+
+template<class Graph>
+bool PossibleECSimpleCheck(const Graph& g
+ , typename Graph::EdgeId e) {
+ return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1 && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
+}
+
+template<class Graph>
+void ReportEdge(io::osequencestream_cov& oss
+ , const Graph& g
+ , typename Graph::EdgeId e
+ , bool output_unipath = false
+ , size_t solid_edge_length_bound = 0) {
+ typedef typename Graph::EdgeId EdgeId;
+ if (!output_unipath || (PossibleECSimpleCheck(g, e) && g.length(e) <= solid_edge_length_bound)) {
+ TRACE("Outputting edge " << g.str(e) << " as single edge");
+ oss << g.coverage(e);
+ oss << g.EdgeNucls(e);
+ } else {
+ TRACE("Outputting edge " << g.str(e) << " as part of unipath");
+ vector<EdgeId> unipath = Unipath(g, e);
+ TRACE("Unipath is " << g.str(unipath));
+ oss << stats::AvgCoverage(g, unipath);
+ TRACE("Merged sequence is of length " << MergeSequences(g, unipath).size());
+ oss << MergeSequences(g, unipath);
+ }
+}
+
+inline void OutputContigs(ConjugateDeBruijnGraph &g, const string &contigs_output_filename, bool output_unipath) {
+ INFO("Outputting contigs to " << contigs_output_filename << ".fasta");
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ io::osequencestream_cov oss(contigs_output_filename + ".fasta");
+
+ if(!output_unipath) {
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
+ } else {
+ UnipathConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
+ }
+
+// {
+// osequencestream_cov oss(contigs_output_filename);
+// set<ConjugateDeBruijnGraph::EdgeId> edges;
+// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// if (edges.count(*it) == 0) {
+// ReportEdge(oss, g, *it, output_unipath, solid_edge_length_bound + ".oppa.fasta");
+// edges.insert(g.conjugate(*it));
+// }
+// // oss << g.EdgeNucls(*it);
+// }
+// DEBUG("Contigs written");
+// }
+// if(!output_unipath) {
+// OutputContigs(g, contigs_output_filename + ".2.fasta", true, solid_edge_length_bound);
+// }
+}
+
+inline void OutputContigsToGFA(ConjugateDeBruijnGraph &g, path_extend::PathContainer &paths, const string &contigs_output_filename) {
+ INFO("Outputting graph to " << contigs_output_filename << ".gfa");
+ GFAWriter<ConjugateDeBruijnGraph> writer(g, paths, contigs_output_filename + ".gfa");
+ writer.Write();
+}
+
+
+inline void OutputContigsToFASTG(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_filename, const ConnectedComponentCounter & cc_counter) {
+
+ INFO("Outputting graph to " << contigs_output_filename << ".fastg");
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+ io::osequencestream_for_fastg ossfg(contigs_output_filename + ".fastg");
+ ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigsFASTG(ossfg, cc_counter);
+}
+
+
+
+
+inline bool ShouldCut(ConjugateDeBruijnGraph& g, VertexId v) {
+ vector<EdgeId> edges;
+ push_back_all(edges, g.OutgoingEdges(v));
+
+ if(edges.size() == 0)
+ return false;
+ for(size_t i = 1; i < edges.size(); i++) {
+ if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
+ return false;
+ }
+ edges.clear();
+ push_back_all(edges, g.IncomingEdges(v));
+ for(size_t i = 0; i < edges.size(); i++)
+ for(size_t j = i + 1; j < edges.size(); j++) {
+ if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
+ return true;
+ }
+ return false;
+}
+
+inline void OutputCutContigs(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_filename,
+ bool /*output_unipath*/ = false,
+ size_t /*solid_edge_length_bound*/ = 0) {
+ INFO("Outputting contigs to " << contigs_output_filename);
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
+ io::osequencestream_cov oss(contigs_output_filename);
+ CuttingContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
+
+// osequencestream_cov oss(contigs_output_filename);
+// set<ConjugateDeBruijnGraph::EdgeId> edges;
+// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+// EdgeId e = *it;
+// cout << g.length(e) << endl;
+// if (edges.count(e) == 0) {
+// Sequence s = g.EdgeNucls(e);
+// cout << s.size() << endl;
+// cout << "oppa " << ShouldCut(g, g.EdgeEnd(e)) << endl;
+// if(s.size() > g.k() && ShouldCut(g, g.EdgeEnd(e))) {
+// s = s.Subseq(0, s.size() - g.k());
+// cout << s.size() << endl;
+// }
+// cout << "oppa1 " << ShouldCut(g, g.conjugate(g.EdgeStart(e))) << endl;
+// if(s.size() > g.k() && ShouldCut(g, g.conjugate(g.EdgeStart(e)))) {
+// s = s.Subseq(g.k(), s.size());
+// cout << s.size() << endl;
+// }
+// oss << g.coverage(e);
+// oss << s;
+// edges.insert(g.conjugate(*it));
+// }
+// // oss << g.EdgeNucls(*it);
+// }
+}
+
+inline void OutputSingleFileContigs(ConjugateDeBruijnGraph& g,
+ const string& contigs_output_dir) {
+ INFO("Outputting contigs to " << contigs_output_dir);
+ int n = 0;
+ make_dir(contigs_output_dir);
+ char n_str[20];
+ set<ConjugateDeBruijnGraph::EdgeId> edges;
+ for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
+ if (edges.count(*it) == 0) {
+ sprintf(n_str, "%d.fa", n);
+ edges.insert(g.conjugate(*it));
+ io::osequencestream oss(contigs_output_dir + n_str);
+ oss << g.EdgeNucls(*it);
+ n++;
+ }
+ }
+ DEBUG("SingleFileContigs(Conjugate) written");
+}
+
+}
diff --git a/src/common/assembly_graph/graph_support/coverage_filling.hpp b/src/common/assembly_graph/graph_support/coverage_filling.hpp
new file mode 100644
index 0000000..ad2516e
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/coverage_filling.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "assembly_graph/core/coverage.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+
+namespace debruijn_graph {
+
+template<class StoringType>
+struct SimultaneousCoverageCollector {
+};
+
+template<>
+struct SimultaneousCoverageCollector<SimpleStoring> {
+ template<class SimultaneousCoverageFiller, class Info>
+ static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
+ filler.inc_coverage(edge_info);
+ }
+};
+
+template<>
+struct SimultaneousCoverageCollector<InvertableStoring> {
+ template<class SimultaneousCoverageFiller, class Info>
+ static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
+ filler.inc_coverage(edge_info);
+ filler.inc_coverage(edge_info.conjugate(filler.k()));
+ }
+};
+
+template<class Graph, class CountIndex>
+class SimultaneousCoverageFiller {
+ const Graph& g_;
+ const CountIndex& count_index_;
+ omnigraph::FlankingCoverage<Graph>& flanking_coverage_;
+ omnigraph::CoverageIndex<Graph>& coverage_index_;
+ typedef typename CountIndex::KmerPos Value;
+public:
+ SimultaneousCoverageFiller(const Graph& g, const CountIndex& count_index,
+ omnigraph::FlankingCoverage<Graph>& flanking_coverage,
+ omnigraph::CoverageIndex<Graph>& coverage_index) :
+ g_(g),
+ count_index_(count_index),
+ flanking_coverage_(flanking_coverage),
+ coverage_index_(coverage_index) {
+ }
+
+ size_t k() const {
+ return count_index_.k();
+ }
+
+ void inc_coverage(const Value &edge_info) {
+ coverage_index_.IncRawCoverage(edge_info.edge_id, edge_info.count);
+ if (edge_info.offset < flanking_coverage_.averaging_range()) {
+ flanking_coverage_.IncRawCoverage(edge_info.edge_id, edge_info.count);
+ }
+ }
+
+ void Fill() {
+ for (auto I = count_index_.value_cbegin(), E = count_index_.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ //VERIFY(edge_info.valid());
+ if (edge_info.valid()) {
+ VERIFY(edge_info.edge_id.get() != NULL);
+ SimultaneousCoverageCollector<typename CountIndex::storing_type>::CollectCoverage(*this, edge_info);
+ } else {
+ VERIFY(edge_info.removed());
+ WARN("Duplicating k+1-mers in graph (known bug in construction)");
+ }
+ }
+ }
+};
+
+template<class Graph, class CountIndex>
+void FillCoverageAndFlanking(const CountIndex& count_index, Graph& g,
+ FlankingCoverage<Graph>& flanking_coverage) {
+ SimultaneousCoverageFiller<Graph, CountIndex> filler(g, count_index, flanking_coverage, g.coverage_index());
+ filler.Fill();
+}
+
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.cpp b/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.cpp
new file mode 100644
index 0000000..b1bb38a
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.cpp
@@ -0,0 +1,70 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "coverage_uniformity_analyzer.hpp"
+namespace debruijn_graph {
+double CoverageUniformityAnalyzer::CountMedianCoverage() const{
+ vector <pair<double, size_t> > coverages;
+ size_t total_len = 0, short_len = 0, cur_len = 0;
+ for (auto iter = g_.ConstEdgeBegin(); ! iter.IsEnd(); ++iter){
+ if (g_.length(*iter) > length_bound_) {
+ coverages.push_back(make_pair(g_.coverage(*iter), g_.length(*iter)));
+ total_len += g_.length(*iter);
+ } else {
+ short_len += g_.length(*iter);
+ }
+ }
+ if (total_len == 0){
+ INFO("Median coverage detection failed, not enough long edges");
+ return -1.0;
+ }
+ std::sort(coverages.begin(), coverages.end());
+ size_t i = 0;
+ while (cur_len < total_len/2 && i <coverages.size()) {
+ cur_len += coverages[i].second;
+ i++;
+ }
+ INFO ("genomic coverage is "<< coverages[i - 1].first << " calculated of length " << size_t (double(total_len) * 0.5));
+ return coverages[i - 1].first;
+}
+
+std::pair<size_t, size_t> CoverageUniformityAnalyzer::TotalLengthsNearMedian(double allowed_variation, double median_coverage) const{
+ std::pair<size_t, size_t> res(0,0);
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.length(*iter) > length_bound_) {
+ if (g_.coverage(*iter) < median_coverage * (1 + allowed_variation) &&
+ g_.coverage(*iter) > median_coverage * (1 - allowed_variation)) {
+ res.first += g_.length(*iter);
+ } else {
+ res.second += g_.length(*iter);
+ }
+ }
+ }
+ return res;
+}
+
+size_t CoverageUniformityAnalyzer::TotalLongEdgeLength() const {
+ size_t res = 0;
+ for (auto iter = g_.ConstEdgeBegin(); ! iter.IsEnd(); ++iter){
+ if (g_.length(*iter) > length_bound_) {
+ res += g_.length(*iter);
+ }
+ }
+ return res;
+}
+
+double CoverageUniformityAnalyzer::UniformityFraction(double allowed_variation, double median_coverage) const {
+ std::pair<size_t, size_t> lengths = TotalLengthsNearMedian(allowed_variation, median_coverage);
+ size_t total_len = lengths.first + lengths.second;
+ if (total_len == 0) {
+ WARN(" No edges longer than length bound(" << length_bound_ <<" )");
+ return 0;
+ }
+ return double(lengths.first) / double(total_len);
+}
+
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.hpp b/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.hpp
new file mode 100644
index 0000000..016605f
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/coverage_uniformity_analyzer.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+#include "assembly_graph/core/graph.hpp"
+
+namespace debruijn_graph {
+
+class CoverageUniformityAnalyzer {
+private:
+ const Graph& g_;
+ const size_t length_bound_;
+public:
+ CoverageUniformityAnalyzer(const Graph& g, const size_t length_bound): g_(g), length_bound_(length_bound){}
+ double CountMedianCoverage() const;
+ double UniformityFraction(double allowed_variation, double median_coverage) const;
+//first - inside [median* (1 - allowed_variation), median* (1 + allowed_variation)], second-outside
+ std::pair<size_t, size_t> TotalLengthsNearMedian(double allowed_variation, double median_coverage) const;
+ size_t TotalLongEdgeLength() const;
+};
+}
diff --git a/src/common/assembly_graph/graph_support/detail_coverage.hpp b/src/common/assembly_graph/graph_support/detail_coverage.hpp
new file mode 100644
index 0000000..15600e2
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/detail_coverage.hpp
@@ -0,0 +1,190 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/indices/perfect_hash_map.hpp"
+#include "assembly_graph/core/coverage.hpp"
+#include "assembly_graph/core/action_handlers.hpp"
+#include "utils/verify.hpp"
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+namespace omnigraph {
+
+template<class Graph>
+class FlankingCoverage : public omnigraph::GraphActionHandler<Graph> {
+ typedef omnigraph::GraphActionHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef pair<EdgeId, unsigned> Pos;
+
+ Graph& g_;
+ const size_t averaging_range_;
+
+ void SetRawCoverage(EdgeId e, unsigned cov) {
+ g_.data(e).set_flanking_coverage(cov);
+ }
+
+ unsigned RawCoverage(EdgeId e) const {
+ return g_.data(e).flanking_coverage();
+ }
+
+ size_t EdgeAveragingRange(EdgeId e) const {
+ return std::min(this->g().length(e), averaging_range_);
+ }
+
+ double AverageFlankingCoverage(EdgeId e) const {
+ return double(RawCoverage(e)) / double(EdgeAveragingRange(e));
+ }
+
+ unsigned InterpolateCoverage(EdgeId e, size_t l) const {
+ VERIFY(l <= averaging_range_);
+ VERIFY(l < g_.length(e));
+ return unsigned(math::round(AverageFlankingCoverage(e) * double(l)));
+ }
+
+ void SetCoverageSimilarToAverageFlanking(EdgeId target, EdgeId source) {
+ SetRawCoverage(target, unsigned(math::round(AverageFlankingCoverage(source) * double(EdgeAveragingRange(target)))));
+ }
+
+ void SetCoverageSimilarToAverageGlobal(EdgeId target, EdgeId source) {
+ SetRawCoverage(target, unsigned(math::round(g_.coverage(source) * double(EdgeAveragingRange(target)))));
+ }
+
+public:
+
+ //todo think about interactions with gap closer
+ FlankingCoverage(Graph& g, size_t averaging_range)
+ : base(g, "FlankingCoverage"), g_(g),
+ averaging_range_(averaging_range) {
+ }
+
+ size_t averaging_range() const {
+ return averaging_range_;
+ }
+
+ //left for saves compatibility and tests remove later!
+ template<class CoverageIndex>
+ void Fill(const CoverageIndex& count_index) {
+ TRACE("Filling flanking coverage from index");
+
+ for (auto I = count_index.value_cbegin(), E = count_index.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ EdgeId e = edge_info.edge_id;
+ unsigned offset = edge_info.offset;
+ unsigned count = edge_info.count;
+ VERIFY(edge_info.valid());
+ VERIFY(e.get() != NULL);
+ if (offset < averaging_range_) {
+ IncRawCoverage(e, count);
+ }
+ }
+ }
+
+ void IncRawCoverage(EdgeId e, unsigned count) {
+ g_.data(e).inc_flanking_coverage(count);
+ }
+
+ double CoverageOfStart(EdgeId e) const {
+ return AverageFlankingCoverage(e);
+ }
+
+ double CoverageOfEnd(EdgeId e) const {
+ return CoverageOfStart(this->g().conjugate(e));
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+// SetRawCoverage(new_edge, RawCoverage(old_edges.front()));
+ size_t kpomers_left = averaging_range_;
+ unsigned acc = 0;
+ for (EdgeId e : old_edges) {
+ if (kpomers_left >= g_.length(e)) {
+ acc += RawCoverage(e);
+ kpomers_left -= g_.length(e);
+ } else {
+ if (kpomers_left != 0)
+ acc += InterpolateCoverage(e, kpomers_left);
+ break;
+ }
+ }
+ SetRawCoverage(new_edge, acc);
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2) {
+ //todo maybe improve later
+ SetCoverageSimilarToAverageFlanking(new_edge_1, old_edge);
+ SetCoverageSimilarToAverageGlobal(new_edge_2, old_edge);
+ if (old_edge == g_.conjugate(old_edge)) {
+ SetCoverageSimilarToAverageGlobal(g_.conjugate(new_edge_1), old_edge);
+ }
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ SetRawCoverage(e, 0);
+ }
+
+ double LocalCoverage(EdgeId e, VertexId v) const {
+ if (this->g().EdgeStart(e) == v) {
+ return GetInCov(e);
+ } else if (this->g().EdgeEnd(e) == v) {
+ return GetOutCov(e);
+ } else {
+ VERIFY(false);
+ return 0.0;
+ }
+ }
+
+ //left for compatibility
+ //todo rename
+ double GetInCov(EdgeId e) const {
+ return CoverageOfStart(e);
+ }
+
+ //left for compatibility
+ //todo rename
+ double GetOutCov(EdgeId e) const {
+ return CoverageOfEnd(e);
+ }
+
+ //////////////////////////
+
+ void Save(EdgeId e, ostream& out) const {
+ out << RawCoverage(e);
+ }
+
+ void Load(EdgeId e, istream& in) {
+ unsigned cov;
+ in >> cov;
+ SetRawCoverage(e, cov);
+ }
+
+ /*
+ * Is thread safe if different threads process different edges.
+ */
+ bool IsThreadSafe() const {
+ return true;
+ }
+
+private:
+ DECL_LOGGER("FlankingCoverage");
+};
+
+}
diff --git a/src/common/assembly_graph/graph_support/edge_removal.hpp b/src/common/assembly_graph/graph_support/edge_removal.hpp
new file mode 100644
index 0000000..e4fbe75
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/edge_removal.hpp
@@ -0,0 +1,172 @@
+#pragma once
+#include "utils/logger/logger.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+void RemoveIsolatedOrCompress(Graph& g, typename Graph::VertexId v) {
+ if (g.IsDeadStart(v) && g.IsDeadEnd(v)) {
+ g.DeleteVertex(v);
+ } else {
+ g.CompressVertex(v);
+ }
+}
+
+template<class Graph>
+class EdgeRemover {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId)> HandlerF;
+
+ Graph& g_;
+ HandlerF removal_handler_;
+
+public:
+ EdgeRemover(Graph& g, HandlerF removal_handler = nullptr)
+ : g_(g),
+ removal_handler_(removal_handler) {
+ }
+
+ void DeleteEdge(EdgeId e) {
+ VertexId start = g_.EdgeStart(e);
+ VertexId end = g_.EdgeEnd(e);
+ DeleteEdgeNoCompress(e);
+ // NOTE: e here is already dead!
+ TRACE("Compressing locality");
+ if (!g_.RelatedVertices(start, end)) {
+ TRACE("Vertices not related");
+ TRACE("Processing end");
+ RemoveIsolatedOrCompress(g_, end);
+ TRACE("End processed");
+ }
+ TRACE("Processing start");
+ RemoveIsolatedOrCompress(g_, start);
+ TRACE("Start processed");
+ }
+
+ void DeleteEdgeNoCompress(EdgeId e) {
+ TRACE("Deletion of edge " << g_.str(e));
+ TRACE("Start " << g_.str(g_.EdgeStart(e)));
+ TRACE("End " << g_.str(g_.EdgeEnd(e)));
+ if (removal_handler_) {
+ TRACE("Calling handler");
+ removal_handler_(e);
+ }
+ TRACE("Deleting edge");
+ g_.DeleteEdge(e);
+ }
+
+ void DeleteEdgeOptCompress(EdgeId e, bool compress) {
+ if (compress)
+ DeleteEdge(e);
+ else
+ DeleteEdgeNoCompress(e);
+ }
+
+private:
+ DECL_LOGGER("EdgeRemover");
+};
+
+//todo rewrite with SmartSetIterator
+template<class Graph>
+class ComponentRemover {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(const std::set<EdgeId>&)> HandlerF;
+
+private:
+ Graph& g_;
+ HandlerF removal_handler_;
+
+ template<class ElemType>
+ void InsertIfNotConjugate(std::set<ElemType>& elems, ElemType elem) {
+ if (elems.count(g_.conjugate(elem)) == 0) {
+ elems.insert(elem);
+ }
+ }
+
+public:
+ ComponentRemover(Graph& g, HandlerF removal_handler = 0)
+ : g_(g),
+ removal_handler_(removal_handler) {
+ }
+
+ template<class EdgeIt>
+ void DeleteComponent(EdgeIt begin, EdgeIt end, bool alter_vertices = true) {
+ using std::set;
+ set<EdgeId> edges;
+ set<VertexId> vertices;
+
+ //cleaning conjugates and gathering vertices
+ for (EdgeIt it = begin; it != end; ++it) {
+ EdgeId e = *it;
+ InsertIfNotConjugate(edges, e);
+ InsertIfNotConjugate(vertices, g_.EdgeStart(e));
+ InsertIfNotConjugate(vertices, g_.EdgeEnd(e));
+ }
+
+ if (removal_handler_) {
+ removal_handler_(edges);
+ }
+
+ for (EdgeId e: edges) {
+ g_.DeleteEdge(e);
+ }
+
+ if (alter_vertices) {
+ for (VertexId v: vertices) {
+ RemoveIsolatedOrCompress(g_, v);
+ }
+ }
+ }
+
+ template<class Container>
+ void DeleteComponent(const Container& container, bool alter_vertices = true) {
+ DeleteComponent(container.begin(), container.end(), alter_vertices);
+ }
+
+};
+
+//Removes first 'trim_len' (k+1)-mers of graph edge, disconnecting it from starting vertex
+//In case edge was removed, its end will be compressed even with "compress = false" parameter
+template<class Graph>
+class EdgeDisconnector {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ Graph& g_;
+ EdgeRemover<Graph> edge_remover_;
+ const size_t trim_len_;
+ typedef std::function<void(EdgeId)> HandlerF;
+
+public:
+ EdgeDisconnector(Graph& g,
+ HandlerF removal_handler = nullptr,
+ size_t trim_len = 1):
+ g_(g),
+ edge_remover_(g, removal_handler),
+ trim_len_(trim_len) {
+ VERIFY(trim_len_ > 0);
+ }
+
+ EdgeId operator()(EdgeId e, bool compress = true) {
+ if (g_.length(e) <= trim_len_
+ || (e == g_.conjugate(e) && g_.length(e) <= 2 * trim_len_)) {
+ VertexId start = g_.EdgeStart(e);
+ VertexId end = g_.EdgeEnd(e);
+ edge_remover_.DeleteEdgeOptCompress(e, compress);
+ if (!compress && !g_.RelatedVertices(start, end)) {
+ TRACE("Processing end");
+ RemoveIsolatedOrCompress(g_, end);
+ TRACE("End processed");
+ }
+ return EdgeId(0);
+ } else {
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, trim_len_);
+ edge_remover_.DeleteEdgeOptCompress(split_res.first, compress);
+ return split_res.second;
+ }
+ }
+};
+
+}
diff --git a/src/common/assembly_graph/graph_support/genomic_quality.hpp b/src/common/assembly_graph/graph_support/genomic_quality.hpp
new file mode 100644
index 0000000..608d120
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/genomic_quality.hpp
@@ -0,0 +1,555 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "visualization/visualization.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+#include "assembly_graph/core/action_handlers.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class EdgeQuality: public visualization::graph_labeler::GraphLabeler<Graph>, public omnigraph::GraphActionHandler<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ map<EdgeId, size_t> quality_;
+ size_t k_;
+
+ template<class Index>
+ void FillQuality(const Index &index
+ , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
+ if (genome.size() < k_)
+ return;
+ RtSeq cur = genome.start<RtSeq>(k_);
+ cur >>= 0;
+ for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
+ cur <<= genome[i + k_ - 1];
+ auto corr_cur = kmer_mapper.Substitute(cur);
+ if (index.contains(corr_cur)) {
+ quality_[index.get(corr_cur).first]++;
+ }
+ }
+ }
+
+public:
+
+ template<class Index>
+ void Fill(const Index &index
+ , const KmerMapper<Graph>& kmer_mapper
+ , const Sequence &genome) {
+ DEBUG("Filling quality values");
+ FillQuality(index, kmer_mapper, genome);
+ FillQuality(index, kmer_mapper, !genome);
+ DEBUG(quality_.size() << " edges have non-zero quality");
+ }
+
+ EdgeQuality(const Graph &graph) :
+ omnigraph::GraphActionHandler<Graph>(graph, "EdgeQuality"),
+ k_(graph.k() + 1) {
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ quality_.erase(e);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+ size_t res = 0;
+ for (size_t i = 0; i < old_edges.size(); i++) {
+ res += quality_[old_edges[i]];
+ }
+ quality_[new_edge] += res;
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ quality_[new_edge] += quality_[edge2];
+ quality_[new_edge] += quality_[edge1];
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
+ EdgeId new_edge2) {
+ if (old_edge == this->g().conjugate(old_edge)) {
+ WARN("EdgeQuality does not support self-conjugate splits");
+ return;
+ }
+ VERIFY(old_edge != this->g().conjugate(old_edge));
+ quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
+ / (this->g().length(new_edge1) + this->g().length(new_edge2));
+ quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
+ / (this->g().length(new_edge1) + this->g().length(new_edge2));
+ }
+
+ double quality(EdgeId edge) const {
+ auto it = quality_.find(edge);
+ if (it == quality_.end())
+ return 0.;
+ else
+ return 1. * (double) it->second / (double) this->g().length(edge);
+ }
+
+ bool IsPositiveQuality(EdgeId edge) const {
+ return math::gr(quality(edge), 0.);
+ }
+
+ bool IsZeroQuality(EdgeId edge) const {
+ return math::eq(quality(edge), 0.);
+ }
+
+ virtual std::string label(VertexId /*vertexId*/) const {
+ return "";
+ }
+
+ virtual std::string label(EdgeId edge) const {
+ double q = quality(edge);
+ return (q == 0) ? "" : "quality: " + ToString(q);
+ }
+
+ void clear() {
+ quality_.clear();
+ }
+
+private:
+ DECL_LOGGER("EdgeQuality");
+};
+
+template<class Graph>
+class QualityLoggingRemovalHandler {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+ const EdgeQuality<Graph>& quality_handler_;
+ size_t black_removed_;
+ size_t total_;
+ bool handle_all_;
+
+ virtual void HandlePositiveQuality(EdgeId /*e*/) {
+
+ }
+
+public:
+ QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
+ bool handle_all = false) :
+ g_(g), quality_handler_(quality_handler), black_removed_(0), total_(0), handle_all_(handle_all) {
+ }
+
+ void HandleDelete(EdgeId e) {
+ total_++;
+ if (handle_all_ || math::gr(quality_handler_.quality(e), 0.)) {
+ TRACE("Deleting good edge id = " << g_.int_id(e)
+ << "; length = " << g_.length(e)
+ << "; quality = " << quality_handler_.quality(e)
+ << "; cov = " << g_.coverage(e));
+ HandlePositiveQuality(e);
+ } else {
+ black_removed_++;
+ }
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ const EdgeQuality<Graph>& quality_handler() const {
+ return quality_handler_;
+ }
+
+ virtual ~QualityLoggingRemovalHandler() {
+ TRACE("Overall stats: total removed = " << total_
+ << "; bad removed = " << black_removed_
+ << "; good removed = " << total_ - black_removed_);
+ }
+
+private:
+ DECL_LOGGER("QualityLoggingRemovalHandler");
+};
+
+template<class Graph>
+class QualityEdgeLocalityPrintingRH : public QualityLoggingRemovalHandler<Graph> {
+ typedef QualityLoggingRemovalHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ visualization::visualization_utils::LocalityPrintingRH<Graph> printing_rh_;
+public:
+ QualityEdgeLocalityPrintingRH(const Graph& g
+ , const EdgeQuality<Graph>& quality_handler
+ , const visualization::graph_labeler::GraphLabeler<Graph>& labeler
+ , std::shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer
+ , const string& output_folder, bool handle_all = false) :
+ base(g, quality_handler, handle_all),
+ printing_rh_(g, labeler, colorer, output_folder)
+ {}
+
+ virtual void HandlePositiveQuality(EdgeId e) {
+ printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e)));
+ }
+
+private:
+ DECL_LOGGER("QualityEdgeLocalityPrintingRH");
+};
+
+//earlier version from rel_cov branch
+//template<class Graph>
+//class EdgeNeighborhoodFinder: public omnigraph::GraphSplitter<Graph> {
+//private:
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// EdgeId edge_;
+// size_t max_size_;
+// size_t edge_length_bound_;
+// bool finished_;
+//public:
+// EdgeNeighborhoodFinder(const Graph &graph, EdgeId edge, size_t max_size
+// , size_t edge_length_bound) :
+// GraphSplitter<Graph>(graph), edge_(edge), max_size_(
+// max_size), edge_length_bound_(edge_length_bound), finished_(
+// false) {
+// }
+//
+// GraphComponent<Graph> NextComponent() {
+// CountingDijkstra<Graph> cf(this->graph(), max_size_,
+// edge_length_bound_);
+// set<VertexId> result_set;
+// cf.run(this->graph().EdgeStart(edge_));
+// vector<VertexId> result_start = cf.ReachedVertices();
+// result_set.insert(result_start.begin(), result_start.end());
+// cf.run(this->graph().EdgeEnd(edge_));
+// vector<VertexId> result_end = cf.ReachedVertices();
+// result_set.insert(result_end.begin(), result_end.end());
+//
+// ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
+// cc.CloseComponent(result_set);
+//
+// finished_ = true;
+// return GraphComponent<Graph>(this->graph(), result_set.begin(), result_set.end());
+// }
+//
+// /*virtual*/ bool Finished() {
+// return finished_;
+// }
+//};
+//
+//template<class Graph>
+//class EdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// std::function<double (EdgeId)>& quality_f_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// EdgeLocalityPrintingRH(const Graph& g
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , std::function<double (EdgeId)> quality_f = 0) :
+// g_(g),
+// labeler_(labeler), output_folder_(output_folder),
+// quality_f_(quality_f){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// TRACE("Deleting edge " << g_.str(edge));
+// if (quality_f_ && math::gr(quality_f_(edge), 0.))
+// INFO("EdgeLocalityPrintRH handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
+//
+// string folder = output_folder_ + "edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// map<EdgeId, string> empty_coloring;
+// visualization::visualization_utils::WriteComponent(g_, EdgeNeighborhood<Graph>(g_, edge, 50, 250),
+// folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
+// }
+//
+//private:
+// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
+// ;
+//};
+
+//template<class Graph, class Index>
+//class EdgeQuality: public GraphLabeler<Graph>, public GraphActionHandler<Graph> {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// map<EdgeId, size_t> quality_;
+// size_t k_;
+//
+//public:
+//
+// void FillQuality(const Index &index
+// , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
+// if (genome.size() < k_)
+// return;
+// RtSeq cur = genome.start<RtSeq>(k_);
+// cur >>= 0;
+// for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
+// cur <<= genome[i + k_ - 1];
+// auto corr_cur = kmer_mapper.Substitute(cur);
+// if (index.contains(corr_cur)) {
+// quality_[index.get(corr_cur).first]++;
+// }
+// }
+// }
+//
+// EdgeQuality(const Graph &graph, const Index &index,
+// const KmerMapper<Graph>& kmer_mapper,
+// const Sequence &genome) :
+//
+// GraphActionHandler<Graph>(graph, "EdgeQualityLabeler"),
+// k_(kmer_mapper.get_k()) {
+// FillQuality(index, kmer_mapper, genome);
+// FillQuality(index, kmer_mapper, !genome);
+// }
+//
+// virtual ~EdgeQuality() {
+// }
+//
+// virtual void HandleAdd(EdgeId /*e*/) {
+// }
+//
+// virtual void HandleDelete(EdgeId e) {
+// quality_.erase(e);
+// }
+//
+// virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
+// size_t res = 0;
+// for (size_t i = 0; i < old_edges.size(); i++) {
+// res += quality_[old_edges[i]];
+// }
+// quality_[new_edge] += res;
+// }
+//
+// virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// quality_[new_edge] += quality_[edge2];
+// quality_[new_edge] += quality_[edge1];
+// }
+//
+// virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
+// EdgeId new_edge2) {
+// quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
+// / (this->g().length(new_edge1) + this->g().length(new_edge2));
+// quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
+// / (this->g().length(new_edge1) + this->g().length(new_edge2));
+// }
+//
+// double quality(EdgeId edge) const {
+// auto it = quality_.find(edge);
+// if (it == quality_.end())
+// return 0.;
+// else
+// return 1. * (double) it->second / (double) this->g().length(edge);
+// }
+//
+// bool IsPositiveQuality(EdgeId edge) const {
+// return math::gr(quality(edge), 0.);
+// }
+//
+// virtual std::string label(VertexId /*vertexId*/) const {
+// return "";
+// }
+//
+// virtual std::string label(EdgeId edge) const {
+// double q = quality(edge);
+// return (q == 0) ? "" : "quality: " + ToString(q);
+// }
+//
+//};
+//
+//template<class Graph, class Index>
+//class QualityLoggingRemovalHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
+// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
+// 0)*/{
+//
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (math::gr(quality_handler_.quality(edge), 0.)) {
+// TRACE("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
+// } else {
+//// TRACE("Deleting edge " << g_.int_id(edge) << " with zero quality");
+// }
+//// if (math::gr(quality_handler_.quality(edge), 0.))
+//// colored_removed_++;
+//// else
+//// black_removed_++;
+// }
+//
+//private:
+// DECL_LOGGER("QualityLoggingRemovalHandler")
+// ;
+//};
+//
+//template<class Graph, class Index>
+//class QualityLoggingRemovalCountHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// size_t black_removed_;
+// size_t total;
+//
+//public:
+// QualityLoggingRemovalCountHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
+// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
+// 0)*/{
+// black_removed_ = 0;
+// total = 0;
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// total++;
+// if (math::gr(quality_handler_.quality(edge), 0.)) {
+// TRACE("Deleting good edge " << g_.int_id(edge) << " with quality " << quality_handler_.quality(edge) << " cov " << g_.coverage(edge) << " length " << g_.length(edge));
+// }else{
+// black_removed_++;
+// }
+// if ((total % (1<<10)) != 0)
+// TRACE("Removed still " << black_removed_ << " " << total);
+// }
+//
+//private:
+//};
+//
+//template<class Graph, class Index>
+//class QualityEdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// const omnigraph::GraphLabeler<Graph>& labeler_;
+// const visualization::graph_colorer::GraphColorer<Graph>& colorer_;
+// const string& output_folder_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityEdgeLocalityPrintingRH(const Graph& g
+// , const EdgeQuality<Graph, Index>& quality_handler
+// , const visualization::graph_labeler::GraphLabeler<Graph>& labeler
+// , const visualization::graph_colorer::GraphColorer<Graph>& colorer
+// , const string& output_folder) :
+// g_(g), quality_handler_(quality_handler),
+// labeler_(labeler), colorer_(colorer), output_folder_(output_folder){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (quality_handler_.IsPositiveQuality(edge)) {
+// DEBUG("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
+// string folder = output_folder_ + "colored_edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+//// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
+// visualization::visualization_utils::WriteComponents(g_, *splitter/*, "locality_of_edge_" + ToString(g_.int_id(edge))*/
+// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
+// , colorer_, labeler_);
+// } else {
+// TRACE("Deleting edge " << g_.str(edge) << " with zero quality");
+// }
+// }
+//
+//private:
+// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
+// ;
+//};
+//
+//template<class Graph, class Index>
+//class QualityPairInfoHandler {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// typedef omnigraph::PairInfo<EdgeId> PairInfo;
+// typedef vector<PairInfo> PairInfos;
+// const Graph& g_;
+// const EdgeQuality<Graph, Index>& quality_handler_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// const PairedInfoIndex<ConjugateDeBruijnGraph>& index_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// QualityPairInfoHandler(const Graph& g
+// , const EdgeQuality<Graph, Index>& quality_handler
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , const PairedInfoIndex<ConjugateDeBruijnGraph>& index) :
+// g_(g), quality_handler_(quality_handler),
+// labeler_(labeler), output_folder_(output_folder), index_(index) {
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// if (quality_handler_.IsPositiveQuality(edge)) {
+// cout << "Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge) << endl;
+// string folder = output_folder_ + "colored_edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// PairInfos infos = index_.GetEdgeInfo(edge);
+// if (infos.size() > 0){
+// for (size_t i = 0; i<infos.size(); i++){
+// cout << "Tip Info " << g_.int_id(infos[i].first) << " " << g_.int_id(infos[i].second) << " " << infos[i].d << " " << infos[i].weight << " " << infos[i].variance << endl;
+// }
+// }
+// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50,
+// 250);
+//
+// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
+// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
+// , empty_coloring, labeler_);
+// }
+// }
+//
+//private:
+//};
+//
+////todo what is the difference with QELPRH?!
+//template<class Graph>
+//class EdgeLocalityPrintingRH {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// const Graph& g_;
+// const GraphLabeler<Graph>& labeler_;
+// const string& output_folder_;
+// std::function<double (EdgeId)>& quality_f_;
+//// size_t black_removed_;
+//// size_t colored_removed_;
+//public:
+// EdgeLocalityPrintingRH(const Graph& g
+// , const GraphLabeler<Graph>& labeler
+// , const string& output_folder
+// , std::function<double (EdgeId)> quality_f = 0) :
+// g_(g),
+// labeler_(labeler), output_folder_(output_folder),
+// quality_f_(quality_f){
+// }
+//
+// void HandleDelete(EdgeId edge) {
+// TRACE("Deleting edge " << g_.str(edge));
+// if (quality_f_ && math::gr(quality_f_(edge), 0.))
+// INFO("Handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
+//
+// string folder = output_folder_ + "edges_deleted/";
+// path::make_dir(folder);
+// //todo magic constant
+// map<EdgeId, string> empty_coloring;
+// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
+// visualization::visualization_utils::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
+// , folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
+// }
+//
+//private:
+// DECL_LOGGER("EdgeLocalityPrintingRH")
+// ;
+//};
+
+}
diff --git a/src/common/assembly_graph/graph_support/graph_processing_algorithm.hpp b/src/common/assembly_graph/graph_support/graph_processing_algorithm.hpp
new file mode 100644
index 0000000..8a27010
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/graph_processing_algorithm.hpp
@@ -0,0 +1,146 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "func/func.hpp"
+#include <boost/none.hpp>
+#include <atomic>
+#include "assembly_graph/core/graph_iterators.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "edge_removal.hpp"
+#include "func/pred.hpp"
+#include "utils/logger/logger.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+using EdgeRemovalHandlerF = std::function<void(typename Graph::EdgeId)>;
+
+template<class Graph>
+class EdgeProcessingAlgorithm {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef func::TypedPredicate<EdgeId> ProceedConditionT;
+
+ Graph& g_;
+ bool conjugate_symmetry_;
+ protected:
+
+ Graph& g() {
+ return g_;
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ virtual bool ProcessEdge(EdgeId e) = 0;
+
+ public:
+ EdgeProcessingAlgorithm(Graph& g,
+ bool conjugate_symmetry = false)
+ : g_(g), conjugate_symmetry_(conjugate_symmetry) {
+
+ }
+
+ virtual ~EdgeProcessingAlgorithm() {
+ }
+
+// bool conjugate_symmetry() const {
+// return conjugate_symmetry_;
+// }
+
+ template<class Comparator = std::less<EdgeId>>
+ bool Run(const Comparator& comp = Comparator(), ProceedConditionT proceed_condition = func::AlwaysTrue<EdgeId>()) {
+ bool triggered = false;
+ for (auto it = g_.SmartEdgeBegin(comp, conjugate_symmetry_); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ TRACE("Current edge " << g_.str(e));
+ if (!proceed_condition(e)) {
+ TRACE("Stop condition was reached.");
+ break;
+ }
+
+ TRACE("Processing edge " << this->g().str(e));
+ triggered |= ProcessEdge(e);
+ };
+ return triggered;
+ }
+
+ private:
+ DECL_LOGGER("EdgeProcessingAlgorithm");
+};
+
+template<class Graph>
+class CountingCallback {
+ typedef typename Graph::EdgeId EdgeId;
+ bool report_on_destruction_;
+ std::atomic<size_t> cnt_;
+
+public:
+ CountingCallback(bool report_on_destruction = false) :
+ report_on_destruction_(report_on_destruction), cnt_(0) {
+ }
+
+ ~CountingCallback() {
+ if (report_on_destruction_)
+ Report();
+ }
+
+ void HandleDelete(EdgeId /*e*/) {
+ cnt_++;
+ }
+
+ void Report() {
+ TRACE(cnt_ << " edges were removed.")
+ cnt_ = 0;
+ }
+
+private:
+ DECL_LOGGER("CountingCallback");
+};
+
+template<class Graph>
+std::function<void(typename Graph::EdgeId)> AddCountingCallback(CountingCallback<Graph>& cnt_callback, std::function<void(typename Graph::EdgeId)> handler) {
+ std::function<void(typename Graph::EdgeId)> cnt_handler = std::bind(&CountingCallback<Graph>::HandleDelete, std::ref(cnt_callback), std::placeholders::_1);
+ return func::CombineCallbacks<typename Graph::EdgeId>(handler, cnt_handler);
+}
+
+template<class Graph>
+class EdgeRemovingAlgorithm : public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+
+ func::TypedPredicate<EdgeId> remove_condition_;
+ EdgeRemover<Graph> edge_remover_;
+
+ protected:
+ virtual bool ProcessEdge(EdgeId e) {
+ TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
+ if (remove_condition_(e)) {
+ TRACE("Check passed, removing");
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ TRACE("Check not passed");
+ return false;
+ }
+
+ public:
+ EdgeRemovingAlgorithm(Graph& g,
+ func::TypedPredicate<EdgeId> remove_condition,
+ std::function<void (EdgeId)> removal_handler = boost::none,
+ bool conjugate_symmetry = false)
+ : base(g, conjugate_symmetry),
+ remove_condition_(remove_condition),
+ edge_remover_(g, removal_handler) {}
+
+ private:
+ DECL_LOGGER("EdgeRemovingAlgorithm");
+};
+
+}
diff --git a/src/modules/assembly_graph/graph_support/marks_and_locks.hpp b/src/common/assembly_graph/graph_support/marks_and_locks.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_support/marks_and_locks.hpp
rename to src/common/assembly_graph/graph_support/marks_and_locks.hpp
diff --git a/src/common/assembly_graph/graph_support/parallel_processing.hpp b/src/common/assembly_graph/graph_support/parallel_processing.hpp
new file mode 100644
index 0000000..abd3149
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/parallel_processing.hpp
@@ -0,0 +1,306 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/logger/logger.hpp"
+#include "assembly_graph/core/graph_iterators.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "utils/openmp_wrapper.h"
+
+namespace omnigraph {
+
+template<class ItVec, class Condition, class Handler>
+void FindInterestingFromChunkIterators(const ItVec& chunk_iterators,
+ const Condition& predicate,
+ const Handler& handler) {
+ VERIFY(chunk_iterators.size() > 1);
+ typedef typename Condition::checked_type ElementType;
+ std::vector<std::vector<ElementType>> of_interest(omp_get_max_threads());
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ size_t cnt = 0;
+ for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) {
+ ElementType t = *it;
+ if (predicate(t)) {
+ of_interest[omp_get_thread_num()].push_back(t);
+ }
+ cnt++;
+ }
+ DEBUG("Processed " << cnt << " elements as potential candidates by thread " << omp_get_thread_num());
+ }
+
+ for (auto& chunk : of_interest) {
+ for (const auto& el : chunk) {
+ handler(el);
+ }
+ chunk.clear();
+ }
+}
+
+template<class Graph, class ElementId>
+class InterestingElementFinder {
+protected:
+ typedef std::function<void (ElementId)> HandlerF;
+ const func::TypedPredicate<ElementId> condition_;
+public:
+
+ InterestingElementFinder(func::TypedPredicate<ElementId> condition):
+ condition_(condition) {
+ }
+
+ virtual ~InterestingElementFinder() {}
+
+ virtual bool Run(const Graph& /*g*/, HandlerF /*handler*/) const = 0;
+};
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class TrivialInterestingElementFinder :
+ public InterestingElementFinder<Graph, ElementId> {
+public:
+
+ TrivialInterestingElementFinder() :
+ InterestingElementFinder<Graph, ElementId>(func::AlwaysTrue<ElementId>()) {
+ }
+
+ bool Run(const Graph& /*g*/, std::function<void (ElementId)> /*handler*/) const override {
+ return false;
+ }
+};
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class SimpleInterestingElementFinder : public InterestingElementFinder<Graph, ElementId> {
+ typedef InterestingElementFinder<Graph, ElementId> base;
+ typedef typename base::HandlerF HandlerF;
+public:
+
+ SimpleInterestingElementFinder(func::TypedPredicate<ElementId> condition = func::AlwaysTrue<ElementId>())
+ : base(condition) {}
+
+ bool Run(const Graph& g, HandlerF handler) const override {
+ const IterationHelper<Graph, ElementId> it_helper(g);
+ for (auto it = it_helper.begin(), end = it_helper.end(); it != end; ++it) {
+ if (this->condition_(*it)) {
+ handler(*it);
+ }
+ }
+ return false;
+ }
+};
+
+template<class Graph, class ElementId = typename Graph::EdgeId>
+class ParallelInterestingElementFinder : public InterestingElementFinder<Graph, ElementId> {
+ typedef InterestingElementFinder<Graph, ElementId> base;
+ typedef typename base::HandlerF HandlerF;
+
+ const size_t chunk_cnt_;
+public:
+
+ ParallelInterestingElementFinder(func::TypedPredicate<ElementId> condition,
+ size_t chunk_cnt)
+ : base(condition), chunk_cnt_(chunk_cnt) {}
+
+ bool Run(const Graph& g, HandlerF handler) const override {
+ TRACE("Looking for interesting elements");
+ TRACE("Splitting graph into " << chunk_cnt_ << " chunks");
+ FindInterestingFromChunkIterators(IterationHelper<Graph, ElementId>(g).Chunks(chunk_cnt_),
+ this->condition_, handler);
+ return false;
+ }
+
+private:
+ DECL_LOGGER("ParallelInterestingElementFinder");
+};
+
+template<class Graph>
+class PersistentAlgorithmBase {
+ Graph& g_;
+protected:
+
+ PersistentAlgorithmBase(Graph& g) : g_(g) {}
+
+ Graph& g() { return g_; }
+ const Graph& g() const { return g_; }
+public:
+ virtual ~PersistentAlgorithmBase() {}
+ virtual size_t Run(bool force_primary_launch = false) = 0;
+};
+
+template<class Algo>
+inline size_t LoopedRun(Algo& algo) {
+ size_t total_triggered = 0;
+ bool run = true;
+ while (run) {
+ size_t triggered = algo.Run();
+ total_triggered += triggered;
+ run = (triggered > 0);
+ }
+ return total_triggered;
+}
+
+//todo only potentially relevant edges should be stored at any point
+template<class Graph, class ElementId,
+ class Comparator = std::less<ElementId>>
+class PersistentProcessingAlgorithm : public PersistentAlgorithmBase<Graph> {
+protected:
+ typedef std::shared_ptr<InterestingElementFinder<Graph, ElementId>> CandidateFinderPtr;
+ CandidateFinderPtr interest_el_finder_;
+
+private:
+ SmartSetIterator<Graph, ElementId, Comparator> it_;
+ bool tracking_;
+ size_t total_iteration_estimate_;
+ size_t curr_iteration_;
+
+protected:
+ void ReturnForConsideration(ElementId el) {
+ it_.push(el);
+ }
+
+ virtual bool Process(ElementId el) = 0;
+ virtual bool Proceed(ElementId /*el*/) const { return true; }
+
+ virtual void PrepareIteration(size_t /*it_cnt*/, size_t /*total_it_estimate*/) {}
+
+public:
+
+ PersistentProcessingAlgorithm(Graph& g,
+ const CandidateFinderPtr& interest_el_finder,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true,
+ size_t total_iteration_estimate = -1ul) :
+ PersistentAlgorithmBase<Graph>(g),
+ interest_el_finder_(interest_el_finder),
+ it_(g, true, comp, canonical_only),
+ tracking_(track_changes),
+ total_iteration_estimate_(total_iteration_estimate),
+ curr_iteration_(0) {
+ it_.Detach();
+ }
+
+ size_t Run(bool force_primary_launch = false) override {
+ bool primary_launch = !tracking_ || (curr_iteration_ == 0) || force_primary_launch ;
+ if (!it_.IsAttached()) {
+ it_.Attach();
+ }
+ if (primary_launch) {
+ it_.clear();
+ TRACE("Primary launch.");
+ TRACE("Start searching for relevant elements");
+ interest_el_finder_->Run(this->g(), [&](ElementId el) {it_.push(el);});
+ TRACE(it_.size() << " elements to consider");
+ } else {
+ TRACE(it_.size() << " elements to consider");
+ VERIFY(tracking_);
+ }
+
+ PrepareIteration(std::min(curr_iteration_, total_iteration_estimate_ - 1), total_iteration_estimate_);
+
+ size_t triggered = 0;
+ TRACE("Start processing");
+ for (; !it_.IsEnd(); ++it_) {
+ ElementId el = *it_;
+ if (!Proceed(el)) {
+ TRACE("Proceed condition turned false on element " << this->g().str(el));
+ it_.ReleaseCurrent();
+ break;
+ }
+ TRACE("Processing edge " << this->g().str(el));
+ if (Process(el))
+ triggered++;
+ }
+ TRACE("Finished processing. Triggered = " << triggered);
+ if (!tracking_)
+ it_.Detach();
+
+ curr_iteration_++;
+ return triggered;
+ }
+private:
+ DECL_LOGGER("PersistentProcessingAlgorithm");
+};
+
+template<class Graph,
+ class Comparator = std::less<typename Graph::EdgeId>>
+class ParallelEdgeRemovingAlgorithm : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ Comparator> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, Comparator> base;
+
+ const func::TypedPredicate<EdgeId> remove_condition_;
+ EdgeRemover<Graph> edge_remover_;
+
+protected:
+
+ bool Process(EdgeId e) override {
+ TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
+ if (remove_condition_(e)) {
+ TRACE("Check passed, removing");
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ TRACE("Check not passed");
+ return false;
+ }
+
+public:
+ ParallelEdgeRemovingAlgorithm(Graph& g,
+ func::TypedPredicate<EdgeId> remove_condition,
+ size_t chunk_cnt,
+ std::function<void(EdgeId)> removal_handler = boost::none,
+ bool canonical_only = false,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true)
+ : base(g,
+ std::make_shared<ParallelInterestingElementFinder<Graph>>(remove_condition, chunk_cnt),
+ canonical_only, comp, track_changes),
+ remove_condition_(remove_condition),
+ edge_remover_(g, removal_handler) {
+ }
+
+private:
+ DECL_LOGGER("ParallelEdgeRemovingAlgorithm");
+};
+
+template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
+class DisconnectionAlgorithm : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ Comparator> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, Comparator> base;
+ func::TypedPredicate<EdgeId> condition_;
+ EdgeDisconnector<Graph> disconnector_;
+
+public:
+ DisconnectionAlgorithm(Graph& g,
+ func::TypedPredicate<EdgeId> condition,
+ size_t chunk_cnt,
+ EdgeRemovalHandlerF<Graph> removal_handler,
+ const Comparator& comp = Comparator(),
+ bool track_changes = true)
+ : base(g,
+ std::make_shared<omnigraph::ParallelInterestingElementFinder<Graph>>(condition, chunk_cnt),
+ /*canonical_only*/false, comp, track_changes),
+ condition_(condition),
+ disconnector_(g, removal_handler) {
+ }
+
+ bool Process(EdgeId e) override {
+ if (condition_(e)) {
+ disconnector_(e);
+ return true;
+ }
+ return false;
+ }
+
+};
+
+
+}
diff --git a/src/common/assembly_graph/graph_support/scaff_supplementary.cpp b/src/common/assembly_graph/graph_support/scaff_supplementary.cpp
new file mode 100644
index 0000000..84f3e9e
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/scaff_supplementary.cpp
@@ -0,0 +1,261 @@
+#include "scaff_supplementary.hpp"
+#include <algorithm>
+
+using namespace std;
+namespace path_extend {
+
+
+void ScaffoldingUniqueEdgeAnalyzer::SetCoverageBasedCutoff() {
+ vector <pair<double, size_t>> coverages;
+ map <EdgeId, size_t> long_component;
+ size_t total_len = 0, short_len = 0, cur_len = 0;
+
+ for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (gp_.g.length(*iter) > length_cutoff_) {
+ coverages.push_back(make_pair(gp_.g.coverage(*iter), gp_.g.length(*iter)));
+ total_len += gp_.g.length(*iter);
+ long_component[*iter] = 0;
+ } else {
+ short_len += gp_.g.length(*iter);
+ }
+ }
+ if (total_len == 0) {
+ WARN("not enough edges longer than "<< length_cutoff_);
+ return;
+ }
+ sort(coverages.begin(), coverages.end());
+ size_t i = 0;
+ while (cur_len < total_len / 2 && i < coverages.size()) {
+ cur_len += coverages[i].second;
+ i++;
+ }
+ median_coverage_ = coverages[i].first;
+}
+
+
+void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_) {
+ storage_.unique_edges_.clear();
+ size_t total_len = 0;
+ size_t unique_len = 0;
+ size_t unique_num = 0;
+ storage_.SetMinLength(length_cutoff_);
+ for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ size_t tlen = gp_.g.length(*iter);
+ total_len += tlen;
+ if (gp_.g.length(*iter) >= length_cutoff_ && gp_.g.coverage(*iter) > median_coverage_ * (1 - relative_coverage_variation_)
+ && gp_.g.coverage(*iter) < median_coverage_ * (1 + relative_coverage_variation_) ) {
+ storage_.unique_edges_.insert(*iter);
+ unique_len += tlen;
+ unique_num ++;
+ }
+ }
+ for (auto iter = storage_.begin(); iter != storage_.end(); ++iter) {
+ DEBUG (gp_.g.int_id(*iter) << " " << gp_.g.coverage(*iter) << " " << gp_.g.length(*iter) );
+ }
+ INFO ("With length cutoff: " << length_cutoff_ <<", median long edge coverage: " << median_coverage_ << ", and maximal unique coverage: " <<
+ relative_coverage_variation_);
+ INFO("Unique edges quantity: " << unique_num << ", unique edges length " << unique_len <<", total edges length " << total_len);
+ if (unique_len * 2 < total_len) {
+ WARN("Less than half of genome in unique edges!");
+ }
+
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByLength(EdgeId e) {
+ return gp_.g.length(e) >= length_cutoff_;
+}
+
+map<EdgeId, size_t> ScaffoldingUniqueEdgeAnalyzer::FillNextEdgeVoting(BidirectionalPathMap<size_t>& active_paths, int direction) const {
+ map<EdgeId, size_t> voting;
+ for (const auto &pair: active_paths) {
+ int current_pos = int(pair.second) + direction;
+ auto path_iter = pair.first;
+ //not found
+ active_paths[path_iter] = path_iter->Size();
+ while (current_pos >= 0 && current_pos < (int) path_iter->Size()) {
+ if (gp_.g.length(path_iter->At(current_pos)) >= length_cutoff_) {
+ voting[path_iter->At(current_pos)] += size_t(round(path_iter->GetWeight()));
+ active_paths[path_iter] = size_t(current_pos);
+ break;
+ }
+ current_pos += direction;
+ }
+ }
+ return voting;
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, shared_ptr<GraphCoverageMap> long_reads_cov_map, const pe_config::LongReads lr_config, int direction) const {
+ BidirectionalPathSet all_set = long_reads_cov_map->GetCoveringPaths(e);
+ BidirectionalPathMap<size_t> active_paths;
+ size_t loop_weight = 0;
+ for (auto path_iter: all_set) {
+ auto pos = path_iter->FindAll(e);
+ if (pos.size() > 1)
+//TODO:: path weight should be size_t?
+ loop_weight += size_t(round(path_iter->GetWeight()));
+ else
+ active_paths[path_iter] = pos[0];
+ }
+//TODO: small plasmid, paths a-b-a, b-a-b ?
+ if (loop_weight > 1)
+ return false;
+ EdgeId prev_unique = e;
+ while (active_paths.size() > 0) {
+ size_t alt = 0;
+ size_t maxx = 0;
+ map<EdgeId, size_t> voting = FillNextEdgeVoting(active_paths, direction);
+
+ if (voting.size() == 0)
+ break;
+ EdgeId next_unique = prev_unique;
+ for (const auto &pair: voting)
+ if (pair.second > maxx) {
+ next_unique = pair.first;
+ maxx = pair.second;
+ }
+ for (const auto &pair: voting)
+ //TODO:: 1 from config?
+ if (pair.first != next_unique && pair.second > 1)
+ alt += pair.second;
+ if (maxx < lr_config.unique_edge_priority * double(alt)) {
+ DEBUG("edge " << gp_.g.int_id(e) <<" dir "<< direction << " was not unique" );
+ DEBUG("current edge " << gp_.g.int_id(next_unique));
+ DEBUG("Paths " << active_paths.size());
+ return false;
+ } else {
+ DEBUG("cur " << gp_.g.int_id(prev_unique) << " next " << gp_.g.int_id(next_unique) <<" sz " << active_paths.size());
+ for (auto iter = active_paths.begin(); iter != active_paths.end();) {
+ if (iter->second >= iter->first->Size() || iter->first->At(iter->second) != next_unique) {
+ iter = active_paths.erase(iter);
+ } else {
+ iter++;
+ }
+ }
+ prev_unique = next_unique;
+ DEBUG(active_paths.size() << " "<< gp_.g.int_id(next_unique));
+ }
+ }
+ DEBUG("edge " << gp_.g.int_id(e) <<" dir "<< direction << " was unique" );
+ return true;
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::ConservativeByPaths(EdgeId e, shared_ptr<GraphCoverageMap> long_reads_cov_map, const pe_config::LongReads lr_config) const{
+ return (ConservativeByPaths(e, long_reads_cov_map, lr_config, 1) && ConservativeByPaths(e, long_reads_cov_map, lr_config, -1));
+}
+
+
+void ScaffoldingUniqueEdgeAnalyzer::CheckCorrectness(ScaffoldingUniqueEdgeStorage& unique_storage_pb) {
+ for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ EdgeId e = *iter;
+ bool e_unique = unique_storage_pb.IsUnique(e);
+ bool e_conj_unique = unique_storage_pb.IsUnique(gp_.g.conjugate(e));
+ VERIFY_MSG(!((e_unique && !e_conj_unique) || (!e_unique && e_conj_unique)), "Edge " << gp_.g.int_id(e) << " is not symmetrically unique with it conjugate");
+ if (ConservativeByLength(e)) {
+ if (e_unique) {
+ DEBUG("edge " << gp_.g.int_id(e) << "is unique");
+ } else {
+ DEBUG("edge " << gp_.g.int_id(e) << "is not unique");
+ }
+ }
+ }
+}
+
+set<VertexId> ScaffoldingUniqueEdgeAnalyzer::GetChildren(VertexId v, map <VertexId, set<VertexId>> &dijkstra_cash_) const {
+ DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
+ DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(gp_.g, max_dijkstra_depth_, max_dijkstra_vertices_));
+ dijkstra.Run(v);
+
+ if (dijkstra_cash_.find(v) == dijkstra_cash_.end()) {
+ auto tmp = dijkstra.ReachedVertices();
+ tmp.push_back(v);
+ dijkstra_cash_[v] = set<VertexId> (tmp.begin(), tmp.end());
+ }
+ return dijkstra_cash_[v];
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(EdgeId e1, EdgeId e2, map <VertexId, set<VertexId>> &dijkstra_cash_) const {
+ auto s1 = GetChildren(gp_.g.EdgeEnd(e1), dijkstra_cash_);
+ auto s2 = GetChildren(gp_.g.EdgeEnd(e2), dijkstra_cash_);
+ if (s1.find(gp_.g.EdgeStart(e2)) != s1.end()) {
+ return true;
+ }
+ if (s2.find(gp_.g.EdgeStart(e1)) != s2.end()) {
+ return true;
+ }
+ for (VertexId v: s1) {
+ if (s2.find(v) != s2.end()) {
+ DEBUG("bulge-like structure, edges "<< gp_.g.int_id(e1) << " " << gp_.g.int_id(e2));
+ return true;
+ }
+ }
+ return false;
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(vector<pair<EdgeId, double>> &next_weights) const {
+ map <VertexId, set<VertexId>> dijkstra_cash_;
+ for (size_t i = 0; i < next_weights.size(); i ++) {
+ for (size_t j = i + 1; j < next_weights.size(); j++) {
+ if (!FindCommonChildren(next_weights[i].first, next_weights[j].first, dijkstra_cash_)) {
+ DEBUG("multiple paired info on edges " <<next_weights[i].first <<" and "<< next_weights[j].first);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+bool ScaffoldingUniqueEdgeAnalyzer::FindCommonChildren(EdgeId from, size_t lib_index) const{
+ DEBUG("processing unique edge " << gp_.g.int_id(from));
+ auto next_edges = gp_.clustered_indices[lib_index].Get(from);
+ vector<pair<EdgeId, double>> next_weights;
+ for (auto hist_pair: next_edges) {
+ if (hist_pair.first == from || hist_pair.first == gp_.g.conjugate(from))
+ continue;
+ double total_w = 0;
+ for (auto w: hist_pair.second)
+ total_w += w.weight;
+ if (math::gr(total_w, 1.0))
+ next_weights.push_back(make_pair(hist_pair.first, total_w));
+ }
+ sort(next_weights.begin(), next_weights.end(), [&](pair<EdgeId, double>a, pair<EdgeId, double>b){
+ return math::gr(a.second, b.second);
+ });
+//most popular edges. think whether it can be done faster
+ if (next_weights.size() > max_different_edges_) {
+ DEBUG(next_weights.size() << " continuations");
+ next_weights.resize(max_different_edges_);
+ }
+ return FindCommonChildren(next_weights);
+}
+
+
+void ScaffoldingUniqueEdgeAnalyzer::ClearLongEdgesWithPairedLib(size_t lib_index, ScaffoldingUniqueEdgeStorage &storage_) const {
+ set<EdgeId> to_erase;
+ for (EdgeId edge: storage_ ) {
+ if (!FindCommonChildren(edge, lib_index)) {
+ to_erase.insert(edge);
+ to_erase.insert(gp_.g.conjugate(edge));
+ }
+ }
+ for (auto iter = storage_.begin(); iter != storage_.end(); ){
+ if (to_erase.find(*iter) != to_erase.end()){
+ iter = storage_.erase(iter);
+ } else {
+ iter++;
+ }
+ }
+}
+
+
+void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgesWithLongReads(shared_ptr<GraphCoverageMap> long_reads_cov_map, ScaffoldingUniqueEdgeStorage& unique_storage_pb, const pe_config::LongReads lr_config) {
+ for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ EdgeId e = *iter;
+ if (ConservativeByLength(e) && ConservativeByPaths(e, long_reads_cov_map, lr_config)) {
+ unique_storage_pb.unique_edges_.insert(e);
+ }
+ }
+ CheckCorrectness(unique_storage_pb);
+}
+
+
+}
diff --git a/src/common/assembly_graph/graph_support/scaff_supplementary.hpp b/src/common/assembly_graph/graph_support/scaff_supplementary.hpp
new file mode 100644
index 0000000..f496b77
--- /dev/null
+++ b/src/common/assembly_graph/graph_support/scaff_supplementary.hpp
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "assembly_graph/core/graph.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "utils/logger/logger.hpp"
+//FIXME
+#include "modules/path_extend/pe_utils.hpp"
+#include "modules/path_extend/pe_config_struct.hpp"
+#include "modules/path_extend/paired_library.hpp"
+
+namespace path_extend {
+typedef debruijn_graph::EdgeId EdgeId;
+
+/* Storage of presumably unique, relatively long edges. Filled by ScaffoldingUniqueEdgeAnalyzer
+ *
+ */
+class ScaffoldingUniqueEdgeStorage {
+ friend class ScaffoldingUniqueEdgeAnalyzer;
+private:
+ set <EdgeId> unique_edges_;
+ size_t min_unique_length_;
+public:
+ ScaffoldingUniqueEdgeStorage(): unique_edges_(){
+ DEBUG("storage created, empty");
+ }
+
+ bool IsUnique(EdgeId e) const {
+ return (unique_edges_.find(e) != unique_edges_.end());
+ }
+
+ decltype(unique_edges_.begin()) begin() const {
+ return unique_edges_.begin();
+ }
+
+ decltype(unique_edges_.end()) end() const {
+ return unique_edges_.end();
+ }
+
+ decltype(unique_edges_.begin()) erase(decltype(unique_edges_.begin()) iter){
+ return unique_edges_.erase(iter);
+ }
+
+ size_t size() const {
+ return unique_edges_.size();
+ }
+ size_t GetMinLength() const {
+ return min_unique_length_;
+ }
+ void SetMinLength(size_t min_length) {
+ min_unique_length_ = min_length;
+ }
+
+ const set<EdgeId>& GetSet() const {
+ return unique_edges_;
+ }
+
+protected:
+ DECL_LOGGER("ScaffoldingUniqueEdgeStorage")
+
+};
+
+//Auxillary class required to fillin the unique edge storage.
+
+
+class ScaffoldingUniqueEdgeAnalyzer {
+
+ const debruijn_graph::conj_graph_pack &gp_;
+ size_t length_cutoff_;
+ double median_coverage_;
+ double relative_coverage_variation_;
+//for uniqueness detection
+ static const size_t max_different_edges_ = 20;
+ static const size_t max_dijkstra_depth_ = 1000;
+ static const size_t max_dijkstra_vertices_ = 1000;
+ set<VertexId> GetChildren(VertexId v, map <VertexId, set<VertexId>> &dijkstra_cash_) const;
+ bool FindCommonChildren(EdgeId e1, EdgeId e2, map <VertexId, set<VertexId>> &dijkstra_cash_) const;
+ bool FindCommonChildren(vector<pair<EdgeId, double>> &next_weights) const;
+ bool FindCommonChildren(EdgeId from, size_t lib_index) const;
+ map<EdgeId, size_t> FillNextEdgeVoting(BidirectionalPathMap<size_t>& active_paths, int direction) const;
+ bool ConservativeByPaths(EdgeId e, shared_ptr<GraphCoverageMap> long_reads_cov_map, const pe_config::LongReads lr_config) const;
+ bool ConservativeByPaths(EdgeId e, shared_ptr<GraphCoverageMap> long_reads_cov_map, const pe_config::LongReads lr_config, int direction) const;
+ bool ConservativeByLength(EdgeId e);
+ void CheckCorrectness(ScaffoldingUniqueEdgeStorage& unique_storage_pb);
+protected:
+ DECL_LOGGER("ScaffoldingUniqueEdgeAnalyzer")
+
+
+ void SetCoverageBasedCutoff();
+public:
+ ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, double max_relative_coverage):gp_(gp), length_cutoff_(apriori_length_cutoff), relative_coverage_variation_(max_relative_coverage){
+ SetCoverageBasedCutoff();
+ }
+ void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_);
+ void ClearLongEdgesWithPairedLib(size_t lib_index, ScaffoldingUniqueEdgeStorage &storage_) const;
+ void FillUniqueEdgesWithLongReads(shared_ptr<GraphCoverageMap> long_reads_cov_map, ScaffoldingUniqueEdgeStorage& unique_storage_pb, const pe_config::LongReads lr_config);
+};
+}
+
+
diff --git a/src/common/assembly_graph/handlers/edge_labels_handler.hpp b/src/common/assembly_graph/handlers/edge_labels_handler.hpp
new file mode 100644
index 0000000..551939f
--- /dev/null
+++ b/src/common/assembly_graph/handlers/edge_labels_handler.hpp
@@ -0,0 +1,226 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ *
+ * Saves labeling of new_graph via different graph transformation by edges of unresolved graph - old_graph
+ * Has two methods
+ *
+ * Created on: Aug 5, 2011
+ * Author: undead
+ */
+
+#ifndef EDGE_LABELS_HANDLER_HPP_
+#define EDGE_LABELS_HANDLER_HPP_
+
+//#include "utils.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "utils/simple_tools.hpp"
+#include <unordered_map>
+#include <map>
+
+using namespace omnigraph;
+
+namespace omnigraph {
+using std::map;
+
+//todo ask Shurik to remove new_graph_
+template<class Graph>
+class EdgeLabelHandler : public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+private:
+ Graph &new_graph_;
+ Graph &old_graph_;
+ //From new edge to sequence of old
+public:
+ map<EdgeId, vector<EdgeId> > edge_labels;
+ //From old edge to set of new ones, containing it.
+ map<EdgeId, set<EdgeId> > edge_inclusions;
+public:
+ //TODO: integrate this to resolver, remove "from_resolve" parameter
+ EdgeLabelHandler(Graph &new_graph, Graph &old_graph,
+ const std::map<EdgeId, EdgeId> &from_resolve)
+ : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
+ new_graph_(new_graph),
+ old_graph_(old_graph) {
+ // printing from resolve
+ FillLabels(from_resolve);
+ /* for(auto iter = from_resolve.begin(); iter != from_resolve.end(); ++iter) {
+ if (edge_inclusions.find(iter->second) == edge_inclusions.end()){
+ set<EdgeId> tmp;
+ edge_inclusions.insert(make_pair(iter->second, tmp));
+ }
+ edge_inclusions[iter->second].insert(iter->first);
+
+ if (edge_labels.find(iter->first) == edge_labels.end()) {
+ set<EdgeId> tmp;
+ edge_labels.insert(make_pair(iter->first, tmp));
+ }
+ edge_labels[iter->second].push_back(iter->second);
+ }
+ */}
+
+ EdgeLabelHandler(Graph &new_graph, Graph &old_graph)
+ : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
+ new_graph_(new_graph),
+ old_graph_(old_graph) {
+ }
+
+ void FillLabels(const map<EdgeId, EdgeId> &from_resolve) {
+ for (auto iter = from_resolve.begin(); iter != from_resolve.end();
+ ++iter) {
+ if (edge_inclusions.find(iter->second) == edge_inclusions.end()) {
+ set<EdgeId> tmp;
+ edge_inclusions.insert(make_pair(iter->second, tmp));
+ }
+ edge_inclusions.find(iter->second)->second.insert(iter->first);
+
+ if (edge_labels.find(iter->first) == edge_labels.end()) {
+ vector<EdgeId> tmp;
+ edge_labels.insert(make_pair(iter->first, tmp));
+ }
+ edge_labels[iter->first].push_back(iter->second);
+ }
+ }
+
+ virtual ~EdgeLabelHandler() {
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ TRACE("Handle glue");
+ if (edge_labels[edge1] != edge_labels[edge2])
+ WARN("gluing two different edges is not a good idea on this step! EdgeLabel Handler can fail on such operation");
+ vector<EdgeId> tmp;
+ for (size_t i = 0; i < edge_labels[edge1].size(); i++) {
+ edge_inclusions.find(edge_labels[edge1][i])->second.insert(
+ new_edge);
+ edge_inclusions.find(edge_labels[edge1][i])->second.erase(edge1);
+ tmp.push_back(edge_labels[edge1][i]);
+
+ edge_labels.erase(edge1);
+ }
+ for (size_t i = 0; i < edge_labels[edge2].size(); i++) {
+ edge_inclusions.find(edge_labels[edge2][i])->second.insert(
+ new_edge);
+ edge_inclusions.find(edge_labels[edge2][i])->second.erase(edge2);
+ edge_labels.erase(edge2);
+
+ // tmp.push_back(edge_labels[edge1][i]);
+ }
+
+ edge_labels.insert(make_pair(new_edge, tmp));
+
+ }
+
+ virtual void HandleSplit(EdgeId /*oldEdge*/, EdgeId /*newEdge1*/, EdgeId /*newEdge2*/) {
+ WARN("EdgesLabelHandler does not support splits");
+ }
+
+ virtual void HandleMerge(const vector<EdgeId> &oldEdges, EdgeId newEdge) {
+ TRACE("HandleMerge by edge labels handler");
+ size_t n = oldEdges.size();
+ vector<EdgeId> tmp;
+ for (size_t j = 0; j < n; j++) {
+ TRACE("Edge " << oldEdges[j] << " was labeled by " << edge_labels[oldEdges[j]]);
+ for (size_t i = 0; i < edge_labels[oldEdges[j]].size(); i++) {
+ edge_inclusions[edge_labels[oldEdges[j]][i]].insert(newEdge);
+ edge_inclusions[edge_labels[oldEdges[j]][i]].erase(oldEdges[j]);
+ tmp.push_back(edge_labels[oldEdges[j]][i]);
+ }
+ edge_labels.erase(oldEdges[j]);
+ }
+ if (edge_labels.find(newEdge) != edge_labels.end()) {
+ DEBUG("Unexpected finding of new edge labels");
+ };
+ edge_labels[newEdge] = tmp;
+
+ }
+
+ /*
+ virtual void HandleAdd(VertexId v) {
+ AddVertexIntId(v);
+ }
+ virtual void HandleDelete(VertexId v) {
+ ClearVertexId(v);
+ }
+ */
+ virtual void HandleAdd(EdgeId e) {
+ TRACE("Add edge " << e);
+
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ for (size_t i = 0; i < edge_labels[e].size(); i++) {
+ edge_inclusions[edge_labels[e][i]].erase(e);
+ }
+ edge_labels.erase(e);
+ }
+
+ std::string str(EdgeId edgeId) const {
+ std::stringstream ss;
+
+ auto it = edge_labels.find(edgeId);
+ if (it != edge_labels.end()) {
+ TRACE("Number of labels " << it->second.size());
+ for (auto label_it = it->second.begin(), end = it->second.end();
+ label_it != end; ++label_it) {
+ ss << this->g().str(*label_it) << "\\n";
+ }
+ }
+ return ss.str();
+ }
+
+ vector<pair<EdgeId, size_t> > resolvedPositions(EdgeId old_edge, size_t position_on_edge) {
+ vector<pair<EdgeId, size_t> > res;
+ for (auto it = edge_inclusions[old_edge].begin(); it != edge_inclusions[old_edge].end(); it++) {
+ EdgeId cur_edge = *it;
+ size_t cur_shift = 0;
+ for (size_t i = 0; i < edge_labels[cur_edge].size(); i++) {
+ if (edge_labels[cur_edge][i] == old_edge) {
+ res.push_back(make_pair(cur_edge, cur_shift + position_on_edge));
+ }
+ cur_shift += old_graph_.length(edge_labels[cur_edge][i]);
+ }
+ }
+ return res;
+ }
+
+};
+
+template<class Graph>
+class EdgesLabelsGraphLabeler : public GraphLabeler<Graph> {
+
+protected:
+ typedef GraphLabeler<Graph> super;
+ typedef typename super::EdgeId EdgeId;
+ typedef typename super::VertexId VertexId;
+ Graph &g_;
+public:
+ EdgeLabelHandler<Graph> &EdgesLabels;
+
+ EdgesLabelsGraphLabeler(Graph &g, EdgeLabelHandler<Graph> &EdgesLab)
+ : g_(g),
+ EdgesLabels(EdgesLab) {
+ }
+
+ virtual std::string label(VertexId vertexId) const {
+ return g_.str(vertexId);
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ return EdgesLabels.str(edgeId) + ": " + g_.str(edgeId);
+ }
+
+ virtual ~EdgesLabelsGraphLabeler() {
+ TRACE("~EdgesPosGraphLabeler");
+ }
+
+};
+}
+
+#endif /* EDGE_LABELS_HANDLER_HPP_ */
diff --git a/src/common/assembly_graph/handlers/edges_position_handler.hpp b/src/common/assembly_graph/handlers/edges_position_handler.hpp
new file mode 100644
index 0000000..c3b4c4a
--- /dev/null
+++ b/src/common/assembly_graph/handlers/edges_position_handler.hpp
@@ -0,0 +1,212 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * edges_position_handler.hpp
+ *
+ * Created on: 22.07.2011
+ *
+ */
+
+#ifndef EDGES_POSITION_HANDLER_HPP_
+#define EDGES_POSITION_HANDLER_HPP_
+
+//#include "utils.hpp"
+#include "utils/simple_tools.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/core/action_handlers.hpp"
+
+namespace omnigraph {
+
+struct EdgePosition {
+ string contigId;
+ MappingRange mr;
+ EdgePosition(string _contigId, MappingRange _mr) : contigId(_contigId), mr(_mr) {
+ }
+
+ EdgePosition() {
+ }
+};
+
+inline ostream& operator <<(ostream& os, const EdgePosition& ep) {
+ return os << ep.contigId << " " << ep.mr;
+}
+
+template<class Graph>
+class EdgesPositionHandler: public GraphActionHandler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ size_t max_mapping_gap_;
+ size_t max_gap_diff_;
+ map<EdgeId, map<string, std::set<MappingRange>>> edges_positions_;
+ //TODO extract set<MappingRange> as a storage class
+
+ MappingRange EraseAndExtract(set<MappingRange> &ranges, set<MappingRange>::iterator &position, const MappingRange &new_pos) {
+ auto &old_pos = *position;
+ if(old_pos.IntersectLeftOf(new_pos) || old_pos.StrictlyContinuesWith(new_pos, max_mapping_gap_, max_gap_diff_)) {
+ ranges.erase(position);
+ return old_pos.Merge(new_pos);
+ } else if(new_pos.IntersectLeftOf(old_pos) || new_pos.StrictlyContinuesWith(old_pos, max_mapping_gap_, max_gap_diff_)) {
+ ranges.erase(position);
+ return new_pos.Merge(old_pos);
+ } else {
+ return new_pos;
+ }
+ }
+
+public:
+ MappingRange EraseAndExtract(set<MappingRange> &ranges, MappingRange new_pos) {
+ auto it = ranges.lower_bound(new_pos);
+ if(it != ranges.end()) {
+ new_pos = EraseAndExtract(ranges, it, new_pos);
+ it = ranges.lower_bound(new_pos);
+ }
+ if(it != ranges.begin()) {
+ new_pos = EraseAndExtract(ranges, --it, new_pos);
+ }
+ return new_pos;
+ }
+
+ set<MappingRange> GetEdgePositions(EdgeId edge, string contig_id) const {
+ VERIFY(this->IsAttached());
+ auto edge_it = edges_positions_.find(edge);
+ if(edge_it == edges_positions_.end())
+ return set<MappingRange>();
+ const auto& positions = edge_it->second;
+ auto it = positions.find(contig_id);
+ if(it == positions.end())
+ return set<MappingRange>();
+ else
+ return it->second;
+ }
+
+ vector<EdgePosition> GetEdgePositions(EdgeId edge) const {
+ VERIFY(this->IsAttached());
+ auto edge_it = edges_positions_.find(edge);
+ if(edge_it == edges_positions_.end())
+ return vector<EdgePosition>();
+ vector<EdgePosition> result;
+ for(auto it = edge_it->second.begin(); it != edge_it->second.end(); ++it) {
+ for(auto pos_it = it->second.begin(); pos_it != it->second.end(); ++pos_it) {
+ result.push_back(EdgePosition(it->first, *pos_it));
+ }
+ }
+ return result;
+ }
+
+ void AddEdgePosition(EdgeId edge, string contig_id, size_t start, size_t end, size_t m_start, size_t m_end) {
+ VERIFY(this->IsAttached());
+ AddEdgePosition(edge, contig_id, MappingRange(start, end, m_start, m_end));
+ }
+
+ void AddEdgePosition(EdgeId edge, string contig_id, MappingRange new_pos) {
+ VERIFY(this->IsAttached());
+ if(new_pos.empty())
+ return;
+ set<MappingRange> &new_set = edges_positions_[edge][contig_id];
+ new_pos = EraseAndExtract(new_set, new_pos);
+ new_set.insert(new_pos);
+ }
+
+ void AddAndShiftEdgePositions(EdgeId edge, const map<string, set<MappingRange>> &contig_map, int shift = 0) {
+ VERIFY(this->IsAttached());
+ for(auto contig_it = contig_map.begin(); contig_it != contig_map.end(); ++contig_it) {
+ for(auto it = contig_it->second.begin(); it != contig_it->second.end(); ++it) {
+ AddEdgePosition(edge, contig_it->first, it->Shift(shift).Fit(this->g().length(edge)));
+ }
+ }
+ }
+
+ template<typename Iter>
+ void AddEdgePositions(EdgeId edge, Iter begin, Iter end) {
+ VERIFY(this->IsAttached());
+ for(auto it = begin; it != end; ++it) {
+ AddEdgePosition(edge, it->contigId, it->mr);
+ }
+ }
+
+ std::string str(EdgeId edge) const {
+ VERIFY(this->IsAttached());
+ std::stringstream ss;
+ vector<EdgePosition> positions = GetEdgePositions(edge);
+ size_t counter = 0;
+ for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) {
+ ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n";
+ counter++;
+ if(counter > 30) {
+ ss << "and many more. Totally " << positions.size() << " positions.";
+ break;
+ }
+ }
+ return ss.str();
+ }
+
+ /**
+ * @param max_mapping_gap - maximal difference in positions of
+ * original sequence for two mapping ranges to be merged.
+ * @param max_gap_diff - maximal difference between gaps in initial and mapped ranges for
+ * mapping ranges to be merged
+ */
+ EdgesPositionHandler(const Graph &g, size_t max_mapping_gap, size_t max_gap_diff = 0) :
+ GraphActionHandler<Graph>(g, "EdgePositionHandler"),
+ max_mapping_gap_(max_mapping_gap),
+ max_gap_diff_(max_gap_diff) {
+ }
+
+ virtual ~EdgesPositionHandler() {
+ TRACE("~EdgePositionHandler ok");
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// TRACE("Handle glue ");
+ auto positions1 = GetEdgePositions(edge1);
+ auto positions2 = GetEdgePositions(edge2);
+ AddEdgePositions(new_edge, positions1.begin(), positions1.end());
+ AddEdgePositions(new_edge, positions2.begin(), positions2.end());
+ }
+
+ virtual void HandleSplit(EdgeId oldEdge, EdgeId newEdge1, EdgeId newEdge2) {
+ if (oldEdge == this->g().conjugate(oldEdge)) {
+ WARN("EdgesPositionHandler does not support self-conjugate splits");
+ return;
+ }
+ if (edges_positions_.count(oldEdge) != 0) {
+ auto contig_map = edges_positions_[oldEdge];
+ AddAndShiftEdgePositions(newEdge1, contig_map, 0);
+ AddAndShiftEdgePositions(newEdge2, contig_map, -int(this->g().length(newEdge1)));
+ }
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& oldEdges, EdgeId newEdge) {
+ int shift = 0;
+ for(auto it = oldEdges.begin(); it != oldEdges.end(); ++it) {
+ if (edges_positions_.count(*it) != 0) {
+ AddAndShiftEdgePositions(newEdge, edges_positions_[*it], shift);
+ }
+ shift += int(this->g().length(*it));
+ }
+ }
+
+ virtual void HandleAdd(EdgeId /*e*/) {
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ edges_positions_.erase(e);
+ }
+
+ void clear() {
+ edges_positions_.clear();
+ }
+
+private:
+ DECL_LOGGER("EdgesPositionHandler");
+};
+
+}
+
+#endif /* EDGES_POSITION_HANDLER_HPP_ */
diff --git a/src/common/assembly_graph/handlers/id_track_handler.hpp b/src/common/assembly_graph/handlers/id_track_handler.hpp
new file mode 100644
index 0000000..12ab12b
--- /dev/null
+++ b/src/common/assembly_graph/handlers/id_track_handler.hpp
@@ -0,0 +1,110 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <unordered_map>
+//#include "utils.hpp"
+#include "visualization/graph_labeler.hpp"
+#include "utils/simple_tools.hpp"
+#include "assembly_graph/core/action_handlers.hpp"
+using namespace omnigraph;
+
+namespace omnigraph {
+template<class Graph>
+class GraphElementFinder : public GraphActionHandler<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ unordered_map<size_t, VertexId> id2vertex_;
+ unordered_map<size_t, EdgeId> id2edge_;
+
+public:
+ GraphElementFinder(const Graph &graph) : GraphActionHandler<Graph>(graph, "Graph element finder") {
+ }
+
+ virtual ~GraphElementFinder() {
+ }
+
+ virtual void HandleAdd(EdgeId e) {
+#pragma omp critical
+ {
+ id2edge_[e.int_id()] = e;
+ }
+ }
+
+ virtual void HandleAdd(VertexId v) {
+#pragma omp critical
+ {
+ id2vertex_[v.int_id()] = v;
+ }
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ id2edge_[e.int_id()] = e;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ id2vertex_[v.int_id()] = v;
+ }
+
+ VertexId ReturnVertexId(size_t id) const {
+ auto it = id2vertex_.find(id);
+ if(it == id2vertex_.end())
+ return VertexId();
+ else
+ return it->second;
+ }
+
+ EdgeId ReturnEdgeId(size_t id) const {
+ auto it = id2edge_.find(id);
+ if(it == id2edge_.end())
+ return EdgeId();
+ else
+ return it->second;
+ }
+
+ void Init() {
+ for(auto it = this->g().begin(); it != this->g().end(); ++it) {
+ HandleAdd(*it);
+ for(auto eit = this->g().OutgoingEdges(*it).begin(); eit != this->g().OutgoingEdges(*it).end(); ++eit) {
+ HandleAdd(*eit);
+ }
+ }
+ }
+};
+
+template<class VertexId, class EdgeId>
+class BaseIdTrackHandler {
+public:
+ BaseIdTrackHandler() {
+ }
+
+ size_t ReturnIntId(EdgeId e) const {
+ return e.int_id();
+ }
+
+ size_t ReturnIntId(VertexId v) const {
+ return v.int_id();
+ }
+};
+
+template<class Graph>
+class IdTrackHandler : public BaseIdTrackHandler<typename Graph::VertexId, typename Graph::EdgeId> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &graph_;
+public:
+ IdTrackHandler(const Graph& g) : graph_(g) {
+ }
+
+ ~IdTrackHandler() {
+ }
+};
+
+}
diff --git a/src/common/assembly_graph/paths/bidirectional_path.cpp b/src/common/assembly_graph/paths/bidirectional_path.cpp
new file mode 100644
index 0000000..b9d45f4
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path.cpp
@@ -0,0 +1,21 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bidirectional_path.cpp
+ *
+ * Created on: Jun 25, 2015
+ * Author: andrey
+ */
+
+#include "utils/standard_base.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+namespace path_extend {
+
+std::atomic<uint64_t> BidirectionalPath::path_id_{0};
+
+}
diff --git a/src/common/assembly_graph/paths/bidirectional_path.hpp b/src/common/assembly_graph/paths/bidirectional_path.hpp
new file mode 100644
index 0000000..9861708
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path.hpp
@@ -0,0 +1,1098 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bidirectional_path.h
+ *
+ * Created on: Nov 14, 2011
+ * Author: andrey
+ */
+#pragma once
+
+#include <atomic>
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+
+using debruijn_graph::Graph;
+using debruijn_graph::EdgeId;
+using debruijn_graph::VertexId;
+
+namespace path_extend {
+
+class BidirectionalPath;
+
+struct Gap {
+ int gap_;
+ uint32_t trash_previous_;
+ uint32_t trash_current_;
+ Gap(int gap)
+ : gap_(gap), trash_previous_(0), trash_current_(0)
+ { }
+
+ Gap(int gap, uint32_t trash_previous, uint32_t trash_current)
+ : gap_(gap), trash_previous_(trash_previous), trash_current_(trash_current)
+ { }
+};
+
+
+class PathListener {
+public:
+ virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
+ virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
+ virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
+ virtual ~PathListener() {
+ }
+};
+
+
+class BidirectionalPath : public PathListener {
+private:
+ static std::atomic<uint64_t> path_id_;
+
+
+public:
+ BidirectionalPath(const Graph& g)
+ : g_(g),
+ data_(),
+ conj_path_(NULL),
+ cumulative_len_(),
+ gap_len_(),
+ listeners_(),
+ id_(path_id_++),
+ weight_(1.0),
+ has_overlaped_begin_(false),
+ has_overlaped_end_(false),
+ overlap_(false) {
+ }
+
+ BidirectionalPath(const Graph& g, const std::vector<EdgeId>& path)
+ : BidirectionalPath(g) {
+ for (size_t i = 0; i < path.size(); ++i) {
+ PushBack(path[i]);
+ }
+ RecountLengths();
+ }
+
+ BidirectionalPath(const Graph& g, EdgeId startingEdge)
+ : BidirectionalPath(g) {
+ PushBack(startingEdge);
+ }
+
+ BidirectionalPath(const BidirectionalPath& path)
+ : g_(path.g_),
+ data_(path.data_),
+ conj_path_(NULL),
+ cumulative_len_(path.cumulative_len_),
+ gap_len_(path.gap_len_),
+ listeners_(),
+ id_(path_id_++),
+ weight_(path.weight_),
+ has_overlaped_begin_(path.has_overlaped_begin_),
+ has_overlaped_end_(path.has_overlaped_end_),
+ overlap_(path.overlap_) {
+ }
+
+public:
+ void Subscribe(PathListener * listener) {
+ listeners_.push_back(listener);
+ }
+
+ void Unsubscribe(PathListener * listener) {
+ for (auto it = listeners_.begin(); it != listeners_.end(); ++it) {
+ if (*it == listener) {
+ listeners_.erase(it);
+ break;
+ }
+ }
+ }
+
+ void SetConjPath(BidirectionalPath* path) {
+ conj_path_ = path;
+ }
+
+ const BidirectionalPath* GetConjPath() const {
+ return conj_path_;
+ }
+
+ BidirectionalPath* GetConjPath() {
+ return conj_path_;
+ }
+
+ void SetWeight(float w) {
+ weight_ = w;
+ }
+
+ double GetWeight() const {
+ return weight_;
+ }
+
+ size_t Size() const {
+ return data_.size();
+ }
+
+ const Graph& graph() const {
+ return g_;
+ }
+
+ bool Empty() const {
+ return data_.empty();
+ }
+
+ size_t Length() const {
+ if (gap_len_.size() == 0 || cumulative_len_.size() == 0) {
+ return 0;
+ }
+ return cumulative_len_[0] + gap_len_[0].gap_;
+ }
+
+ //TODO iterators forward/reverse
+ EdgeId operator[](size_t index) const {
+ return data_[index];
+ }
+
+ EdgeId At(size_t index) const {
+ return data_[index];
+ }
+
+ EdgeId ReverseAt(size_t index) const {
+ return data_[data_.size() - index - 1];
+ }
+
+
+ // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN)
+ size_t LengthAt(size_t index) const {
+ return cumulative_len_[index];
+ }
+
+ int GapAt(size_t index) const {
+ return gap_len_[index].gap_;
+ }
+
+ uint32_t TrashCurrentAt(size_t index) const {
+ return gap_len_[index].trash_current_;
+ }
+
+ uint32_t TrashPreviousAt(size_t index) const {
+ return gap_len_[index].trash_previous_;
+ }
+
+ size_t GetId() const {
+ return id_;
+ }
+
+ EdgeId Back() const {
+ return data_.back();
+ }
+
+ EdgeId Front() const {
+ return data_.front();
+ }
+
+ void PushBack(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
+ data_.push_back(e);
+ Gap gap_struct(gap, trash_previous, trash_current);
+ gap_len_.push_back(gap_struct);
+ IncreaseLengths(g_.length(e), gap_struct);
+ NotifyBackEdgeAdded(e, gap_struct);
+ }
+
+ void PushBack(EdgeId e, Gap gap) {
+ data_.push_back(e);
+ gap_len_.push_back(gap);
+ IncreaseLengths(g_.length(e), gap);
+ NotifyBackEdgeAdded(e, gap);
+ }
+
+ void PushBack(const BidirectionalPath& path) {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ PushBack(path.At(i), path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
+ }
+ }
+
+ void PopBack() {
+ if (data_.empty()) {
+ return;
+ }
+ EdgeId e = data_.back();
+ DecreaseLengths();
+ gap_len_.pop_back();
+ data_.pop_back();
+ NotifyBackEdgeRemoved(e);
+ }
+
+ void PopBack(size_t count) {
+ for (size_t i = 0; i < count; ++i) {
+ PopBack();
+ }
+ }
+
+ void Clear() {
+ while (!Empty()) {
+ PopBack();
+ }
+ }
+
+ virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, int) {
+ }
+
+ virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, Gap) {
+ }
+
+
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, int gap) {
+ PushFront(g_.conjugate(e), gap);
+ }
+
+ virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, Gap gap) {
+ PushFront(g_.conjugate(e), gap);
+ }
+
+ virtual void FrontEdgeRemoved(EdgeId, BidirectionalPath*) {
+ }
+
+ virtual void BackEdgeRemoved(EdgeId, BidirectionalPath *) {
+ PopFront();
+ }
+
+ int FindFirst(EdgeId e) const {
+ for (size_t i = 0; i < Size(); ++i) {
+ if (data_[i] == e) {
+ return (int) i;
+ }
+ }
+ return -1;
+ }
+
+ int FindLast(EdgeId e) const {
+ for (int i = (int) Size() - 1; i >= 0; --i) {
+ if (data_[i] == e) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ bool Contains(EdgeId e) const {
+ return FindFirst(e) != -1;
+ }
+
+ bool Contains(VertexId v) const {
+ for(auto edge : data_) {
+ if(g_.EdgeEnd(edge) == v || g_.EdgeStart(edge) == v ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ vector<size_t> FindAll(EdgeId e, size_t start = 0) const {
+ vector<size_t> result;
+ for (size_t i = start; i < Size(); ++i) {
+ if (data_[i] == e) {
+ result.push_back(i);
+ }
+ }
+ return result;
+ }
+
+ bool CompareFrom(size_t from, const BidirectionalPath& sample) const {
+ if (from + sample.Size() > Size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < sample.Size(); ++i) {
+ if (At(from + i) != sample[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ size_t CommonEndSize(const BidirectionalPath& p) const {
+ if (p.Size() == 0) {
+ return 0;
+ }
+ std::vector<size_t> begins = FindAll(p.At(0));
+ for (size_t i = 0; i < begins.size(); ++i) {
+ size_t it1 = begins[i];
+ size_t it2 = 0;
+ while (it2 < p.Size() and At(it1) == p.At(it2)) {
+ it1++;
+ it2++;
+ if (it1 == Size()) {
+ return it2;
+ }
+ }
+ }
+ return 0;
+ }
+
+ size_t OverlapEndSize(const BidirectionalPath* path2) const {
+ if (Size() == 0) {
+ return 0;
+ }
+ int last1 = (int) Size() - 1;
+ int max_over = 0;
+ vector<size_t> begins2 = path2->FindAll(At(last1));
+ for (size_t i = 0; i < begins2.size(); ++i) {
+ int begin2 = (int) begins2[i];
+ int cur1 = last1;
+ while (begin2 > 0 && cur1 > 0 && path2->At(begin2 - 1) == At(cur1 - 1)) {
+ cur1--;
+ begin2--;
+ }
+ int over = last1 - cur1 + 1;
+ if (begin2 == 0 && cur1 > 0 && over > max_over) {
+ max_over = over;
+ }
+ }
+ return (size_t) max_over;
+ }
+
+ int FindFirst(const BidirectionalPath& path, size_t from = 0) const {
+ if (path.Size() > Size()) {
+ return -1;
+ }
+ for (size_t i = from; i <= Size() - path.Size(); ++i) {
+ if (CompareFrom(i, path)) {
+ return (int) i;
+ }
+ }
+ return -1;
+ }
+//TODO: Why just naive search?
+ int FindLast(const BidirectionalPath& path) const {
+ if (path.Size() > Size()) {
+ return -1;
+ }
+ for (int i = (int) (Size() - path.Size()); i >= 0; --i) {
+ if (CompareFrom((size_t) i, path)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ bool Contains(const BidirectionalPath& path) const {
+ return FindFirst(path) != -1;
+ }
+
+ bool Equal(const BidirectionalPath& path) const {
+ return operator==(path);
+ }
+
+ bool operator==(const BidirectionalPath& path) const {
+ return Size() == path.Size() && CompareFrom(0, path);
+ }
+
+ bool operator!=(const BidirectionalPath& path) const {
+ return !operator==(path);
+ }
+
+ void CheckConjugateEnd(size_t max_repeat_length) {
+ size_t prev_size = 0;
+ while (prev_size != Size()) {
+ prev_size = Size();
+ FindConjEdges(max_repeat_length);
+ }
+ }
+
+ size_t GetComponent(const debruijn_graph::ConnectedComponentCounter &component_counter) const {
+ std::unordered_map <size_t, size_t> component_sizes;
+ for (size_t i = 0; i < this->Size(); i++) {
+ auto e = this->At(i);
+ size_t comp_id = component_counter.GetComponent(e);
+ if (component_sizes.find(comp_id) == component_sizes.end())
+ component_sizes[comp_id] = 0;
+ component_sizes[comp_id] += g_.length(e);
+ }
+ size_t ans = 0;
+ size_t maxans = 0;
+ for (auto pp: component_sizes) {
+ if (pp.second > maxans) {
+ ans = pp.first;
+ maxans = pp.second;
+ }
+ }
+ return ans;
+ }
+
+ void FindConjEdges(size_t max_repeat_length) {
+ for (size_t begin_pos = 0; begin_pos < Size(); ++begin_pos) {
+ size_t begin = begin_pos;
+ vector<size_t> conj_pos = FindAll(g_.conjugate(At(begin_pos)), begin + 1);
+ for (auto end_pos = conj_pos.rbegin(); end_pos != conj_pos.rend(); ++end_pos) {
+ VERIFY(*end_pos < Size());
+ size_t end = *end_pos;
+ if (end <= begin) {
+ continue;
+ }
+ while (begin < end && At(begin) == g_.conjugate(At(end))) {
+ begin++;
+ end--;
+ }
+ DEBUG("Found palindromic fragment from " << begin_pos << " to " << *end_pos);
+ Print();
+ VERIFY(*end_pos < Size());
+ size_t tail_size = Size() - *end_pos - 1;
+ size_t head_size = begin_pos;
+ size_t palindrom_half_size = begin - begin_pos;
+ size_t head_len = Length() - LengthAt(begin_pos);
+ size_t tail_len = *end_pos < Size() - 1 ? LengthAt(*end_pos + 1) : 0;
+//TODO : this is not true in case of gaps inside the palindrom_len;
+ size_t palindrom_len = (size_t) max((int) LengthAt(begin_pos) - (int) LengthAt(begin), 0);
+ size_t between = (size_t) max(0, (int) LengthAt(begin) - (int) (end < Size() - 1 ? LengthAt(end + 1) : 0));
+ DEBUG("tail len " << tail_len << " head len " << head_len << " palindrom_len "<< palindrom_len << " between " << between);
+ if (palindrom_len <= max_repeat_length) {
+ if (palindrom_len < head_len && palindrom_len < tail_len) {
+ DEBUG("too big head and end");
+ continue;
+ }
+ if (between > palindrom_len) {
+ DEBUG("too big part between");
+ continue;
+ }
+ }
+ bool delete_tail = tail_size < head_size;
+ if (tail_size == head_size) {
+ delete_tail = tail_len < head_len;
+ }
+ if (delete_tail) {
+ PopBack(tail_size + palindrom_half_size);
+ DEBUG("Deleting tail because of palindrom removal");
+ return;
+ } else {
+ GetConjPath()->PopBack(head_size + palindrom_half_size);
+ DEBUG("Deleting head because of palindrom removal");
+ return;
+ }
+ }
+ }
+ }
+
+ BidirectionalPath SubPath(size_t from, size_t to) const {
+ BidirectionalPath result(g_);
+ for (size_t i = from; i < min(to, Size()); ++i) {
+ result.PushBack(data_[i], gap_len_[i]);
+ }
+ return result;
+ }
+
+ BidirectionalPath SubPath(size_t from) const {
+ return SubPath(from, Size());
+ }
+
+ double Coverage() const {
+ double cov = 0.0;
+
+ for (size_t i = 0; i < Size(); ++i) {
+ cov += g_.coverage(data_[i]) * (double) g_.length(data_[i]);
+ }
+ return cov / (double) Length();
+ }
+
+ BidirectionalPath Conjugate() const {
+ BidirectionalPath result(g_);
+ if (Empty()) {
+ return result;
+ }
+ result.PushBack(g_.conjugate(Back()), 0);
+ for (int i = ((int) Size()) - 2; i >= 0; --i) {
+ result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].gap_ + gap_len_[i + 1].trash_current_ - gap_len_[i + 1].trash_previous_, gap_len_[i + 1].trash_current_, gap_len_[i + 1].trash_previous_);
+ }
+
+ return result;
+ }
+
+ vector<EdgeId> ToVector() const {
+ return vector<EdgeId>(data_.begin(), data_.end());
+ }
+
+ bool CameToInterstrandBulge() const {
+ if (Empty())
+ return false;
+
+ EdgeId lastEdge = Back();
+ VertexId lastVertex = g_.EdgeEnd(lastEdge);
+
+ if (g_.OutgoingEdgeCount(lastVertex) == 2) {
+ vector<EdgeId> bulgeEdges(g_.out_begin(lastVertex), g_.out_end(lastVertex));
+ VertexId nextVertex = g_.EdgeEnd(bulgeEdges[0]);
+
+ if (bulgeEdges[0] == g_.conjugate(bulgeEdges[1]) && nextVertex == g_.EdgeEnd(bulgeEdges[1]) && g_.CheckUniqueOutgoingEdge(nextVertex)
+ && *(g_.out_begin(nextVertex)) == g_.conjugate(lastEdge)) {
+
+ DEBUG("Came to interstrand bulge " << g_.int_id(lastEdge));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool IsInterstrandBulge() const {
+ if (Empty())
+ return false;
+
+ EdgeId lastEdge = Back();
+ VertexId lastVertex = g_.EdgeEnd(lastEdge);
+ VertexId prevVertex = g_.EdgeStart(lastEdge);
+
+ if (g_.OutgoingEdgeCount(prevVertex) == 2 && g_.IncomingEdgeCount(lastVertex) == 2 && g_.CheckUniqueOutgoingEdge(lastVertex)
+ && g_.CheckUniqueIncomingEdge(prevVertex) && *(g_.in_begin(prevVertex)) == g_.conjugate(*(g_.out_begin(lastVertex)))) {
+
+ vector<EdgeId> bulgeEdges(g_.out_begin(prevVertex), g_.out_end(prevVertex));
+ EdgeId bulgeEdge = bulgeEdges[0] == lastEdge ? bulgeEdges[1] : bulgeEdges[0];
+
+ if (bulgeEdge == g_.conjugate(lastEdge)) {
+ DEBUG("In interstrand bulge " << g_.int_id(lastEdge));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void Print() const {
+ DEBUG("Path " << id_);
+ DEBUG("Length " << Length());
+ DEBUG("Weight " << weight_);
+ DEBUG("#, edge, length, gap length, trash length, total length, total length from begin");
+ for (size_t i = 0; i < Size(); ++i) {
+ DEBUG(i << ", " << g_.int_id(At(i)) << ", "
+ << g_.length(At(i)) << ", " << GapAt(i) << ", "
+ << TrashPreviousAt(i) << "-" << TrashCurrentAt(i)
+ << ", " << LengthAt(i) << ", "
+ << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i)));
+ }
+ }
+
+ void PrintInString() const {
+ stringstream str;
+ for (size_t i = 0; i < Size(); ++i) {
+ str << g_.int_id(At(i)) << " ";
+ }
+ DEBUG(str.str());
+ }
+ void PrintInfo() const {
+ INFO("Path " << id_);
+ INFO("Length " << Length());
+ INFO("Weight " << weight_);
+ INFO("#, edge, length, gap length, total length");
+ for (size_t i = 0; i < Size(); ++i) {
+ INFO(i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i));
+ }
+ }
+
+ void Print(std::ostream& os) {
+ if (Empty()) {
+ return;
+ }
+ os << "Path " << GetId() << endl;
+ os << "Length " << Length() << endl;
+ os << "#, edge, length, gap, total length" << endl;
+ for (size_t i = 0; i < Size(); ++i) {
+ os << i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i) << endl;
+ }
+ }
+
+ void SetOverlapedBeginTo(BidirectionalPath* to) {
+ if (has_overlaped_begin_) {
+ to->SetOverlapBegin();
+ }
+ SetOverlapBegin();
+ to->SetOverlapEnd();
+ }
+
+ void SetOverlapedEndTo(BidirectionalPath* to) {
+ if (has_overlaped_end_) {
+ to->SetOverlapEnd();
+ }
+ SetOverlapEnd();
+ to->SetOverlapBegin();
+ }
+
+ void SetOverlap(bool overlap = true) {
+ overlap_ = overlap;
+ conj_path_->overlap_ = overlap;
+ }
+
+ bool HasOverlapedBegin() const {
+ return has_overlaped_begin_;
+ }
+
+ bool HasOverlapedEnd() const {
+ return has_overlaped_end_;
+ }
+
+ bool IsOverlap() const {
+ return overlap_;
+ }
+
+ void ResetOverlaps() {
+ overlap_ = false;
+ has_overlaped_begin_ = false;
+ has_overlaped_end_ = false;
+ conj_path_->overlap_ = false;
+ conj_path_->has_overlaped_begin_ = false;
+ conj_path_->has_overlaped_end_ = false;
+ }
+private:
+
+ void RecountLengths() {
+ cumulative_len_.clear();
+ size_t currentLength = 0;
+ for (auto iter = data_.rbegin(); iter != data_.rend(); ++iter) {
+ currentLength += g_.length((EdgeId) *iter);
+ cumulative_len_.push_front(currentLength);
+ }
+ }
+
+ void IncreaseLengths(size_t length, Gap gap_struct) {
+ for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
+ *iter += length + gap_struct.gap_ - gap_struct.trash_previous_;
+ }
+ cumulative_len_.push_back(length);
+ }
+
+ void DecreaseLengths() {
+ size_t length = g_.length(data_.back()) + gap_len_.back().gap_ - gap_len_.back().trash_previous_;
+
+ for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
+ *iter -= length;
+ }
+ cumulative_len_.pop_back();
+ }
+
+ void NotifyFrontEdgeAdded(EdgeId e, int gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyFrontEdgeAdded(EdgeId e, Gap gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyBackEdgeAdded(EdgeId e, int gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyBackEdgeAdded(EdgeId e, Gap gap) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeAdded(e, this, gap);
+ }
+ }
+
+ void NotifyFrontEdgeRemoved(EdgeId e) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->FrontEdgeRemoved(e, this);
+ }
+ }
+
+ void NotifyBackEdgeRemoved(EdgeId e) {
+ for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
+ (*i)->BackEdgeRemoved(e, this);
+ }
+ }
+
+ void PushFront(EdgeId e, Gap gap) {
+ PushFront(e, gap.gap_ + gap.trash_current_ - gap.trash_previous_, gap.trash_current_, gap.trash_previous_);
+ }
+
+ void PushFront(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
+ data_.push_front(e);
+ if (gap_len_.size() > 0) {
+ gap_len_[0].gap_ += gap;
+ gap_len_[0].trash_previous_ += trash_previous;
+ gap_len_[0].trash_current_ += trash_current;
+ }
+ gap_len_.push_front(Gap(0, 0, 0));
+
+ int length = (int) g_.length(e);
+ if (cumulative_len_.empty()) {
+ cumulative_len_.push_front(length);
+ } else {
+ cumulative_len_.push_front(length + cumulative_len_.front() + gap - trash_previous );
+ }
+ NotifyFrontEdgeAdded(e, gap);
+ }
+
+ void PopFront() {
+ EdgeId e = data_.front();
+ if (gap_len_.size() > 1) {
+ gap_len_[1].gap_ = 0;
+ gap_len_[1].trash_previous_ = 0;
+ gap_len_[1].trash_current_ = 0;
+ }
+ data_.pop_front();
+ gap_len_.pop_front();
+
+ cumulative_len_.pop_front();
+ NotifyFrontEdgeRemoved(e);
+ }
+
+ void SetOverlapBegin(bool overlap = true) {
+ if (has_overlaped_begin_ != overlap) {
+ has_overlaped_begin_ = overlap;
+ }
+ if (GetConjPath()->has_overlaped_end_ != overlap) {
+ GetConjPath()->has_overlaped_end_ = overlap;
+ }
+ }
+
+ void SetOverlapEnd(bool overlap = true) {
+ GetConjPath()->SetOverlapBegin(overlap);
+ }
+
+ const Graph& g_;
+ std::deque<EdgeId> data_;
+ BidirectionalPath* conj_path_;
+ std::deque<size_t> cumulative_len_; // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) ... L(eN)
+ std::deque<Gap> gap_len_; // e1 - gap2 - e2 - ... - gapN - eN
+ std::vector<PathListener *> listeners_;
+ const uint64_t id_; //Unique ID
+ float weight_;
+ bool has_overlaped_begin_;
+ bool has_overlaped_end_;
+ bool overlap_;
+ DECL_LOGGER("BidirectionalPath");
+};
+
+inline int SkipOneGap(EdgeId end, const BidirectionalPath& path, int gap, int pos, bool forward) {
+ size_t len = 0;
+ while (pos < (int) path.Size() && pos >= 0 && end != path.At(pos) && (int) len < 2 * gap) {
+ len += path.graph().length(path.At(pos));
+ forward ? pos++ : pos--;
+ }
+ if (pos < (int) path.Size() && pos >= 0 && end == path.At(pos)) {
+ return pos;
+ }
+ return -1;
+}
+
+inline void SkipGaps(const BidirectionalPath& path1, size_t& cur_pos1, int gap1, const BidirectionalPath& path2, size_t& cur_pos2, int gap2, bool use_gaps,
+ bool forward) {
+ if (use_gaps) {
+ if (gap1 > 0 && gap2 <= 0) {
+ int temp2 = SkipOneGap(path1.At(cur_pos1), path2, gap1, (int) cur_pos2, forward);
+ if (temp2 >= 0) {
+ cur_pos2 = (size_t) temp2;
+ }
+ } else if (gap2 > 0 && gap1 <= 0) {
+ int temp1 = SkipOneGap(path2.At(cur_pos2), path1, gap2, (int) cur_pos1, forward);
+ if (temp1 >= 0) {
+ cur_pos1 = (size_t) temp1;
+ }
+ } else if (gap1 > 0 && gap2 > 0 && gap1 != gap2) {
+ DEBUG("not equal gaps in two paths!!!");
+ }
+ }
+}
+
+
+//Try do ignore multiple loop traversals
+inline size_t FirstNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ int cur_pos1 = (int) pos1;
+ int cur_pos2 = (int) pos2;
+ int gap1 = path1.GapAt(cur_pos1);
+ int gap2 = path2.GapAt(cur_pos2);
+ while (cur_pos1 >= 0 && cur_pos2 >= 0) {
+ if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
+ cur_pos1--;
+ cur_pos2--;
+ } else {
+ DEBUG("Not Equal at " << cur_pos1 << " and " << cur_pos2);
+ return cur_pos1;
+ }
+ if (cur_pos1 >= 0 && cur_pos2 >= 0) {
+ size_t p1 = (size_t) cur_pos1;
+ size_t p2 = (size_t) cur_pos2;
+ SkipGaps(path1, p1, gap1, path2, p2, gap2, use_gaps, false);
+ cur_pos1 = (int) p1;
+ cur_pos2 = (int) p2;
+ gap1 = path1.GapAt(cur_pos1);
+ gap2 = path2.GapAt(cur_pos2);
+ }
+ }
+ DEBUG("Equal!!");
+ return -1UL;
+}
+inline bool EqualBegins(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ DEBUG("Checking for equal begins");
+ return FirstNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
+}
+
+inline size_t LastNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ size_t cur_pos1 = pos1;
+ size_t cur_pos2 = pos2;
+ while (cur_pos1 < path1.Size() && cur_pos2 < path2.Size()) {
+ if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
+ cur_pos1++;
+ cur_pos2++;
+ } else {
+ return cur_pos1;
+ }
+ int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1) : 0;
+ int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2) : 0;
+ SkipGaps(path1, cur_pos1, gap1, path2, cur_pos2, gap2, use_gaps, true);
+ }
+ return -1UL;
+}
+
+inline bool EqualEnds(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
+ return LastNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
+}
+
+inline bool PathIdCompare(const BidirectionalPath* p1, const BidirectionalPath* p2) {
+ return p1->GetId() < p2->GetId();
+}
+
+
+
+typedef std::pair<BidirectionalPath*, BidirectionalPath*> PathPair;
+
+inline bool compare_path_pairs(const PathPair& p1, const PathPair& p2) {
+ if (p1.first->Length() != p2.first->Length() || p1.first->Size() == 0 || p2.first->Size() == 0) {
+ return p1.first->Length() > p2.first->Length();
+ }
+ const Graph& g = p1.first->graph();
+ return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front());
+}
+
+class PathComparator {
+public:
+ bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
+ return p1.GetId() < p2.GetId();
+ }
+
+ bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const {
+ return p1->GetId() < p2->GetId();
+ }
+};
+
+typedef set<BidirectionalPath*, PathComparator> BidirectionalPathSet;
+
+template<class Value>
+using BidirectionalPathMap = map<BidirectionalPath*, Value, PathComparator>;
+
+typedef std::multiset <BidirectionalPath *, PathComparator> BidirectionalPathMultiset;
+
+class PathContainer {
+
+public:
+
+ typedef std::vector<PathPair> PathContainerT;
+
+ class Iterator : public PathContainerT::iterator {
+ public:
+ Iterator(const PathContainerT::iterator& iter)
+ : PathContainerT::iterator(iter) {
+ }
+ BidirectionalPath* get() const {
+ return this->operator *().first;
+ }
+ BidirectionalPath* getConjugate() const {
+ return this->operator *().second;
+ }
+ };
+
+ class ConstIterator : public PathContainerT::const_iterator {
+ public:
+ ConstIterator(const PathContainerT::const_iterator& iter)
+ : PathContainerT::const_iterator(iter) {
+ }
+ BidirectionalPath* get() const {
+ return this->operator *().first;
+ }
+ BidirectionalPath* getConjugate() const {
+ return this->operator *().second;
+ }
+ };
+
+ PathContainer() {
+ }
+
+ BidirectionalPath& operator[](size_t index) const {
+ return *(data_[index].first);
+ }
+
+ BidirectionalPath* Get(size_t index) const {
+ return data_[index].first;
+ }
+
+ BidirectionalPath* GetConjugate(size_t index) const {
+ return data_[index].second;
+ }
+
+ void DeleteAllPaths() {
+ for (size_t i = 0; i < data_.size(); ++i) {
+ delete data_[i].first;
+ delete data_[i].second;
+ }
+ clear();
+ }
+
+ ~PathContainer() {
+ DeleteAllPaths();
+ }
+
+ size_t size() const {
+ return data_.size();
+ }
+
+ void clear() {
+ data_.clear();
+ }
+
+ void reserve(size_t size) {
+ data_.reserve(size);
+ }
+
+ bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) {
+ p->SetConjPath(cp);
+ cp->SetConjPath(p);
+ p->Subscribe(cp);
+ cp->Subscribe(p);
+ data_.push_back(std::make_pair(p, cp));
+ return true;
+ }
+
+ void SortByLength() {
+ std::stable_sort(data_.begin(), data_.end(), compare_path_pairs);
+ }
+
+ Iterator begin() {
+ return Iterator(data_.begin());
+ }
+
+ Iterator end() {
+ return Iterator(data_.end());
+ }
+
+
+ ConstIterator begin() const {
+ return ConstIterator(data_.begin());
+ }
+
+ ConstIterator end() const {
+ return ConstIterator(data_.end());
+ }
+
+ Iterator erase(Iterator iter) {
+ return Iterator(data_.erase(iter));
+ }
+
+ void print() const {
+ for (size_t i = 0; i < size(); ++i) {
+ Get(i)->Print();
+ GetConjugate(i)->Print();
+ }
+ }
+
+ void FilterEmptyPaths() {
+ DEBUG ("try to delete empty paths");
+ for (Iterator iter = begin(); iter != end();) {
+ if (iter.get()->Size() == 0) {
+ // FIXME: This is trash. PathContainer should own paths
+ delete iter.get();
+ delete iter.getConjugate();
+ iter = erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+ DEBUG("empty paths are removed");
+ }
+
+ void FilterInterstandBulges() {
+ DEBUG ("Try to delete paths with interstand bulges");
+ for (Iterator iter = begin(); iter != end(); ++iter) {
+ if (iter.get()->IsInterstrandBulge()) {
+ iter.get()->PopBack();
+ }
+ if (iter.getConjugate()->IsInterstrandBulge()) {
+ iter.getConjugate()->PopBack();
+ }
+ }
+ DEBUG("deleted paths with interstand bulges");
+ }
+
+private:
+ std::vector<PathPair> data_;
+
+protected:
+ DECL_LOGGER("BidirectionalPath");
+
+};
+
+inline pair<size_t, size_t> ComparePaths(size_t start_pos1, size_t start_pos2, const BidirectionalPath& path1, const BidirectionalPath& path2,
+ size_t max_diff) {
+ path1.Print();
+ path2.Print();
+ if (start_pos1 >= path1.Size() || start_pos2 >= path2.Size()) {
+ return make_pair(start_pos1, start_pos2);
+ }
+ const Graph& g = path1.graph();
+ size_t cur_pos = start_pos1;
+ size_t last2 = start_pos2;
+ size_t last1 = cur_pos;
+ cur_pos++;
+ size_t diff_len = 0;
+ while (cur_pos < path1.Size()) {
+ if (diff_len > max_diff) {
+ return make_pair(last1, last2);
+ }
+ EdgeId e = path1[cur_pos];
+ vector<size_t> poses2 = path2.FindAll(e);
+ bool found = false;
+ for (size_t pos2 = 0; pos2 < poses2.size(); ++pos2) {
+ if (poses2[pos2] > last2) {
+ if (path2.LengthAt(last2) - path2.LengthAt(poses2[pos2]) - g.length(path2.At(last2)) - path2.GapAt(poses2[pos2]) > max_diff) {
+ break;
+ }
+ last2 = poses2[pos2];
+ last1 = cur_pos;
+ DEBUG("found " << cur_pos);
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ diff_len += g.length(e) + path1.GapAt(cur_pos);
+ DEBUG("not found " << cur_pos << " now diff len " << diff_len);
+ } else {
+ diff_len = 0;
+ }
+ cur_pos++;
+ }
+ return make_pair(last1, last2);
+}
+
+inline void DeletePaths(BidirectionalPathSet& paths) {
+ for (auto i = paths.begin(); i != paths.end(); ++i) {
+ delete (*i);
+ }
+}
+
+inline void DeletePaths(vector<BidirectionalPath*>& paths) {
+ for (auto i = paths.begin(); i != paths.end(); ++i) {
+ delete (*i);
+ }
+}
+
+inline void DeleteMapWithPaths(map<EdgeId, BidirectionalPath*> m) {
+ for (auto i = m.begin(); i != m.end(); ++i){
+ delete i->second;
+ }
+}
+
+} // path extend
+
diff --git a/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp b/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp
new file mode 100644
index 0000000..a3a3004
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.cpp
@@ -0,0 +1,68 @@
+//
+// Created by andrey on 20.01.17.
+//
+
+#include "bidirectional_path_output.hpp"
+
+namespace path_extend {
+
+
+string path_extend::ContigWriter::ToFASTGPathFormat(const BidirectionalPath &path) const {
+ if (path.Empty())
+ return "";
+ string res = ids_.at(path.Front()).short_id_;
+ for (size_t i = 1; i < path.Size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
+ res += ";\n" + ids_.at(path[i]).short_id_;
+ }
+ else {
+ res += "," + ids_.at(path[i]).short_id_;
+ }
+ }
+ return res;
+}
+
+void path_extend::ContigWriter::OutputPaths(const PathContainer &paths,
+ const string &filename_base,
+ bool write_fastg) const {
+ name_generator_->Preprocess(paths);
+ IOContigStorage storage(g_, constructor_, paths);
+
+ INFO("Writing contigs to " << filename_base);
+ io::osequencestream_simple oss(filename_base + ".fasta");
+ std::ofstream os_fastg;
+ if (write_fastg)
+ os_fastg.open((filename_base + ".paths").c_str());
+
+ size_t i = 0;
+ for (const auto& precontig : storage.Storage()) {
+ ++i;
+ std::string contig_id = name_generator_->MakeContigName(i, precontig);
+ oss.set_header(contig_id);
+ oss << precontig.sequence_;
+
+ if (write_fastg) {
+ os_fastg << contig_id << endl;
+ os_fastg << ToFASTGPathFormat(*precontig.path_) << endl;
+ os_fastg << contig_id << "'" << endl;
+ os_fastg << ToFASTGPathFormat(*precontig.path_->GetConjPath()) << endl;
+ }
+ }
+
+ if (write_fastg)
+ os_fastg.close();
+ DEBUG("Contigs written");
+}
+
+
+void path_extend::PathInfoWriter::WritePaths(const PathContainer &paths, const string &filename) const {
+ std::ofstream oss(filename.c_str());
+
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ iter.get()->Print(oss);
+ }
+
+ oss.close();
+}
+
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp b/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp
new file mode 100644
index 0000000..7de980d
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp
@@ -0,0 +1,60 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+
+#include "io_support.hpp"
+
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+
+class ContigWriter {
+protected:
+ DECL_LOGGER("PathExtendIO")
+
+protected:
+ const Graph& g_;
+ ContigConstructor<Graph> &constructor_;
+ map<EdgeId, ExtendedContigIdT> ids_;
+ shared_ptr<ContigNameGenerator> name_generator_;
+
+ string ToFASTGPathFormat(const BidirectionalPath &path) const;
+
+
+public:
+ ContigWriter(const Graph& g,
+ ContigConstructor<Graph> &constructor,
+ const ConnectedComponentCounter &c_counter,
+ shared_ptr<ContigNameGenerator> name_generator) :
+ g_(g),
+ constructor_(constructor),
+ ids_(),
+ name_generator_(name_generator) {
+ MakeContigIdMap(g_, ids_, c_counter, "NODE");
+ }
+
+ void OutputPaths(const PathContainer &paths,
+ const string &filename_base,
+ bool write_fastg = true) const;
+
+};
+
+
+class PathInfoWriter {
+protected:
+ DECL_LOGGER("PathExtendIO")
+
+public:
+
+ void WritePaths(const PathContainer &paths, const string &filename) const;
+};
+
+}
diff --git a/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp
new file mode 100644
index 0000000..3d52888
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.cpp
@@ -0,0 +1,177 @@
+//
+// Created by andrey on 23.01.17.
+//
+
+#include "io_support.hpp"
+#include "modules/path_extend/pe_utils.hpp"
+
+namespace path_extend {
+
+void path_extend::TranscriptToGeneJoiner::MakeSet(size_t x) {
+ parents_[x] = x;
+ ranks_[x] = 0;
+}
+
+void path_extend::TranscriptToGeneJoiner::JoinTrees(size_t x, size_t y) {
+ x = FindTree(x);
+ y = FindTree(y);
+ if (x != y) {
+ if (ranks_[x] < ranks_[y])
+ parents_[x] = y;
+ else
+ parents_[y] = x;
+ if (ranks_[x] == ranks_[y])
+ ++ranks_[x];
+ }
+}
+
+void path_extend::TranscriptToGeneJoiner::Init(const PathContainer &paths) {
+ DEBUG("Initializing parents and ranks");
+ parents_.resize(paths.size());
+ ranks_.resize(paths.size());
+
+ size_t path_num = 0;
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter, ++path_num) {
+ path_id_[iter.get()] = path_num;
+ path_id_[iter.getConjugate()] = path_num;
+ MakeSet(path_num);
+ }
+
+ DEBUG("Initialized parents and ranks");
+
+ VERIFY_MSG(path_num == paths.size(), "Path Num " << path_num << " Size " << paths.size())
+}
+
+size_t path_extend::TranscriptToGeneJoiner::FindTree(size_t x) {
+ size_t parent;
+ if (x == parents_[x]) {
+ parent = x;
+ }
+ else {
+ parents_[x] = FindTree(parents_[x]);
+ parent = parents_[x];
+ }
+ return parent;
+}
+
+size_t path_extend::TranscriptToGeneJoiner::GetPathId(BidirectionalPath *path) {
+ return path_id_[path];
+}
+
+void path_extend::TranscriptToGeneJoiner::Construct(const PathContainer &paths) {
+ Init(paths);
+
+ GraphCoverageMap edges_coverage(g_, paths);
+
+ DEBUG("Union trees");
+ //For all edges in coverage map
+ for (auto iterator = edges_coverage.begin(); iterator != edges_coverage.end(); ++iterator) {
+ //Select a path covering an edge
+ EdgeId edge = iterator->first;
+ GraphCoverageMap::MapDataT *edge_paths = iterator->second;
+
+ if (g_.length(edge) > min_edge_len_ && edge_paths->size() > 1) {
+ DEBUG("Long edge " << edge.int_id() << " Paths " << edge_paths->size());
+ //For all other paths covering this edge join then into single gene with the first path
+ for (auto it_edge = ++edge_paths->begin(); it_edge != edge_paths->end(); ++it_edge) {
+ size_t first = path_id_[*edge_paths->begin()];
+ size_t next = path_id_[*it_edge];
+ DEBUG("Edge " << edge.int_id() << " First " << first << " Next " << next);
+
+ JoinTrees(first, next);
+ }
+ }
+ }
+}
+
+string path_extend::IOContigStorage::ToString(const BidirectionalPath &path) const {
+ stringstream ss;
+ if (path.IsInterstrandBulge() && path.Size() == 1) {
+ ss << constructor_.construct(path.Back()).first.substr(k_, g_.length(path.Back()) - k_);
+ return ss.str();
+ }
+
+ if (!path.Empty()) {
+ ss << constructor_.construct(path[0]).first.substr(0, k_);
+ }
+
+
+ size_t i = 0;
+ while (i < path.Size()) {
+ int gap = i == 0 ? 0 : path.GapAt(i);
+ if (gap > (int) k_) {
+ for (size_t j = 0; j < gap - k_; ++j) {
+ ss << "N";
+ }
+ ss << constructor_.construct(path[i]).first;
+ }
+ else {
+ int overlapLen = (int) k_ - gap;
+ if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
+ overlapLen -= (int) g_.length(path[i]) + (int) k_;
+ ++i;
+ //skipping overlapping edges
+ while (i < path.Size() && overlapLen >= (int) g_.length(path[i]) + path.GapAt(i)) {
+ overlapLen -= (int) g_.length(path[i]) + path.GapAt(i);
+ ++i;
+ }
+ if (i == path.Size()) {
+ break;
+ }
+
+ overlapLen = overlapLen + (int) k_ - path.GapAt(i);
+
+ if(overlapLen < 0) {
+ for (int j = 0; j < abs(overlapLen); ++j) {
+ ss << "N";
+ }
+ overlapLen = 0;
+ }
+ }
+ auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
+ if (i != path.Size() - 1) {
+ for (size_t j = 0; j < path.TrashPreviousAt(i + 1); ++j) {
+ temp_str.pop_back();
+ if (temp_str.size() == 0) {
+ break;
+ }
+ }
+ }
+ ss << temp_str;
+ }
+ ++i;
+ }
+ return ss.str();
+}
+
+void path_extend::ScaffoldBreaker::SplitPath(const BidirectionalPath &path, PathContainer &result) const {
+ size_t i = 0;
+
+ while (i < path.Size()) {
+ BidirectionalPath *p = new BidirectionalPath(path.graph(), path[i]);
+ ++i;
+
+ while (i < path.Size() and path.GapAt(i) <= min_gap_) {
+ p->PushBack(path[i], path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
+ ++i;
+ }
+
+ if (i < path.Size()) {
+ DEBUG("split path " << i << " gap " << path.GapAt(i));
+ p->Print();
+ }
+
+ BidirectionalPath *cp = new BidirectionalPath(p->Conjugate());
+ result.AddPair(p, cp);
+ }
+}
+
+void path_extend::ScaffoldBreaker::Break(const PathContainer &paths, PathContainer &result) const {
+ for (auto it = paths.begin(); it != paths.end(); ++it) {
+ SplitPath(*it.get(), result);
+ }
+ result.SortByLength();
+}
+
+}
+
diff --git a/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp
new file mode 100644
index 0000000..e46bd42
--- /dev/null
+++ b/src/common/assembly_graph/paths/bidirectional_path_io/io_support.hpp
@@ -0,0 +1,190 @@
+//
+// Created by andrey on 23.01.17.
+//
+
+#pragma once
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/graph_support/contig_output.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+
+namespace path_extend {
+using namespace debruijn_graph;
+
+
+struct IOContig {
+ std::string sequence_;
+ BidirectionalPath* path_;
+
+ IOContig(const std::string& sequence, BidirectionalPath* path) :
+ sequence_(sequence), path_(path) { }
+};
+
+struct IOContigGreater
+{
+ bool operator()(const IOContig &a, const IOContig &b) const {
+ if (a.sequence_.length() == b.sequence_.length())
+ return math::gr(a.path_->Coverage(), b.path_->Coverage());
+ return a.sequence_.length() > b.sequence_.length();
+ }
+};
+
+class IOContigStorage {
+private:
+ const Graph &g_;
+ ContigConstructor<Graph> &constructor_;
+ size_t k_;
+ vector<IOContig> storage_;
+
+ string ToString(const BidirectionalPath& path) const;
+public:
+ IOContigStorage(const Graph &g, ContigConstructor<Graph> &constructor, const PathContainer &paths):
+ g_(g),
+ constructor_(constructor),
+ k_(g.k()),
+ storage_() {
+
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ BidirectionalPath* path = iter.get();
+ if (path->Length() <= 0)
+ continue;
+ string path_string = ToString(*path);
+ if (path_string.length() >= g.k()) {
+ storage_.emplace_back(path_string, path);
+ }
+ }
+ std::sort(storage_.begin(), storage_.end(), IOContigGreater());
+ }
+
+ const vector<IOContig>& Storage() const {
+ return storage_;
+ }
+};
+
+
+//Finds common long edges in paths and joins them into
+//Based on disjoint set union
+class TranscriptToGeneJoiner {
+private:
+ const Graph &g_;
+ size_t min_edge_len_; //minimal length for joining transcripts into a gene
+
+ map<BidirectionalPath *, size_t, PathComparator> path_id_; //path ids
+ std::vector<size_t> parents_; //node parents in
+ std::vector<size_t> ranks_; //tree depth
+
+
+ void MakeSet(size_t x);
+
+ void JoinTrees(size_t x, size_t y);
+
+ void Init(const PathContainer &paths);
+public:
+ TranscriptToGeneJoiner(const Graph &g, size_t min_edge_len): g_(g), min_edge_len_(min_edge_len) {}
+
+ size_t FindTree(size_t x);
+
+ size_t GetPathId(BidirectionalPath *path);
+
+ void Construct(const PathContainer &paths);
+};
+
+
+
+class ContigNameGenerator {
+public:
+ virtual void Preprocess(const PathContainer& paths) = 0;
+
+ virtual std::string MakeContigName(size_t index, const IOContig &precontig) = 0;
+
+ virtual ~ContigNameGenerator() {
+ }
+};
+
+class DefaultContigNameGenerator: public ContigNameGenerator {
+public:
+ void Preprocess(const PathContainer&) override {}
+
+ std::string MakeContigName(size_t index, const IOContig &precontig) override {
+ return io::MakeContigId(index, precontig.sequence_.length(), precontig.path_->Coverage());
+ }
+};
+
+class PlasmidContigNameGenerator: public ContigNameGenerator {
+ const ConnectedComponentCounter &c_counter_;
+
+public:
+ PlasmidContigNameGenerator(const ConnectedComponentCounter &c_counter): c_counter_(c_counter) {}
+
+ void Preprocess(const PathContainer&) override {}
+
+ std::string MakeContigName(size_t index, const IOContig &precontig) override {
+ EdgeId e = precontig.path_->At(0);
+ size_t component = c_counter_.GetComponent(e);
+ return io::MakeContigComponentId(index, precontig.sequence_.length(), precontig.path_->Coverage(), component);
+ }
+};
+
+class TranscriptNameGenerator: public ContigNameGenerator {
+ TranscriptToGeneJoiner transcript_joiner_;
+
+ unordered_map<size_t, size_t> isoform_num_;
+ unordered_map<size_t, size_t> gene_ids_;
+ size_t gene_num_;
+
+public:
+ TranscriptNameGenerator(const Graph &g, size_t min_edge_len = 300):
+ transcript_joiner_(g, min_edge_len),
+ isoform_num_(),
+ gene_ids_(),
+ gene_num_(0) {
+
+ }
+
+ void Preprocess(const PathContainer& paths) override {
+ transcript_joiner_.Construct(paths);
+ }
+
+ std::string MakeContigName(size_t index, const IOContig &precontig) override {
+ size_t id = transcript_joiner_.GetPathId(precontig.path_);
+ size_t parent_id = transcript_joiner_.FindTree(id);
+ DEBUG("Path " << id << " Parent " << parent_id);
+ if (gene_ids_.find(parent_id) == gene_ids_.end()) {
+ gene_ids_[parent_id] = gene_num_;
+ isoform_num_[parent_id] = 0;
+ gene_num_++;
+ }
+ string contig_id = io::MakeRNAContigId(index, precontig.sequence_.length(), precontig.path_->Coverage(), gene_ids_[parent_id], isoform_num_[parent_id]);
+ isoform_num_[parent_id]++;
+ return contig_id;
+ }
+};
+
+
+inline std::shared_ptr<ContigNameGenerator> MakeContigNameGenerator(config::pipeline_type mode,
+ const conj_graph_pack &gp) {
+ std::shared_ptr<path_extend::ContigNameGenerator> name_generator;
+ if (mode == config::pipeline_type::plasmid)
+ name_generator = make_shared<PlasmidContigNameGenerator>(gp.components);
+ else if (mode == config::pipeline_type::rna)
+ name_generator = make_shared<TranscriptNameGenerator>(gp.g);
+ else
+ name_generator = make_shared<DefaultContigNameGenerator>();
+ return name_generator;
+}
+
+class ScaffoldBreaker {
+private:
+
+ int min_gap_;
+
+ void SplitPath(const BidirectionalPath& path, PathContainer &result) const;
+
+public:
+
+ ScaffoldBreaker(int min_gap): min_gap_(min_gap) {}
+
+ void Break(const PathContainer &paths, PathContainer &result) const;
+};
+
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/paths/mapping_path.hpp b/src/common/assembly_graph/paths/mapping_path.hpp
new file mode 100644
index 0000000..3551e04
--- /dev/null
+++ b/src/common/assembly_graph/paths/mapping_path.hpp
@@ -0,0 +1,301 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "sequence/sequence.hpp"
+#include "utils/range.hpp"
+
+namespace omnigraph {
+
+/**
+ * This class is a representation of how certain sequence is mapped to genome. Needs further adjustment.
+ */
+template<typename ElementId>
+class Path {
+ std::vector<ElementId> sequence_;
+ size_t start_pos_;
+ size_t end_pos_;
+ public:
+ typedef typename vector<ElementId>::const_iterator iterator;
+
+ Path(const vector<ElementId>& sequence, size_t start_pos, size_t end_pos)
+ : sequence_(sequence), start_pos_(start_pos), end_pos_( end_pos) {
+ }
+
+ Path() : sequence_(),
+ start_pos_(-1ul),
+ end_pos_(-1ul) {
+ }
+
+ size_t start_pos() const { return start_pos_; }
+ size_t end_pos() const { return end_pos_; }
+
+ size_t size() const { return sequence_.size(); }
+
+ const std::vector<ElementId>& sequence() const { return sequence_; }
+ ElementId operator[](size_t index) const { return sequence_[index]; }
+
+ iterator begin() const { return sequence_.begin(); }
+ iterator end() const { return sequence_.end(); }
+};
+
+struct MappingRange {
+// on genome/contig/whatever
+ Range initial_range;
+//on edge
+ Range mapped_range;
+
+ MappingRange() {
+ }
+
+ MappingRange(Range initial_range, Range mapped_range)
+ : initial_range(initial_range), mapped_range(mapped_range) {}
+
+ MappingRange(size_t i_start, size_t i_end, size_t m_start, size_t m_end)
+ : initial_range(i_start, i_end), mapped_range(m_start, m_end) {}
+
+ MappingRange Merge(const MappingRange &other) const {
+ return MappingRange(initial_range.Merge(other.initial_range), mapped_range.Merge(other.mapped_range));
+ }
+
+ MappingRange ShiftInitial(int shift) const {
+ MappingRange result(*this);
+ result.initial_range.shift(shift);
+ return result;
+ }
+
+ MappingRange Shift(int shift) const {
+ VERIFY(initial_range.end_pos >= initial_range.start_pos);
+ if(empty())
+ return MappingRange();
+ MappingRange result(*this);
+ if(int(result.mapped_range.end_pos) <= -shift)
+ return MappingRange();
+ result.mapped_range.end_pos += shift;
+ if(int(result.mapped_range.start_pos) <= -shift) {
+ result.initial_range.start_pos -= result.mapped_range.start_pos + shift;
+ if(result.initial_range.start_pos >= result.initial_range.end_pos)
+ result.initial_range.start_pos = result.initial_range.end_pos - 1;
+ result.mapped_range.start_pos = 0;
+ } else {
+ result.mapped_range.start_pos += shift;
+ }
+ return result;
+ }
+
+ MappingRange Fit(size_t length) const {
+ VERIFY(initial_range.end_pos >= initial_range.start_pos);
+ if(empty())
+ return MappingRange();
+ MappingRange result(*this);
+ if(result.mapped_range.start_pos >= length)
+ return MappingRange();
+ if(result.mapped_range.end_pos >= length) {
+ if(result.initial_range.end_pos + length < result.mapped_range.end_pos)
+ return MappingRange();
+ result.initial_range.end_pos -= result.mapped_range.end_pos - length;
+ result.mapped_range.end_pos = length;
+ }
+ return result;
+ }
+
+ bool empty() const {
+ return initial_range.empty() || mapped_range.empty();
+ }
+
+ bool operator<(const MappingRange &other) const {
+ if(this->initial_range != other.initial_range)
+ return this->initial_range < other.initial_range;
+ return this->mapped_range < other.mapped_range;
+ }
+ MappingRange operator = (const MappingRange & other) {
+ initial_range = other.initial_range;
+ mapped_range = other.mapped_range;
+ return *this;
+ }
+
+ bool Intersect(const MappingRange &other) {
+ return initial_range.Intersect(other.initial_range) && mapped_range.Intersect(other.mapped_range);
+ }
+
+ bool IntersectLeftOf(const MappingRange &other) const {
+ return initial_range.IntersectLeftOf(other.initial_range) && mapped_range.IntersectLeftOf(other.mapped_range);
+ }
+
+ bool StrictlyContinuesWith(const MappingRange &other, size_t max_gap, size_t gap_diff = 0) const {
+ return this->initial_range.end_pos <= other.initial_range.start_pos
+ && this->mapped_range.end_pos <= other.mapped_range.start_pos
+ && other.initial_range.start_pos - this->initial_range.end_pos
+ <= other.mapped_range.start_pos - this->mapped_range.end_pos + gap_diff
+ && other.mapped_range.start_pos - this->mapped_range.end_pos
+ <= other.initial_range.start_pos - this->initial_range.end_pos + gap_diff
+ && other.initial_range.start_pos - this->initial_range.end_pos <= max_gap;
+ }
+
+ bool operator==(const MappingRange &that) const {
+ return initial_range == that.initial_range || mapped_range == that.mapped_range;
+ }
+
+ bool operator!=(const MappingRange &that) const {
+ return !(*this == that);
+ }
+
+};
+
+inline std::ostream& operator<<(std::ostream& os, const MappingRange& map_range) {
+ os << map_range.initial_range << " --> " << map_range.mapped_range;
+ return os;
+}
+
+template<typename ElementId>
+class MappingPath {
+ public:
+ MappingPath() {}
+
+ MappingPath(const ElementId &edge,
+ const MappingRange &range_mapping)
+ : edges_({ edge }),
+ range_mappings_({ range_mapping }) {}
+
+ MappingPath(const std::vector<ElementId>& edges,
+ const std::vector<MappingRange> range_mappings)
+ : edges_(edges),
+ range_mappings_(range_mappings) {}
+
+ size_t size() const { return edges_.size(); }
+
+ size_t empty() const { return edges_.empty(); }
+
+ ElementId edge_at(size_t idx) const {
+ return edges_[idx];
+ };
+
+ MappingRange mapping_at(size_t idx) const {
+ return range_mappings_[idx];
+ };
+
+ std::pair<const ElementId, const MappingRange> operator[](size_t idx) const {
+ return std::make_pair(edges_[idx], range_mappings_[idx]);
+ }
+
+ std::pair<const ElementId, const MappingRange> front() const {
+ return std::make_pair(edges_.front(), range_mappings_.front());
+ }
+
+ std::pair<const ElementId, const MappingRange> back() const {
+ return std::make_pair(edges_.back(), range_mappings_.back());
+ }
+
+ size_t start_pos() const {
+ return range_mappings_.front().mapped_range.start_pos;
+ }
+
+ size_t end_pos() const {
+ return range_mappings_.back().mapped_range.end_pos;
+ }
+
+ Path<ElementId> path() const {
+ if (edges_.size() != 0)
+ return Path<ElementId>(edges_,
+ range_mappings_[0].mapped_range.start_pos,
+ range_mappings_[range_mappings_.size() - 1].mapped_range.end_pos);
+ else
+ return Path<ElementId>();
+ }
+
+ const std::vector<ElementId>& simple_path() const {
+ return edges_;
+ }
+
+ void join(const MappingPath<ElementId>& that, int pos_shift = 0) {
+ for (size_t i = 0; i < that.size(); ++i) {
+ edges_.push_back(that.edges_[i]);
+ range_mappings_.push_back(that.range_mappings_[i].ShiftInitial(pos_shift));
+ }
+ }
+
+ void push_back(ElementId id, MappingRange range) {
+ edges_.push_back(id);
+ range_mappings_.push_back(range);
+ }
+
+ private:
+ std::vector<ElementId> edges_;
+ std::vector<MappingRange> range_mappings_;
+};
+
+template <typename ElementId>
+inline std::ostream& operator<<(std::ostream& os, const MappingPath<ElementId>& mp) {
+ os << "MappingPath ( ";
+ for(size_t i = 0; i < mp.size(); i++) {
+ os << mp[i] << " ";
+ }
+ os << " )";
+ return os;
+}
+
+template<class Graph>
+struct GapDescription {
+ typedef typename Graph::EdgeId EdgeId;
+ EdgeId start, end;
+ Sequence gap_seq;
+ //FIXME discuss using size_t
+ size_t edge_gap_start_position, edge_gap_end_position;
+
+ GapDescription() :
+ start(0),
+ end(0),
+ edge_gap_start_position(0),
+ edge_gap_end_position(0) {
+ }
+
+ GapDescription(EdgeId start_e, EdgeId end_e,
+ const Sequence &gap,
+ size_t gap_start, size_t gap_end) :
+ start(start_e),
+ end(end_e),
+ gap_seq(gap.str()),
+ edge_gap_start_position(gap_start),
+ edge_gap_end_position(gap_end) {
+ }
+
+ GapDescription<Graph> conjugate(const Graph &g) const {
+ GapDescription<Graph> res(
+ g.conjugate(end), g.conjugate(start), !gap_seq,
+ g.length(end) - edge_gap_end_position,
+ g.length(start) - edge_gap_start_position);
+ return res;
+ }
+
+ string str(const Graph &g) const {
+ stringstream s;
+ s << g.int_id(start) << " " << edge_gap_start_position << endl
+ << g.int_id(end) << " " << edge_gap_end_position << endl
+ << gap_seq.str() << endl;
+ return s.str();
+ }
+
+ bool operator<(const GapDescription &b) const {
+ return start < b.start ||
+ (start == b.start && end < b.end) ||
+ (start == b.start && end == b.end &&
+ edge_gap_start_position < b.edge_gap_start_position);
+ }
+
+ bool operator!=(const GapDescription rhs) const {
+ return start != rhs.start
+ || end != rhs.end
+ || gap_seq != rhs.gap_seq
+ || edge_gap_start_position != rhs.edge_gap_start_position
+ || edge_gap_end_position != rhs.edge_gap_end_position;
+ }
+
+};
+
+
+}
diff --git a/src/common/assembly_graph/paths/path_finders.hpp b/src/common/assembly_graph/paths/path_finders.hpp
new file mode 100644
index 0000000..4cef781
--- /dev/null
+++ b/src/common/assembly_graph/paths/path_finders.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "assembly_graph/core/directions.hpp"
+
+namespace omnigraph {
+template<class Graph>
+class UniquePathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& graph_;
+public:
+ //todo use length bound if needed
+ UniquePathFinder(const Graph& graph, size_t /*length_bound*/ =
+ std::numeric_limits<size_t>::max())
+ : graph_(graph) {}
+
+ std::vector<EdgeId> operator()(EdgeId e,
+ const AbstractDirection<Graph> &direction) const {
+ std::vector<EdgeId> answer;
+ EdgeId curr = e;
+ answer.push_back(curr);
+ std::set<EdgeId> was;
+ while (direction.CheckUniqueOutgoingEdge(direction.EdgeEnd(curr))) {
+ curr = direction.GetUniqueOutgoingEdge(direction.EdgeEnd(curr));
+ if (was.count(curr) > 0)
+ break;
+ was.insert(curr);
+ answer.push_back(curr);
+ }
+ return answer;
+ }
+
+ std::vector<EdgeId> UniquePathForward(EdgeId e) const {
+ return this->operator()(e, ForwardDirection<Graph>(graph_));
+ }
+
+ std::vector<EdgeId> UniquePathBackward(EdgeId e) const {
+ auto tmp = this->operator()(e, BackwardDirection<Graph>(graph_));
+ return std::vector<EdgeId>(tmp.rbegin(), tmp.rend());
+ }
+
+};
+
+template<class Graph>
+class TrivialPathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+public:
+ TrivialPathFinder(const Graph&, size_t = 0) {}
+
+ std::vector<EdgeId> operator()(EdgeId e, const AbstractDirection<Graph> &) const {
+ return {e};
+ }
+
+};
+
+template<class Graph>
+class PlausiblePathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ //todo remove graph_ field???
+ const Graph& graph_;
+ const size_t length_bound_;
+
+ class DFS {
+ private:
+ const Graph &graph_;
+ const AbstractDirection<Graph> &direction_;
+ const size_t length_bound_;
+
+ std::pair<size_t, EdgeId> find(EdgeId edge, size_t length) {
+ length += graph_.length(edge);
+ VertexId cross = direction_.EdgeEnd(edge);
+ auto result = make_pair(length, edge);
+ if (length < length_bound_
+ && direction_.CheckUniqueIncomingEdge(cross)) {
+ std::vector<EdgeId> outgoing = direction_.OutgoingEdges(cross);
+ for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
+ auto candidate = find(*it, length);
+ if (candidate.first > result.first)
+ result = candidate;
+ }
+ }
+ return result;
+ }
+
+ std::vector<EdgeId> RestoreAnswer(EdgeId start, EdgeId end) {
+ std::vector<EdgeId> result;
+ while (end != start) {
+ result.push_back(end);
+ end = direction_.GetUniqueIncomingEdge(direction_.EdgeStart(end));
+ }
+ result.push_back(start);
+ return std::vector<EdgeId>(result.rbegin(), result.rend());
+ }
+
+ public:
+ DFS(const Graph &graph, const AbstractDirection<Graph> &direction,
+ size_t length_bound)
+ : graph_(graph),
+ direction_(direction),
+ length_bound_(length_bound) {
+ }
+
+ std::vector<EdgeId> find(EdgeId edge) {
+ return RestoreAnswer(edge, find(edge, 0).second);
+ }
+ };
+
+public:
+ PlausiblePathFinder(const Graph& graph, size_t length_bound)
+ : graph_(graph),
+ length_bound_(length_bound) {}
+
+ std::vector<EdgeId> operator()(EdgeId e,
+ const AbstractDirection<Graph> &direction) const {
+ return DFS(graph_, direction, length_bound_).find(e);
+ }
+
+};
+}
\ No newline at end of file
diff --git a/src/common/assembly_graph/paths/path_processor.hpp b/src/common/assembly_graph/paths/path_processor.hpp
new file mode 100644
index 0000000..0408100
--- /dev/null
+++ b/src/common/assembly_graph/paths/path_processor.hpp
@@ -0,0 +1,386 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "common/adt/bag.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+const string PrintPath(const Graph& g, const vector<typename Graph::EdgeId>& edges) {
+ string delim = "";
+ std::stringstream ss;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ ss << delim << g.str(edges[i]);
+ delim = " -> ";
+ }
+ return ss.str();
+}
+
+
+template<class Graph>
+class PathProcessor {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef vector<EdgeId> Path;
+ typedef typename DijkstraHelper<Graph>::BoundedDijkstra DijkstraT;
+public:
+ class Callback {
+
+ public:
+ virtual ~Callback() {
+ }
+
+ virtual void HandleReversedPath(const vector<EdgeId>& reversed_path) = 0;
+
+
+ protected:
+ Path ReversePath(const Path& path) const {
+ Path result;
+ for (auto it = path.rbegin(), end = path.rend(); it != end; ++it)
+ result.push_back(*it);
+ return result;
+ }
+ };
+
+private:
+
+ class Traversal {
+ const PathProcessor& outer_;
+ VertexId end_;
+ size_t min_len_;
+ size_t max_len_;
+ Callback& callback_;
+ size_t edge_depth_bound_;
+
+ size_t curr_len_;
+ size_t curr_depth_;
+ size_t call_cnt_;
+ Path reversed_edge_path_;
+ bag<VertexId> vertex_cnts_;
+
+ const Graph& g_;
+ const DijkstraT& dijkstra_;
+
+ void Push(EdgeId e, VertexId start_v) {
+ TRACE("Pushing edge " << g_.str(e));
+ curr_len_ += g_.length(e);
+ curr_depth_++;
+ reversed_edge_path_.push_back(e);
+ vertex_cnts_.put(start_v);
+ }
+
+ void Pop() {
+ VERIFY(!reversed_edge_path_.empty());
+ EdgeId e = reversed_edge_path_.back();
+ size_t len = g_.length(e);
+ VERIFY(curr_len_ >= len);
+
+ TRACE("Popping edge " << g_.str(e));
+ vertex_cnts_.take(g_.EdgeStart(e));
+ reversed_edge_path_.pop_back();
+ curr_len_ -= len;
+ curr_depth_--;
+ }
+
+ bool CanGo(EdgeId e, VertexId start_v) {
+ if (!dijkstra_.DistanceCounted(start_v))
+ return false;
+ if (dijkstra_.GetDistance(start_v) + g_.length(e) + curr_len_ > max_len_)
+ return false;
+ if (curr_depth_ >= edge_depth_bound_)
+ return false;
+ if (vertex_cnts_.mult(start_v) >= PathProcessor::MAX_VERTEX_USAGE)
+ return false;
+ return true;
+ }
+
+ bool Go(VertexId v, const size_t min_len) {
+ TRACE("Got to vertex " << g_.str(v));
+ if (++call_cnt_ >= PathProcessor::MAX_CALL_CNT) {
+ TRACE("Maximal count " << MAX_CALL_CNT << " of recursive calls was exceeded!");
+ return true;
+ }
+
+ if (v == outer_.start_ && curr_len_ >= min_len) {
+ //TRACE("New path found: " << PrintPath(g_, path_));
+ callback_.HandleReversedPath(reversed_edge_path_);
+ }
+
+ TRACE("Iterating through incoming edges of vertex " << g_.int_id(v))
+ vector<EdgeId> incoming;
+ incoming.reserve(4);
+ std::copy_if(g_.in_begin(v), g_.in_end(v), std::back_inserter(incoming), [&] (EdgeId e) {
+ return dijkstra_.DistanceCounted(g_.EdgeStart(e));
+ });
+
+ std::sort(incoming.begin(), incoming.end(), [&] (EdgeId e1, EdgeId e2) {
+ return dijkstra_.GetDistance(g_.EdgeStart(e1)) < dijkstra_.GetDistance(g_.EdgeStart(e2));
+ });
+
+ for (EdgeId e : incoming) {
+ VertexId start_v = g_.EdgeStart(e);
+ if (CanGo(e, start_v)) {
+ Push(e, start_v);
+ bool exceeded_limits = Go(start_v, min_len);
+ Pop();
+ if (exceeded_limits)
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public:
+ Traversal(const PathProcessor& outer, VertexId end,
+ size_t min_len, size_t max_len,
+ Callback& callback, size_t edge_depth_bound) :
+ outer_(outer), end_(end),
+ min_len_(min_len), max_len_(max_len),
+ callback_(callback),
+ edge_depth_bound_(edge_depth_bound),
+ curr_len_(0), curr_depth_(0), call_cnt_(0),
+ g_(outer.g_),
+ dijkstra_(outer.dijkstra_) {
+ reversed_edge_path_.reserve(PathProcessor::MAX_CALL_CNT);
+ vertex_cnts_.put(end_);
+ }
+
+ //returns true iff limits were exceeded
+ bool Go() {
+ if (!dijkstra_.DistanceCounted(end_) || dijkstra_.GetDistance(end_) > max_len_) {
+ return false;
+ }
+
+ bool code = Go(end_, min_len_);
+ VERIFY(curr_len_ == 0);
+ VERIFY(curr_depth_ == 0);
+ vertex_cnts_.take(end_);
+ VERIFY(vertex_cnts_.size() == 0);
+ return code;
+ }
+ };
+
+ friend class Traversal;
+
+public:
+
+ PathProcessor(const Graph& g, VertexId start, size_t length_bound) :
+ g_(g),
+ start_(start),
+ dijkstra_(DijkstraHelper<Graph>::CreateBoundedDijkstra(g, length_bound, MAX_DIJKSTRA_VERTICES)) {
+ TRACE("Dijkstra launched");
+ dijkstra_.Run(start);
+ TRACE("Dijkstra finished");
+ }
+
+ // dfs from the end vertices
+ // 3 two mistakes, 2 bad dijkstra, 1 some bad dfs, 0 = okay
+ int Process(VertexId end, size_t min_len, size_t max_len, Callback& callback, size_t edge_depth_bound = -1ul) const {
+ TRACE("Process launched");
+ int error_code = 0;
+
+ if (dijkstra_.VertexLimitExceeded()) {
+ TRACE("dijkstra : vertex limit exceeded");
+ error_code = 2;
+ }
+
+ TRACE("Start vertex is " << g_.str(start_));
+ TRACE("Bounds are " << min_len << " " << max_len);
+ TRACE("End vertex " << g_.str(end));
+
+ Traversal traversal(*this, end, min_len, max_len, callback, edge_depth_bound);
+ error_code |= int(traversal.Go());
+
+ TRACE("Process finished with error code " << error_code);
+ return error_code;
+ }
+
+private:
+ static const size_t MAX_CALL_CNT = 3000;
+ static const size_t MAX_DIJKSTRA_VERTICES = 3000;
+ static const size_t MAX_VERTEX_USAGE = 5;
+
+ const Graph& g_;
+ VertexId start_;
+ DijkstraT dijkstra_;
+
+ DECL_LOGGER("PathProcessor")
+};
+
+template<class Graph>
+int ProcessPaths(const Graph& g, size_t min_len, size_t max_len,
+ typename Graph::VertexId start, typename Graph::VertexId end,
+ typename PathProcessor<Graph>::Callback& callback, size_t max_edge_cnt = -1ul) {
+ PathProcessor<Graph> processor(g, start, max_len);
+ return processor.Process(end, min_len, max_len, callback, max_edge_cnt);
+}
+
+template<class Graph>
+class AdapterCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+ std::function<void(const Path&)> func_;
+ bool reverse_;
+public:
+
+ AdapterCallback(const std::function<void(const Path&)>& func, bool reverse = false) :
+ func_(func), reverse_(reverse) {}
+
+ void HandleReversedPath(const Path& path) override {
+ func_(reverse_ ? this->ReversePath(path) : path);
+ }
+
+};
+
+template<class Graph, class Comparator>
+class BestPathStorage: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+public:
+ BestPathStorage(const Graph& g, Comparator comparator) :
+ g_(g), comparator_(comparator) {
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ if (!best_path_ || comparator_(path, *best_path_))
+ best_path_ = boost::make_optional(path);
+ }
+
+ boost::optional<Path> best_path() const {
+ return best_path_;
+ }
+
+private:
+ const Graph& g_;
+ Comparator comparator_;
+ boost::optional<Path> best_path_;
+};
+
+template<class Graph>
+class PathStorageCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ PathStorageCallback(const Graph& g) :
+ g_(g) {
+ }
+
+ void HandleReversedPath(const vector<EdgeId>& path) override {
+ paths_.push_back(this->ReversePath(path));
+ }
+
+ size_t size() const {
+ return paths_.size();
+ }
+
+ const vector<Path>& paths() const {
+ return paths_;
+ }
+
+private:
+ const Graph& g_;
+ vector<Path> paths_;
+};
+
+template<class Graph>
+class NonEmptyPathCounter: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ NonEmptyPathCounter(const Graph& g) :
+ g_(g), count_(0) {
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ if (path.size() > 0) {
+ ++count_;
+ paths_.push_back(this->ReversePath(path));
+ }
+ }
+
+ size_t count() const {
+ return count_;
+ }
+
+ const vector<Path>& paths() const {
+ return paths_;
+ }
+
+private:
+ const Graph& g_;
+ size_t count_;
+ vector<Path> paths_;
+};
+
+template<class Graph>
+class VertexLabelerCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef vector<EdgeId> Path;
+
+public:
+ VertexLabelerCallback(const Graph& g) :
+ g_(g), count_(0) {
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ for (EdgeId e : path) {
+ if (path.size() > 0) {
+ vertices_.insert(g_.EdgeStart(e));
+ vertices_.insert(g_.EdgeEnd(e));
+ ++count_;
+ }
+ }
+ }
+
+ const set<VertexId>& vertices() const {
+ return vertices_;
+ }
+
+ size_t count() const {
+ return count_;
+ }
+
+private:
+ Graph& g_;
+ size_t count_;
+ set<VertexId> vertices_;
+};
+
+template<class Graph>
+class DistancesLengthsCallback: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<EdgeId> Path;
+
+public:
+ DistancesLengthsCallback(const Graph& g) :
+ g_(g) {
+ }
+
+ void HandleReversedPath(const Path& path) override {
+ distances_.insert(CumulativeLength(g_, path));
+ }
+
+ vector<size_t> distances() const {
+ return vector<size_t>(distances_.begin(), distances_.end());
+ }
+
+private:
+ const Graph& g_;
+ set<size_t> distances_;
+
+ DECL_LOGGER("DistancesLengthsCallback");
+};
+
+}
diff --git a/src/common/assembly_graph/paths/path_utils.hpp b/src/common/assembly_graph/paths/path_utils.hpp
new file mode 100644
index 0000000..5ab6b28
--- /dev/null
+++ b/src/common/assembly_graph/paths/path_utils.hpp
@@ -0,0 +1,130 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_utils.hpp
+ *
+ */
+
+#pragma once
+
+#include "sequence/sequence.hpp"
+#include "path_processor.hpp"
+#include "mapping_path.hpp"
+#include "assembly_graph/dijkstra/dijkstra_algorithm.hpp"
+
+namespace debruijn_graph {
+
+
+ template<class Graph>
+ vector<typename Graph::EdgeId> GetCommonPathsEnd(
+ const Graph &g,
+ typename Graph::EdgeId e1,
+ typename Graph::EdgeId e2,
+ size_t min_dist,
+ size_t max_dist,
+ typename omnigraph::DijkstraHelper<Graph>::BoundedDijkstra &dijkstra) {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ vector<EdgeId> res;
+ VERIFY (min_dist >= g.length(e1));
+ VERIFY (max_dist >= g.length(e1));
+ size_t dist = max_dist - g.length(e1);
+ VertexId cur_vertex = g.EdgeStart(e2);
+ if (!dijkstra.DistanceCounted(cur_vertex))
+ return res;
+ size_t cur_dist;
+ if ((cur_dist = dijkstra.GetDistance(cur_vertex)) > dist)
+ return res;
+ size_t suffix_len = 0;
+ while (cur_dist > 0) {
+ EdgeId prev_edge(0);
+ bool found = false;
+ for (auto edge: g.IncomingEdges(cur_vertex)) {
+ if ((dijkstra.DistanceCounted(g.EdgeStart(edge))) && (
+ suffix_len + g.length(edge) + dijkstra.GetDistance(g.EdgeStart(edge)) <= dist)) {
+ if (found == true) {
+ std::reverse(res.begin(), res.end());
+ return res;
+ } else {
+ found = true;
+ prev_edge = edge;
+ }
+ }
+ }
+ if (!found)
+ return res;
+ else {
+ suffix_len += g.length(prev_edge);
+ VERIFY(cur_dist >= g.length(prev_edge));
+ cur_dist -= g.length(prev_edge);
+ cur_vertex = g.EdgeStart(prev_edge);
+ res.push_back(prev_edge);
+ }
+ }
+ std::reverse(res.begin(), res.end());
+ return res;
+ }
+
+ template<class Graph>
+ vector<vector<typename Graph::EdgeId> > GetAllPathsBetweenEdges(
+ const Graph &g,
+ typename Graph::EdgeId &e1,
+ typename Graph::EdgeId &e2, size_t min_dist,
+ size_t max_dist) {
+ omnigraph::PathStorageCallback<Graph> callback(g);
+ ProcessPaths(g,
+ min_dist,
+ max_dist,
+ g.EdgeEnd(e1), g.EdgeStart(e2),
+ callback);
+ auto paths = callback.paths();
+ return paths;
+ }
+
+ template<class graph_pack>
+ size_t GetAllPathsQuantity(const graph_pack &origin_gp,
+ const typename graph_pack::graph_t::EdgeId &e1,
+ const typename graph_pack::graph_t::EdgeId &e2, double d, double is_var) {
+ omnigraph::PathStorageCallback<typename graph_pack::graph_t> callback(origin_gp.g);
+ omnigraph::PathProcessor<typename graph_pack::graph_t>
+ path_processor(origin_gp.g,
+ (size_t) d - origin_gp.g.length(e1) - size_t(is_var),
+ (size_t) d - origin_gp.g.length(e1) + size_t(is_var),
+ origin_gp.g.EdgeEnd(e1),
+ origin_gp.g.EdgeStart(e2),
+ callback);
+ path_processor.Process();
+ auto paths = callback.paths();
+ TRACE(e1.ind_id() << " " << e2.int_id() << " " << paths.size());
+ return paths.size();
+ }
+
+ template<class Graph>
+ Sequence MergeSequences(const Graph &g,
+ const vector<typename Graph::EdgeId> &continuous_path) {
+ vector<Sequence> path_sequences;
+ path_sequences.push_back(g.EdgeNucls(continuous_path[0]));
+ for (size_t i = 1; i < continuous_path.size(); ++i) {
+ VERIFY(g.EdgeEnd(continuous_path[i - 1]) == g.EdgeStart(continuous_path[i]));
+ path_sequences.push_back(g.EdgeNucls(continuous_path[i]));
+ }
+ return MergeOverlappingSequences(path_sequences, g.k());
+ }
+
+ template<class Graph>
+ Sequence PathSequence(const Graph &g, const omnigraph::Path<typename Graph::EdgeId> &path) {
+ Sequence path_sequence = MergeSequences(g, path.sequence());
+ size_t start = path.start_pos();
+ size_t end = path_sequence.size() - g.length(path[path.size() - 1]) + path.end_pos();
+ return path_sequence.Subseq(start, end);
+ }
+
+
+}
diff --git a/src/common/assembly_graph/stats/picture_dump.hpp b/src/common/assembly_graph/stats/picture_dump.hpp
new file mode 100644
index 0000000..bee431d
--- /dev/null
+++ b/src/common/assembly_graph/stats/picture_dump.hpp
@@ -0,0 +1,455 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "statistics.hpp"
+#include "assembly_graph/core/graph.hpp"
+
+#include "pipeline/graph_pack.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+#include "pipeline/graphio.hpp"
+//FIXME awful dependency to get write_lib_data
+#include "pipeline/config_struct.hpp"
+#include "visualization/position_filler.hpp"
+
+#include "visualization/visualization.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "io/reads/rc_reader_wrapper.hpp"
+#include "io/reads/delegating_reader_wrapper.hpp"
+#include "io/reads/io_helper.hpp"
+#include "io/reads/wrapper_collection.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "io/dataset_support/dataset_readers.hpp"
+#include "utils/copy_file.hpp"
+
+#include <boost/algorithm/string.hpp>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <cmath>
+
+namespace debruijn_graph {
+
+namespace stats {
+
+template<class Graph, class Index>
+MappingPath<typename Graph::EdgeId>
+FindGenomeMappingPath(const Sequence& genome, const Graph& g,
+ const Index& index,
+ const KmerMapper<Graph>& kmer_mapper) {
+ BasicSequenceMapper<Graph, Index> srt(g, index, kmer_mapper);
+ return srt.MapSequence(genome);
+}
+
+template<class graph_pack>
+MappingPath<typename graph_pack::graph_t::EdgeId>
+FindGenomeMappingPath(const Sequence& genome, const graph_pack& gp) {
+ return FindGenomeMappingPath(genome, gp.g, gp.index, gp.kmer_mapper);
+}
+
+template <class graph_pack>
+shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> DefaultColorer(const graph_pack& gp) {
+ return visualization::graph_colorer::DefaultColorer(gp.g,
+ FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path(),
+ FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path());
+}
+
+template <class graph_pack>
+void CollectContigPositions(graph_pack &gp) {
+ if (!cfg::get().pos.contigs_for_threading.empty() &&
+ path::FileExists(cfg::get().pos.contigs_for_threading))
+ visualization::position_filler::FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true);
+
+ if (!cfg::get().pos.contigs_to_analyze.empty() &&
+ path::FileExists(cfg::get().pos.contigs_to_analyze))
+ visualization::position_filler::FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true);
+}
+
+template<class Graph, class Index>
+class GenomeMappingStat: public AbstractStatCounter {
+ private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ const Index& index_;
+ Sequence genome_;
+ size_t k_;
+ public:
+ GenomeMappingStat(const Graph &graph, const Index &index, GenomeStorage genome, size_t k) :
+ graph_(graph), index_(index), genome_(genome.GetSequence()), k_(k) {}
+
+ virtual ~GenomeMappingStat() {}
+
+ virtual void Count() {
+ INFO("Mapping genome");
+ size_t break_number = 0;
+ size_t covered_kp1mers = 0;
+ size_t fail = 0;
+ if (genome_.size() <= k_)
+ return;
+
+ RtSeq cur = genome_.start<RtSeq>(k_ + 1);
+ cur >>= 0;
+ bool breaked = true;
+ pair<EdgeId, size_t> cur_position;
+ for (size_t cur_nucl = k_; cur_nucl < genome_.size(); cur_nucl++) {
+ cur <<= genome_[cur_nucl];
+ if (index_.contains(cur)) {
+ pair<EdgeId, size_t> next = index_.get(cur);
+ if (!breaked
+ && cur_position.second + 1
+ < graph_.length(cur_position.first)) {
+ if (next.first != cur_position.first
+ || cur_position.second + 1 != next.second) {
+ fail++;
+ }
+ }
+ cur_position = next;
+ covered_kp1mers++;
+ breaked = false;
+ } else {
+ if (!breaked) {
+ breaked = true;
+ break_number++;
+ }
+ }
+ }
+ INFO("Genome mapped");
+ INFO("Genome mapping results:");
+ INFO("Covered k+1-mers:" << covered_kp1mers << " of " << (genome_.size() - k_) << " which is "
+ << (100.0 * (double) covered_kp1mers / (double) (genome_.size() - k_)) << "%");
+ INFO("Covered k+1-mers form " << break_number + 1 << " contigious parts");
+ INFO("Continuity failtures " << fail);
+ }
+};
+
+template<class Graph>
+void WriteErrorLoc(const Graph &g,
+ const string& folder_name,
+ std::shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> genome_colorer,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler) {
+ INFO("Writing error localities for graph to folder " << folder_name);
+ auto all = GraphComponent<Graph>::WholeGraph(g);
+ set<typename Graph::EdgeId> edges = genome_colorer->ColoredWith(all.edges().begin(),
+ all.edges().end(), "black");
+ set<typename Graph::VertexId> to_draw;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ to_draw.insert(g.EdgeEnd(*it));
+ to_draw.insert(g.EdgeStart(*it));
+ }
+ shared_ptr<GraphSplitter<Graph>> splitter = StandardSplitter(g, to_draw);
+ visualization::visualization_utils::WriteComponents(g, folder_name, splitter, genome_colorer, labeler);
+ INFO("Error localities written written to folder " << folder_name);
+}
+
+template<class graph_pack>
+void CountStats(const graph_pack& gp) {
+ typedef typename graph_pack::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ INFO("Counting stats");
+ StatList stats;
+ Path<EdgeId> path1 = FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index,
+ gp.kmer_mapper).path();
+ Path<EdgeId> path2 = FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index,
+ gp.kmer_mapper).path();
+ stats.AddStat(new VertexEdgeStat<Graph>(gp.g));
+ stats.AddStat(new BlackEdgesStat<Graph>(gp.g, path1, path2));
+ stats.AddStat(new NStat<Graph>(gp.g, path1, 50));
+ stats.AddStat(new SelfComplementStat<Graph>(gp.g));
+ stats.AddStat(
+ new GenomeMappingStat<Graph, Index>(gp.g, gp.index,
+ gp.genome, gp.k_value));
+ stats.AddStat(new IsolatedEdgesStat<Graph>(gp.g, path1, path2));
+ stats.Count();
+ INFO("Stats counted");
+}
+
+template<class Graph>
+void WriteGraphComponentsAlongGenome(const Graph& g,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler,
+ const string& folder,
+ const Path<typename Graph::EdgeId>& path1,
+ const Path<typename Graph::EdgeId>& path2) {
+ INFO("Writing graph components along genome");
+
+ make_dir(folder);
+ visualization::visualization_utils::WriteComponentsAlongPath(g, path1, folder,
+ visualization::graph_colorer::DefaultColorer(g, path1, path2),
+ labeler);
+
+ INFO("Writing graph components along genome finished");
+}
+
+//todo refactoring needed: use graph pack instead!!!
+template<class Graph, class Mapper>
+void WriteGraphComponentsAlongContigs(const Graph& g,
+ Mapper &mapper,
+ const std::string& folder,
+ std::shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler) {
+ INFO("Writing graph components along contigs");
+ auto contigs_to_thread = io::EasyStream(cfg::get().pos.contigs_to_analyze, false);
+ contigs_to_thread->reset();
+ io::SingleRead read;
+ while (!contigs_to_thread->eof()) {
+ (*contigs_to_thread) >> read;
+ make_dir(folder + read.name());
+ visualization::visualization_utils::WriteComponentsAlongPath(g, mapper.MapSequence(read.sequence()).simple_path(),
+ folder + read.name() + "/", colorer, labeler);
+ }
+ INFO("Writing graph components along contigs finished");
+}
+
+template<class Graph>
+void WriteKmerComponent(conj_graph_pack &gp, RtSeq const& kp1mer, const std::string& file,
+ std::shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler) {
+ if(!gp.index.contains(kp1mer)) {
+ WARN("no such kmer in the graph");
+ return;
+ }
+ VERIFY(gp.index.contains(kp1mer));
+ auto pos = gp.index.get(kp1mer);
+ typename Graph::VertexId v = pos.second * 2 < gp.g.length(pos.first) ? gp.g.EdgeStart(pos.first) : gp.g.EdgeEnd(pos.first);
+ GraphComponent<Graph> component = omnigraph::VertexNeighborhood<Graph>(gp.g, v);
+ visualization::visualization_utils::WriteComponent<Graph>(component, file, colorer, labeler);
+}
+
+inline
+optional<RtSeq> FindCloseKP1mer(const conj_graph_pack &gp,
+ size_t genome_pos, size_t k) {
+ VERIFY(gp.genome.size() > 0);
+ VERIFY(genome_pos < gp.genome.size());
+ static const size_t magic_const = 200;
+ for (size_t diff = 0; diff < magic_const; diff++) {
+ for (int dir = -1; dir <= 1; dir += 2) {
+ size_t pos = (gp.genome.size() - k + genome_pos + dir * diff) % (gp.genome.size() - k);
+ RtSeq kp1mer = gp.kmer_mapper.Substitute(
+ RtSeq (k + 1, gp.genome.GetSequence(), pos));
+ if (gp.index.contains(kp1mer))
+ return optional<RtSeq>(kp1mer);
+ }
+ }
+ return boost::none;
+}
+
+inline
+void PrepareForDrawing(conj_graph_pack &gp) {
+ gp.EnsureDebugInfo();
+ CollectContigPositions(gp);
+}
+
+
+struct detail_info_printer {
+ detail_info_printer(conj_graph_pack &gp,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler,
+ const string& folder)
+ : gp_(gp),
+ labeler_(labeler),
+ folder_(folder) {
+ }
+
+ void operator() (config::info_printer_pos pos,
+ const string& folder_suffix = "") {
+ string pos_name = ModeName(pos, config::InfoPrinterPosNames());
+
+ ProduceDetailedInfo(pos_name + folder_suffix, pos);
+ }
+
+ private:
+
+ void ProduceDetailedInfo(const string &pos_name,
+ config::info_printer_pos pos) {
+ using namespace visualization;
+
+ static size_t call_cnt = 0;
+
+ auto it = cfg::get().info_printers.find(pos);
+ VERIFY(it != cfg::get().info_printers.end());
+
+ const config::debruijn_config::info_printer & config = it->second;
+
+ if (config.basic_stats) {
+ VertexEdgeStat<conj_graph_pack::graph_t> stats(gp_.g);
+ INFO("Number of vertices : " << stats.vertices() << ", number of edges : "
+ << stats.edges() << ", sum length of edges : " << stats.edge_length());
+ }
+
+ if (config.save_graph_pack) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
+ graphio::PrintGraphPack(saves_folder + "graph_pack", printer, gp_);
+ //TODO: separate
+ graphio::PrintClusteredIndices(saves_folder + "graph_pack", printer, gp_.clustered_indices);
+ }
+
+ if (config.save_all) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name);
+ path::make_dirs(saves_folder);
+ string p = saves_folder + "/saves";
+ INFO("Saving current state to " << p);
+
+ debruijn_graph::graphio::PrintAll(p, gp_);
+ debruijn_graph::config::write_lib_data(p);
+ }
+
+ if (config.save_full_graph) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
+ graphio::PrintBasicGraph(saves_folder + "graph", printer);
+ }
+
+ if (config.lib_info) {
+ string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(saves_folder);
+ config::write_lib_data(saves_folder + "lib_info");
+ }
+
+ if (config.extended_stats) {
+ VERIFY(cfg::get().developer_mode);
+ CountStats(gp_);
+ }
+
+ if (!(config.write_error_loc ||
+ config.write_full_graph ||
+ config.write_full_nc_graph ||
+ config.write_components ||
+ !config.components_for_kmer.empty() ||
+ config.write_components_along_genome ||
+ config.write_components_along_contigs ||
+ !config.components_for_genome_pos.empty())) {
+ return;
+ }
+
+ VERIFY(cfg::get().developer_mode);
+ string pics_folder = path::append_path(path::append_path(folder_, "pictures/"),
+ ToString(call_cnt++, 2) + "_" + pos_name + "/");
+ path::make_dirs(pics_folder);
+ PrepareForDrawing(gp_);
+
+ auto path1 = FindGenomeMappingPath(gp_.genome.GetSequence(), gp_.g, gp_.index,
+ gp_.kmer_mapper).path();
+
+ auto colorer = DefaultColorer(gp_);
+
+ if (config.write_error_loc) {
+ make_dir(pics_folder + "error_loc/");
+ WriteErrorLoc(gp_.g, pics_folder + "error_loc/", colorer, labeler_);
+ }
+
+ if (config.write_full_graph) {
+ visualization_utils::WriteComponent(GraphComponent<Graph>::WholeGraph(gp_.g),
+ pics_folder + "full_graph.dot", colorer, labeler_);
+ }
+
+ if (config.write_full_nc_graph) {
+ visualization_utils::WriteSimpleComponent(GraphComponent<Graph>::WholeGraph(gp_.g),
+ pics_folder + "nc_full_graph.dot", colorer, labeler_);
+ }
+
+ if (config.write_components) {
+ make_dir(pics_folder + "components/");
+ visualization_utils::WriteComponents(gp_.g, pics_folder + "components/",
+ omnigraph::ReliableSplitter<Graph>(gp_.g), colorer, labeler_);
+ }
+
+ if (!config.components_for_kmer.empty()) {
+ string kmer_folder = path::append_path(pics_folder, "kmer_loc/");
+ make_dir(kmer_folder);
+ auto kmer = RtSeq(gp_.k_value + 1, config.components_for_kmer.substr(0, gp_.k_value + 1).c_str());
+ string file_name = path::append_path(kmer_folder, pos_name + ".dot");
+ WriteKmerComponent(gp_, kmer, file_name, colorer, labeler_);
+ }
+
+ if (config.write_components_along_genome) {
+ make_dir(pics_folder + "along_genome/");
+ visualization_utils::WriteComponentsAlongPath
+ (gp_.g, path1.sequence(), pics_folder + "along_genome/", colorer, labeler_);
+ }
+
+ if (config.write_components_along_contigs) {
+ make_dir(pics_folder + "along_contigs/");
+ BasicSequenceMapper<Graph, Index> mapper(gp_.g, gp_.index, gp_.kmer_mapper);
+ WriteGraphComponentsAlongContigs(gp_.g, mapper, pics_folder + "along_contigs/", colorer, labeler_);
+ }
+
+ if (!config.components_for_genome_pos.empty()) {
+ string pos_loc_folder = path::append_path(pics_folder, "pos_loc/");
+ make_dir(pos_loc_folder);
+ vector<string> positions;
+ boost::split(positions, config.components_for_genome_pos,
+ boost::is_any_of(" ,"), boost::token_compress_on);
+ for (auto it = positions.begin(); it != positions.end(); ++it) {
+ boost::optional<RtSeq> close_kp1mer = FindCloseKP1mer(gp_,
+ std::stoi(*it), gp_.k_value);
+ if (close_kp1mer) {
+ string locality_folder = path::append_path(pos_loc_folder, *it + "/");
+ make_dir(locality_folder);
+ WriteKmerComponent(gp_, *close_kp1mer, path::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_);
+ } else {
+ WARN(
+ "Failed to find genome kp1mer close to the one at position "
+ << *it << " in the graph. Which is " << RtSeq (gp_.k_value + 1, gp_.genome.GetSequence(), std::stoi(*it)));
+ }
+ }
+ }
+ }
+
+ conj_graph_pack& gp_;
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler_;
+ string folder_;
+};
+
+inline
+std::string ConstructComponentName(std::string file_name, size_t cnt) {
+ stringstream ss;
+ ss << cnt;
+ string res = file_name;
+ res.insert(res.length(), ss.str());
+ return res;
+}
+
+template<class Graph>
+double AvgCoverage(const Graph& g,
+ const std::vector<typename Graph::EdgeId>& edges) {
+ double total_cov = 0.;
+ size_t total_length = 0;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ total_cov += g.coverage(*it) * (double) g.length(*it);
+ total_length += g.length(*it);
+ }
+ return total_cov / (double) total_length;
+}
+
+template<class Graph>
+size_t Nx(Graph &g, double percent) {
+ size_t sum_edge_length = 0;
+ vector<size_t> lengths;
+ for (auto iterator = g.ConstEdgeBegin(); !iterator.IsEnd(); ++iterator) {
+ lengths.push_back(g.length(*iterator));
+ sum_edge_length += g.length(*iterator);
+ }
+ sort(lengths.begin(), lengths.end());
+ double len_perc = (1.0 - percent * 0.01) * (double) (sum_edge_length);
+ for (size_t i = 0; i < lengths.size(); i++) {
+ if (lengths[i] >= len_perc)
+ return lengths[i];
+ else
+ len_perc -= (double) lengths[i];
+ }
+ return 0;
+}
+
+}
+}
diff --git a/src/common/assembly_graph/stats/statistics.hpp b/src/common/assembly_graph/stats/statistics.hpp
new file mode 100644
index 0000000..cb6e7b4
--- /dev/null
+++ b/src/common/assembly_graph/stats/statistics.hpp
@@ -0,0 +1,273 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/simple_tools.hpp"
+#include "math/xmath.h"
+#include "pipeline/config_struct.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <map>
+
+namespace debruijn_graph {
+namespace stats {
+
+using namespace math;
+using namespace omnigraph;
+
+class AbstractStatCounter {
+public:
+ AbstractStatCounter() {
+ }
+
+ virtual ~AbstractStatCounter() {
+ }
+
+ virtual void Count() = 0;
+ //protected:
+ // DECL_LOGGER("StatCounter")
+};
+
+class StatList : AbstractStatCounter {
+private:
+ vector<AbstractStatCounter *> to_count_;
+public:
+ StatList(vector<AbstractStatCounter *> to_count =
+ vector<AbstractStatCounter *>()) :
+ to_count_(to_count) {
+ }
+
+ virtual ~StatList() {
+ }
+
+ void AddStat(AbstractStatCounter *new_stat) {
+ to_count_.push_back(new_stat);
+ }
+
+ const vector<AbstractStatCounter *> stats() {
+ return to_count_;
+ }
+
+ virtual void Count() {
+ for (size_t i = 0; i < to_count_.size(); i++) {
+ to_count_[i]->Count();
+ }
+ }
+
+ void DeleteStats() {
+ for (size_t i = 0; i < to_count_.size(); i++)
+ delete to_count_[i];
+ to_count_.clear();
+ }
+};
+
+template<class Graph>
+class VertexEdgeStat : public AbstractStatCounter {
+private:
+ const Graph &graph_;
+public:
+ VertexEdgeStat(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ virtual ~VertexEdgeStat() {
+ }
+
+ size_t vertices() {
+ return graph_.size();
+ }
+
+ size_t edges() {
+ size_t edgeNumber = 0;
+ size_t sum_edge_length = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator) {
+ edgeNumber++;
+ // if (graph_.coverage(*iterator) > 30) {
+ sum_edge_length += graph_.length(*iterator);
+ // }
+ }
+ return edgeNumber;
+ }
+
+ size_t edge_length() {
+ size_t sum_edge_length = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator) {
+ if (graph_.coverage(*iterator) > 30) {
+ sum_edge_length += graph_.length(*iterator);
+ }
+ }
+ return sum_edge_length;
+ }
+
+ virtual void Count() {
+ INFO(
+ "Vertex count=" << vertices() << "; Edge count=" << edges());
+ INFO(
+ "sum length of edges " << edge_length());
+ }
+};
+
+template<class Graph>
+class BlackEdgesStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ Path<EdgeId> path1_;
+ Path<EdgeId> path2_;
+public:
+ BlackEdgesStat(const Graph &graph, Path<EdgeId> path1, Path<EdgeId> path2) :
+ graph_(graph), path1_(path1), path2_(path2) {
+ }
+
+ virtual ~BlackEdgesStat() {
+ }
+
+ virtual void Count() {
+ size_t black_count = 0;
+ size_t edge_count = 0;
+ const vector <EdgeId> path_edges1 = path1_.sequence();
+ const vector <EdgeId> path_edges2 = path2_.sequence();
+ set <EdgeId> colored_edges;
+ colored_edges.insert(path_edges1.begin(), path_edges1.end());
+ colored_edges.insert(path_edges2.begin(), path_edges2.end());
+ size_t sum_length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ edge_count++;
+ if (colored_edges.count(*it) == 0) {
+ black_count++;
+ sum_length += graph_.length(*it);
+ }
+ }
+ if (edge_count > 0) {
+ INFO("Error edges count: " << black_count << " which is " <<
+ 100.0 * (double) black_count / (double) edge_count << "% of all edges");
+ INFO("Total length of all black edges: " << sum_length << ". While double genome length is " <<
+ (2 * cfg::get().ds.reference_genome.size()));
+ } else {
+ INFO("Error edges count: " << black_count << " which is 0% of all edges");
+ }
+ }
+};
+
+template<class Graph>
+class NStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ Path<EdgeId> path_;
+ size_t perc_;
+public:
+ NStat(const Graph &graph, Path<EdgeId> path, size_t perc = 50) :
+ graph_(graph), path_(path), perc_(perc) {
+ }
+
+ virtual ~NStat() {
+ }
+
+ virtual void Count() {
+ vector <size_t> lengths;
+ size_t sum_all = 0;
+ for (size_t i = 0; i < path_.size(); i++) {
+ lengths.push_back(graph_.length(path_[i]));
+ sum_all += graph_.length(path_[i]);
+ }
+ sort(lengths.begin(), lengths.end());
+ size_t sum = 0;
+ size_t current = lengths.size();
+ while (current > 0 && (double) sum < (double) perc_ * 0.01 * (double) sum_all) {
+ current--;
+ sum += lengths[current];
+ }
+ if (current < lengths.size())
+ INFO("N" << perc_ << ": " << lengths[current]);
+ }
+};
+
+template<class Graph>
+class IsolatedEdgesStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ set <EdgeId> black_edges_;
+ vector <size_t> lengths;
+public:
+ IsolatedEdgesStat(const Graph &graph, Path<EdgeId> path1,
+ Path<EdgeId> path2) :
+ graph_(graph) {
+ for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ black_edges_.insert(*it);
+ }
+ for (size_t i = 0; i < path1.size(); i++) {
+ black_edges_.erase(path1[i]);
+ }
+ for (size_t i = 0; i < path2.size(); i++) {
+ black_edges_.erase(path2[i]);
+ }
+ }
+
+ virtual ~IsolatedEdgesStat() {
+ }
+
+ virtual void Count() {
+ lengths.clear();
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ EdgeId edge = *it;
+ if (graph_.IsDeadEnd(graph_.EdgeEnd(edge))
+ && graph_.IsDeadStart(graph_.EdgeStart(edge))
+ && black_edges_.count(edge) == 0) {
+ lengths.push_back(graph_.length(edge));
+ }
+ }
+ INFO("Isolated not black edges: " << lengths.size());
+ WriteLengths(cfg::get().output_dir, "isolated_edges.txt");
+ }
+
+ void WriteLengths(string folder_name, string file_name) {
+ ofstream os;
+ os.open((folder_name + "/" + file_name).c_str());
+ WriteLengths(os);
+ os.close();
+ }
+
+ void WriteLengths(ostream &os) {
+ sort(lengths.begin(), lengths.end());
+ for (size_t i = 0; i < lengths.size(); i++) {
+ os << lengths[i] << endl;
+ }
+ }
+};
+
+template<class Graph>
+class SelfComplementStat : public AbstractStatCounter {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+public:
+ SelfComplementStat(const Graph &graph) :
+ graph_(graph) {
+ }
+
+ virtual ~SelfComplementStat() {
+ }
+
+ virtual void Count() {
+ size_t sc_number = 0;
+ for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
+ ++iterator)
+ if (graph_.conjugate(*iterator) == (*iterator))
+ sc_number++;
+ // INFO("Self-complement count failed!!! ");
+ INFO("Self-complement count=" << sc_number);
+ }
+};
+}
+}
diff --git a/src/modules/empty.cpp b/src/common/empty.cpp
similarity index 100%
copy from src/modules/empty.cpp
copy to src/common/empty.cpp
diff --git a/src/common/func/func.hpp b/src/common/func/func.hpp
new file mode 100644
index 0000000..a0b130f
--- /dev/null
+++ b/src/common/func/func.hpp
@@ -0,0 +1,25 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <functional>
+
+namespace func {
+
+template<class T>
+std::function<void(T)> CombineCallbacks(const std::function<void(T)>& f1,
+ const std::function<void(T)>& f2) {
+ return [=] (T t) {
+ if (f1)
+ f1(t);
+ if (f2)
+ f2(t);
+ };
+}
+
+}
diff --git a/src/common/func/function_traits.hpp b/src/common/func/function_traits.hpp
new file mode 100644
index 0000000..3facd64
--- /dev/null
+++ b/src/common/func/function_traits.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <functional>
+
+namespace func {
+
+template<class F>
+struct function_traits;
+
+// function pointer
+template<class R, class... Args>
+struct function_traits<R(*)(Args...)> : public function_traits<R(Args...)> {
+};
+
+// member function pointer
+template<class C, class R, class... Args>
+struct function_traits<R(C::*)(Args...)> : public function_traits<R(C &, Args...)> {
+};
+
+// const member function pointer
+template<class C, class R, class... Args>
+struct function_traits<R(C::*)(Args...) const> : public function_traits<R(C &, Args...)> {
+};
+
+// member object pointer
+template<class C, class R>
+struct function_traits<R(C::*)> : public function_traits<R(C &)> {
+};
+
+template<class R, class... Args>
+struct function_traits<R(Args...)> {
+ using return_type = R;
+
+ static constexpr std::size_t arity = sizeof...(Args);
+
+ template<std::size_t N>
+ struct arg {
+ static_assert(N < arity, "invalid argument index");
+ using type = typename std::tuple_element<N, std::tuple<Args...>>::type;
+ };
+};
+
+template<class F>
+struct function_traits<F &> : public function_traits<F> {
+};
+
+template<class F>
+struct function_traits<F &&> : public function_traits<F> {
+};
+
+// functors & default implementation
+template<class F>
+struct function_traits {
+private:
+ using call_type = function_traits<decltype(&F::operator())>;
+
+public:
+ using return_type = typename call_type::return_type;
+
+ // Remeber to get rid of this argument
+ static constexpr std::size_t arity = call_type::arity - 1;
+
+ template<std::size_t N>
+ struct arg {
+ static_assert(N < arity, "invalid argument index");
+ // Remeber to get rid of this argument
+ using type = typename call_type::template arg<N + 1>::type;
+ };
+};
+
+} // namespace func
diff --git a/src/common/func/pred.hpp b/src/common/func/pred.hpp
new file mode 100644
index 0000000..ebe22cc
--- /dev/null
+++ b/src/common/func/pred.hpp
@@ -0,0 +1,175 @@
+#pragma once
+
+#include "function_traits.hpp"
+
+#include <memory>
+#include <functional>
+
+namespace func {
+
+template<class T>
+class AbstractPredicate {
+public:
+ typedef T checked_type;
+
+ virtual bool Check(T t) const = 0;
+
+ bool operator()(T t) const { return Check(t); }
+
+ virtual ~AbstractPredicate() {}
+};
+
+template<typename T>
+class TypedPredicate {
+ struct TypedPredicateConcept {
+ virtual ~TypedPredicateConcept() { };
+
+ virtual bool operator()(T x) const = 0;
+ };
+
+ template<class P>
+ struct TypedPredicateModel : TypedPredicateConcept {
+ TypedPredicateModel(P p)
+ : data_(std::move(p)) { }
+
+ virtual bool operator()(T x) const override {
+ return data_(x);
+ }
+
+ P data_;
+ };
+
+ std::shared_ptr<const TypedPredicateConcept> self_;
+
+public:
+ typedef T checked_type;
+
+ template<typename P>
+ TypedPredicate(P p)
+ : self_(std::make_shared<TypedPredicateModel<P>>(std::move(p))) { }
+
+ bool operator()(T x) const {
+ return self_->operator()(x);
+ }
+};
+
+template<typename T>
+class AlwaysTrueOperator {
+public:
+ typedef T checked_type;
+
+ bool operator()(T) const {
+ return true;
+ }
+};
+
+template<typename T>
+class AlwaysFalseOperator {
+ typedef T checked_type;
+
+public:
+ bool operator()(T) const {
+ return false;
+ }
+};
+
+template<typename T>
+class AndOperator {
+public:
+ typedef T checked_type;
+
+ AndOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
+ : lhs_(std::move(lhs)),
+ rhs_(std::move(rhs)) { }
+
+ bool operator()(T x) const {
+ return lhs_(x) && rhs_(x);
+ }
+
+private:
+ const TypedPredicate<T> lhs_, rhs_;
+};
+
+template<typename T>
+class OrOperator {
+public:
+ typedef T checked_type;
+
+ OrOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
+ : lhs_(std::move(lhs)), rhs_(std::move(rhs)) { }
+
+ bool operator()(T x) const {
+ return lhs_(x) || rhs_(x);
+ }
+
+private:
+ const TypedPredicate<T> lhs_, rhs_;
+};
+
+template<typename T>
+class NotOperator {
+public:
+ typedef T checked_type;
+
+ NotOperator(const TypedPredicate<T> p)
+ : p_(std::move(p)) { }
+
+ bool operator()(T x) const {
+ return !p_(x);
+ }
+
+private:
+ const TypedPredicate<T> p_;
+};
+
+template<class P,
+ bool = function_traits<P>::arity == 1 &&
+ std::is_same<typename function_traits<P>::return_type, bool>::value>
+struct is_predicate : public std::true_type {
+};
+
+template<class P>
+struct is_predicate<P, false> : public std::false_type {
+};
+
+template<class TP1, class TP2,
+ typename _T1 = typename function_traits<TP1>::template arg<0>::type,
+ typename _T2 = typename function_traits<TP2>::template arg<0>::type,
+ typename =
+ typename std::enable_if<std::is_same<_T1, _T2>::value &&
+ is_predicate<TP1>::value && is_predicate<TP2>::value
+ >::type>
+TypedPredicate<_T1> And(TP1 lhs, TP2 rhs) {
+ return AndOperator<_T1>(lhs, rhs);
+}
+
+template<class TP1, class TP2,
+ typename _T1 = typename function_traits<TP1>::template arg<0>::type,
+ typename _T2 = typename function_traits<TP2>::template arg<0>::type,
+ typename =
+ typename std::enable_if<std::is_same<_T1, _T2>::value &&
+ is_predicate<TP1>::value && is_predicate<TP2>::value
+ >::type>
+TypedPredicate<_T1> Or(TP1 lhs, TP2 rhs) {
+ return OrOperator<_T1>(lhs, rhs);
+}
+
+template<class TP,
+ typename _T = typename function_traits<TP>::template arg<0>::type,
+ typename =
+ typename std::enable_if<is_predicate<TP>::value>::type>
+TypedPredicate<_T> Not(TP p) {
+ return NotOperator<_T>(p);
+}
+
+template<class T>
+TypedPredicate<T> AlwaysTrue() {
+ return AlwaysTrueOperator<T>();
+}
+
+template<class T>
+TypedPredicate<T> AlwaysFalse() {
+ return AlwaysFalseOperator<T>();
+}
+
+} // namespace func
diff --git a/src/common/io/CMakeLists.txt b/src/common/io/CMakeLists.txt
new file mode 100644
index 0000000..31fe9f4
--- /dev/null
+++ b/src/common/io/CMakeLists.txt
@@ -0,0 +1,16 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(input CXX)
+
+add_library(input STATIC
+ reads/parser.cpp
+ sam/read.cpp
+ sam/sam_reader.cpp)
+
+target_link_libraries(input BamTools samtools)
+
diff --git a/src/common/io/dataset_support/dataset_readers.hpp b/src/common/io/dataset_support/dataset_readers.hpp
new file mode 100644
index 0000000..4b04751
--- /dev/null
+++ b/src/common/io/dataset_support/dataset_readers.hpp
@@ -0,0 +1,121 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/logger/logger.hpp"
+#include "utils/simple_tools.hpp"
+#include "io/reads/io_helper.hpp"
+#include "pipeline/library.hpp"
+#include "pipeline/config_struct.hpp"
+
+namespace io {
+
+inline
+PairedStreamPtr paired_easy_reader(const SequencingLibrary<debruijn_graph::config::DataSetData> &lib,
+ bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ OffsetType offset_type = PhredOffset) {
+ ReadStreamList<PairedRead> streams;
+ for (auto read_pair : lib.paired_reads()) {
+ streams.push_back(PairedEasyStream(read_pair.first, read_pair.second, followed_by_rc, insert_size, change_read_order,
+ use_orientation, lib.orientation(), offset_type));
+ }
+ return MultifileWrap<PairedRead>(streams);
+}
+
+inline
+ReadStreamList<SingleRead> single_easy_readers(const SequencingLibrary<debruijn_graph::config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ bool handle_Ns = true,
+ OffsetType offset_type = PhredOffset) {
+ ReadStreamList<SingleRead> streams;
+ if (including_paired_reads) {
+ for (const auto& read : lib.reads()) {
+ //do we need input_file function here?
+ streams.push_back(EasyStream(read, followed_by_rc, handle_Ns, offset_type));
+ }
+ } else {
+ for (const auto& read : lib.single_reads()) {
+ streams.push_back(EasyStream(read, followed_by_rc, handle_Ns, offset_type));
+ }
+ }
+ return streams;
+}
+
+inline
+SingleStreamPtr single_easy_reader(const SequencingLibrary<debruijn_graph::config::DataSetData> &lib,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ bool handle_Ns = true,
+ OffsetType offset_type = PhredOffset) {
+ return MultifileWrap<io::SingleRead>(
+ single_easy_readers(lib, followed_by_rc, including_paired_reads, handle_Ns, offset_type));
+}
+
+inline
+PairedStreamPtr paired_easy_reader_for_libs(std::vector<size_t> libs,
+ bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ OffsetType offset_type = PhredOffset) {
+ ReadStreamList<io::PairedRead> streams;
+ for (size_t i = 0; i < libs.size(); ++i) {
+ streams.push_back(paired_easy_reader(cfg::get().ds.reads[libs[i]],
+ followed_by_rc, insert_size, change_read_order, use_orientation, offset_type));
+ }
+ return MultifileWrap<PairedRead>(streams);
+}
+
+
+inline
+PairedStreamPtr paired_easy_reader(bool followed_by_rc,
+ size_t insert_size,
+ bool change_read_order = false,
+ bool use_orientation = true,
+ OffsetType offset_type = PhredOffset) {
+
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ all_libs[i] = i;
+
+ // FIXME: Should we use only first library?
+ // No, this one is for all libs together
+ return paired_easy_reader_for_libs(all_libs, followed_by_rc, insert_size, change_read_order, use_orientation, offset_type);
+}
+
+
+inline
+SingleStreamPtr single_easy_reader_for_libs(vector<size_t> libs,
+ bool followed_by_rc,
+ bool including_paired_reads,
+ OffsetType offset_type = PhredOffset) {
+ ReadStreamList<SingleRead> streams;
+ for (size_t i = 0; i < libs.size(); ++i) {
+ streams.push_back(single_easy_reader(cfg::get().ds.reads[libs[i]],
+ followed_by_rc, including_paired_reads, offset_type));
+ }
+ return MultifileWrap<SingleRead>(streams);
+}
+
+inline
+SingleStreamPtr single_easy_reader(bool followed_by_rc,
+ bool including_paired_reads,
+ OffsetType offset_type = PhredOffset) {
+
+ std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
+ for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
+ all_libs[i] = i;
+
+ return single_easy_reader_for_libs(all_libs, followed_by_rc, including_paired_reads, offset_type);
+}
+
+}
diff --git a/src/common/io/dataset_support/read_converter.hpp b/src/common/io/dataset_support/read_converter.hpp
new file mode 100644
index 0000000..6939f1a
--- /dev/null
+++ b/src/common/io/dataset_support/read_converter.hpp
@@ -0,0 +1,279 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * read_converter.hpp
+ *
+ * Created on: Apr 13, 2012
+ * Author: andrey
+ */
+
+#pragma once
+
+#include "io/reads/binary_converter.hpp"
+#include "io/reads/io_helper.hpp"
+#include "dataset_readers.hpp"
+#include "utils/simple_tools.hpp"
+
+#include <fstream>
+
+namespace io {
+
+typedef debruijn_graph::config::dataset dataset;
+typedef debruijn_graph::config::DataSetData DataSetData;
+typedef SequencingLibrary<DataSetData> SequencingLibraryT;
+
+class ReadConverter {
+
+private:
+ const static size_t current_binary_format_version = 11;
+
+ static bool CheckBinaryReadsExist(SequencingLibraryT& lib) {
+ return path::FileExists(lib.data().binary_reads_info.bin_reads_info_file);
+ }
+
+ static bool LoadLibIfExists(SequencingLibraryT& lib) {
+ auto& data = lib.data();
+
+ if (!CheckBinaryReadsExist(lib))
+ return false;
+
+ std::ifstream info;
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::in);
+ DEBUG("Reading binary information file " << data.binary_reads_info.bin_reads_info_file);
+
+ size_t chunk_num = 0;
+ size_t format = 0;
+ size_t lib_index = 0;
+
+ info >> format;
+ if (!info.eof()) {
+ info >> chunk_num;
+ }
+ if (!info.eof()) {
+ info >> lib_index;
+ }
+
+ if (chunk_num != data.binary_reads_info.chunk_num ||
+ format != current_binary_format_version ||
+ lib_index != data.lib_index) {
+ return false;
+ }
+
+ INFO("Binary reads detected");
+ info >> data.read_length;
+ info >> data.read_count;
+ info >> data.total_nucls;
+ data.binary_reads_info.binary_coverted = true;
+
+ info.close();
+ return true;
+ }
+
+ static void ConvertToBinary(SequencingLibraryT& lib) {
+ auto& data = lib.data();
+ std::ofstream info;
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
+ info << "0 0 0";
+ info.close();
+
+ INFO("Converting reads to binary format for library #" << data.lib_index << " (takes a while)");
+ INFO("Converting paired reads");
+ PairedStreamPtr paired_reader = paired_easy_reader(lib, false, 0, false, false);
+ BinaryWriter paired_converter(data.binary_reads_info.paired_read_prefix,
+ data.binary_reads_info.chunk_num,
+ data.binary_reads_info.buffer_size);
+
+ ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, lib.orientation());
+ paired_stat.read_count_ *= 2;
+
+ INFO("Converting single reads");
+
+ SingleStreamPtr single_reader = single_easy_reader(lib, false, false);
+ BinaryWriter single_converter(data.binary_reads_info.single_read_prefix,
+ data.binary_reads_info.chunk_num,
+ data.binary_reads_info.buffer_size);
+ ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
+
+ paired_stat.merge(single_stat);
+ data.read_length = paired_stat.max_len_;
+ data.read_count = paired_stat.read_count_;
+ data.total_nucls = paired_stat.total_len_;
+
+ info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
+ info << current_binary_format_version << " " <<
+ data.binary_reads_info.chunk_num << " " <<
+ data.lib_index << " " <<
+ data.read_length << " " <<
+ data.read_count << " " <<
+ data.total_nucls << "\n";
+
+ info.close();
+ data.binary_reads_info.binary_coverted = true;
+ }
+
+public:
+ static void ConvertToBinaryIfNeeded(SequencingLibraryT& lib) {
+ if (lib.data().binary_reads_info.binary_coverted && CheckBinaryReadsExist(lib))
+ return;
+
+ if (LoadLibIfExists(lib)) {
+ return;
+ }
+
+ ConvertToBinary(lib);
+ }
+};
+
+
+inline
+BinaryPairedStreams raw_paired_binary_readers(SequencingLibraryT &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ ReadConverter::ConvertToBinaryIfNeeded(lib);
+ const auto& data = lib.data();
+ VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+
+ ReadStreamList<PairedReadSeq> paired_streams;
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ paired_streams.push_back(make_shared<BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
+ i, insert_size));
+ }
+ return apply_paired_wrappers(followed_by_rc, paired_streams);
+}
+
+inline
+BinarySingleStreams raw_single_binary_readers(SequencingLibraryT &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ const auto& data = lib.data();
+ ReadConverter::ConvertToBinaryIfNeeded(lib);
+ VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
+
+ BinarySingleStreams single_streams;
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ single_streams.push_back(make_shared<BinaryFileSingleStream>(data.binary_reads_info.single_read_prefix, i));
+ }
+ if (including_paired_reads) {
+ BinaryPairedStreams paired_streams;
+ for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
+ paired_streams.push_back(make_shared<BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
+ i, 0));
+ }
+
+ return apply_single_wrappers(followed_by_rc, single_streams, &paired_streams);
+ }
+ else {
+ return apply_single_wrappers(followed_by_rc, single_streams);
+ }
+}
+
+
+inline
+BinaryPairedStreams paired_binary_readers(SequencingLibraryT &lib,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+ return raw_paired_binary_readers(lib, followed_by_rc, insert_size);
+}
+
+
+inline
+BinarySingleStreams single_binary_readers(SequencingLibraryT &lib,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ return raw_single_binary_readers(lib, followed_by_rc, including_paired_reads);
+}
+
+
+inline
+//todo simplify
+BinaryPairedStreams paired_binary_readers_for_libs(dataset& dataset_info,
+ const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+
+ VERIFY(!libs.empty())
+ size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
+
+ std::vector<BinaryPairedStreams> streams(chunk_num);
+ for (size_t i = 0; i < libs.size(); ++i) {
+ VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
+ "Cannot create stream for multiple libraries with different chunk_num")
+ BinaryPairedStreams lib_streams = raw_paired_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, insert_size);
+ for (size_t j = 0; j < chunk_num; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
+ }
+
+ BinaryPairedStreams joint_streams;
+ for (size_t j = 0; j < chunk_num; ++j) {
+ joint_streams.push_back(MultifileWrap<PairedReadSeq>(streams[j]));
+ }
+ return joint_streams;
+}
+
+inline
+BinarySingleStreams single_binary_readers_for_libs(dataset& dataset_info,
+ const std::vector<size_t>& libs,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ VERIFY(!libs.empty())
+ size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
+
+ std::vector<BinarySingleStreams> streams(chunk_num);
+ for (size_t i = 0; i < libs.size(); ++i) {
+ VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
+ "Cannot create stream for multiple libraries with different chunk_num")
+ BinarySingleStreams lib_streams = raw_single_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, including_paired_reads);
+
+ for (size_t j = 0; j < chunk_num; ++j) {
+ streams[j].push_back(lib_streams.ptr_at(j));
+ }
+ }
+
+ BinarySingleStreams joint_streams;
+ for (size_t j = 0; j < chunk_num; ++j) {
+ joint_streams.push_back(MultifileWrap<SingleReadSeq>(streams[j]));
+ }
+ return joint_streams;
+}
+
+inline
+BinaryPairedStreams paired_binary_readers(dataset& dataset_info,
+ bool followed_by_rc,
+ size_t insert_size = 0) {
+
+ std::vector<size_t> all_libs(dataset_info.reads.lib_count());
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
+ all_libs[i] = i;
+ }
+ return paired_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, insert_size);
+}
+
+inline
+BinarySingleStreams single_binary_readers(dataset& dataset_info,
+ bool followed_by_rc,
+ bool including_paired_reads) {
+ std::vector<size_t> all_libs(dataset_info.reads.lib_count());
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
+ all_libs[i] = i;
+ }
+ return single_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, including_paired_reads);
+}
+
+inline
+BinarySingleStreamPtr single_binary_multireader(dataset& dataset_info, bool followed_by_rc, bool including_paired_reads) {
+ return MultifileWrap<SingleReadSeq>(single_binary_readers(dataset_info, followed_by_rc, including_paired_reads));
+}
+
+inline
+BinaryPairedStreamPtr paired_binary_multireader(dataset& dataset_info, bool followed_by_rc, size_t insert_size = 0) {
+ return MultifileWrap<PairedReadSeq>(paired_binary_readers(dataset_info, followed_by_rc, insert_size));
+}
+
+
+}
diff --git a/src/common/io/kmers/kmer_iterator.hpp b/src/common/io/kmers/kmer_iterator.hpp
new file mode 100644
index 0000000..07d04a6
--- /dev/null
+++ b/src/common/io/kmers/kmer_iterator.hpp
@@ -0,0 +1,54 @@
+#ifndef __IO_KMER_ITERATOR_HPP__
+#define __IO_KMER_ITERATOR_HPP__
+
+#include "io/kmers/mmapped_reader.hpp"
+#include <string>
+
+namespace io {
+
+template<class Seq>
+using raw_kmer_iterator = MMappedFileRecordArrayIterator<typename Seq::DataType>;
+
+template<class Seq>
+raw_kmer_iterator<Seq> make_kmer_iterator(const std::string &FileName,
+ unsigned K) {
+ return raw_kmer_iterator<Seq>(FileName, Seq::GetDataSize(K));
+}
+
+template<class Seq>
+std::vector<raw_kmer_iterator<Seq>> make_kmer_iterator(const std::string &FileName,
+ size_t K, size_t amount) {
+ std::vector<raw_kmer_iterator<Seq>> res;
+ if (amount == 1) {
+ res.emplace_back(FileName, Seq::GetDataSize(K));
+ return res;
+ }
+
+ // Determine the file size
+ struct stat buf;
+ VERIFY_MSG(stat(FileName.c_str(), &buf) != -1,
+ "stat(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ size_t file_size = buf.st_size;
+
+ // Now start creating the iterators keeping in mind, that offset should be
+ // multiple of page size.
+ size_t chunk = round_up(file_size / amount,
+ getpagesize() * Seq::GetDataSize(K) * sizeof(typename Seq::DataType));
+ size_t offset = 0;
+ if (chunk > file_size)
+ chunk = file_size;
+
+ while (offset < file_size) {
+ res.emplace_back(FileName, Seq::GetDataSize(K),
+ offset,
+ offset + chunk > file_size ? file_size - offset : chunk);
+ offset += chunk;
+ }
+
+ return res;
+}
+
+
+};
+
+#endif
diff --git a/src/common/io/kmers/mmapped_reader.hpp b/src/common/io/kmers/mmapped_reader.hpp
new file mode 100644
index 0000000..998659f
--- /dev/null
+++ b/src/common/io/kmers/mmapped_reader.hpp
@@ -0,0 +1,396 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_MMAPPED_READER_HPP
+#define HAMMER_MMAPPED_READER_HPP
+
+#include "common/adt/pointer_iterator.hpp"
+#include "common/adt/array_vector.hpp"
+
+#include "utils/verify.hpp"
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <cstring>
+#include <cerrno>
+
+#include <string>
+#include <algorithm>
+
+class MMappedReader {
+ int StreamFile;
+ bool Unlink;
+ std::string FileName;
+
+ void remap() {
+ VERIFY(BlockSize != FileSize);
+
+ if (MappedRegion)
+ munmap(MappedRegion, BlockSize);
+
+ BlockOffset += BlockSize;
+
+ if (BlockOffset + BlockSize > FileSize)
+ BlockSize = FileSize - BlockOffset;
+
+ // We do not add PROT_WRITE here intentionaly - remapping and write access
+ // is pretty error-prone.
+ if (BlockSize)
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BlockSize,
+ PROT_READ, MAP_FILE | MAP_PRIVATE,
+ StreamFile, InitialOffset + BlockOffset);
+ else
+ MappedRegion = NULL;
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+
+ void read_internal(void *buf, size_t amount) {
+ memcpy(buf, MappedRegion + BytesRead - BlockOffset, amount);
+ BytesRead += amount;
+ }
+
+protected:
+ uint8_t *MappedRegion;
+ size_t FileSize, BlockOffset, BytesRead, BlockSize;
+ off_t InitialOffset;
+
+public:
+ MMappedReader()
+ : StreamFile(-1), Unlink(false), FileName(""), MappedRegion(0), FileSize(0), BytesRead(0),
+ InitialOffset(0) { }
+
+ MMappedReader(const std::string &filename, bool unlink = false,
+ size_t blocksize = 64 * 1024 * 1024, off_t off = 0, size_t sz = 0)
+ : Unlink(unlink), FileName(filename), BlockSize(blocksize) {
+ struct stat buf;
+
+ InitialOffset = off;
+ FileSize = (sz ? sz : (stat(FileName.c_str(), &buf) != 0 ? 0 : buf.st_size - InitialOffset));
+
+ StreamFile = open(FileName.c_str(), O_RDONLY);
+ VERIFY_MSG(StreamFile != -1,
+ "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno << ". File: " <<
+ FileName);
+
+ if (BlockSize != -1ULL) {
+ size_t PageSize = getpagesize();
+ BlockSize = BlockSize / PageSize * PageSize;
+ } else
+ BlockSize = FileSize;
+
+ if (BlockSize) {
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BlockSize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE,
+ StreamFile, InitialOffset);
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ } else
+ MappedRegion = NULL;
+
+ BlockOffset = BytesRead = 0;
+ }
+
+ MMappedReader(MMappedReader &&other) {
+ // First, copy out the stuff
+ MappedRegion = other.MappedRegion;
+ FileSize = other.FileSize;
+ BlockOffset = other.BlockOffset;
+ BytesRead = other.BytesRead;
+ BlockSize = other.BlockSize;
+ FileName = std::move(other.FileName);
+ Unlink = other.Unlink;
+ StreamFile = other.StreamFile;
+ InitialOffset = other.InitialOffset;
+
+ // Now, zero out inside other, so we won't do crazy thing in dtor
+ other.StreamFile = -1;
+ other.Unlink = false;
+ other.MappedRegion = 0;
+ }
+
+ MMappedReader &operator=(MMappedReader &&other) {
+ if (this != &other) {
+ *this = std::move(other);
+ }
+ return *this;
+ }
+
+ virtual ~MMappedReader() {
+ if (StreamFile != -1)
+ close(StreamFile);
+ if (MappedRegion)
+ munmap(MappedRegion, BlockSize);
+
+ if (Unlink) {
+ int res = unlink(FileName.c_str());
+ VERIFY_MSG(res == 0,
+ "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+ }
+
+ void read(void *buf, size_t amount) {
+ if (BytesRead + amount < BlockOffset + BlockSize) {
+ // Easy case, no remap is necessary
+ read_internal(buf, amount);
+ return;
+ }
+
+ // Hard case - remapping is necessary. First - finish the current block.
+ size_t ToRead = BlockSize - (BytesRead - BlockOffset);
+ uint8_t *cbuf = (uint8_t *) buf;
+
+ read_internal(cbuf, ToRead);
+ amount -= ToRead;
+ cbuf += ToRead;
+
+ // Next, read as much BlockSize blocks as possible.
+ while (amount >= BlockSize) {
+ remap();
+ read_internal(cbuf, BlockSize);
+ amount -= BlockSize;
+ cbuf += BlockSize;
+ }
+
+ // Finally, remap and read remaining.
+ remap();
+ read_internal(cbuf, amount);
+ }
+
+ void *skip(size_t amount) {
+ // Easy case, no remapping is needed
+ if (BytesRead + amount <= BlockOffset + BlockSize) {
+ void *out = MappedRegion + BytesRead - BlockOffset;
+ BytesRead += amount;
+
+ return out;
+ }
+
+ // Make sure data does not cross the block boundary
+ VERIFY(BytesRead == BlockOffset + BlockSize);
+
+ // Now, remap and read from the beginning of the block
+ remap();
+
+ return skip(amount);
+ }
+
+ bool good() const {
+ return BytesRead < FileSize;
+ }
+
+ size_t size() const { return FileSize; }
+
+ size_t data_size() const { return FileSize; }
+
+ void *data() const { return MappedRegion; }
+};
+
+template<typename T>
+class MMappedRecordReader : public MMappedReader {
+public:
+ typedef pointer_iterator<T> iterator;
+ typedef const pointer_iterator<T> const_iterator;
+
+ MMappedRecordReader(const std::string &FileName, bool unlink = true,
+ size_t blocksize = 64 * 1024 * 1024 / (sizeof(T) * (unsigned) getpagesize()) *
+ (sizeof(T) * (unsigned) getpagesize()),
+ off_t off = 0, size_t sz = 0) :
+ MMappedReader(FileName, unlink, blocksize, off, sz) {
+ VERIFY(FileSize % sizeof(T) == 0);
+ }
+
+ void read(T *el, size_t amount) {
+ MMappedReader::read(el, amount * sizeof(T));
+ }
+
+ size_t size() const { return FileSize / sizeof(T); }
+
+ size_t data_size() const { return FileSize; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx]; }
+
+ const T &operator[](size_t idx) const { return data()[idx]; }
+
+ iterator begin() { return iterator(data()); }
+
+ const_iterator begin() const { return const_iterator(data()); }
+
+ iterator end() { return iterator(data() + size()); }
+
+ const_iterator end() const { return const_iterator(data() + size()); }
+};
+
+template<class T>
+class MMappedFileRecordIterator :
+ public boost::iterator_facade<MMappedFileRecordIterator<T>,
+ const T,
+ std::input_iterator_tag> {
+public:
+ // Default ctor, used to implement "end" iterator
+ MMappedFileRecordIterator() : good_(false) { }
+
+ MMappedFileRecordIterator(const std::string &FileName)
+ : reader_(FileName, false), good_(true) {
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ MMappedFileRecordIterator(MMappedRecordReader<T> &&reader)
+ : reader_(std::move(reader)), good_(true) {
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ bool good() const {
+ return good_;
+ }
+
+private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ good_ = reader_.good();
+ if (good_)
+ reader_.read(&value_, sizeof(value_));
+ }
+
+ bool equal(const MMappedFileRecordIterator &other) {
+ // Iterators are equal iff:
+ // 1) They both are not good (at the end of the stream),
+ // or
+ // 2) Has the same mapped region
+ return ((!reader_.good() && !other.reader_.good()) ||
+ reader_.data() == other.reader_.data());
+ }
+
+ const T dereference() const { return value_; }
+
+ T value_;
+ MMappedRecordReader<T> reader_;
+ bool good_;
+};
+
+template<typename T>
+class MMappedRecordArrayReader : public MMappedReader {
+ size_t elcnt_;
+
+public:
+ typedef typename array_vector<T>::iterator iterator;
+ typedef typename array_vector<T>::const_iterator const_iterator;
+
+ MMappedRecordArrayReader(const std::string &FileName,
+ size_t elcnt = 1,
+ bool unlink = true,
+ off_t off = 0, size_t sz = 0) :
+ MMappedReader(FileName, unlink, -1ULL, off, sz), elcnt_(elcnt) {
+ VERIFY(FileSize % (sizeof(T) * elcnt_) == 0);
+ }
+
+ void read(T *el, size_t amount) {
+ MMappedReader::read(el, amount * sizeof(T) * elcnt_);
+ }
+
+ size_t size() const { return FileSize / sizeof(T) / elcnt_; }
+
+ size_t data_size() const { return FileSize; }
+
+ size_t elcnt() const { return elcnt_; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx * elcnt_]; }
+
+ const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
+
+ iterator begin() { return iterator(data(), /* size */ elcnt_); }
+
+ const_iterator begin() const { return const_iterator(data()), /* size */ elcnt_; }
+
+ const_iterator cbegin() const { return const_iterator(data()), /* size */ elcnt_; }
+
+ iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator cend() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+};
+
+static inline size_t round_up(size_t value, size_t boundary) {
+ return (value + boundary - 1) / boundary * boundary;
+}
+
+template<class T>
+class MMappedFileRecordArrayIterator :
+ public boost::iterator_facade<MMappedFileRecordArrayIterator<T>,
+ const T *,
+ std::input_iterator_tag,
+ const T *> {
+public:
+ // Default ctor, used to implement "end" iterator
+ MMappedFileRecordArrayIterator() : value_(NULL), array_size_(0), reader_(), good_(false) { }
+
+ MMappedFileRecordArrayIterator(const std::string &FileName,
+ size_t elcnt,
+ off_t offset = 0, size_t filesize = 0)
+ : value_(NULL),
+ array_size_(sizeof(T) * elcnt),
+ reader_(FileName, false,
+ round_up(filesize > 0 ? std::min(size_t(64 * 1024 * 1024), filesize) : 64 * 1024 * 1024,
+ array_size_ * (unsigned) getpagesize()),
+ offset, filesize),
+ good_(false) {
+ increment();
+ }
+
+ MMappedFileRecordArrayIterator(MMappedRecordReader<T> &&reader, size_t elcnt)
+ : value_(NULL), array_size_(sizeof(T) * elcnt), reader_(std::move(reader)), good_(false) {
+ increment();
+ }
+
+ MMappedFileRecordArrayIterator(const MMappedFileRecordArrayIterator &) = delete;
+
+ MMappedFileRecordArrayIterator(MMappedFileRecordArrayIterator &&other)
+ : value_(other.value_), array_size_(other.array_size_),
+ reader_(std::move(other.reader_)), good_(other.good_) { }
+
+ bool good() const { return good_; }
+
+ const MMappedRecordReader<T> &reader() const { return reader_; }
+
+private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ good_ = reader_.good();
+ value_ = (good_ ? (T *) reader_.skip(array_size_) : NULL);
+ }
+
+ bool equal(const MMappedFileRecordArrayIterator &other) const {
+ return value_ == other.value_;
+ }
+
+ const T *dereference() const { return value_; }
+
+ T *value_;
+ size_t array_size_;
+ MMappedRecordReader<T> reader_;
+ bool good_;
+};
+
+#endif // HAMMER_MMAPPED_READER_HPP
diff --git a/src/common/io/kmers/mmapped_writer.hpp b/src/common/io/kmers/mmapped_writer.hpp
new file mode 100644
index 0000000..9b3b2ce
--- /dev/null
+++ b/src/common/io/kmers/mmapped_writer.hpp
@@ -0,0 +1,191 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_MMAPPED_WRITER_HPP
+#define HAMMER_MMAPPED_WRITER_HPP
+
+#include "common/adt/pointer_iterator.hpp"
+#include "common/adt/array_vector.hpp"
+
+#include <string>
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <strings.h>
+
+class MMappedWriter {
+ int StreamFile;
+
+ MMappedWriter(const MMappedWriter &) = delete;
+
+protected:
+ uint8_t *MappedRegion;
+ size_t BytesWritten, BytesReserved, FileOffset, BufOffset;
+public:
+ MMappedWriter() = default;
+
+ MMappedWriter(const std::string &FileName) {
+ open(FileName);
+ }
+
+ void open(const std::string &FileName) {
+ StreamFile = ::open(FileName.c_str(), O_RDWR | O_CREAT | O_TRUNC, (mode_t) 0660);
+ VERIFY_MSG(StreamFile != -1,
+ "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+
+ FileOffset = BytesWritten = 0;
+ MappedRegion = NULL;
+ }
+
+ virtual ~MMappedWriter() {
+ if (MappedRegion)
+ munmap(MappedRegion, BytesReserved);
+ close(StreamFile);
+ }
+
+ void write(void *buf, size_t amount) {
+ memcpy(MappedRegion + BufOffset + BytesWritten, buf, amount);
+ BytesWritten += amount;
+ }
+
+ bool good() const {
+ return BytesWritten < BytesReserved;
+ }
+
+ void reserve(size_t amount) {
+ if (MappedRegion) {
+ munmap(MappedRegion, BytesReserved);
+ FileOffset += BytesWritten;
+ MappedRegion = NULL;
+ }
+
+ if (amount == 0)
+ return;
+
+ int res = (int) lseek(StreamFile, amount - 1, SEEK_CUR);
+ VERIFY_MSG(res != -1,
+ "lseek(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ res = (int) ::write(StreamFile, "", 1);
+ VERIFY_MSG(res != -1,
+ "write(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+
+ // FileOffset here should be aligned to page boundary. Tune the stuff due to this fact.
+ int PageSize = getpagesize();
+ size_t FileOffsetAligned = FileOffset / PageSize * PageSize;
+ size_t Residual = FileOffset - FileOffsetAligned;
+
+ BytesReserved = amount + Residual;
+ BytesWritten = 0;
+ BufOffset = Residual;
+ MappedRegion =
+ (uint8_t *) mmap(NULL, BytesReserved,
+ PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED,
+ StreamFile, FileOffsetAligned);
+ VERIFY_MSG((intptr_t) MappedRegion != -1L,
+ "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
+ }
+
+ size_t size() const { return BytesReserved; }
+};
+
+template<typename T>
+class MMappedRecordWriter : public MMappedWriter {
+public:
+ typedef pointer_iterator<T> iterator;
+ typedef const pointer_iterator<T> const_iterator;
+
+ MMappedRecordWriter() = default;
+
+ MMappedRecordWriter(const std::string &FileName) :
+ MMappedWriter(FileName) {
+ }
+
+ void write(const T *el, size_t amount) {
+ MMappedWriter::write((void *) el, amount * sizeof(T));
+ }
+
+ void reserve(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T));
+ }
+
+ void resize(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T));
+ }
+
+ size_t size() const { return BytesReserved / sizeof(T); }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx]; }
+
+ const T &operator[](size_t idx) const { return data()[idx]; }
+
+ iterator begin() { return iterator(data()); }
+
+ const_iterator begin() const { return const_iterator(data()); }
+
+ iterator end() { return iterator(data() + size()); }
+
+ const_iterator end() const { return const_iterator(data() + size()); }
+};
+
+template<typename T>
+class MMappedRecordArrayWriter : public MMappedWriter {
+ size_t elcnt_;
+public:
+ typedef typename array_vector<T>::iterator iterator;
+ typedef typename array_vector<T>::const_iterator const_iterator;
+
+ MMappedRecordArrayWriter() = default;
+
+ MMappedRecordArrayWriter(const std::string &FileName,
+ size_t elcnt = 1) :
+ MMappedWriter(FileName), elcnt_(elcnt) { }
+
+ void open(const std::string &FileName,
+ size_t elcnt = 1) {
+ elcnt_ = elcnt;
+ MMappedWriter::open(FileName);
+ }
+
+ void write(const T *el, size_t amount) {
+ MMappedWriter::write((void *) el, amount * sizeof(T) * elcnt_);
+ }
+
+ void reserve(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
+ }
+
+ void resize(size_t amount) {
+ MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
+ }
+
+ size_t size() const { return BytesReserved / sizeof(T) / elcnt_; }
+
+ T *data() { return (T *) MappedRegion; }
+
+ const T *data() const { return (const T *) MappedRegion; }
+
+ T &operator[](size_t idx) { return data()[idx * elcnt_]; }
+
+ const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
+
+ iterator begin() { return iterator(data(), elcnt_); }
+
+ const_iterator begin() const { return const_iterator(data(), elcnt_); }
+
+ iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
+
+ const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
+};
+
+#endif // HAMMER_MMAPPED_WRITER_HPP
diff --git a/src/common/io/reads/binary_converter.hpp b/src/common/io/reads/binary_converter.hpp
new file mode 100644
index 0000000..ff427cb
--- /dev/null
+++ b/src/common/io/reads/binary_converter.hpp
@@ -0,0 +1,262 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * binary_io.hpp
+ *
+ * Created on: Apr 12, 2012
+ * Author: andrey
+ */
+
+#ifndef BINARY_IO_HPP_
+#define BINARY_IO_HPP_
+
+#include <fstream>
+
+#include "utils/verify.hpp"
+#include "ireader.hpp"
+#include "single_read.hpp"
+#include "paired_read.hpp"
+#include "pipeline/library.hpp"
+
+namespace io {
+
+template<class Read>
+class ReadBinaryWriter {
+
+public:
+
+ ReadBinaryWriter(LibraryOrientation /*orientation*/ = LibraryOrientation::Undefined) {
+ }
+
+ bool Write(std::ostream& file, const Read& r) const {
+ return r.BinWrite(file);
+ }
+};
+
+template<>
+class ReadBinaryWriter<PairedRead> {
+
+private:
+
+ bool rc1_;
+
+ bool rc2_;
+
+public:
+
+ ReadBinaryWriter(LibraryOrientation orientation) {
+ switch (orientation) {
+ case LibraryOrientation::FF: {
+ rc1_ = false;
+ rc2_ = false;
+ break;
+ }
+ case LibraryOrientation::RR: {
+ rc1_ = true;
+ rc2_ = true;
+ break;
+ }
+ case LibraryOrientation::FR: {
+ rc1_ = false;
+ rc2_ = true;
+ break;
+ }
+ case LibraryOrientation::RF: {
+ rc1_ = true;
+ rc2_ = false;
+ break;
+ }
+ default: {
+ rc1_ = false;
+ rc2_ = false;
+ break;
+ }
+ }
+
+ }
+
+ bool Write(std::ostream& file, const PairedRead& r) const {
+ return r.BinWrite(file, rc1_, rc2_);
+ }
+};
+
+
+class BinaryWriter {
+
+private:
+ const std::string file_name_prefix_;
+
+ size_t file_num_;
+
+ std::vector<std::ofstream*> file_ds_;
+
+ size_t buf_size_;
+
+ template<class Read>
+ void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file, size_t from, size_t to) {
+ for (size_t i = from; i < to; ++i) {
+ read_writer.Write(file, buffer[i]);
+ }
+ }
+
+ template<class Read>
+ void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file) {
+ FlushBuffer(buffer, read_writer, file, 0, buffer.size());
+ }
+
+ template<class Read>
+ ReadStreamStat ToBinary(io::ReadStream<Read>& stream, size_t buf_size,
+ LibraryOrientation orientation) {
+
+ ReadBinaryWriter<Read> read_writer(orientation);
+ size_t buffer_reads = buf_size / (sizeof (Read) * 4);
+ size_t reads_to_flush = buffer_reads * file_num_;
+
+ std::vector< std::vector<Read> > buf(file_num_, std::vector<Read>(buffer_reads) );
+ std::vector< ReadStreamStat > read_stats(file_num_);
+ std::vector< size_t > current_buf_sizes(file_num_, 0);
+ size_t read_count = 0;
+
+ for (size_t i = 0; i < file_num_; ++i) {
+ file_ds_[i]->seekp(0);
+ read_stats[i].write(*file_ds_[i]);
+ }
+
+ size_t buf_index;
+ while (!stream.eof()) {
+ buf_index = read_count % file_num_;
+
+ Read& r = buf[buf_index][current_buf_sizes[buf_index]];
+ stream >> r;
+ read_stats[buf_index].increase(r);
+
+ ++current_buf_sizes[buf_index];
+ VERBOSE_POWER(++read_count, " reads processed");
+
+ if (read_count % reads_to_flush == 0) {
+ for (size_t i = 0; i < file_num_; ++i) {
+ FlushBuffer(buf[i], read_writer, *file_ds_[i]);
+ current_buf_sizes[i] = 0;
+ }
+ }
+ }
+
+ ReadStreamStat result;
+ for (size_t i = 0; i < file_num_; ++i) {
+ buf[i].resize(current_buf_sizes[i]);
+ FlushBuffer(buf[i], read_writer, *file_ds_[i]);
+
+ file_ds_[i]->seekp(0);
+ read_stats[i].write(*file_ds_[i]);
+ result.merge(read_stats[i]);
+ }
+
+ INFO(read_count << " reads written");
+ return result;
+ }
+
+
+ template<class Read>
+ ReadStreamStat ToBinaryForThread(io::ReadStream<Read>& stream, size_t buf_size,
+ size_t thread_num, LibraryOrientation orientation) {
+
+ ReadBinaryWriter<Read> read_writer(orientation);
+ size_t buffer_reads = buf_size / (sizeof (Read) * 4);
+ std::vector<Read> buf(buffer_reads);
+
+ ReadStreamStat stat;
+ file_ds_[thread_num]->seekp(0);
+ stat.write(*file_ds_[thread_num]);
+
+ size_t current = 0;
+
+ while (!stream.eof()) {
+ Read& r = buf[current];
+ stream >> r;
+ stat.increase(r);
+ ++current;
+
+ if (stat.read_count_ % buffer_reads == 0) {
+ FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
+ current = 0;
+ }
+ }
+
+ buf.resize(current);
+ FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
+
+ file_ds_[thread_num]->seekp(0);
+ stat.write(*file_ds_[thread_num]);
+
+ return stat;
+ }
+
+
+public:
+
+ BinaryWriter(const std::string& file_name_prefix, size_t file_num,
+ size_t buf_size):
+ file_name_prefix_(file_name_prefix), file_num_(file_num),
+ file_ds_(), buf_size_(buf_size) {
+
+ std::string fname;
+ for (size_t i = 0; i < file_num_; ++i) {
+ fname = file_name_prefix_ + "_" + ToString(i) + ".seq";
+ file_ds_.push_back(new std::ofstream(fname, std::ios_base::binary));
+ }
+ }
+
+ ~BinaryWriter() {
+ for (size_t i = 0; i < file_num_; ++i) {
+ if (file_ds_[i]->is_open()) {
+ file_ds_[i]->close();
+ }
+ delete file_ds_[i];
+ }
+ }
+
+
+ ReadStreamStat ToBinary(io::ReadStream<io::SingleReadSeq>& stream) {
+ return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::SingleRead>& stream) {
+ return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::PairedReadSeq>& stream) {
+ return ToBinary(stream, buf_size_ / (2 * file_num_), LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinary(io::ReadStream<io::PairedRead>& stream, LibraryOrientation orientation) {
+ return ToBinary(stream, buf_size_ / (2 * file_num_), orientation);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleReadSeq>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleRead>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedReadSeq>& stream, size_t thread_num) {
+ return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, LibraryOrientation::Undefined);
+ }
+
+ ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedRead>& stream, size_t thread_num, LibraryOrientation orientation) {
+ return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, orientation);
+ }
+
+};
+
+
+}
+
+
+#endif /* BINARY_IO_HPP_ */
diff --git a/src/common/io/reads/binary_streams.hpp b/src/common/io/reads/binary_streams.hpp
new file mode 100644
index 0000000..9769b15
--- /dev/null
+++ b/src/common/io/reads/binary_streams.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <fstream>
+
+#include "utils/verify.hpp"
+#include "ireader.hpp"
+#include "single_read.hpp"
+#include "paired_read.hpp"
+
+namespace io {
+
+// == Deprecated classes ==
+// Use FileReadStream and InsertSizeModyfing instead
+
+class BinaryFileSingleStream: public PredictableReadStream<SingleReadSeq> {
+private:
+ std::ifstream stream_;
+ ReadStreamStat read_stat_;
+ size_t current_;
+
+public:
+
+ BinaryFileSingleStream(const std::string& file_name_prefix, size_t file_num) {
+ std::string fname;
+ fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
+ stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
+
+ reset();
+ }
+
+ virtual bool is_open() {
+ return stream_.is_open();
+ }
+
+ virtual bool eof() {
+ return current_ == read_stat_.read_count_;
+ }
+
+ virtual BinaryFileSingleStream& operator>>(SingleReadSeq& read) {
+ read.BinRead(stream_);
+ VERIFY(current_ < read_stat_.read_count_);
+
+ ++current_;
+ return *this;
+ }
+
+ virtual void close() {
+ current_ = 0;
+ stream_.close();
+ }
+
+ virtual void reset() {
+ stream_.clear();
+ stream_.seekg(0);
+ VERIFY(stream_.good());
+ read_stat_.read(stream_);
+ current_ = 0;
+ }
+
+ virtual size_t size() const {
+ return read_stat_.read_count_;
+ }
+
+ virtual ReadStreamStat get_stat() const {
+ return read_stat_;
+ }
+
+};
+
+class BinaryFilePairedStream: public PredictableReadStream<PairedReadSeq> {
+
+private:
+ std::ifstream stream_;
+
+ size_t insert_size_;
+
+ ReadStreamStat read_stat_;
+
+ size_t current_;
+
+
+public:
+
+ BinaryFilePairedStream(const std::string& file_name_prefix, size_t file_num, size_t insert_szie): stream_(), insert_size_ (insert_szie) {
+ std::string fname;
+ fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
+ stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
+
+ reset();
+ }
+
+ virtual bool is_open() {
+ return stream_.is_open();
+ }
+
+ virtual bool eof() {
+ return current_ >= read_stat_.read_count_;
+ }
+
+ virtual BinaryFilePairedStream& operator>>(PairedReadSeq& read) {
+ read.BinRead(stream_, insert_size_);
+ VERIFY(current_ < read_stat_.read_count_);
+
+ ++current_;
+ return *this;
+ }
+
+ virtual void close() {
+ current_ = 0;
+ stream_.close();
+ }
+
+
+ virtual void reset() {
+ stream_.clear();
+ stream_.seekg(0);
+ VERIFY(stream_.good());
+ read_stat_.read(stream_);
+ current_ = 0;
+ }
+
+ virtual size_t size() const {
+ return read_stat_.read_count_;
+ }
+
+ ReadStreamStat get_stat() const {
+ ReadStreamStat stat = read_stat_;
+ stat.read_count_ *= 2;
+ return stat;
+ }
+};
+
+}
diff --git a/src/common/io/reads/careful_filtering_reader_wrapper.hpp b/src/common/io/reads/careful_filtering_reader_wrapper.hpp
new file mode 100644
index 0000000..cd7771a
--- /dev/null
+++ b/src/common/io/reads/careful_filtering_reader_wrapper.hpp
@@ -0,0 +1,183 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+//todo rename file
+#include "io/reads/delegating_reader_wrapper.hpp"
+#include "pipeline/library.hpp"
+
+namespace io {
+
+const size_t none = -1ul;
+
+inline std::pair<size_t, size_t> LongestValidCoords(const SingleRead& r) {
+ size_t best_len = 0;
+ size_t best_pos = none;
+ size_t pos = none;
+ std::string seq = r.GetSequenceString();
+ for (size_t i = 0; i <= seq.size(); ++i) {
+ if (i < seq.size() && is_nucl(seq[i])) {
+ if (pos == none) {
+ pos = i;
+ }
+ } else {
+ if (pos != none) {
+ size_t len = i - pos;
+ if (len > best_len) {
+ best_len = len;
+ best_pos = pos;
+ }
+ }
+ pos = none;
+ }
+ }
+ if (best_len == 0) {
+ return std::make_pair(0, 0);
+ }
+ return std::make_pair(best_pos, best_pos + best_len);
+}
+
+inline SingleRead LongestValid(const SingleRead& r,
+ bool /*use_orientation*/ = false,
+ LibraryOrientation /*orientation*/ = LibraryOrientation::FR) {
+
+ std::pair<size_t, size_t> p = LongestValidCoords(r);
+ return r.Substr(p.first, p.second);
+}
+
+inline PairedRead LongestValid(const PairedRead& r,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::FR) {
+ std::pair<size_t, size_t> c1 = LongestValidCoords(r.first());
+ std::pair<size_t, size_t> c2 = LongestValidCoords(r.second());
+ size_t len1 = c1.second - c1.first;
+ size_t len2 = c2.second - c2.first;
+ if (len1 == 0 || len2 == 0) {
+ return PairedRead();
+ }
+ if (len1 == r.first().size() && len2 == r.second().size()) {
+ return r;
+ }
+
+ size_t is;
+ if (!use_orientation) {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ }
+ else {
+ switch (orientation) {
+ case LibraryOrientation::FF: {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ break;
+ }
+ case LibraryOrientation::RR: {
+ is = r.insert_size() - r.first().size() + c1.second - c2.first;
+ break;
+ }
+ case LibraryOrientation::FR: {
+ is = r.insert_size() - c1.first - c2.first;
+ break;
+ }
+ case LibraryOrientation::RF: {
+ is = r.insert_size() - r.first().size() + c1.second - r.second().size() + c2.second;
+ break;
+ }
+ default: {
+ is = r.insert_size() - c1.first - r.second().size() + c2.second;
+ break;
+ }
+ }
+ }
+
+ return PairedRead(r.first().Substr(c1.first, c1.second), r.second().Substr(c2.first, c2.second), is);
+}
+
+
+//todo rewrite without eof
+template<typename ReadType>
+class CarefulFilteringWrapper : public DelegatingWrapper<ReadType> {
+ typedef DelegatingWrapper<ReadType> base;
+public:
+ /*
+ * Default constructor.
+ *
+ * @param reader Reference to any other reader (child of IReader).
+ */
+ CarefulFilteringWrapper(typename base::ReadStreamPtrT reader_ptr,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) :
+ base(reader_ptr),
+ eof_(false),
+ use_orientation_(use_orientation),
+ orientation_(orientation) {
+ StepForward();
+ }
+
+ /* virtual */ bool eof() {
+ return eof_;
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read * data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ CarefulFilteringWrapper& operator>>(ReadType& read) {
+ read = next_read_;
+ StepForward();
+ return *this;
+ }
+
+ /* virtual */
+ void reset() {
+ base::reset();
+ eof_ = false;
+ StepForward();
+ }
+
+private:
+ bool eof_;
+ bool use_orientation_;
+ LibraryOrientation orientation_;
+ ReadType next_read_;
+
+ /*
+ * Read next valid read in the stream.
+ */
+ void StepForward() {
+ while (!base::eof()) {
+ base::operator >>(next_read_);
+ next_read_ = LongestValid(next_read_, use_orientation_, orientation_);
+ if (next_read_.IsValid()) {
+ return;
+ }
+ }
+ eof_ = true;
+ }
+};
+
+template<class ReadType>
+std::shared_ptr<ReadStream<ReadType>> CarefulFilteringWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ //return reader_ptr = make_shared<CarefulFilteringWrapper<ReadType>>(reader_ptr, false, LibraryOrientation::Undefined);
+ return std::shared_ptr<CarefulFilteringWrapper<ReadType> >(
+ new CarefulFilteringWrapper<ReadType>(reader_ptr, use_orientation, orientation));
+}
+
+template<class ReadType>
+ReadStreamList<ReadType> CarefulFilteringWrap(const ReadStreamList<ReadType>& readers,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ ReadStreamList<ReadType> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(CarefulFilteringWrap<ReadType>(readers.ptr_at(i), use_orientation, orientation));
+ }
+ return answer;
+}
+
+}
diff --git a/src/modules/io/reads_io/converting_reader_wrapper.hpp b/src/common/io/reads/converting_reader_wrapper.hpp
similarity index 100%
rename from src/modules/io/reads_io/converting_reader_wrapper.hpp
rename to src/common/io/reads/converting_reader_wrapper.hpp
diff --git a/src/modules/io/reads_io/delegating_reader_wrapper.hpp b/src/common/io/reads/delegating_reader_wrapper.hpp
similarity index 100%
rename from src/modules/io/reads_io/delegating_reader_wrapper.hpp
rename to src/common/io/reads/delegating_reader_wrapper.hpp
diff --git a/src/common/io/reads/fasta_fastq_gz_parser.hpp b/src/common/io/reads/fasta_fastq_gz_parser.hpp
new file mode 100644
index 0000000..d976577
--- /dev/null
+++ b/src/common/io/reads/fasta_fastq_gz_parser.hpp
@@ -0,0 +1,165 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file fastqgz_parser.hpp
+ * @author Mariya Fomkina
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * FastaFastqGzParser is the parser stream that reads data from .fastq.gz
+ * files.
+ */
+
+#ifndef COMMON_IO_FASTAFASTQGZPARSER_HPP
+#define COMMON_IO_FASTAFASTQGZPARSER_HPP
+
+#include <zlib.h>
+#include <string>
+#include "kseq/kseq.h"
+#include "utils/verify.hpp"
+#include "single_read.hpp"
+#include "io/reads/parser.hpp"
+#include "sequence/quality.hpp"
+#include "sequence/nucl.hpp"
+
+namespace io {
+
+namespace fastafastqgz {
+// STEP 1: declare the type of file handler and the read() function
+KSEQ_INIT(gzFile, gzread)
+}
+
+class FastaFastqGzParser: public Parser {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+ */
+ FastaFastqGzParser(const std::string& filename, OffsetType offset_type =
+ PhredOffset) :
+ Parser(filename, offset_type), fp_(), seq_(NULL) {
+ open();
+ }
+
+ /*
+ * Default destructor.
+ */
+ /* virtual */
+ ~FastaFastqGzParser() {
+ close();
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */
+ FastaFastqGzParser& operator>>(SingleRead& read) {
+ if (!is_open_ || eof_) {
+ return *this;
+ }
+ //todo offset_type_ should be used in future
+ if (seq_->qual.s) {
+ read = SingleRead(seq_->name.s, seq_->seq.s, seq_->qual.s, offset_type_);
+ } else {
+ read = SingleRead(seq_->name.s, seq_->seq.s);
+// size_t len = strlen(seq_->seq.s);
+// char* qual = (char*) malloc(len + 1);
+// char q = '\2' + 64;
+// for (size_t i = 0; i < len; ++i) {
+// qual[i] = q;
+// }
+// qual[len] = '\0';
+// read.SetAll(seq_->name.s, seq_->seq.s, qual, SolexaOffset);
+// free(qual);
+ }
+ ReadAhead();
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */
+ void close() {
+ if (is_open_) {
+ // STEP 5: destroy seq
+ fastafastqgz::kseq_destroy(seq_);
+ // STEP 6: close the file handler
+ gzclose(fp_);
+ is_open_ = false;
+ eof_ = true;
+ }
+ }
+
+private:
+ /*
+ * @variable File that is associated with gzipped data file.
+ */
+ gzFile fp_;
+ /*
+ * @variable Data element that stores last SingleRead got from
+ * stream.
+ */
+ fastafastqgz::kseq_t* seq_;
+
+ /*
+ * Open a stream.
+ */
+ /* virtual */
+ void open() {
+ // STEP 2: open the file handler
+ fp_ = gzopen(filename_.c_str(), "r");
+ if (!fp_) {
+ is_open_ = false;
+ return;
+ }
+ // STEP 3: initialize seq
+ seq_ = fastafastqgz::kseq_init(fp_);
+ eof_ = false;
+ is_open_ = true;
+ ReadAhead();
+ }
+
+ /*
+ * Read next SingleRead from file.
+ */
+ void ReadAhead() {
+ VERIFY(is_open_);
+ VERIFY(!eof_);
+ if (fastafastqgz::kseq_read(seq_) < 0) {
+ eof_ = true;
+ }
+ }
+
+ /*
+ * Hidden copy constructor.
+ */
+ FastaFastqGzParser(const FastaFastqGzParser& parser);
+ /*
+ * Hidden assign operator.
+ */
+ void operator=(const FastaFastqGzParser& parser);
+};
+
+}
+
+#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/common/io/reads/file_reader.hpp b/src/common/io/reads/file_reader.hpp
new file mode 100644
index 0000000..49037d6
--- /dev/null
+++ b/src/common/io/reads/file_reader.hpp
@@ -0,0 +1,129 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+
+* Reader<SingleRead> is the very base class that reads from one file
+* through Parser object.
+* Reader<PairedRead> is the class that reads data from two input
+* files and gets PairedReads using this data and distance information.
+*/
+
+#pragma once
+
+#include "ireader.hpp"
+#include "single_read.hpp"
+#include "parser.hpp"
+#include "utils/path_helper.hpp"
+
+namespace io {
+
+class FileReadStream : public ReadStream<SingleRead> {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param distance Doesn't have any sense here, but necessary for
+ * wrappers.
+ * @param offset The offset of the read quality.
+ */
+ explicit FileReadStream(const std::string &filename,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), offset_type_(offset_type), parser_(NULL) {
+ path::CheckFileExistenceFATAL(filename_);
+ parser_ = SelectParser(filename_, offset_type_);
+ }
+
+ /*
+ * Default destructor.
+ */
+ /* virtual */ ~FileReadStream() {
+ close();
+ delete parser_;
+ }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ if (parser_ != NULL) {
+ return parser_->is_open();
+ } else {
+ return false;
+ }
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ if (parser_ != NULL) {
+ return parser_->eof();
+ } else {
+ return true;
+ }
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param singleread The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ FileReadStream &operator>>(SingleRead &singleread) {
+ if (parser_ != NULL) {
+ (*parser_) >> singleread;
+ }
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ if (parser_ != NULL) {
+ parser_->close();
+ }
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ if (parser_ != NULL) {
+ parser_->reset();
+ }
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+private:
+ /*
+ * @variable The name of the file which stream reads from.
+ */
+ std::string filename_;
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+ /*
+ * @variable Internal stream that reads from file.
+ */
+ Parser *parser_;
+
+};
+
+}
diff --git a/src/modules/io/reads_io/filtering_reader_wrapper.hpp b/src/common/io/reads/filtering_reader_wrapper.hpp
similarity index 100%
rename from src/modules/io/reads_io/filtering_reader_wrapper.hpp
rename to src/common/io/reads/filtering_reader_wrapper.hpp
diff --git a/src/common/io/reads/io_helper.hpp b/src/common/io/reads/io_helper.hpp
new file mode 100644
index 0000000..7eea77c
--- /dev/null
+++ b/src/common/io/reads/io_helper.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "read_stream_vector.hpp"
+#include "single_read.hpp"
+#include "paired_read.hpp"
+#include "file_reader.hpp"
+#include "paired_readers.hpp"
+#include "binary_streams.hpp"
+#include "multifile_reader.hpp"
+#include "converting_reader_wrapper.hpp"
+#include "careful_filtering_reader_wrapper.hpp"
+#include "rc_reader_wrapper.hpp"
+
+namespace io {
+ typedef ReadStream<SingleRead> SingleStream;
+ typedef std::shared_ptr<SingleStream> SingleStreamPtr;
+ typedef ReadStreamList<SingleRead> SingleStreams;
+
+ typedef ReadStream<PairedRead> PairedStream;
+ typedef std::shared_ptr<PairedStream> PairedStreamPtr;
+ typedef ReadStreamList<PairedRead> PairedStreams;
+
+ typedef ReadStream<SingleReadSeq> BinarySingleStream;
+ typedef std::shared_ptr<BinarySingleStream> BinarySingleStreamPtr;
+ typedef ReadStreamList<SingleReadSeq> BinarySingleStreams;
+
+ typedef ReadStream<PairedReadSeq> BinaryPairedStream;
+ typedef std::shared_ptr<BinaryPairedStream> BinaryPairedStreamPtr;
+ typedef ReadStreamList<PairedReadSeq> BinaryPairedStreams;
+
+ inline BinarySingleStreams apply_single_wrappers(bool followed_by_rc,
+ BinarySingleStreams& single_readers,
+ BinaryPairedStreams* paired_readers = 0) {
+ VERIFY(single_readers.size() != 0);
+ BinarySingleStreams readers = single_readers;
+
+ if (paired_readers != 0) {
+ VERIFY(single_readers.size() == paired_readers->size());
+ BinarySingleStreams squashed_paired = SquashingWrap<PairedReadSeq>(*paired_readers);
+ readers = WrapPairsInMultifiles<SingleReadSeq>(squashed_paired, readers);
+ }
+
+ if (followed_by_rc) {
+ readers = RCWrap<SingleReadSeq>(readers);
+ }
+ return readers;
+ }
+
+ //todo make deprecated
+ inline BinaryPairedStreams apply_paired_wrappers(bool followed_by_rc,
+ BinaryPairedStreams& readers) {
+ VERIFY(readers.size() != 0);
+ if (followed_by_rc) {
+ return RCWrap<PairedReadSeq>(readers);
+ } else {
+ return readers;
+ }
+ }
+
+ inline SingleStreamPtr EasyStream(const std::string& filename, bool followed_by_rc,
+ bool handle_Ns = true, OffsetType offset_type = PhredOffset) {
+ SingleStreamPtr reader = make_shared<FileReadStream>(filename, offset_type);
+ if (handle_Ns) {
+ reader = CarefulFilteringWrap<SingleRead>(reader);
+ }
+ if (followed_by_rc) {
+ reader = RCWrap<SingleRead>(reader);
+ }
+ return reader;
+ }
+
+ inline PairedStreamPtr WrapPairedStream(PairedStreamPtr reader,
+ bool followed_by_rc,
+ bool use_orientation = false,
+ LibraryOrientation orientation = LibraryOrientation::Undefined) {
+ PairedStreamPtr answer = reader;
+ answer = CarefulFilteringWrap<PairedRead>(answer, use_orientation, orientation);
+ if (followed_by_rc) {
+ answer = RCWrap<PairedRead>(answer);
+ }
+ return answer;
+
+ }
+
+ inline PairedStreamPtr PairedEasyStream(const std::string& filename1, const std::string& filename2,
+ bool followed_by_rc, size_t insert_size, bool change_read_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset) {
+ PairedStreamPtr reader = make_shared<SeparatePairedReadStream>(filename1, filename2, insert_size,
+ change_read_order, use_orientation,
+ orientation, offset_type);
+ //Use orientation for IS calculation if it's not done by changer
+ return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
+ }
+
+ inline PairedStreamPtr PairedEasyStream(const std::string& filename, bool followed_by_rc,
+ size_t insert_size, bool change_read_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset) {
+ PairedStreamPtr reader = make_shared<InterleavingPairedReadStream>(filename, insert_size, change_read_order,
+ use_orientation, orientation, offset_type);
+ //Use orientation for IS calculation if it's not done by changer
+ return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
+ }
+}
diff --git a/src/common/io/reads/ireader.hpp b/src/common/io/reads/ireader.hpp
new file mode 100644
index 0000000..252bb5e
--- /dev/null
+++ b/src/common/io/reads/ireader.hpp
@@ -0,0 +1,117 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+//todo rename to reader
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include "utils/standard_base.hpp"
+
+namespace io {
+
+struct ReadStreamStat {
+ size_t read_count_;
+ size_t max_len_;
+ uint64_t total_len_;
+
+
+ ReadStreamStat(): read_count_(0), max_len_(0), total_len_(0) { }
+
+ void write(std::ostream& stream) const {
+ stream.write((const char *) &read_count_, sizeof(read_count_));
+ stream.write((const char *) &max_len_, sizeof(max_len_));
+ stream.write((const char *) &total_len_, sizeof(total_len_));
+ }
+
+ void read(std::istream& stream) {
+ stream.read((char *) &read_count_, sizeof(read_count_));
+ stream.read((char *) &max_len_, sizeof(max_len_));
+ stream.read((char *) &total_len_, sizeof(total_len_));
+ }
+
+ template<class Read>
+ void increase(const Read& read) {
+ size_t len = read.size();
+
+ ++read_count_;
+ if (max_len_ < len) {
+ max_len_ = len;
+ }
+ total_len_ += read.nucl_count();
+ }
+
+ void merge(const ReadStreamStat& stat) {
+ read_count_ += stat.read_count_;
+ if (max_len_ < stat.max_len_) {
+ max_len_ = stat.max_len_;
+ }
+ total_len_ += stat.total_len_;
+ }
+
+ bool valid() const {
+ return read_count_ != 0;
+ }
+
+};
+
+/**
+ * Reader is the interface for all other readers and reader wrappers.
+ */
+template<typename ReadType>
+class ReadStream: boost::noncopyable {
+ public:
+ typedef ReadType ReadT;
+
+ /*
+ * Default destructor.
+ */
+ virtual ~ReadStream() {}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true if the stream is opened and false otherwise.
+ */
+ virtual bool is_open() = 0;
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of the stream is reached and false
+ * otherwise.
+ */
+ virtual bool eof() = 0;
+
+ /*
+ * Read SingleRead or PairedRead from stream (according to ReadType).
+ *
+ * @param read The SingleRead or PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ virtual ReadStream& operator>>(ReadType& read) = 0;
+
+ /*
+ * Close the stream.
+ */
+ virtual void close() = 0;
+
+ /*
+ * Close the stream and open it again.
+ */
+ virtual void reset() = 0;
+
+ virtual ReadStreamStat get_stat() const = 0;
+
+};
+
+template<class Read>
+class PredictableReadStream: public ReadStream<Read> {
+public:
+ virtual size_t size() const = 0;
+};
+
+}
diff --git a/src/common/io/reads/ireadstream.hpp b/src/common/io/reads/ireadstream.hpp
new file mode 100644
index 0000000..e9f4089
--- /dev/null
+++ b/src/common/io/reads/ireadstream.hpp
@@ -0,0 +1,168 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* ifastqstream.hpp
+*
+* Created on: 03.03.2011
+* Author: vyahhi
+*/
+
+#ifndef IREADSTREAM_HPP_
+#define IREADSTREAM_HPP_
+
+#include "kseq/kseq.h"
+#include <zlib.h>
+#include "utils/verify.hpp"
+#include "read.hpp"
+#include "sequence/nucl.hpp"
+
+// STEP 1: declare the type of file handler and the read() function
+KSEQ_INIT(gzFile, gzread)
+
+/*
+* Read name, seq and qual strings from FASTQ data (one by one)
+*/
+//fixme deprecated!!! remove usages!
+class ireadstream {
+
+public:
+typedef Read ReadT;
+
+ireadstream(const std::string &filename) : filename_(filename), offset_(Read::PHRED_OFFSET) {
+ is_open_ = open(filename);
+}
+
+ireadstream(const std::string &filename, int offset) : filename_(filename), offset_(offset) {
+ is_open_ = open(filename);
+}
+
+virtual ~ireadstream() {
+ close();
+}
+
+bool is_open() const {
+ return is_open_;
+}
+
+bool eof() const {
+ return eof_;
+}
+
+static std::vector <Read> *readAll(std::string filename, int cnt = -1) {
+ ireadstream irs(filename);
+ VERIFY(irs.is_open());
+ std::vector <Read> *res = new std::vector<Read>();
+ Read r;
+ while (cnt-- && irs.is_open() && !irs.eof()) {
+ irs >> r;
+ if (!r.isValid()) {
+ cnt++;
+ continue;
+ }
+ res->push_back(r);
+ }
+ irs.close();
+ return res;
+}
+
+static void readAllNoValidation(std::vector <Read> *res, std::string filename, uint64_t *totalsize,
+ int qvoffset = Read::PHRED_OFFSET, int trim_quality = -1, int cnt = -1) {
+ ireadstream irs(filename, qvoffset);
+ VERIFY(irs.is_open());
+ *totalsize = 0;
+ Read r;
+ while (cnt-- && irs.is_open() && !irs.eof()) {
+ irs >> r;
+ size_t read_size = r.trimNsAndBadQuality(trim_quality);
+ res->push_back(r);
+ *totalsize += read_size;
+ }
+ irs.close();
+}
+
+ireadstream &operator>>(Read &r) {
+ VERIFY(is_open());
+ VERIFY(!eof());
+ if (!is_open() || eof()) {
+ return *this;
+ }
+ r.setName(seq_->name.s);
+ if (seq_->qual.s) {
+ r.setQuality(seq_->qual.s, offset_);
+ }
+ r.setSequence(seq_->seq.s);
+ read_ahead(); // make actual read for the next result
+ return *this;
+}
+
+void close() {
+ if (is_open()) {
+ kseq_destroy(seq_); // STEP 5: destroy seq
+ gzclose(fp_); // STEP 6: close the file handler
+ is_open_ = false;
+ }
+}
+
+void reset() {
+ close();
+ open(filename_);
+}
+
+private:
+std::string filename_;
+gzFile fp_;
+kseq_t *seq_;
+bool is_open_;
+bool eof_;
+int offset_;
+
+/*
+ * open i's file with FASTQ reads,
+ * return true if it opened file, false otherwise
+ */
+bool open(std::string filename) {
+ fp_ = gzopen(filename.c_str(), "r"); // STEP 2: open the file handler
+ if (!fp_) {
+ return false;
+ }
+ is_open_ = true;
+ seq_ = kseq_init(fp_); // STEP 3: initialize seq
+ eof_ = false;
+ read_ahead();
+ return true;
+}
+
+void read_ahead() {
+ VERIFY(is_open());
+ VERIFY(!eof());
+ if (kseq_read(seq_) < 0) {
+ eof_ = true;
+ }
+}
+};
+
+//return -1 if failed to determine offset
+inline int determine_offset(const std::string &filename) {
+ireadstream stream(filename, 0);
+size_t count = 0;
+Read r;
+while (!stream.eof() && count++ < 10000) {
+ stream >> r;
+ std::string q_str = r.getQualityString();
+ for (size_t i = 0; i < q_str.size(); ++i) {
+ int q_val = q_str[i];
+ if (q_val < ';')
+ return 33;
+ if (q_val > 'K')
+ return 64;
+ }
+}
+return -1;
+}
+
+#endif /* IREADSTREAM_HPP_ */
diff --git a/src/common/io/reads/modifying_reader_wrapper.hpp b/src/common/io/reads/modifying_reader_wrapper.hpp
new file mode 100644
index 0000000..ec4a137
--- /dev/null
+++ b/src/common/io/reads/modifying_reader_wrapper.hpp
@@ -0,0 +1,115 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/verify.hpp"
+#include "delegating_reader_wrapper.hpp"
+#include "single_read.hpp"
+#include "paired_readers.hpp"
+
+#include <memory>
+
+namespace io {
+
+class SequenceModifier {
+public:
+ virtual ~SequenceModifier() {}
+
+ SingleRead Modify(const SingleRead& read) {
+ return SingleRead(read.name(), Modify(read.sequence()).str());
+ }
+
+ SingleReadSeq Modify(const SingleReadSeq& read) {
+ return SingleReadSeq(Modify(read.sequence()));
+ }
+
+ virtual Sequence Modify(const Sequence& s) = 0;
+};
+
+class TrivialModifier : public SequenceModifier {
+public:
+
+ virtual Sequence Modify(const Sequence& s) {
+ return s;
+ }
+};
+
+/**
+ * Attention!!! this class clears quality!!!
+ */
+template<class ReadType>
+class ModifyingWrapper;
+
+template<>
+class ModifyingWrapper<SingleRead>: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read = modifier_->Modify(read);
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<PairedRead>: public DelegatingWrapper<PairedRead> {
+ typedef DelegatingWrapper<PairedRead> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(PairedRead& read) {
+ this->reader() >> read;
+ read = PairedRead(modifier_->Modify(read.first()),
+ modifier_->Modify(read.second()),
+ read.insert_size());
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<SingleReadSeq>: public DelegatingWrapper<SingleReadSeq> {
+ typedef DelegatingWrapper<SingleReadSeq> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(SingleReadSeq& read) {
+ this->reader() >> read;
+ read = modifier_->Modify(read.sequence());
+ return *this;
+ }
+};
+
+template<>
+class ModifyingWrapper<PairedReadSeq>: public DelegatingWrapper<PairedReadSeq> {
+ typedef DelegatingWrapper<PairedReadSeq> base;
+ std::shared_ptr<SequenceModifier> modifier_;
+
+public:
+ ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
+ base(reader), modifier_(modifier) {}
+
+ ModifyingWrapper& operator>>(PairedReadSeq& read) {
+ this->reader() >> read;
+ read = PairedReadSeq(modifier_->Modify(read.first().sequence())
+ , SingleReadSeq(modifier_->Modify(read.second())), read.insert_size());
+ return *this;
+ }
+};
+
+}
diff --git a/src/modules/io/reads_io/mpmc_bounded.hpp b/src/common/io/reads/mpmc_bounded.hpp
similarity index 100%
rename from src/modules/io/reads_io/mpmc_bounded.hpp
rename to src/common/io/reads/mpmc_bounded.hpp
diff --git a/src/modules/io/reads_io/multifile_reader.hpp b/src/common/io/reads/multifile_reader.hpp
similarity index 100%
rename from src/modules/io/reads_io/multifile_reader.hpp
rename to src/common/io/reads/multifile_reader.hpp
diff --git a/src/modules/io/reads_io/orientation.hpp b/src/common/io/reads/orientation.hpp
similarity index 100%
rename from src/modules/io/reads_io/orientation.hpp
rename to src/common/io/reads/orientation.hpp
diff --git a/src/common/io/reads/osequencestream.hpp b/src/common/io/reads/osequencestream.hpp
new file mode 100644
index 0000000..9545f8c
--- /dev/null
+++ b/src/common/io/reads/osequencestream.hpp
@@ -0,0 +1,381 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * oreadstream.hpp
+ *
+ * Created on: 23.06.2011
+ * Author: vyahhi
+ */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include "single_read.hpp"
+#include "paired_read.hpp"
+
+namespace io {
+
+inline std::string MakeContigId(size_t number, size_t length, const std::string& prefix = "NODE") {
+ return prefix + "_" + ToString(number) + "_length_" + ToString(length);
+}
+
+inline std::string MakeContigId(size_t number, size_t length, double coverage, const std::string& prefix = "NODE") {
+ return MakeContigId(number, length, prefix) + "_cov_" + ToString(coverage);
+}
+
+inline std::string MakeContigId(size_t number, size_t length, double coverage, size_t id, const std::string& prefix = "NODE") {
+ return MakeContigId(number, length, coverage, prefix) + "_ID_" + ToString(id);
+}
+
+inline std::string MakeRNAContigId(size_t number, size_t length, double coverage, size_t gene_id, size_t isoform_id, const std::string& prefix = "NODE") {
+ return MakeContigId(number, length, coverage, prefix) + "_g" + ToString(gene_id) + "_i" + ToString(isoform_id);
+}
+
+inline std::string MakeContigComponentId(size_t number, size_t length, double coverage, size_t component_id, const std::string& prefix = "NODE") {
+ return MakeContigId(number, length, coverage, prefix) + "_component_" + ToString(component_id);
+}
+
+class osequencestream {
+protected:
+ std::ofstream ofstream_;
+
+ size_t id_;
+
+ void write_str(const std::string& s) {
+ size_t cur = 0;
+ while (cur < s.size()) {
+ ofstream_ << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ }
+
+ virtual void write_header(const std::string& s) {
+ // Velvet format: NODE_1_length_24705_cov_358.255249
+ ofstream_ << ">" << MakeContigId(id_++, s.size()) << std::endl;
+ }
+
+public:
+ osequencestream(const std::string& filename): id_(1) {
+ ofstream_.open(filename.c_str());
+ }
+
+
+ virtual ~osequencestream() {
+ ofstream_.close();
+ }
+
+ virtual osequencestream& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ virtual osequencestream& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+ /**
+ * Has different way of making headers
+ * Doesn't increase counters, don't mix with other methods!
+ */
+ virtual osequencestream& operator<<(const SingleRead& read) {
+ ofstream_ << ">" << read.name() << std::endl;
+ size_t cur = 0;
+ std::string s = read.GetSequenceString();
+ while (cur < s.size()) {
+ ofstream_ << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ return *this;
+ }
+};
+
+
+
+
+
+
+class PairedOutputSequenceStream {
+protected:
+ std::ofstream ofstreaml_;
+ std::ofstream ofstreamr_;
+
+ static void write(const SingleRead& read, std::ofstream& stream) {
+ stream << ">" << read.name() << std::endl;
+ size_t cur = 0;
+ std::string s = read.GetSequenceString();
+ while (cur < s.size()) {
+ stream << s.substr(cur, 60) << std::endl;
+ cur += 60;
+ }
+ }
+
+public:
+ PairedOutputSequenceStream(const std::string& filename1, const std::string &filename2) {
+ ofstreaml_.open(filename1);
+ ofstreamr_.open(filename2);
+ }
+
+ virtual ~PairedOutputSequenceStream() {
+ ofstreaml_.close();
+ ofstreamr_.close();
+ }
+
+ PairedOutputSequenceStream& operator<<(const PairedRead& read) {
+ write(read.first(), ofstreaml_);
+ write(read.second(), ofstreamr_);
+ return *this;
+ }
+};
+
+
+class osequencestream_cov: public osequencestream {
+protected:
+ double coverage_;
+
+ virtual void write_header(const std::string& s) {
+ // Velvet format: NODE_1_length_24705_cov_358.255249
+ ofstream_ << ">" << MakeContigId(id_++, s.size(), coverage_) << std::endl;
+ }
+
+
+public:
+ osequencestream_cov(const std::string& filename)
+ : osequencestream(filename), coverage_(0.) { }
+
+ virtual ~osequencestream_cov() {
+ ofstream_.close();
+ }
+
+ osequencestream_cov& operator<<(double coverage) {
+ coverage_ = coverage;
+ return *this;
+ }
+
+ osequencestream_cov& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_cov& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+
+class osequencestream_simple: public osequencestream {
+protected:
+ std::string header_;
+
+ double cov_;
+
+ virtual void write_header(const std::string& /*s*/) {
+ ofstream_ << ">" << header_ << std::endl;
+ }
+
+public:
+ osequencestream_simple(const std::string& filename)
+ : osequencestream(filename), header_("") { }
+
+ virtual ~osequencestream_simple() {
+ ofstream_.close();
+ }
+
+ void set_header(const std::string &header) {
+ header_ = header;
+ }
+
+ osequencestream_simple& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_simple& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+class osequencestream_with_id: public osequencestream {
+protected:
+ size_t uid_;
+
+ double cov_;
+
+ virtual void write_header(const std::string& s) {
+ ofstream_ << ">" << GetId(s) << std::endl;
+ id_++;
+ }
+
+public:
+ osequencestream_with_id(const std::string& filename)
+ : osequencestream(filename), uid_(0), cov_(0.0) { }
+
+ virtual ~osequencestream_with_id() {
+ ofstream_.close();
+ }
+
+ std::string GetId(const std::string& s) const {
+ return MakeContigId(id_, s.size(), cov_, uid_);
+ }
+
+ void setCoverage(double c) {
+ cov_ = c;
+ }
+
+ void setID(size_t uid) {
+ uid_ = uid;
+ }
+
+ osequencestream_with_id& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_id& operator<<(double coverage) {
+ cov_ = coverage;
+ return *this;
+ }
+
+ osequencestream_with_id& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+class osequencestream_with_manual_node_id: public osequencestream_with_id {
+ bool is_id_set_;
+ virtual void write_header(const std::string& s) {
+ //for manual NODE ID setting osequencestream need to chech that node ID is really manually set
+ if (!is_id_set_) {
+ WARN ("NODE ID is not set manually, setting to 0");
+ id_ = 0;
+ }
+ ofstream_ << ">" << MakeContigId(id_, s.size(), cov_, uid_) << std::endl;
+ is_id_set_ = false;
+ }
+
+public:
+//unfortunately constructor inheritance is supported only since g++4.8
+ osequencestream_with_manual_node_id(const std::string& filename): osequencestream_with_id(filename) {
+ is_id_set_ = false;
+ }
+
+ void setNodeID(int id) {
+ id_ = id;
+ is_id_set_ = true;
+ }
+
+ osequencestream_with_manual_node_id& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_manual_node_id& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+
+};
+
+
+class osequencestream_with_data_for_scaffold: public osequencestream_with_id {
+protected:
+ std::ofstream scstream_;
+
+ virtual void write_header(const std::string& s) {
+ scstream_ << id_ << "\tNODE_" << id_ << "\t" << s.size() << "\t" << (int) round(cov_) << std::endl;
+ ofstream_ << ">" << MakeContigId(id_++, s.size(), cov_, uid_) << std::endl;
+ }
+
+public:
+ osequencestream_with_data_for_scaffold(const std::string& filename): osequencestream_with_id(filename) {
+ id_ = 1;
+ std::string sc_filename = filename + ".info";
+ scstream_.open(sc_filename.c_str());
+ }
+
+ virtual ~osequencestream_with_data_for_scaffold() {
+ ofstream_.close();
+ scstream_.close();
+ }
+
+ osequencestream_with_data_for_scaffold& operator<<(const std::string& s) {
+ write_header(s);
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_with_data_for_scaffold& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+};
+
+class osequencestream_for_fastg: public osequencestream_with_id {
+protected:
+ std::string header_;
+
+ virtual void write_header(const std::string& s) {
+ ofstream_ << ">" << s;
+ }
+
+public:
+ osequencestream_for_fastg(const std::string& filename):
+ osequencestream_with_id(filename) {
+ id_ = 1;
+ }
+
+ virtual ~osequencestream_for_fastg() {
+ ofstream_.close();
+ }
+
+ void set_header(const std::string& h) {
+ header_= h;
+ }
+
+ osequencestream_for_fastg& operator<<(const std::set<std::string>& s) {
+ write_header(header_);
+ if (s.size() > 0) {
+ auto iter = s.begin();
+ ofstream_ << ":" << *iter;
+ ++iter;
+ while (iter != s.end()) {
+ ofstream_ << "," << *iter;
+ ++iter;
+ }
+ }
+ ofstream_ << ";" << std::endl;
+ return *this;
+ }
+
+ osequencestream_for_fastg& operator<<(const std::string& s) {
+ write_str(s);
+ return *this;
+ }
+
+ osequencestream_for_fastg& operator<<(const Sequence& seq) {
+ std::string s = seq.str();
+ return operator <<(s);
+ }
+
+};
+
+}
diff --git a/src/modules/io/reads/paired_read.hpp b/src/common/io/reads/paired_read.hpp
similarity index 100%
rename from src/modules/io/reads/paired_read.hpp
rename to src/common/io/reads/paired_read.hpp
diff --git a/src/common/io/reads/paired_readers.hpp b/src/common/io/reads/paired_readers.hpp
new file mode 100644
index 0000000..8aaa861
--- /dev/null
+++ b/src/common/io/reads/paired_readers.hpp
@@ -0,0 +1,252 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <string>
+#include "ireader.hpp"
+#include "paired_read.hpp"
+#include "file_reader.hpp"
+#include "orientation.hpp"
+
+namespace io {
+
+class SeparatePairedReadStream : public ReadStream<PairedRead> {
+ public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The pair that contains the names of two files to
+ * be opened.
+ * @param distance Distance between parts of PairedReads.
+ * @param offset The offset of the read quality.
+ */
+ explicit SeparatePairedReadStream(const std::string& filename1, const std::string& filename2,
+ size_t insert_size, bool change_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset)
+ : insert_size_(insert_size),
+ change_order_(change_order),
+ use_orientation_(use_orientation),
+ changer_(GetOrientationChanger<PairedRead>(orientation)),
+ offset_type_(offset_type),
+ first_(new FileReadStream(filename1, offset_type_)),
+ second_(new FileReadStream(filename2, offset_type_)),
+ filename1_(filename1),
+ filename2_(filename2){}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return first_->is_open() && second_->is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+
+ if (first_->eof() != second_->eof()) {
+ if (first_->eof()) {
+ ERROR("The number of right read-pairs is larger than the number of left read-pairs");
+ } else {
+ ERROR("The number of left read-pairs is larger than the number of right read-pairs");
+ }
+ FATAL_ERROR("Unequal number of read-pairs detected in the following files: " << filename1_ << " " << filename2_ << "");
+ }
+ return first_->eof();
+ }
+
+ /*
+ * Read PairedRead from stream.
+ *
+ * @param pairedread The PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ SeparatePairedReadStream& operator>>(PairedRead& pairedread) {
+ SingleRead sr1, sr2;
+ (*first_) >> sr1;
+ (*second_) >> sr2;
+
+ if (use_orientation_) {
+ pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
+ }
+ else {
+ pairedread = PairedRead(sr1, sr2, insert_size_);
+ }
+
+ if (change_order_) {
+ pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
+ }
+
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ first_->close();
+ second_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ first_->reset();
+ second_->reset();
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ private:
+
+ size_t insert_size_;
+
+ bool change_order_;
+
+ bool use_orientation_;
+
+ std::unique_ptr<OrientationChanger<PairedRead>> changer_;
+
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+
+ /*
+ * @variable The first stream (reads from first file).
+ */
+ std::unique_ptr<ReadStream<SingleRead>> first_;
+ /*
+ * @variable The second stream (reads from second file).
+ */
+ std::unique_ptr<ReadStream<SingleRead>> second_;
+
+ //Only for providing information about error for users
+ std::string filename1_;
+ std::string filename2_;
+};
+
+class InterleavingPairedReadStream : public ReadStream<PairedRead> {
+ public:
+ /*
+ * Default constructor.
+ *
+ * @param filename Single file
+ * @param distance Distance between parts of PairedReads.
+ * @param offset The offset of the read quality.
+ */
+ explicit InterleavingPairedReadStream(const std::string& filename, size_t insert_size, bool change_order = false,
+ bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), insert_size_(insert_size),
+ change_order_(change_order),
+ use_orientation_(use_orientation),
+ changer_(GetOrientationChanger<PairedRead>(orientation)),
+ offset_type_(offset_type),
+ single_(new FileReadStream(filename_, offset_type_)) {}
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ /* virtual */ bool is_open() {
+ return single_->is_open();
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ /* virtual */ bool eof() {
+ return single_->eof();
+ }
+
+ /*
+ * Read PairedRead from stream.
+ *
+ * @param pairedread The PairedRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ /* virtual */ InterleavingPairedReadStream& operator>>(PairedRead& pairedread) {
+ SingleRead sr1, sr2;
+ (*single_) >> sr1;
+ (*single_) >> sr2;
+
+ if (use_orientation_) {
+ pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
+ }
+ else {
+ pairedread = PairedRead(sr1, sr2, insert_size_);
+ }
+
+ if (change_order_) {
+ pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
+ }
+
+ return *this;
+ }
+
+ /*
+ * Close the stream.
+ */
+ /* virtual */ void close() {
+ single_->close();
+ }
+
+ /*
+ * Close the stream and open it again.
+ */
+ /* virtual */ void reset() {
+ single_->reset();
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ private:
+ /*
+ * @variable The names of the file which stream reads from.
+ */
+ std::string filename_;
+
+ size_t insert_size_;
+
+ bool change_order_;
+
+ bool use_orientation_;
+
+ std::unique_ptr<OrientationChanger<PairedRead>> changer_;
+
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+
+ /*
+ * @variable The single read stream.
+ */
+ std::unique_ptr<ReadStream<SingleRead>> single_;
+
+};
+}
diff --git a/src/common/io/reads/parser.cpp b/src/common/io/reads/parser.cpp
new file mode 100644
index 0000000..3a5ef81
--- /dev/null
+++ b/src/common/io/reads/parser.cpp
@@ -0,0 +1,90 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file parser.cpp
+ * @author Mariya Fomkina
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Parser is the parent class for all streams that read data from
+ * different file types (fastq, fasta, sam etc).
+ * This file contains functions that are used to select exact parser
+ * according to extension.
+ */
+
+#include "single_read.hpp"
+#include "fasta_fastq_gz_parser.hpp"
+#include "parser.hpp"
+#include "sam/bam_parser.hpp"
+#include "utils/standard_base.hpp"
+
+
+namespace io {
+
+/*
+ * Get extension from filename.
+ *
+ * @param filename The name of the file to read from.
+ *
+ * @return File extension (e.g. "fastq", "fastq.gz").
+ */
+std::string GetExtension(const std::string& filename) {
+ std::string name = filename;
+ size_t pos = name.find_last_of(".");
+ std::string ext = "";
+ if (pos != std::string::npos) {
+ ext = name.substr(name.find_last_of(".") + 1);
+ if (ext == "gz") {
+ ext = name.substr(name.find_last_of
+ (".", name.find_last_of(".") - 1) + 1);
+ }
+ }
+ return ext;
+}
+
+/*
+ * Select parser type according to file extension.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+
+ * @return Pointer to the new parser object with these filename and
+ * offset.
+ */
+Parser* SelectParser(const std::string& filename,
+ OffsetType offset_type /*= PhredOffset*/) {
+ std::string ext = GetExtension(filename);
+ if (ext == "bam")
+ return new BAMParser(filename, offset_type);
+
+ return new FastaFastqGzParser(filename, offset_type);
+ /*
+ if ((ext == "fastq") || (ext == "fastq.gz") ||
+ (ext == "fasta") || (ext == "fasta.gz") ||
+ (ext == "fa") || (ext == "fq.gz") ||
+ (ext == "fq") || (ext == "fa.gz") ||
+ (ext == "seq") || (ext == "seq.gz")) {
+ return new FastaFastqGzParser(filename, offset_type);
+ }
+
+ ERROR("Unknown file extention in input!");
+ return NULL; */
+}
+
+void first_fun(int) {
+}
+
+}
diff --git a/src/common/io/reads/parser.hpp b/src/common/io/reads/parser.hpp
new file mode 100644
index 0000000..030a985
--- /dev/null
+++ b/src/common/io/reads/parser.hpp
@@ -0,0 +1,145 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+* @file parser.hpp
+* @author Mariya Fomkina
+* @version 1.0
+*
+* @section LICENSE
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of
+* the License, or (at your option) any later version.
+*
+* @section DESCRIPTION
+*
+* Parser is the parent class for all streams that read data from
+* different file types (fastq, fasta, sam etc).
+*/
+
+#ifndef COMMON_IO_PARSER_HPP
+#define COMMON_IO_PARSER_HPP
+
+#include <string>
+#include "single_read.hpp"
+
+namespace io {
+
+class Parser {
+public:
+ /*
+ * Default constructor.
+ *
+ * @param filename The name of the file to be opened.
+ * @param offset The offset of the read quality.
+ */
+ Parser(const std::string &filename,
+ OffsetType offset_type = PhredOffset)
+ : filename_(filename), offset_type_(offset_type),
+ is_open_(false), eof_(true) { }
+
+ /*
+ * Default destructor.
+ */
+ virtual ~Parser() { }
+
+ /*
+ * Check whether the stream is opened.
+ *
+ * @return true of the stream is opened and false otherwise.
+ */
+ virtual bool is_open() const {
+ return is_open_;
+ }
+
+ /*
+ * Check whether we've reached the end of stream.
+ *
+ * @return true if the end of stream is reached and false
+ * otherwise.
+ */
+ virtual bool eof() const {
+ return eof_;
+ }
+
+ /*
+ * Read SingleRead from stream.
+ *
+ * @param read The SingleRead that will store read data.
+ *
+ * @return Reference to this stream.
+ */
+ virtual Parser &operator>>(SingleRead &read) = 0;
+
+ /*
+ * Close the stream.
+ */
+ virtual void close() = 0;
+
+ /*
+ * Close the stream and open it again.
+ */
+ void reset() {
+ close();
+ open();
+ }
+
+protected:
+ /*
+ * @variable The name the file which stream reads from.
+ */
+ std::string filename_;
+ /*
+ * @variable Quality offset type.
+ */
+ OffsetType offset_type_;
+ /*
+ * @variable Flag that shows whether the stream is opened.
+ */
+ bool is_open_;
+ /*
+ * @variable Flag that shows whether the end of the stream is
+ * reached.
+ */
+ bool eof_;
+
+private:
+ /*
+ * Open a stream.
+ */
+ virtual void open() = 0;
+};
+
+/*
+* Get extension from filename.
+*
+* @param filename The name of the file to read from.
+*
+* @return File extension (e.g. "fastq", "fastq.gz").
+*/
+std::string GetExtension(const std::string &filename);
+
+/*
+* Select parser type according to file extension.
+*
+* @param filename The name of the file to be opened.
+* @param offset The offset of the read quality.
+
+* @return Pointer to the new parser object with these filename and
+* offset.
+*/
+Parser *SelectParser(const std::string &filename,
+ OffsetType offset_type = PhredOffset);
+
+//todo delete???
+void first_fun(int);
+
+}
+
+#endif /* COMMON_IO_PARSER_HPP */
diff --git a/src/modules/io/reads_io/rc_reader_wrapper.hpp b/src/common/io/reads/rc_reader_wrapper.hpp
similarity index 100%
rename from src/modules/io/reads_io/rc_reader_wrapper.hpp
rename to src/common/io/reads/rc_reader_wrapper.hpp
diff --git a/src/common/io/reads/read.hpp b/src/common/io/reads/read.hpp
new file mode 100644
index 0000000..913a6f3
--- /dev/null
+++ b/src/common/io/reads/read.hpp
@@ -0,0 +1,244 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * read.hpp
+ *
+ * Created on: 29.03.2011
+ * Author: vyahhi
+ */
+
+#ifndef READ_HPP_
+#define READ_HPP_
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "utils/verify.hpp"
+#include "sequence/quality.hpp"
+#include "sequence/sequence.hpp"
+#include "sequence/nucl.hpp"
+#include "sequence/sequence_tools.hpp"
+#include "utils/simple_tools.hpp"
+
+//fixme deprecated!!! used in hammer!
+class Read {
+public:
+ static const int PHRED_OFFSET = 33;
+
+ bool isValid() const {
+ return valid_;
+ }
+
+ Sequence getSequence() const {
+ VERIFY(valid_);
+ return Sequence(seq_);
+ }
+
+ Sequence getSubSequence(size_t start, size_t length) const __attribute__ ((deprecated)) {
+ VERIFY(length > 0 && start + length <= seq_.size());
+ return Sequence(seq_.substr(start, length));
+ }
+
+ Quality getQuality() const {
+ VERIFY(valid_);
+ return Quality(qual_);
+ }
+
+ const std::string &getSequenceString() const {
+ return seq_;
+ }
+
+ const std::string &getQualityString() const {
+ return qual_;
+ }
+
+ std::string getPhredQualityString(int offset = PHRED_OFFSET) const {
+ std::string res = qual_;
+ for (size_t i = 0; i < res.size(); ++i) {
+ res[i] = (char) (res[i] + offset);
+ }
+ return res;
+ }
+
+ const std::string &getName() const {
+ return name_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(seq_[i]));
+ return dignucl(seq_[i]);
+ }
+
+ /**
+ * trim read
+ * @param ltrim first good base
+ * @param rtrim last good base
+ * @return whether there is anything left
+ */
+ bool trimLeftRight(int ltrim, int rtrim) {
+ if (ltrim >= (int) seq_.size() || rtrim < 0 || rtrim < ltrim) {
+ seq_ = "";
+ qual_ = "";
+ valid_ = false;
+ return 0;
+ }
+ bool donesomething = false;
+ if (ltrim > 0) {
+ ltrim_ += ltrim;
+ seq_.erase(0, ltrim);
+ qual_.erase(0, ltrim);
+ donesomething = true;
+ }
+ if (rtrim - ltrim + 1 < (int) seq_.size() && rtrim < (int) seq_.size() - ltrim - 1) {
+ rtrim_ -= ((int) seq_.size() - (rtrim - ltrim + 1));
+ seq_.erase(rtrim - ltrim + 1, std::string::npos);
+ qual_.erase(rtrim - ltrim + 1, std::string::npos);
+ donesomething = true;
+ }
+ if (donesomething) valid_ = updateValid();
+ return true;
+ }
+
+ size_t trimNsAndBadQuality(int threshold) {
+ int start = 0;
+ for (; start < (int) seq_.size(); ++start) {
+ if (seq_[start] != 'N' && (int) qual_[start] > threshold) break;
+ }
+ int end = 0;
+ for (end = (int) seq_.size() - 1; end > -1; --end) {
+ if (seq_[end] != 'N' && (int) qual_[end] > threshold) break;
+ }
+ if (!trimLeftRight(start, end)) return 0;
+ else return seq_.size();
+ }
+
+ /**
+ * @param k k as in k-mer
+ * @param start start point
+ * @return the first starting point of a valid k-mer >=start; return -1 if no such place exists
+ */
+ size_t firstValidKmer(size_t start, size_t k) const __attribute__ ((deprecated)) {
+ size_t curHypothesis = start;
+ size_t i = start;
+ for (; i < seq_.size(); ++i) {
+ if (i >= k + curHypothesis)
+ return curHypothesis;
+ if (!is_nucl(seq_[i])) {
+ curHypothesis = i + 1;
+ }
+ }
+ if (i >= k + curHypothesis) {
+ return curHypothesis;
+ }
+ return -1ULL;
+ }
+
+ void setSequence(const char *s, bool preserve_trimming = false) {
+ seq_ = s;
+ if (!preserve_trimming) {
+ ltrim_ = 0;
+ rtrim_ = initial_size_ = (int) seq_.size();
+ }
+ valid_ = updateValid();
+ }
+
+ void setQuality(const char *s, int offset = PHRED_OFFSET) {
+ qual_ = s;
+ for (size_t i = 0; i < qual_.size(); ++i) {
+ qual_[i] = (char) (qual_[i] - offset);
+ }
+ }
+
+ void setName(const char *s) {
+ name_ = s;
+ }
+
+ Read()
+ : valid_(false), ltrim_(0), rtrim_(0), initial_size_(0) {
+ ;
+ }
+
+ Read(const std::string &name, const std::string &seq, const std::string &qual) :
+ name_(name), seq_(seq), qual_(qual) { // for test only!
+ ltrim_ = 0;
+ initial_size_ = rtrim_ = (int) seq_.size();
+ valid_ = updateValid();
+ }
+
+ int ltrim() const { return ltrim_; }
+
+ void set_ltrim(unsigned val) { ltrim_ = val; };
+
+ int rtrim() const { return rtrim_; }
+
+ int initial_size() const { return initial_size_; }
+
+private:
+ std::string name_;
+ std::string seq_;
+ std::string qual_;
+ bool valid_;
+ int ltrim_;
+ int rtrim_;
+ int initial_size_;
+
+ friend class ireadstream;
+
+ friend uint32_t TrimBadQuality(Read *, int);
+
+ bool updateValid() const {
+ if (seq_.size() == 0) {
+ return false;
+ }
+ for (size_t i = 0; i < seq_.size(); ++i) {
+ if (!is_nucl(seq_[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ Read operator!() const {
+ std::string newName;
+ if (name_ == "" || name_[0] != '!') {
+ newName = '!' + name_;
+ } else {
+ newName = name_.substr(1, name_.length());
+ }
+ return Read(newName, ReverseComplement(seq_), Reverse(qual_));
+ }
+
+ void print(std::ostream &outf, int offset) const {
+ outf << "@" << name_.c_str() << "\n";
+ for (int i = 0; i < ltrim_; ++i) outf << "N";
+ outf << seq_.c_str();
+ for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << "N";
+ outf << "\n" << "+" << name_.c_str();
+ if (ltrim_ > 0) outf << " ltrim=" << ltrim_;
+ if (rtrim_ < initial_size_)
+ outf << " rtrim=" << (initial_size_ - rtrim_);
+ outf << "\n";
+ char badq = (char) (offset + 2);
+ for (int i = 0; i < ltrim_; ++i) outf << badq;
+ outf << getPhredQualityString(offset).c_str();
+ for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << badq;
+ outf << "\n";
+ }
+};
+
+// todo: put this to *.cpp
+//ostream& operator<<(ostream& os, const Read& read) {
+// return os << read.getSequenceString();
+//}
+
+#endif /* READ_HPP_ */
diff --git a/src/common/io/reads/read_processor.hpp b/src/common/io/reads/read_processor.hpp
new file mode 100644
index 0000000..a8d060b
--- /dev/null
+++ b/src/common/io/reads/read_processor.hpp
@@ -0,0 +1,209 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __HAMMER_READ_PROCESSOR_HPP__
+#define __HAMMER_READ_PROCESSOR_HPP__
+
+#include "io/reads/mpmc_bounded.hpp"
+
+#include "utils/openmp_wrapper.h"
+
+#pragma GCC diagnostic push
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif
+namespace hammer {
+class ReadProcessor {
+ static size_t constexpr cacheline_size = 64;
+ typedef char cacheline_pad_t[cacheline_size];
+
+ unsigned nthreads_;
+ cacheline_pad_t pad0;
+ size_t read_;
+ cacheline_pad_t pad1;
+ size_t processed_;
+ cacheline_pad_t pad2;
+
+private:
+ template<class Reader, class Op>
+ bool RunSingle(Reader &irs, Op &op) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
+ while (!irs.eof()) {
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
+ read_ += 1;
+
+ processed_ += 1;
+ if (op(std::move(r))) // Pass ownership of read down to processor
+ return true;
+ }
+
+ return false;
+ }
+
+ template<class Reader, class Op, class Writer>
+ void RunSingle(Reader &irs, Op &op, Writer &writer) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
+ while (!irs.eof()) {
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
+ read_ += 1;
+
+ auto res = op(std::move(r)); // Pass ownership of read down to processor
+ processed_ += 1;
+
+ if (res)
+ writer << *res;
+ }
+ }
+
+public:
+ ReadProcessor(unsigned nthreads)
+ : nthreads_(nthreads), read_(0), processed_(0) { }
+
+ size_t read() const { return read_; }
+
+ size_t processed() const { return processed_; }
+
+ template<class Reader, class Op>
+ bool Run(Reader &irs, Op &op) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
+ if (nthreads_ < 2)
+ return RunSingle(irs, op);
+
+ // Round nthreads to next power of two
+ unsigned bufsize = nthreads_ - 1;
+ bufsize = (bufsize >> 1) | bufsize;
+ bufsize = (bufsize >> 2) | bufsize;
+ bufsize = (bufsize >> 4) | bufsize;
+ bufsize = (bufsize >> 8) | bufsize;
+ bufsize = (bufsize >> 16) | bufsize;
+ bufsize += 1;
+
+ mpmc_bounded_queue<ReadPtr> in_queue(2 * bufsize);
+
+ bool stop = false;
+# pragma omp parallel shared(in_queue, irs, op, stop) num_threads(nthreads_)
+ {
+# pragma omp master
+ {
+ while (!irs.eof()) {
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
+# pragma omp atomic
+ read_ += 1;
+
+ while (!in_queue.enqueue(std::move(r)))
+ sched_yield();
+
+# pragma omp flush (stop)
+ if (stop)
+ break;
+ }
+
+ in_queue.close();
+ }
+
+ while (1) {
+ ReadPtr r;
+
+ if (!in_queue.wait_dequeue(r))
+ break;
+
+# pragma omp atomic
+ processed_ += 1;
+
+ bool res = op(std::move(r));
+ if (res) {
+# pragma omp atomic
+ stop |= res;
+ }
+ }
+ }
+
+# pragma omp flush(stop)
+ return stop;
+ }
+
+ template<class Reader, class Op, class Writer>
+ void Run(Reader &irs, Op &op, Writer &writer) {
+ using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
+
+ if (nthreads_ < 2) {
+ RunSingle(irs, op, writer);
+ return;
+ }
+
+ // Round nthreads to next power of two
+ unsigned bufsize = nthreads_ - 1;
+ bufsize = (bufsize >> 1) | bufsize;
+ bufsize = (bufsize >> 2) | bufsize;
+ bufsize = (bufsize >> 4) | bufsize;
+ bufsize = (bufsize >> 8) | bufsize;
+ bufsize = (bufsize >> 16) | bufsize;
+ bufsize += 1;
+
+ mpmc_bounded_queue<ReadPtr> in_queue(bufsize), out_queue(2 * bufsize);
+# pragma omp parallel shared(in_queue, out_queue, irs, op, writer) num_threads(nthreads_)
+ {
+# pragma omp master
+ {
+ while (!irs.eof()) {
+ ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
+ irs >> *r;
+
+ // First, try to provide read to the queue. If it's full, never mind.
+ bool status = in_queue.enqueue(std::move(r));
+
+ // Flush down the output queue
+ ReadPtr outr;
+ while (out_queue.dequeue(outr))
+ writer << *outr;
+
+ // If the input queue was originally full, wait until we can insert
+ // the read once again.
+ if (!status)
+ while (!in_queue.enqueue(std::move(r)))
+ sched_yield();
+ }
+
+ in_queue.close();
+
+ // Flush down the output queue while in master threads.
+ ReadPtr outr;
+ while (out_queue.dequeue(outr))
+ writer << *outr;
+ }
+
+ while (1) {
+ ReadPtr r;
+
+ if (!in_queue.wait_dequeue(r))
+ break;
+
+ auto res = op(std::move(r));
+ if (res)
+ while (!out_queue.enqueue(std::move(res)))
+ sched_yield();
+ }
+ }
+
+ // Flush down the output queue
+ ReadPtr outr;
+ while (out_queue.dequeue(outr))
+ writer << *outr;
+ }
+};
+
+#pragma GCC diagnostic pop
+
+}
+
+#endif // __HAMMER_READ_PROCESSOR_HPP__
diff --git a/src/common/io/reads/read_stream_vector.hpp b/src/common/io/reads/read_stream_vector.hpp
new file mode 100644
index 0000000..734c451
--- /dev/null
+++ b/src/common/io/reads/read_stream_vector.hpp
@@ -0,0 +1,137 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "ireader.hpp"
+#include <vector>
+
+namespace io {
+//todo rename file
+
+//todo check destroy_readers logic and usages
+template<class ReadType>
+class ReadStreamList {
+public:
+ typedef ReadType ReadT;
+ typedef ReadStream<ReadType> ReaderT;
+ typedef std::shared_ptr<ReaderT> ReaderPtrT;
+
+private:
+ std::vector<ReaderPtrT> readers_;
+
+public:
+
+ explicit ReadStreamList(const std::vector<ReaderPtrT> &readers) : readers_(readers) {
+ }
+
+ ReadStreamList() {
+ }
+
+ explicit ReadStreamList(ReaderT *reader_ptr) : readers_(1, ReaderPtrT(reader_ptr)) {
+ }
+
+ explicit ReadStreamList(ReaderPtrT reader_ptr) : readers_(1, reader_ptr) {
+ }
+
+ explicit ReadStreamList(size_t size) : readers_(size) {
+ }
+
+ //todo use boost iterator facade
+ class iterator : public std::iterator<std::input_iterator_tag, ReaderT> {
+ typedef typename std::vector<ReaderPtrT>::iterator vec_it;
+ vec_it it_;
+ public:
+
+ iterator(vec_it it) : it_(it) {
+ }
+
+ void operator++() {
+ ++it_;
+ }
+
+ bool operator==(const iterator &that) {
+ return it_ == that.it_;
+ }
+
+ bool operator!=(const iterator &that) {
+ return it_ != that.it_;
+ }
+
+ ReaderT &operator*() {
+ return *(*it_);
+ }
+ };
+
+ ReaderT &operator[](size_t i) {
+ return *readers_.at(i);
+ }
+
+ ReaderPtrT &ptr_at(size_t i) {
+ return readers_.at(i);
+ }
+
+ ReaderT &back() {
+ return *readers_.back();
+ }
+
+ size_t size() const {
+ return readers_.size();
+ }
+
+ bool eof() const {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ if (!readers_[i]->eof()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ iterator begin() {
+ return iterator(readers_.begin());
+ }
+
+ iterator end() {
+ return iterator(readers_.end());
+ }
+
+ void push_back(ReaderT *reader_ptr) {
+ readers_.push_back(ReaderPtrT(reader_ptr));
+ }
+
+ void push_back(ReaderPtrT reader_ptr) {
+ readers_.push_back(reader_ptr);
+ }
+
+ void reset() {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ readers_[i]->reset();
+ }
+ }
+
+ void close() {
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ readers_[i]->close();
+ }
+ }
+
+ void clear() {
+ readers_.clear();
+ }
+
+ ReadStreamStat get_stat() const {
+ ReadStreamStat stat;
+ for (size_t i = 0; i < readers_.size(); ++i) {
+ stat.merge(readers_[i]->get_stat());
+ }
+ return stat;
+ }
+
+};
+
+}
diff --git a/src/common/io/reads/sequence_reader.hpp b/src/common/io/reads/sequence_reader.hpp
new file mode 100644
index 0000000..86daf5d
--- /dev/null
+++ b/src/common/io/reads/sequence_reader.hpp
@@ -0,0 +1,77 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads/ireader.hpp"
+#include "common/basic/reads/single_read.hpp"
+
+namespace io {
+
+//todo merge with VectorReader
+template<class ReadType>
+class SequenceReadStream : public ReadStream<ReadType> {
+public:
+ explicit SequenceReadStream(const Sequence &sequence, const std::string &name = "")
+ : sequence_(sequence),
+ name_(name),
+ opened_(true),
+ eof_(false) {
+ }
+
+ virtual ~SequenceReadStream() {
+ }
+
+ virtual bool is_open() {
+ return opened_;
+ }
+
+ virtual bool eof() {
+ return eof_;
+ }
+
+ virtual void close() {
+ opened_ = false;
+ }
+
+ void reset() {
+ eof_ = false;
+ opened_ = true;
+ }
+
+ ReadStreamStat get_stat() const {
+ return ReadStreamStat();
+ }
+
+ SequenceReadStream &operator>>(ReadType &read);
+
+private:
+ Sequence sequence_;
+ std::string name_;
+ bool opened_;
+ bool eof_;
+};
+
+template<>
+SequenceReadStream<SingleRead> &SequenceReadStream<SingleRead>::operator>>(SingleRead &read) {
+ if (!eof_) {
+ read = SingleRead(name_, sequence_.str());
+ eof_ = true;
+ }
+ return *this;
+}
+
+template<>
+SequenceReadStream<SingleReadSeq> &SequenceReadStream<SingleReadSeq>::operator>>(SingleReadSeq &read) {
+ if (!eof_) {
+ read = SingleReadSeq(sequence_);
+ eof_ = true;
+ }
+ return *this;
+}
+
+}
diff --git a/src/common/io/reads/single_read.hpp b/src/common/io/reads/single_read.hpp
new file mode 100644
index 0000000..15bac77
--- /dev/null
+++ b/src/common/io/reads/single_read.hpp
@@ -0,0 +1,336 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/verify.hpp"
+#include "sequence/quality.hpp"
+#include "sequence/sequence.hpp"
+#include "sequence/nucl.hpp"
+#include "sequence/sequence_tools.hpp"
+#include "utils/simple_tools.hpp"
+
+#include <string>
+
+namespace io {
+
+/*
+* This enumerate contains offset type.
+* UnknownOffset is equal to "offset = 0".
+* PhredOffset is equal to "offset = 33".
+* SolexaOffset is equal to "offset = 64".
+*/
+enum OffsetType {
+ UnknownOffset = 0,
+ PhredOffset = 33,
+ SolexaOffset = 64
+};
+
+//todo extract code about offset from here
+
+typedef uint16_t SequenceOffsetT;
+
+
+class SingleRead {
+public:
+
+ static std::string EmptyQuality(const std::string &seq) {
+ return std::string(seq.size(), (char) 33);
+ }
+
+ static const int BAD_QUALITY_THRESHOLD = 2;
+
+ SingleRead() :
+ name_(""), seq_(""), qual_(""), left_offset_(0), right_offset_(0), valid_(false) {
+ DEBUG(name_ << " created");
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ const std::string &qual, OffsetType offset,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
+ Init();
+ DEBUG(name_ << " created");
+ for (size_t i = 0; i < qual_.size(); ++i) {
+ qual_[i] = (char) (qual_[i] - offset);
+ }
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ const std::string &qual,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
+ DEBUG(name_ << " created");
+ Init();
+ }
+
+ SingleRead(const std::string &name, const std::string &seq,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ name_(name), seq_(seq), qual_(EmptyQuality(seq_)), left_offset_(left_offset),
+ right_offset_(right_offset) {
+ DEBUG(name_ << " created");
+ Init();
+ }
+
+ bool IsValid() const {
+ return valid_;
+ }
+
+ Sequence sequence(bool rc = false) const {
+ VERIFY(valid_);
+ return Sequence(seq_, rc);
+ }
+
+ Quality quality() const {
+ VERIFY(valid_);
+ return Quality(qual_);
+ }
+
+ const std::string &name() const {
+ return name_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ const std::string &GetSequenceString() const {
+ return seq_;
+ }
+
+ const std::string &GetQualityString() const {
+ return qual_;
+ }
+
+ std::string GetPhredQualityString() const {
+ int offset = PhredOffset;
+ std::string res = qual_;
+ for (size_t i = 0; i < res.size(); ++i) {
+ res[i] = (char) (res[i] + offset);
+ }
+ return res;
+ }
+
+ /*
+ * Return ith nucleotide of SingleRead sequence in unreadable form
+ * (0, 1, 2 or 3).
+ *
+ * @param i Nucleotide index.
+ * @return Nucleotide on ith position of SingleRead sequence.
+ */
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(seq_[i]));
+ return dignucl(seq_[i]);
+ }
+
+ SingleRead operator!() const {
+ std::string new_name;
+ if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
+ new_name = name_.substr(0, name_.length() - 3);
+ } else {
+ new_name = name_ + "_RC";
+ }
+ // TODO make naming nicer
+ // if (name_ == "" || name_[0] != '!') {
+ // new_name = '!' + name_;
+ // } else {
+ // new_name = name_.substr(1, name_.length());
+ // }
+ return SingleRead(new_name, ReverseComplement(seq_), Reverse(qual_), right_offset_, left_offset_);
+ }
+
+ SingleRead Substr(size_t from, size_t to) const {
+ VERIFY(from <= to && to <= size());
+ size_t len = to - from;
+ if (len == size()) {
+ return *this;
+ }
+ if (len == 0) {
+ return SingleRead();
+ }
+ return SubstrStrict(from, to);
+ }
+
+ bool operator==(const SingleRead &singleread) const {
+ return seq_ == singleread.seq_;
+ }
+
+ void ChangeName(const std::string &new_name) {
+ name_ = new_name;
+ }
+
+ static bool IsValid(const std::string &seq) {
+ for (size_t i = 0; i < seq.size(); ++i) {
+ if (!is_nucl(seq[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ SequenceOffsetT GetLeftOffset() const {
+ return left_offset_;
+ }
+
+ SequenceOffsetT GetRightOffset() const {
+ return right_offset_;
+ }
+
+ bool BinWrite(std::ostream &file, bool rc = false) const {
+ sequence(rc).BinWrite(file);
+ if (rc) {
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ } else {
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ }
+ return !file.fail();
+ }
+
+
+ void print_size() const {
+ std::cerr << size() << std::endl;
+ }
+
+
+private:
+ /*
+ * @variable The name of SingleRead in input file.
+ */
+ std::string name_;
+ /*
+ * @variable The sequence of nucleotides.
+ */
+ std::string seq_;
+ /*
+ * @variable The quality of SingleRead.
+ */
+ std::string qual_;
+ /*
+ * @variable The flag of SingleRead correctness.
+ */
+
+ //Left and right offsets with respect to original sequence
+ SequenceOffsetT left_offset_;
+
+ SequenceOffsetT right_offset_;
+
+ bool valid_;
+
+ void Init() {
+ VERIFY(seq_.size() == qual_.size());
+ valid_ = SingleRead::IsValid(seq_);
+ }
+
+ SingleRead SubstrStrict(size_t from, size_t to) const {
+ size_t len = to - from;
+ // return SingleRead(name_, seq_.substr(from, len), qual_.substr(from, len));
+ // TODO remove naming?
+ std::string new_name;
+ if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
+ new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + ToString(size() - to) + "," +
+ ToString(size() - from) + ")" + "_RC";
+ } else {
+ new_name = name_ + "_SUBSTR(" + ToString(from) + "," + ToString(to) + ")";
+ }
+ return SingleRead(new_name, seq_.substr(from, len), qual_.substr(from, len),
+ SequenceOffsetT(from + (size_t) left_offset_),
+ SequenceOffsetT(size() - to + (size_t) right_offset_));
+ }
+
+
+};
+
+inline std::ostream &operator<<(std::ostream &os, const SingleRead &read) {
+ os << "Single read name=" << read.name() << " sequence=" << read.GetSequenceString() << std::endl;
+ return os;
+}
+
+class SingleReadSeq {
+
+public:
+ SingleReadSeq(const Sequence &s,
+ SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
+ seq_(s), left_offset_(left_offset), right_offset_(right_offset) {
+ }
+
+ SingleReadSeq() : seq_(), left_offset_(0), right_offset_(0) {
+ }
+
+ bool BinRead(std::istream &file) {
+ seq_.BinRead(file);
+ file.read((char *) &left_offset_, sizeof(left_offset_));
+ file.read((char *) &right_offset_, sizeof(right_offset_));
+ return !file.fail();
+ }
+
+ bool BinWrite(std::ostream &file, bool rc = false) const {
+ if (rc)
+ (!seq_).BinWrite(file);
+ else
+ seq_.BinWrite(file);
+ if (rc) {
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ } else {
+ file.write((const char *) &left_offset_, sizeof(left_offset_));
+ file.write((const char *) &right_offset_, sizeof(right_offset_));
+ }
+ return !file.fail();
+ }
+
+ // SingleReadSeq(std::istream& file): seq_(file, true) {
+ // }
+
+ bool operator==(const SingleReadSeq &singleread) const {
+ return seq_ == singleread.seq_;
+ }
+
+ const Sequence sequence() const {
+ return seq_;
+ }
+
+ size_t size() const {
+ return seq_.size();
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ SingleReadSeq operator!() const {
+ return SingleReadSeq(!seq_);
+ }
+
+ SequenceOffsetT GetLeftOffset() const {
+ return left_offset_;
+ }
+
+ SequenceOffsetT GetRightOffset() const {
+ return right_offset_;
+ }
+
+private:
+ Sequence seq_;
+
+ //Left and right offsets with respect to original sequence
+ SequenceOffsetT left_offset_;
+
+ SequenceOffsetT right_offset_;
+};
+
+inline std::ostream &operator<<(std::ostream &os, const SingleReadSeq &read) {
+ os << "Single read sequence=" << read.sequence() << std::endl;
+ return os;
+}
+
+}
diff --git a/src/common/io/reads/splitting_wrapper.hpp b/src/common/io/reads/splitting_wrapper.hpp
new file mode 100644
index 0000000..6665623
--- /dev/null
+++ b/src/common/io/reads/splitting_wrapper.hpp
@@ -0,0 +1,76 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "single_read.hpp"
+#include "read_stream_vector.hpp"
+#include "delegating_reader_wrapper.hpp"
+
+namespace io {
+
+class SplittingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+private:
+ std::vector<SingleRead> buffer_;
+ size_t buffer_position_;
+
+ void FillBuffer(SingleRead& tmp_read) {
+ buffer_.clear();
+ for (size_t i = 0; i < tmp_read.size(); ++i) {
+ size_t j = i;
+ while (j < tmp_read.size() && is_nucl(tmp_read.GetSequenceString()[j])) {
+ j++;
+ }
+ if (j > i) {
+ buffer_.push_back(tmp_read.Substr(i, j));
+ i = j - 1;
+ }
+ }
+ buffer_position_ = 0;
+ }
+
+ bool Skip() {
+ while (!this->reader().eof() && buffer_position_ == buffer_.size()) {
+ SingleRead tmp_read;
+ this->reader() >> tmp_read;
+ FillBuffer(tmp_read);
+ }
+ return buffer_position_ != buffer_.size();
+ }
+
+public:
+
+ explicit SplittingWrapper(base::ReadStreamPtrT reader) :
+ base(reader), buffer_position_(0) {
+ }
+
+ /* virtual */
+ SplittingWrapper& operator>>(SingleRead& read) {
+ Skip();
+ read = buffer_[buffer_position_];
+ buffer_position_++;
+ return *this;
+ }
+
+ //todo fix needed!!! seems that eof can't be called multiple times in a row!!!
+ /* virtual */ bool eof() {
+ return !Skip();
+ }
+};
+
+inline std::shared_ptr<ReadStream<SingleRead>> SplittingWrap(std::shared_ptr<ReadStream<SingleRead>> reader_ptr) {
+ return std::make_shared<SplittingWrapper>(reader_ptr);
+}
+
+inline ReadStreamList<SingleRead> SplittingWrap(ReadStreamList<SingleRead>& readers) {
+ ReadStreamList<SingleRead> answer;
+ for (size_t i = 0; i < readers.size(); ++i) {
+ answer.push_back(SplittingWrap(readers.ptr_at(i)));
+ }
+ return answer;
+}
+}
diff --git a/src/common/io/reads/vector_reader.hpp b/src/common/io/reads/vector_reader.hpp
new file mode 100644
index 0000000..74dfc7f
--- /dev/null
+++ b/src/common/io/reads/vector_reader.hpp
@@ -0,0 +1,61 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "io/reads/ireadstream.hpp"
+namespace io {
+
+/**
+ * Use vector<T> as input-stream with operator>>(T& t)
+ */
+template <typename T>
+class VectorReadStream : public ReadStream<T> {
+ std::vector<T> data_;
+ size_t pos_;
+ bool closed_;
+public:
+ VectorReadStream(const std::vector<T>& data) : data_(data), pos_(0), closed_(false) {
+
+ }
+
+ VectorReadStream(const T& item) : data_({item}), pos_(0), closed_(false) {
+
+ }
+
+ virtual bool eof() /*const */{
+ return pos_ == data_.size();
+ }
+
+ VectorReadStream<T>& operator>>(T& t) {
+ VERIFY(!eof());
+ t = data_[pos_++];
+ return *this;
+ }
+
+ void close() {
+ closed_ = true;
+ }
+
+ virtual bool is_open() /*const */{
+ return !closed_;
+ }
+
+ void reset() {
+ pos_ = 0;
+ }
+
+ ReadStreamStat get_stat() const {
+ //todo
+ ReadStreamStat stat;
+ stat.read_count_ = data_.size();
+
+ return stat;
+ }
+
+};
+
+}
diff --git a/src/common/io/reads/wrapper_collection.hpp b/src/common/io/reads/wrapper_collection.hpp
new file mode 100644
index 0000000..1f6c405
--- /dev/null
+++ b/src/common/io/reads/wrapper_collection.hpp
@@ -0,0 +1,115 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "single_read.hpp"
+#include "delegating_reader_wrapper.hpp"
+
+namespace io {
+
+//todo refactor!!!
+class IdSettingReaderWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ size_t next_id_;
+public:
+ IdSettingReaderWrapper(base::ReadStreamPtrT reader, size_t start_id = 0) :
+ base(reader), next_id_(start_id) {
+ }
+
+ /* virtual */
+ IdSettingReaderWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read.ChangeName(ToString(next_id_++));
+ return *this;
+ }
+};
+
+class PrefixAddingReaderWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+ std::string prefix_;
+public:
+ PrefixAddingReaderWrapper(base::ReadStreamPtrT reader,
+ const std::string& prefix) :
+ base(reader), prefix_(prefix) {
+ }
+
+ /* virtual */
+ PrefixAddingReaderWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ read.ChangeName(prefix_ + read.name());
+ return *this;
+ }
+};
+
+//fixme currently leads to long stretches of ACGTACGT...
+class FixingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+
+ io::SingleRead MakeValid(const io::SingleRead& read) const {
+ std::string str = read.GetSequenceString();
+ for (size_t i = 0; i < str.length(); ++i) {
+ if (!is_nucl(str[i]))
+ str[i] = nucl(char(i % 4));
+ }
+ return io::SingleRead(read.name(), str);
+ }
+
+public:
+ FixingWrapper(base::ReadStreamPtrT reader) :
+ base(reader) {
+ }
+
+ /* virtual */
+ FixingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ if (!read.IsValid()) {
+ TRACE("Read " << read.name() << " was invalid. Fixing");
+ read = MakeValid(read);
+ VERIFY(read.IsValid());
+ }
+ return *this;
+ }
+
+private:
+ DECL_LOGGER("FixingWrapper");
+};
+
+class NonNuclCollapsingWrapper: public DelegatingWrapper<SingleRead> {
+ typedef DelegatingWrapper<SingleRead> base;
+
+ io::SingleRead MakeValid(const io::SingleRead& read) const {
+ std::string str = read.GetSequenceString();
+ std::stringstream ss;
+ for (size_t i = 0; i < read.size(); ++i) {
+ if (is_nucl(str[i]))
+ ss << str[i];
+ }
+ return io::SingleRead(read.name(), ss.str());
+ }
+
+public:
+ NonNuclCollapsingWrapper(base::ReadStreamPtrT reader) :
+ base(reader) {
+ }
+
+ /* virtual */
+ NonNuclCollapsingWrapper& operator>>(SingleRead& read) {
+ this->reader() >> read;
+ if (!read.IsValid()) {
+ TRACE("Read " << read.name() << " was invalid. Collapsing non-nucls");
+ read = MakeValid(read);
+ VERIFY(read.IsValid());
+ }
+ return *this;
+ }
+
+private:
+ DECL_LOGGER("NonNuclCollapsingWrapper");
+};
+
+}
diff --git a/src/common/io/sam/bam_parser.hpp b/src/common/io/sam/bam_parser.hpp
new file mode 100644
index 0000000..74de549
--- /dev/null
+++ b/src/common/io/sam/bam_parser.hpp
@@ -0,0 +1,67 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef COMMON_IO_BAMPARSER_HPP
+#define COMMON_IO_BAMPARSER_HPP
+
+#include "reads/single_read.hpp"
+#include "io/reads/parser.hpp"
+#include "sequence/quality.hpp"
+#include "sequence/nucl.hpp"
+#include "utils/verify.hpp"
+
+#include "bamtools/api/BamReader.h"
+
+#include <string>
+
+namespace io {
+
+class BAMParser: public Parser {
+public:
+ BAMParser(const std::string& filename, OffsetType offset_type = PhredOffset)
+ : Parser(filename, offset_type) {
+ open();
+ }
+
+ ~BAMParser() {
+ close();
+ }
+
+ BAMParser& operator>>(SingleRead& read) {
+ if (!is_open_ || eof_)
+ return *this;
+
+ read = SingleRead(seq_.Name, seq_.QueryBases, seq_.Qualities, offset_type_);
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+
+ return *this;
+ }
+
+ void close() {
+ reader_.Close();
+ is_open_ = false;
+ eof_ = true;
+ }
+
+private:
+ BamTools::BamReader reader_;
+ BamTools::BamAlignment seq_;
+
+ void open() {
+ reader_.Open(filename_);
+ is_open_ = true;
+
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+ }
+
+ BAMParser(const BAMParser& parser);
+ void operator=(const BAMParser& parser);
+};
+
+}
+
+#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/common/io/sam/bam_reader.hpp b/src/common/io/sam/bam_reader.hpp
new file mode 100644
index 0000000..d7f8947
--- /dev/null
+++ b/src/common/io/sam/bam_reader.hpp
@@ -0,0 +1,107 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+//todo rename to reader
+#pragma once
+
+#include "io/reads/ireader.hpp"
+#include "io/reads/single_read.hpp"
+
+#include <bamtools/api/BamReader.h>
+
+namespace io {
+class BamRead : public BamTools::BamAlignment {
+public:
+ BamRead() { }
+
+ BamRead(const BamTools::BamAlignment &other)
+ : BamTools::BamAlignment(other) { }
+
+ const std::string &name() const {
+ return Name;
+ }
+
+ size_t size() const {
+ return Length;
+ }
+
+ size_t nucl_count() const {
+ return size();
+ }
+
+ const std::string &GetSequenceString() const {
+ return QueryBases;
+ }
+
+ std::string GetPhredQualityString() const {
+ return Qualities;
+ }
+
+ operator io::SingleRead() {
+ // not including quality is intentional:
+ // during read correction bases might be inserted/deleted,
+ // and base qualities for them are not calculated
+ return io::SingleRead(name(), GetSequenceString());
+ }
+
+ char operator[](size_t i) const {
+ VERIFY(is_nucl(QueryBases[i]));
+ return dignucl(QueryBases[i]);
+ }
+};
+
+class UnmappedBamStream : public ReadStream<BamRead> {
+public:
+ UnmappedBamStream(const std::string &filename)
+ : filename_(filename) {
+ open();
+ }
+
+ virtual ~UnmappedBamStream() { }
+
+ bool is_open() { return is_open_; }
+
+ bool eof() { return eof_; }
+
+ UnmappedBamStream &operator>>(BamRead &read) {
+ if (!is_open_ || eof_)
+ return *this;
+
+ read = seq_;
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+
+ return *this;
+ }
+
+ void close() {
+ reader_.Close();
+ is_open_ = false;
+ eof_ = true;
+ }
+
+ void reset() {
+ close();
+ open();
+ }
+
+ ReadStreamStat get_stat() const { return ReadStreamStat(); }
+
+private:
+ BamTools::BamReader reader_;
+ BamTools::BamAlignment seq_;
+ std::string filename_;
+ bool is_open_;
+ bool eof_;
+
+ void open() {
+ reader_.Open(filename_);
+ is_open_ = true;
+
+ eof_ = (false == reader_.GetNextAlignment(seq_));
+ }
+
+};
+}
diff --git a/src/common/io/sam/read.cpp b/src/common/io/sam/read.cpp
new file mode 100644
index 0000000..de65d03
--- /dev/null
+++ b/src/common/io/sam/read.cpp
@@ -0,0 +1,42 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <io/sam/read.hpp>
+
+using namespace std;
+
+namespace sam_reader {
+
+string SingleSamRead::cigar() const {
+ uint32_t *cigar = bam1_cigar(data_);
+ string res;
+ res.reserve(data_->core.n_cigar);
+ for (size_t k = 0; k < data_->core.n_cigar; ++k) {
+ res += std::to_string(bam_cigar_oplen(cigar[k]));
+ res += bam_cigar_opchr(cigar[k]);
+
+ }
+ return res;
+}
+
+string SingleSamRead::name() const {
+ string res(bam1_qname(data_));
+ return res;
+}
+
+string SingleSamRead::seq() const {
+ string res = "";
+ auto b = bam1_seq(data_);
+ for (int k = 0; k < data_->core.l_qseq; ++k) {
+ res += bam_nt16_rev_table[bam1_seqi(b, k)];
+ }
+ return res;
+}
+
+
+}
+;
diff --git a/src/modules/io/sam_io/read.hpp b/src/common/io/sam/read.hpp
similarity index 100%
rename from src/modules/io/sam_io/read.hpp
rename to src/common/io/sam/read.hpp
diff --git a/src/common/io/sam/sam_reader.cpp b/src/common/io/sam/sam_reader.cpp
new file mode 100644
index 0000000..63a1cf8
--- /dev/null
+++ b/src/common/io/sam/sam_reader.cpp
@@ -0,0 +1,73 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <io/sam/read.hpp>
+#include <io/sam/sam_reader.hpp>
+
+namespace sam_reader {
+
+bool MappedSamStream::eof() const {
+ return eof_;
+}
+
+bool MappedSamStream::is_open() const {
+ return is_open_;
+}
+
+MappedSamStream& MappedSamStream::operator>>(SingleSamRead& read) {
+ if (!is_open_ || eof_)
+ return *this;
+ read.set_data(seq_);
+ int tmp = samread(reader_, seq_);
+ eof_ = (0 >= tmp);
+ return *this;
+}
+
+MappedSamStream& MappedSamStream::operator >>(PairedSamRead& read) {
+ TRACE("starting process paired read");
+ SingleSamRead r1;
+ MappedSamStream::operator >>(r1);
+ SingleSamRead r2;
+ MappedSamStream::operator >>(r2);
+
+ read = PairedSamRead(r1, r2);
+ TRACE(r1.seq());
+ TRACE(r2.seq());
+ TRACE(r1.name());
+ return *this;
+}
+
+const char* MappedSamStream::get_contig_name(int i) const {
+ VERIFY(i < reader_->header->n_targets);
+ return (reader_->header->target_name[i]);
+}
+
+void MappedSamStream::close() {
+ samclose(reader_);
+ is_open_ = false;
+ eof_ = true;
+ bam_destroy1(seq_);
+}
+
+void MappedSamStream::reset() {
+ close();
+ open();
+}
+
+void MappedSamStream::open() {
+ if ((reader_ = samopen(filename_.c_str(), "r", NULL)) == NULL) {
+ WARN("Fail to open SAM file " << filename_);
+ is_open_ = false;
+ eof_ = true;
+ } else {
+ is_open_ = true;
+ int tmp = samread(reader_, seq_);
+ eof_ = (0 >= tmp);
+ }
+}
+
+}
diff --git a/src/common/io/sam/sam_reader.hpp b/src/common/io/sam/sam_reader.hpp
new file mode 100644
index 0000000..e37df7c
--- /dev/null
+++ b/src/common/io/sam/sam_reader.hpp
@@ -0,0 +1,49 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "read.hpp"
+
+#include "utils/logger/log_writers.hpp"
+
+#include <samtools/sam.h>
+#include <samtools/bam.h>
+
+#include <string>
+
+namespace sam_reader {
+
+class MappedSamStream {
+public:
+ MappedSamStream(const std::string &filename)
+ : filename_(filename) {
+ open();
+ }
+
+ virtual ~MappedSamStream() {
+ }
+
+ bool is_open() const;
+ bool eof() const;
+ MappedSamStream& operator >>(SingleSamRead& read);
+ MappedSamStream& operator >>(PairedSamRead& read);
+ const char* get_contig_name(int i) const;
+ void close();
+ void reset();
+
+private:
+ samfile_t *reader_;
+ bam1_t *seq_ = bam_init1();
+ std::string filename_;
+ bool is_open_;
+ bool eof_;
+
+ void open();
+};
+
+}
+;
diff --git a/src/common/math/smooth.hpp b/src/common/math/smooth.hpp
new file mode 100644
index 0000000..be12a3e
--- /dev/null
+++ b/src/common/math/smooth.hpp
@@ -0,0 +1,189 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cmath>
+
+namespace math {
+
+template<typename T>
+static T MedianOf3(T u, T v, T w) {
+ /* Median(u,v,w): */
+ if ((u <= v && v <= w) ||
+ (u >= v && v >= w))
+ return v;
+ if ((u <= w && w <= v) ||
+ (u >= w && w >= v))
+ return w;
+
+ /* else */ return u;
+}
+
+/* Return (Index-1) of median(u,v,w) , i.e.,
+-1 : u
+0 : v
+1 : w
+*/
+template<typename T>
+static int IndexOfMedianOf3(T u, T v, T w) {
+ if ((u <= v && v <= w) ||
+ (u >= v && v >= w))
+ return 0;
+ if ((u <= w && w <= v) ||
+ (u >= w && w >= v))
+ return 1;
+
+ /* else */ return -1;
+}
+
+enum class SmoothEndRule {
+ No,
+ Copy,
+ Tukey
+};
+
+template<typename T>
+static bool SmoothEndStep(const T *x, T *y, size_t n, SmoothEndRule end_rule) {
+ switch (end_rule) {
+ default:
+ case SmoothEndRule::No:
+ return false;
+ case SmoothEndRule::Copy:
+ y[0] = x[0];
+ y[n - 1] = x[n - 1];
+ return false;
+ case SmoothEndRule::Tukey: {
+ bool chg = false;
+ y[0] = MedianOf3(3 * y[1] - 2 * y[2], x[0], y[1]);
+ chg = chg || (y[0] != x[0]);
+ y[n - 1] = MedianOf3(y[n - 2], x[n - 1], 3 * y[n - 2] - 2 * y[n - 3]);
+ chg = chg || (y[n - 1] != x[n - 1]);
+ return chg;
+ }
+ }
+}
+
+template<typename T>
+static bool Smooth3(const T *x, T *y, size_t n, SmoothEndRule end_rule) {
+ // y[] := Running Median of three (x) = "3 (x[])" with "copy ends"
+ // --- return chg := ( y != x )
+ bool chg = false;
+
+ for (size_t i = 1; i < n - 1; i++) {
+ int j = IndexOfMedianOf3(x[i - 1], x[i], x[i + 1]);
+ y[i] = x[(int) i + j];
+ chg = chg || j;
+ }
+
+ chg |= SmoothEndStep(x, y, n, end_rule);
+
+ return chg;
+}
+
+template<typename T>
+static size_t Smooth3R(const T *x, T *y, T *z, size_t n, SmoothEndRule end_rule) {
+ // y[] := "3R"(x) ; 3R = Median of three, repeated until convergence
+ size_t iter;
+ bool chg;
+
+ iter = chg = Smooth3(x, y, n, SmoothEndRule::Copy);
+
+ while (chg) {
+ if ((chg = Smooth3(y, z, n, SmoothEndRule::No))) {
+ iter += 1;
+ for (size_t i = 1; i < n - 1; i++)
+ y[i] = z[i];
+ }
+ }
+
+ chg |= SmoothEndStep(x, y, n, end_rule);
+
+ return (iter ? iter : chg);
+ /* = 0 <==> only one "3" w/o any change
+ = 1 <==> either ["3" w/o change + endchange]
+ or [two "3"s, 2nd w/o change ] */
+}
+
+template<typename T>
+static bool SplitTest(const T *x, size_t i) {
+ // Split test:
+ // Are we at a /-\ or \_/ location => split should be made ?
+
+ if (x[i] != x[i + 1])
+ return false;
+
+ if ((x[i - 1] <= x[i] && x[i + 1] <= x[i + 2]) ||
+ (x[i - 1] >= x[i] && x[i + 1] >= x[i + 2]))
+ return false;
+
+ /* else */ return true;
+}
+
+template<typename T>
+static bool SmoothSplit3(const T *x, T *y, size_t n, bool do_ends) {
+ // y[] := S(x[]) where S() = "sm_split3"
+ bool chg = false;
+
+ for (size_t i = 0; i < n; i++)
+ y[i] = x[i];
+
+ if (do_ends && SplitTest(x, 1)) {
+ chg = true;
+ y[1] = x[0];
+ y[2] = MedianOf3(x[2], x[3], 3 * x[3] - 2 * x[4]);
+ }
+
+ for (size_t i = 2; i < n - 3; i++) {
+ if (SplitTest(x, i)) {
+ int j;
+ // plateau at x[i] == x[i+1]
+
+ // at left:
+ if (-1 < (j = IndexOfMedianOf3(x[i], x[i - 1], 3 * x[i - 1] - 2 * x[i - 2]))) {
+ y[i] = (j == 0 ? x[i - 1] : 3 * x[i - 1] - 2 * x[i - 2]);
+ chg = (y[i] != x[i]);
+ }
+
+ // at right:
+ if (-1 < (j = IndexOfMedianOf3(x[i + 1], x[i + 2], 3 * x[i + 2] - 2 * x[i + 3]))) {
+ y[i + 1] = (j == 0 ? x[i + 2] : 3 * x[i + 2] - 2 * x[i + 3]);
+ chg = (y[i + 1] != x[i + 1]);
+ }
+ }
+ }
+
+ if (do_ends && SplitTest(x, n - 3)) {
+ chg = true;
+ y[n - 2] = x[n - 1];
+ y[n - 3] = MedianOf3(x[n - 3], x[n - 4], 3 * x[n - 4] - 2 * x[n - 5]);
+ }
+
+ return chg;
+}
+
+template<typename T>
+size_t Smooth3RS3R(std::vector <T> &y, const std::vector <T> &x,
+ SmoothEndRule end_rule = SmoothEndRule::Tukey, bool split_ends = false) {
+ // y[1:n] := "3R S 3R"(x[1:n]); z = "work";
+ size_t iter;
+ bool chg;
+ size_t n = x.size();
+
+ y.resize(n);
+ std::vector <T> z(n), w(n);
+
+ iter = Smooth3R(&x[0], &y[0], &z[0], n, end_rule);
+ chg = SmoothSplit3(&y[0], &z[0], n, split_ends);
+ if (chg)
+ iter += Smooth3R(&z[0], &y[0], &w[0], n, end_rule);
+
+ /* else y == z already */
+ return (iter + chg);
+}
+
+}
diff --git a/src/modules/math/xmath.h b/src/common/math/xmath.h
similarity index 100%
rename from src/modules/math/xmath.h
rename to src/common/math/xmath.h
diff --git a/src/common/modules/CMakeLists.txt b/src/common/modules/CMakeLists.txt
new file mode 100644
index 0000000..fbd848b
--- /dev/null
+++ b/src/common/modules/CMakeLists.txt
@@ -0,0 +1,13 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(modules CXX)
+
+add_library(modules STATIC
+ genome_consistance_checker.cpp alignment/bwa_index.cpp)
+target_link_libraries(modules bwa)
+
diff --git a/src/common/modules/alignment/bwa_index.cpp b/src/common/modules/alignment/bwa_index.cpp
new file mode 100644
index 0000000..9973477
--- /dev/null
+++ b/src/common/modules/alignment/bwa_index.cpp
@@ -0,0 +1,327 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "bwa_index.hpp"
+
+#include "bwa/bwa.h"
+#include "bwa/bwamem.h"
+#include "bwa/utils.h"
+#include "kseq/kseq.h"
+
+#include <string>
+#include <memory>
+
+// all of the bwa and kseq stuff is in unaligned sequence
+// best way I had to keep from clashes with klib macros
+
+#define MEM_F_SOFTCLIP 0x200
+
+#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
+#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
+extern "C" {
+int is_bwt(uint8_t *T, int n);
+};
+
+namespace alignment {
+
+BWAIndex::BWAIndex(const debruijn_graph::Graph& g)
+ : g_(g),
+ memopt_(mem_opt_init(), free),
+ idx_(nullptr, bwa_idx_destroy) {
+ memopt_->flag |= MEM_F_SOFTCLIP;
+ Init();
+}
+
+BWAIndex::~BWAIndex() {}
+
+// modified from bwa (heng li)
+static uint8_t* seqlib_add1(const kstring_t *seq, const kstring_t *name,
+ bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) {
+ bntann1_t *p;
+ int lasts;
+ if (bns->n_seqs == *m_seqs) {
+ *m_seqs <<= 1;
+ bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t));
+ }
+ p = bns->anns + bns->n_seqs;
+ p->name = strdup((char*)name->s);
+ p->anno = strdup("(null");
+ p->gi = 0; p->len = seq->l;
+ p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
+ p->n_ambs = 0;
+ for (size_t i = lasts = 0; i < seq->l; ++i) {
+ int c = nst_nt4_table[(int)seq->s[i]];
+ if (c >= 4) { // N
+ if (lasts == seq->s[i]) { // contiguous N
+ ++(*q)->len;
+ } else {
+ if (bns->n_holes == *m_holes) {
+ (*m_holes) <<= 1;
+ bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t));
+ }
+ *q = bns->ambs + bns->n_holes;
+ (*q)->len = 1;
+ (*q)->offset = p->offset + i;
+ (*q)->amb = seq->s[i];
+ ++p->n_ambs;
+ ++bns->n_holes;
+ }
+ }
+ lasts = seq->s[i];
+ { // fill buffer
+ if (c >= 4) c = lrand48()&3;
+ if (bns->l_pac == *m_pac) { // double the pac size
+ *m_pac <<= 1;
+ pac = (uint8_t*)realloc(pac, *m_pac/4);
+ memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4);
+ }
+ _set_pac(pac, bns->l_pac, c);
+ ++bns->l_pac;
+ }
+ }
+ ++bns->n_seqs;
+
+ return pac;
+}
+
+static uint8_t* seqlib_make_pac(const debruijn_graph::Graph &g,
+ const std::vector<debruijn_graph::EdgeId> &ids,
+ bool for_only) {
+ bntseq_t * bns = (bntseq_t*)calloc(1, sizeof(bntseq_t));
+ uint8_t *pac = 0;
+ int32_t m_seqs, m_holes;
+ int64_t m_pac, l;
+ bntamb1_t *q;
+
+ bns->seed = 11; // fixed seed for random generator
+ m_seqs = m_holes = 8; m_pac = 0x10000;
+ bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t));
+ bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t));
+ pac = (uint8_t*) calloc(m_pac/4, 1);
+ q = bns->ambs;
+
+ // move through the sequences
+ // FIXME: not kstring is required
+ for (auto e : ids) {
+ std::string ref = std::to_string(g.int_id(e));
+ std::string seq = g.EdgeNucls(e).str();
+
+ // make the ref name kstring
+ kstring_t * name = (kstring_t*)malloc(1 * sizeof(kstring_t));
+ name->l = ref.length() + 1;
+ name->m = ref.length() + 3;
+ name->s = (char*)calloc(name->m, sizeof(char));
+ memcpy(name->s, ref.c_str(), ref.length()+1);
+
+ // make the sequence kstring
+ kstring_t * t = (kstring_t*)malloc(sizeof(kstring_t));
+ t->l = seq.length();
+ t->m = seq.length() + 2;
+ //t->s = (char*)calloc(v[k].Seq.length(), sizeof(char));
+ t->s = (char*)malloc(t->m);
+ memcpy(t->s, seq.c_str(), seq.length());
+
+ // make the forward only pac
+ pac = seqlib_add1(t, name, bns, pac, &m_pac, &m_seqs, &m_holes, &q);
+
+ // clear it out
+ free(name->s);
+ free(name);
+ free(t->s);
+ free(t);
+ }
+
+ if (!for_only) {
+ // add the reverse complemented sequence
+ m_pac = (bns->l_pac * 2 + 3) / 4 * 4;
+ pac = (uint8_t*)realloc(pac, m_pac/4);
+ memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4);
+ for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac)
+ _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l));
+ }
+
+ bns_destroy(bns);
+
+ return pac;
+}
+
+static bwt_t *seqlib_bwt_pac2bwt(const uint8_t *pac, size_t bwt_seq_lenr) {
+ bwt_t *bwt;
+ ubyte_t *buf;
+ int i;
+
+ // initialization
+ bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+ bwt->seq_len = bwt_seq_lenr; //bwa_seq_len(fn_pac); //dummy
+ bwt->bwt_size = (bwt->seq_len + 15) >> 4;
+
+ // prepare sequence
+ //pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
+ //buf2 = (ubyte_t*)calloc(pac_size, 1);
+ //err_fread_noeof(buf2, 1, pac_size, fp);
+ //err_fclose(fp);
+ memset(bwt->L2, 0, 5 * 4);
+ buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
+ for (i = 0; i < (int)bwt->seq_len; ++i) {
+ buf[i] = pac[i>>2] >> ((3 - (i&3)) << 1) & 3;
+ ++bwt->L2[1+buf[i]];
+ }
+ for (i = 2; i <= 4; ++i)
+ bwt->L2[i] += bwt->L2[i-1];
+ //free(buf2);
+
+ // Burrows-Wheeler Transform
+ bwt->primary = is_bwt(buf, bwt->seq_len);
+ bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
+ for (i = 0; i < (int)bwt->seq_len; ++i)
+ bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
+ free(buf);
+ return bwt;
+}
+
+static bntann1_t* seqlib_add_to_anns(const std::string& name, const std::string& seq, bntann1_t* ann, size_t offset) {
+ ann->offset = offset;
+ ann->name = (char*)malloc(name.length()+1); // +1 for \0
+ strncpy(ann->name, name.c_str(), name.length()+1);
+ ann->anno = (char*)malloc(7);
+ strcpy(ann->anno, "(null)\0");
+ ann->len = seq.length();
+ ann->n_ambs = 0; // number of "holes"
+ ann->gi = 0; // gi?
+ ann->is_alt = 0;
+
+ return ann;
+}
+
+void BWAIndex::Init() {
+ idx_.reset((bwaidx_t*)calloc(1, sizeof(bwaidx_t)));
+ ids_.clear();
+
+ for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ ids_.push_back(*it);
+ }
+
+ // construct the forward-only pac
+ uint8_t* fwd_pac = seqlib_make_pac(g_, ids_, true); //true->for_only
+
+ // construct the forward-reverse pac ("packed" 2 bit sequence)
+ uint8_t* pac = seqlib_make_pac(g_, ids_, false); // don't write, becasue only used to make BWT
+
+ size_t tlen = 0;
+ for (auto e : ids_)
+ tlen += g_.EdgeNucls(e).size();
+
+#ifdef DEBUG_BWATOOLS
+ std::cerr << "ref seq length: " << tlen << std::endl;
+#endif
+
+ // make the bwt
+ bwt_t *bwt;
+ bwt = seqlib_bwt_pac2bwt(pac, tlen*2); // *2 for fwd and rev
+ bwt_bwtupdate_core(bwt);
+ free(pac); // done with fwd-rev pac
+
+ // construct sa from bwt and occ. adds it to bwt struct
+ bwt_cal_sa(bwt, 32);
+ bwt_gen_cnt_table(bwt);
+
+ // make the bns
+ bntseq_t * bns = (bntseq_t*) calloc(1, sizeof(bntseq_t));
+ bns->l_pac = tlen;
+ bns->n_seqs = ids_.size();
+ bns->seed = 11;
+ bns->n_holes = 0;
+
+ // make the anns
+ // FIXME: Do we really need this?
+ bns->anns = (bntann1_t*)calloc(ids_.size(), sizeof(bntann1_t));
+ size_t offset = 0, k = 0;
+ for (auto e: ids_) {
+ std::string name = std::to_string(g_.int_id(e));
+ std::string seq = g_.EdgeNucls(e).str();
+ seqlib_add_to_anns(name, seq, &bns->anns[k++], offset);
+ offset += seq.length();
+ }
+
+ // ambs is "holes", like N bases
+ bns->ambs = 0; //(bntamb1_t*)calloc(1, sizeof(bntamb1_t));
+
+ // make the in-memory idx struct
+ idx_->bwt = bwt;
+ idx_->bns = bns;
+ idx_->pac = fwd_pac;
+}
+
+omnigraph::MappingPath<debruijn_graph::EdgeId> BWAIndex::AlignSequence(const Sequence &sequence) const {
+ omnigraph::MappingPath<debruijn_graph::EdgeId> res;
+
+ if (!idx_) return res;
+
+ std::string seq = sequence.str();
+ mem_alnreg_v ar = mem_align1(memopt_.get(), idx_->bwt, idx_->bns, idx_->pac,
+ seq.length(), seq.data());
+ for (size_t i = 0; i < ar.n; ++i) {
+ const mem_alnreg_t &a = ar.a[i];
+ if (a.secondary >= 0) continue; // skip secondary alignments
+// if (a.qe - a.qb < g_.k()) continue; // skip short alignments
+// if (a.re - a.rb < g_.k()) continue;
+ int is_rev = 0;
+ size_t pos = bns_depos(idx_->bns, a.rb < idx_->bns->l_pac? a.rb : a.re - 1, &is_rev) - idx_->bns->anns[a.rid].offset;
+/* fprintf(stderr, "%zu: [%lld, %lld]\t[%d, %d] %c %d %s %ld %zu\n",
+ i,
+ a.rb, a.re, a.qb, a.qe,
+ "+-"[is_rev], a.rid,
+ idx_->bns->anns[a.rid].name, g_.int_id(ids_[a.rid]), pos);
+*/
+ size_t initial_range_end = a.qe;
+ size_t mapping_range_end = pos + a.re - a.rb;
+ size_t read_length = seq.length() ;
+ //we had to reduce the range to kmer-based
+ if (pos + (a.re - a.rb) >= g_.length(ids_[a.rid]) ){
+ if (a.qe > g_.k() + a.qb)
+ initial_range_end -= g_.k();
+ else continue;
+ if (a.re > g_.k() + a.rb)
+ mapping_range_end -= g_.k();
+ else continue;
+ if (read_length >= g_.k())
+ read_length -= g_.k();
+ else continue;
+ }
+ // FIXME: Check this!
+ if (!is_rev) {
+ res.push_back(ids_[a.rid],
+ { { (size_t)a.qb, initial_range_end },
+ { pos, mapping_range_end}});
+ } else {
+// fprintf (stderr,"%d %d %d\n", a.qb, a.qe - g_.k(), seq.length() - g_.k());
+
+// fprintf (stderr,"%d %d %d\n", pos, pos + a.re - a.rb , g_.length(ids_[a.rid]) );
+
+ res.push_back(g_.conjugate(ids_[a.rid]),
+ { omnigraph::Range(a.qb, initial_range_end).Invert(read_length),
+ omnigraph::Range(pos, mapping_range_end ).Invert(g_.length(ids_[a.rid])) });
+
+ }
+
+#if 0
+ mem_aln_t aln = mem_reg2aln(memopt_.get(), idx_->bns, idx_->pac, seq.length(), seq.c_str(), &a);
+
+ // print alignment
+ printf("\t%c\t%s\t%ld %ld %ld\t%d\t", "+-"[aln.is_rev], idx_->bns->anns[aln.rid].name, aln.rid, g_.int_id(ids_[aln.rid]), (long)aln.pos, aln.mapq);
+ for (int k = 0; k < aln.n_cigar; ++k) // print CIGAR
+ printf("%d%c", aln.cigar[k]>>4, "MIDSH"[aln.cigar[k]&0xf]);
+ printf("\t%d\n", aln.NM); // print edit distance
+ free(aln.cigar);
+#endif
+
+ }
+ free(ar.a);
+
+ return res;
+}
+
+}
diff --git a/src/common/modules/alignment/bwa_index.hpp b/src/common/modules/alignment/bwa_index.hpp
new file mode 100644
index 0000000..8bc7037
--- /dev/null
+++ b/src/common/modules/alignment/bwa_index.hpp
@@ -0,0 +1,44 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+
+extern "C" {
+struct bwaidx_s;
+typedef struct bwaidx_s bwaidx_t;
+
+struct mem_opt_s;
+typedef struct mem_opt_s mem_opt_t;
+};
+
+namespace alignment {
+
+class BWAIndex {
+ public:
+ // bwaidx / memopt are incomplete below, therefore we need to outline ctor
+ // and dtor.
+ BWAIndex(const debruijn_graph::Graph& g);
+ ~BWAIndex();
+
+ omnigraph::MappingPath<debruijn_graph::EdgeId> AlignSequence(const Sequence &sequence) const;
+ private:
+ void Init();
+
+ const debruijn_graph::Graph& g_;
+
+ // Store the options in memory
+ std::unique_ptr<mem_opt_t, void(*)(void*)> memopt_;
+
+ // hold the full index structure
+ std::unique_ptr<bwaidx_t, void(*)(bwaidx_t*)> idx_;
+
+ std::vector<debruijn_graph::EdgeId> ids_;
+};
+
+}
diff --git a/src/common/modules/alignment/bwa_sequence_mapper.hpp b/src/common/modules/alignment/bwa_sequence_mapper.hpp
new file mode 100644
index 0000000..62a8542
--- /dev/null
+++ b/src/common/modules/alignment/bwa_sequence_mapper.hpp
@@ -0,0 +1,35 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "sequence_mapper.hpp"
+#include "bwa_index.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+
+namespace alignment {
+
+template<class Graph>
+class BWAReadMapper: public debruijn_graph::AbstractSequenceMapper<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ using debruijn_graph::AbstractSequenceMapper<Graph>::g_;
+public:
+ BWAReadMapper(const Graph& g)
+ : debruijn_graph::AbstractSequenceMapper<Graph>(g),
+ index_(g) {}
+
+ omnigraph::MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
+ return index_.AlignSequence(sequence);
+ }
+
+ ~BWAReadMapper() {
+ }
+
+ BWAIndex index_;
+};
+
+}
+
diff --git a/src/common/modules/alignment/edge_index.hpp b/src/common/modules/alignment/edge_index.hpp
new file mode 100644
index 0000000..da84b58
--- /dev/null
+++ b/src/common/modules/alignment/edge_index.hpp
@@ -0,0 +1,103 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "common/assembly_graph/core/graph.hpp"
+#include "common/assembly_graph/core/action_handlers.hpp"
+#include "utils/indices/edge_info_updater.hpp"
+#include "edge_index_refiller.hpp"
+
+namespace debruijn_graph {
+
+/**
+ * EdgeIndex is a structure to store info about location of certain k-mers in graph. It delegates all
+ * container procedures to inner_index_ and all handling procedures to
+ * renewer_ which is DataHashRenewer.
+ */
+template<class Graph>
+class EdgeIndex: public omnigraph::GraphActionHandler<Graph> {
+
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ using InnerIndex = KmerFreeEdgeIndex<Graph, DefaultStoring>;
+ typedef Graph GraphT;
+ typedef typename InnerIndex::KMer KMer;
+ typedef typename InnerIndex::KMerIdx KMerIdx;
+ typedef typename InnerIndex::KmerPos Value;
+
+private:
+ InnerIndex inner_index_;
+ EdgeInfoUpdater<InnerIndex, Graph> updater_;
+ EdgeIndexRefiller refiller_;
+ bool delete_index_;
+
+public:
+ EdgeIndex(const Graph& g, const std::string &workdir)
+ : omnigraph::GraphActionHandler<Graph>(g, "EdgeIndex"),
+ inner_index_(g, workdir),
+ updater_(g, inner_index_),
+ delete_index_(true) {
+ }
+
+ virtual ~EdgeIndex() {
+ TRACE("~EdgeIndex OK")
+ }
+
+ InnerIndex &inner_index() {
+ return inner_index_;
+ }
+
+ size_t k() const {
+ return inner_index_.k();
+ }
+
+ const InnerIndex &inner_index() const {
+ VERIFY(this->IsAttached());
+ return inner_index_;
+ }
+
+ void HandleAdd(EdgeId e) override {
+ updater_.UpdateKmers(e);
+ }
+
+ void HandleDelete(EdgeId e) override {
+ updater_.DeleteKmers(e);
+ }
+
+ bool contains(const KMer& kmer) const {
+ VERIFY(this->IsAttached());
+ return inner_index_.contains(inner_index_.ConstructKWH(kmer));
+ }
+
+ const pair<EdgeId, size_t> get(const KMer& kmer) const {
+ VERIFY(this->IsAttached());
+ auto kwh = inner_index_.ConstructKWH(kmer);
+ if (!inner_index_.contains(kwh)) {
+ return make_pair(EdgeId(0), -1u);
+ } else {
+ EdgeInfo<EdgeId> entry = inner_index_.get_value(kwh);
+ return std::make_pair(entry.edge_id, (size_t)entry.offset);
+ }
+ }
+
+ void Refill() {
+ clear();
+ refiller_.Refill(inner_index_, this->g());
+ INFO("Index refilled");
+ }
+
+ void Update() {
+ updater_.UpdateAll();
+ }
+
+ void clear() {
+ inner_index_.clear();
+ }
+
+};
+}
diff --git a/src/common/modules/alignment/edge_index_refiller.cpp b/src/common/modules/alignment/edge_index_refiller.cpp
new file mode 100644
index 0000000..c03c5ad
--- /dev/null
+++ b/src/common/modules/alignment/edge_index_refiller.cpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/indices/edge_index_builders.hpp"
+#include "utils/indices/edge_multi_index.hpp"
+#include "core/graph.hpp"
+
+#include "edge_index_refiller.hpp"
+
+namespace debruijn_graph {
+
+using EdgeIndex = KmerFreeEdgeIndex<ConjugateDeBruijnGraph>;
+
+template<>
+void EdgeIndexRefiller::Refill(EdgeIndex &index,
+ const ConjugateDeBruijnGraph &g) {
+ typedef typename EdgeIndexHelper<EdgeIndex>::GraphPositionFillingIndexBuilderT IndexBuilder;
+ IndexBuilder().BuildIndexFromGraph(index, g);
+}
+
+using PacIndex = DeBruijnEdgeMultiIndex<ConjugateDeBruijnGraph::EdgeId>;
+
+template<>
+void EdgeIndexRefiller::Refill(PacIndex &index,
+ const ConjugateDeBruijnGraph &g) {
+ typedef typename debruijn_graph::EdgeIndexHelper<PacIndex>::GraphPositionFillingIndexBuilderT Builder;
+ Builder().BuildIndexFromGraph(index, g);
+}
+
+}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index_refiller.hpp b/src/common/modules/alignment/edge_index_refiller.hpp
similarity index 100%
rename from src/modules/assembly_graph/graph_alignment/edge_index_refiller.hpp
rename to src/common/modules/alignment/edge_index_refiller.hpp
diff --git a/src/common/modules/alignment/kmer_map.hpp b/src/common/modules/alignment/kmer_map.hpp
new file mode 100644
index 0000000..478461b
--- /dev/null
+++ b/src/common/modules/alignment/kmer_map.hpp
@@ -0,0 +1,151 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __KMER_MAP_HPP__
+#define __KMER_MAP_HPP__
+
+#include "sequence/rtseq.hpp"
+
+#include <htrie/hat-trie.h>
+#include <boost/iterator/iterator_facade.hpp>
+
+namespace debruijn_graph {
+class KMerMap {
+ typedef RtSeq Kmer;
+ typedef RtSeq Seq;
+ typedef typename Seq::DataType RawSeqData;
+
+ value_t* internal_tryget(const Kmer &key) const {
+ return hattrie_tryget(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ value_t* internal_get(const Kmer &key) const {
+ return hattrie_get(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ int internal_erase(const Kmer &key) {
+ return hattrie_del(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ class iterator : public boost::iterator_facade<iterator,
+ const std::pair<Kmer, Seq>,
+ std::forward_iterator_tag,
+ const std::pair<Kmer, Seq>> {
+ public:
+ iterator(unsigned k, hattrie_iter_t *start = nullptr)
+ : k_(k), iter_(start, [](hattrie_iter_t *p) { hattrie_iter_free(p); }) {}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ hattrie_iter_next(iter_.get());
+ }
+
+ bool equal(const iterator &other) const {
+ // Special case: NULL and finished are equal
+ if (iter_.get() == nullptr || hattrie_iter_finished(iter_.get()))
+ return other.iter_.get() == nullptr || hattrie_iter_finished(other.iter_.get());
+
+ if (other.iter_.get() == nullptr)
+ return false;
+
+ return hattrie_iter_equal(iter_.get(), other.iter_.get());
+ }
+
+ const std::pair<Kmer, Seq> dereference() const {
+ size_t len;
+ Kmer k(k_, (const RawSeqData*)hattrie_iter_key(iter_.get(), &len));
+ Seq s(k_, (const RawSeqData*)(*hattrie_iter_val(iter_.get())));
+ return std::make_pair(k, s);
+ }
+
+ unsigned k_;
+ std::shared_ptr<hattrie_iter_t> iter_;
+ };
+
+ public:
+ KMerMap(unsigned k)
+ : k_(k), mapping_(hattrie_create()) {
+ rawcnt_ = (unsigned)Seq::GetDataSize(k_);
+ }
+
+ ~KMerMap() {
+ clear();
+ hattrie_free(mapping_);
+ }
+
+ void erase(const Kmer &key) {
+ value_t *vp = internal_tryget(key);
+ if (vp == nullptr)
+ return;
+
+ RawSeqData *value = reinterpret_cast<RawSeqData*>(*vp);
+ delete[] value;
+ int res = internal_erase(key);
+ VERIFY_MSG(res == 0, "Failed to delete from kmer mapper");
+ }
+
+ void set(const Kmer &key, const Seq &value) {
+ value_t *vp = internal_tryget(key);
+ RawSeqData *rawvalue = nullptr;
+ if (vp == nullptr) {
+ vp = internal_get(key);
+ rawvalue = new RawSeqData[rawcnt_];
+ *vp = reinterpret_cast<uintptr_t>(rawvalue);
+ } else {
+ rawvalue = reinterpret_cast<RawSeqData*>(*vp);
+ }
+
+ memcpy(rawvalue, value.data(), rawcnt_ * sizeof(RawSeqData));
+ }
+
+ bool count(const Kmer &key) const {
+ return internal_tryget(key) != nullptr;
+ }
+
+ const RawSeqData *find(const Kmer &key) const {
+ value_t *vp = internal_tryget(key);
+ if (vp == nullptr)
+ return nullptr;
+
+ return reinterpret_cast<const RawSeqData*>(*vp);
+ }
+
+ void clear() {
+ // Delete all the values
+ auto *iter = hattrie_iter_begin(mapping_, false);
+ while (!hattrie_iter_finished(iter)) {
+ RawSeqData *value = (RawSeqData*)(*hattrie_iter_val(iter));
+ delete[] value;
+ hattrie_iter_next(iter);
+ }
+ hattrie_iter_free(iter);
+ // Delete the mapping and all the keys
+ hattrie_clear(mapping_);
+ }
+
+ size_t size() const {
+ return hattrie_size(mapping_);
+ }
+
+ iterator begin() const {
+ return iterator(k_, hattrie_iter_begin(mapping_, false));
+ }
+
+ iterator end() const {
+ return iterator(k_);
+ }
+
+ private:
+ unsigned k_;
+ unsigned rawcnt_;
+ hattrie_t *mapping_;
+};
+
+}
+
+#endif // __KMER_MAP_HPP__
diff --git a/src/common/modules/alignment/kmer_mapper.hpp b/src/common/modules/alignment/kmer_mapper.hpp
new file mode 100644
index 0000000..1f11d1f
--- /dev/null
+++ b/src/common/modules/alignment/kmer_mapper.hpp
@@ -0,0 +1,219 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "sequence/sequence_tools.hpp"
+#include "common/adt/kmer_vector.hpp"
+#include "edge_index.hpp"
+
+#include "kmer_map.hpp"
+
+#include <set>
+#include <cstdlib>
+
+namespace debruijn_graph {
+template<class Graph>
+class KmerMapper : public omnigraph::GraphActionHandler<Graph> {
+ typedef omnigraph::GraphActionHandler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef RtSeq Kmer;
+ typedef RtSeq Seq;
+ typedef typename Seq::DataType RawSeqData;
+
+ unsigned k_;
+ KMerMap mapping_;
+ bool verification_on_;
+ bool normalized_;
+
+ bool CheckAllDifferent(const Sequence &old_s, const Sequence &new_s) const {
+ std::set<Kmer> kmers;
+ Kmer kmer = old_s.start<Kmer>(k_) >> 0;
+ for (size_t i = k_ - 1; i < old_s.size(); ++i) {
+ kmer <<= old_s[i];
+ kmers.insert(kmer);
+ }
+ kmer = new_s.start<Kmer>(k_) >> 0;
+ for (size_t i = k_ - 1; i < new_s.size(); ++i) {
+ kmer <<= new_s[i];
+ kmers.insert(kmer);
+ }
+ return kmers.size() == old_s.size() - k_ + 1 + new_s.size() - k_ + 1;
+ }
+
+public:
+ KmerMapper(const Graph &g) :
+ base(g, "KmerMapper"),
+ k_(unsigned(g.k() + 1)),
+ mapping_(k_),
+ normalized_(false) {
+ }
+
+ virtual ~KmerMapper() {}
+
+ auto begin() const -> decltype(mapping_.begin()) {
+ return mapping_.begin();
+ }
+
+ auto end() const -> decltype(mapping_.end()) {
+ return mapping_.end();
+ }
+
+ void Normalize() {
+ if (normalized_)
+ return;
+
+ KMerVector<Kmer> all(k_, size());
+ for (auto it = begin(); it != end(); ++it)
+ all.push_back(it->first);
+
+ for (auto it = all.begin(); it != all.end(); ++it) {
+ Seq val(k_, it.data());
+ Normalize(val);
+ }
+ normalized_ = true;
+ }
+
+ unsigned k() const {
+ return k_;
+ }
+
+// void Revert(const Kmer &kmer) {
+// Kmer old_value = Substitute(kmer);
+// if (old_value != kmer) {
+// mapping_.erase(kmer);
+// mapping_.set(old_value, kmer);
+// normalized_ = false;
+// }
+// }
+
+ void Normalize(const Kmer &kmer) {
+ mapping_.set(kmer, Substitute(kmer));
+ }
+
+ void RemapKmers(const Sequence &old_s, const Sequence &new_s) {
+ VERIFY(this->IsAttached());
+ size_t old_length = old_s.size() - k_ + 1;
+ size_t new_length = new_s.size() - k_ + 1;
+ UniformPositionAligner aligner(old_s.size() - k_ + 1,
+ new_s.size() - k_ + 1);
+ Kmer old_kmer = old_s.start<Kmer>(k_) >> 'A';
+ typename Kmer::less2 kmer_less;
+ for (size_t i = k_ - 1; i < old_s.size(); ++i) {
+ old_kmer <<= old_s[i];
+
+ // Checking if already have info for this kmer
+ if (mapping_.count(old_kmer))
+ continue;
+
+ size_t old_kmer_offset = i - k_ + 1;
+ size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
+ if (old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
+ Kmer middle(k_-1, new_s, new_length / 2);
+ if (kmer_less(middle, !middle)) {
+ new_kmer_offest = new_length - 1 - new_kmer_offest;
+ }
+ }
+ Kmer new_kmer(k_, new_s, new_kmer_offest);
+ if (old_kmer == new_kmer)
+ continue;
+
+ if (mapping_.count(new_kmer)) {
+ // Special case of remapping back.
+ // Not sure that we actually need it
+ if (Substitute(new_kmer) == old_kmer)
+ mapping_.erase(new_kmer);
+ else
+ continue;
+ }
+
+ mapping_.set(old_kmer, new_kmer);
+ normalized_ = false;
+ }
+ }
+
+ void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) override {
+ VERIFY(this->g().EdgeNucls(new_edge) == this->g().EdgeNucls(edge2));
+ RemapKmers(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2));
+ }
+
+ Kmer Substitute(const Kmer &kmer) const {
+ VERIFY(this->IsAttached());
+ Kmer answer = kmer;
+ const auto *rawval = mapping_.find(answer);
+ while (rawval != nullptr) {
+ Seq val(k_, rawval);
+ if (verification_on_)
+ VERIFY(answer != val);
+
+ answer = val;
+ rawval = mapping_.find(answer);
+ }
+ return answer;
+ }
+
+ bool CanSubstitute(const Kmer &kmer) const {
+ const auto *rawval = mapping_.find(kmer);
+ return rawval != nullptr;
+ }
+
+ void BinWrite(std::ostream &file) const {
+ uint32_t sz = (uint32_t)size();
+ file.write((const char *) &sz, sizeof(uint32_t));
+
+ for (auto iter = begin(); iter != end(); ++iter) {
+ Kmer::BinWrite(file, iter->first);
+ Kmer::BinWrite(file, iter->second);
+ }
+ }
+
+ void BinRead(std::istream &file) {
+ clear();
+
+ uint32_t size;
+ file.read((char *) &size, sizeof(uint32_t));
+ for (uint32_t i = 0; i < size; ++i) {
+ Kmer key(k_);
+ Seq value(k_);
+ Kmer::BinRead(file, &key);
+ Seq::BinRead(file, &value);
+ mapping_.set(key, value);
+ }
+ normalized_ = false;
+ }
+
+ bool CompareTo(KmerMapper<Graph> const &m) {
+ if (size() != m.size()) {
+ INFO("Unequal sizes");
+ return false;
+ }
+
+ for (auto iter = begin(); iter != end(); ++iter) {
+ auto cmp = m.mapping_.find(iter.first());
+ if (cmp == m.mapping_.end() || cmp.second() != iter.second()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ void clear() {
+ normalized_ = false;
+ return mapping_.clear();
+ }
+
+ size_t size() const {
+ return mapping_.size();
+ }
+
+ // "turn on = true" means turning of all verifies
+ void SetUnsafeMode(bool turn_on) {
+ verification_on_ = !turn_on;
+ }
+};
+
+}
diff --git a/src/common/modules/alignment/kmer_mapper_logger.hpp b/src/common/modules/alignment/kmer_mapper_logger.hpp
new file mode 100644
index 0000000..3643030
--- /dev/null
+++ b/src/common/modules/alignment/kmer_mapper_logger.hpp
@@ -0,0 +1,45 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * sequencem_mapping_logger.h
+ *
+ * Created on: Nov 27, 2012
+ * Author: alex
+ */
+
+#ifndef KMER_MAPPER_LOGGER_H_
+#define KMER_MAPPER_LOGGER_H_
+
+#include "sequence/sequence.hpp"
+#include "common/assembly_graph/core/action_handlers.hpp"
+#include "utils/standard_base.hpp"
+
+namespace debruijn {
+
+template<class Graph>
+class KmerMapperLogger : public omnigraph::GraphActionHandler<Graph> {
+public:
+ typedef pair<Sequence, Sequence> MappedSeq;
+ typedef typename Graph::EdgeId EdgeId;
+
+ KmerMapperLogger(Graph& graph) : GraphActionHandler<Graph>(graph, "KmerMapperLogger") {}
+ virtual ~KmerMapperLogger() {}
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+ log_.push_back(MappedSeq(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2)));
+ }
+
+ const vector<MappedSeq>& log() const {
+ return log_;
+ }
+
+ vector<MappedSeq> log_;
+};
+
+} /* namespace debruijn */
+#endif /* KMER_MAPPER_LOGGER_H_ */
diff --git a/src/common/modules/alignment/long_read_mapper.hpp b/src/common/modules/alignment/long_read_mapper.hpp
new file mode 100644
index 0000000..66dbf03
--- /dev/null
+++ b/src/common/modules/alignment/long_read_mapper.hpp
@@ -0,0 +1,172 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef LONG_READ_MAPPER_HPP_
+#define LONG_READ_MAPPER_HPP_
+
+#include "long_read_storage.hpp"
+#include "sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+class LongReadMapper: public SequenceMapperListener {
+public:
+ typedef vector<vector<EdgeId>> PathsT;
+ typedef MappingPath<EdgeId> MappingT;
+ typedef std::function<PathsT (const MappingT&)> PathExtractionF;
+
+ LongReadMapper(const Graph& g,
+ PathStorage<Graph>& storage,
+ PathExtractionF path_extractor)
+ : g_(g),
+ storage_(storage),
+ path_extractor_(path_extractor) {
+ }
+
+ void StartProcessLibrary(size_t threads_count) override {
+ for (size_t i = 0; i < threads_count; ++i)
+ buffer_storages_.emplace_back(g_);
+ }
+
+ void StopProcessLibrary() override {
+ buffer_storages_.clear();
+ }
+
+ void MergeBuffer(size_t thread_index) override {
+ DEBUG("Merge buffer " << thread_index << " with size " << buffer_storages_[thread_index].size());
+ storage_.AddStorage(buffer_storages_[thread_index]);
+ buffer_storages_[thread_index].Clear();
+ DEBUG("Now size " << storage_.size());
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleRead&,
+ const MappingPath<EdgeId>& read) override {
+ ProcessSingleRead(thread_index, read);
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleReadSeq&,
+ const MappingPath<EdgeId>& read) override {
+ ProcessSingleRead(thread_index, read);
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+private:
+ void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& mapping) {
+ DEBUG("Processing read");
+ for (const auto& path : path_extractor_(mapping)) {
+ buffer_storages_[thread_index].AddPath(path, 1, false);
+ }
+ DEBUG("Read processed");
+ }
+
+ const Graph& g_;
+ PathStorage<Graph>& storage_;
+ std::vector<PathStorage<Graph>> buffer_storages_;
+ PathExtractionF path_extractor_;
+ DECL_LOGGER("LongReadMapper");
+};
+
+class GappedPathExtractor {
+ const Graph& g_;
+ const MappingPathFixer<Graph> path_fixer_;
+ const double MIN_MAPPED_RATIO = 0.3;
+ const size_t MIN_MAPPED_LENGTH = 100;
+public:
+ GappedPathExtractor(const Graph& g): g_(g), path_fixer_(g) {
+ }
+
+ vector<vector<EdgeId>> operator() (const MappingPath<EdgeId>& mapping) const {
+ vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
+ mapping.simple_path());
+ corrected_path = FilterBadMappings(corrected_path, mapping);
+ return FindReadPathWithGaps(mapping, corrected_path);
+ }
+
+private:
+
+ size_t CountMappedEdgeSize(EdgeId edge, const MappingPath<EdgeId>& mapping_path, size_t& mapping_index) const {
+ while(mapping_path[mapping_index].first != edge) {
+ mapping_index++;
+ }
+ size_t start_idx = mapping_index;
+
+ while(mapping_path[mapping_index].first == edge) {
+ mapping_index++;
+ if(mapping_index >= mapping_path.size()) {
+ break;
+ }
+ }
+ size_t end_idx = mapping_index;
+ size_t total_len = 0;
+ for(size_t i = start_idx; i < end_idx; ++i) {
+ total_len += mapping_path[i].second.initial_range.size();
+ }
+
+ return total_len;
+ }
+
+ vector<EdgeId> FilterBadMappings(const vector<EdgeId>& corrected_path, const MappingPath<EdgeId>& mapping_path) const {
+ vector<EdgeId> new_corrected_path;
+ size_t mapping_index = 0;
+ for (auto edge : corrected_path) {
+ size_t mapping_size = CountMappedEdgeSize(edge, mapping_path, mapping_index);
+ size_t edge_len = g_.length(edge);
+ //VERIFY(edge_len >= mapping_size);
+ if (mapping_size > MIN_MAPPED_LENGTH ||
+ math::gr((double) mapping_size / (double) edge_len, MIN_MAPPED_RATIO)) {
+ new_corrected_path.push_back(edge);
+ }
+ }
+ return new_corrected_path;
+ }
+
+ vector<vector<EdgeId>> FindReadPathWithGaps(const MappingPath<EdgeId>& mapping_path, vector<EdgeId>& corrected_path) const {
+ if (mapping_path.size() == 0) {
+ TRACE("read unmapped");
+ return vector<vector<EdgeId>>();
+ }
+ vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
+ return SplitUnfixedPoints(fixed_path);
+ }
+
+ vector<vector<EdgeId>> SplitUnfixedPoints(vector<EdgeId>& path) const {
+ vector<vector<EdgeId>> result;
+ size_t prev_start = 0;
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
+ result.push_back(vector<EdgeId>(path.begin() + prev_start, path.begin() + i));
+ prev_start = i;
+ }
+ }
+ result.push_back(vector<EdgeId>(path.begin() + prev_start, path.end()));
+ return result;
+ }
+};
+
+typedef std::function<vector<vector<EdgeId>> (const MappingPath<EdgeId>&)> PathExtractionF;
+
+inline PathExtractionF ChooseProperReadPathExtractor(const Graph& g, io::LibraryType lib_type) {
+ if (lib_type == io::LibraryType::PathExtendContigs || lib_type == io::LibraryType::TSLReads
+ || lib_type == io::LibraryType::TrustedContigs || lib_type == io::LibraryType::UntrustedContigs) {
+ return [&] (const MappingPath<EdgeId>& mapping) {
+ return GappedPathExtractor(g)(mapping);
+ };
+ } else {
+ return [&] (const MappingPath<EdgeId>& mapping) {
+ return vector<vector<EdgeId>>{ReadPathFinder<Graph>(g).FindReadPath(mapping)};
+ };
+ }
+}
+
+}/*longreads*/
+
+#endif /* LONG_READ_MAPPER_HPP_ */
diff --git a/src/common/modules/alignment/long_read_storage.hpp b/src/common/modules/alignment/long_read_storage.hpp
new file mode 100644
index 0000000..2eeaee0
--- /dev/null
+++ b/src/common/modules/alignment/long_read_storage.hpp
@@ -0,0 +1,354 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * long_edge_storage.hpp
+ *
+ * Created on: Feb 7, 2013
+ * Author: lab42
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace debruijn_graph {
+
+template<class Graph>
+class PathInfo {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ vector<EdgeId> path;
+
+private:
+ mutable size_t w;
+
+public:
+ const vector<EdgeId>& getPath() const {
+ return path;
+ }
+
+ size_t getWeight() const {
+ return w;
+ }
+
+ void increaseWeight(int addition = 1) const {
+ w += addition;
+ }
+
+ bool operator<(const PathInfo<Graph> &other) const {
+ return path < other.path;
+ }
+
+ PathInfo(const vector<EdgeId> &p, size_t weight = 0) :
+ path(p), w(weight) {
+ }
+
+ PathInfo(const PathInfo<Graph> &other) {
+ path = other.path;
+ w = other.w;
+ }
+
+ string str(const Graph &g_) const {
+ stringstream s;
+ for(auto iter = path.begin(); iter != path.end(); iter ++ ){
+ s << g_.int_id(*iter) << " ";
+ }
+ return s.str();
+ }
+
+};
+
+template<class Graph>
+class PathStorage {
+ friend class PathInfo<Graph> ;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef map<EdgeId, set<PathInfo<Graph> > > InnerIndex;
+
+ const Graph &g_;
+ InnerIndex inner_index_;
+ static const size_t kLongEdgeForStats = 500;
+
+ void HiddenAddPath(const vector<EdgeId> &p, int w){
+ if (p.size() == 0 ) return;
+ for (typename set<PathInfo<Graph> >::iterator iter = inner_index_[p[0]].begin(); iter != inner_index_[p[0]].end(); ++iter) {
+
+ if (iter->path == p) {
+ iter->increaseWeight(w);
+ return;
+ }
+ }
+ inner_index_[p[0]].insert(PathInfo<Graph>(p, w));
+ size_++;
+ }
+
+public:
+
+ PathStorage(const Graph &g)
+ : g_(g),
+ inner_index_(),
+ size_(0) {
+ }
+
+ PathStorage(const PathStorage & p)
+ : g_(p.g_),
+ inner_index_(),
+ size_(0) {
+ for (auto iter = p.inner_index_.begin(); iter != p.inner_index_.end();
+ iter++) {
+ for (auto j_iter = iter->second.begin();
+ j_iter != iter->second.end(); j_iter++) {
+ this->AddPath(j_iter->path, (int) j_iter->getWeight());
+ }
+ }
+ }
+
+ void ReplaceEdges(map<EdgeId, EdgeId> &old_to_new){
+ map<int, EdgeId> tmp_map;
+// for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter ){
+// tmp_map[g_.int_id(*iter)] = *iter;
+// }
+ InnerIndex new_index;
+ for (auto iter = inner_index_.begin(); iter != inner_index_.end(); iter++) {
+ auto tmp = iter->second;
+ EdgeId new_first;
+ if (old_to_new.find(iter->first) == old_to_new.end())
+ new_first = iter->first;
+ else {
+ DEBUG("new first edge: "<< g_.int_id(old_to_new[iter->first]) << " with " << tmp.size() << " edges ");
+ new_first = old_to_new[iter->first];
+ }
+ set<PathInfo<Graph> > new_tmp;
+ for (auto j_iter = tmp.begin(); j_iter != tmp.end(); j_iter++) {
+ PathInfo<Graph> pi = *(j_iter);
+ for (size_t k = 0; k < pi.path.size(); k++)
+ if (old_to_new.find(pi.path[k]) != old_to_new.end()) {
+// INFO(g_.int_id(old_to_new[pi.path[k]]));
+ pi.path[k] = old_to_new[pi.path[k]];
+ }
+ DEBUG(pi.str(g_));
+ new_tmp.insert(pi);
+
+ }
+ if (new_first != iter->first) {
+ TRACE("and mmew_tmp.size: "<< new_tmp.size());
+ }
+ if (new_index.find(new_first) == new_index.end()) {
+ new_index[new_first] = new_tmp;
+ } else {
+ for (auto j_iter = new_tmp.begin(); j_iter != new_tmp.end(); j_iter++) {
+ new_index[new_first].insert(*j_iter);
+ }
+ }
+
+ }
+
+ inner_index_ = new_index;
+ }
+
+ void AddPath(const vector<EdgeId> &p, int w, bool add_rc = false) {
+ HiddenAddPath(p, w);
+ if (add_rc) {
+ vector<EdgeId> rc_p(p.size());
+ for (size_t i = 0; i < p.size(); i++)
+ rc_p[i] = g_.conjugate(p[p.size() - 1 - i]);
+ HiddenAddPath(rc_p, w);
+ }
+ }
+
+ void DumpToFile(const string& filename) const{
+ map <EdgeId, EdgeId> auxilary;
+ DumpToFile(filename, auxilary);
+ }
+
+ void DumpToFile(const string& filename, const map<EdgeId, EdgeId>& replacement,
+ size_t stats_weight_cutoff = 1, bool need_log = false) const {
+ ofstream filestr(filename);
+ set<EdgeId> continued_edges;
+
+ for(auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter){
+ filestr<< iter->second.size() << endl;
+ int non1 = 0;
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ filestr << " Weight: " << j_iter->getWeight();
+ if (j_iter->getWeight() > stats_weight_cutoff)
+ non1++;
+
+ filestr << " length: " << j_iter->path.size() << " ";
+ for (auto p_iter = j_iter->path.begin(); p_iter != j_iter->path.end(); ++p_iter) {
+ if (p_iter != j_iter->path.end() - 1 && j_iter->getWeight() > stats_weight_cutoff) {
+ continued_edges.insert(*p_iter);
+ }
+
+ filestr << g_.int_id(*p_iter) << "(" << g_.length(*p_iter) << ") ";
+ }
+ filestr << endl;
+ }
+ filestr << endl;
+ }
+
+ int noncontinued = 0;
+ int long_gapped = 0;
+ int continued = 0;
+ if (need_log) {
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ EdgeId e = *iter;
+ if (g_.length(e) > kLongEdgeForStats) {
+ if (!g_.IsDeadEnd(g_.EdgeEnd(e))) {
+ if (continued_edges.find(e) == continued_edges.end()) {
+ auto replacement_it = replacement.find(e);
+ if (replacement_it != replacement.end() &&
+ continued_edges.find(replacement_it->second) != continued_edges.end()) {
+ TRACE("found in teplacement, edges " << g_.int_id(e) << " " <<
+ g_.int_id(replacement_it->second) << " skipping ");
+ continue;
+ }
+ TRACE("noncontinued end left " << g_.int_id(e));
+ noncontinued++;
+ } else
+ continued++;
+ } else {
+ TRACE("dead end left " << g_.int_id(e));
+ long_gapped++;
+ }
+ }
+ }
+ INFO("After PacBio (long reads) aligning, for edges longer than " << kLongEdgeForStats << ":");
+ INFO("No continuation found for " << noncontinued + long_gapped << " edges of " <<
+ noncontinued + continued + long_gapped);
+ }
+ }
+
+ void SaveAllPaths(vector<PathInfo<Graph>> &res) const {
+ for (auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter) {
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
+ res.push_back(*j_iter);
+ }
+ }
+ }
+
+ void LoadFromFile(const string s, bool force_exists = true) {
+ FILE* file = fopen(s.c_str(), "r");
+ if (force_exists) {
+ VERIFY(file != NULL);
+ } else if (file == NULL) {
+ INFO("Long reads not found, skipping");
+ return;
+ }
+ fclose(file);
+
+ INFO("Loading long reads alignment...");
+ ifstream filestr(s);
+ INFO("loading from " << s);
+ map<size_t, EdgeId> tmp_map;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ tmp_map[g_.int_id(*iter)] = *iter;
+ }
+ int fl;
+
+ file = fopen((s).c_str(), "r");
+ char ss[14];
+ while (!feof(file)) {
+ int n;
+
+ fl = fscanf(file, "%d\n", &n);
+ if (fl != 1)
+ break;
+ TRACE(n);
+ for (int i = 0; i < n; i++) {
+
+ int w = -1, l = -1;
+ fl = fscanf(file, "Weight: %d length: %d", &w, &l);
+ TRACE(w << " " << l);
+ VERIFY(fl == 2);
+ vector<EdgeId> p;
+ for (int j = 0; j < l; j++) {
+ size_t e;
+ int x;
+ fl = fscanf(file, "%zu(%d)", &e, &x);
+ VERIFY(fl == 2);
+ VERIFY(tmp_map.find(e) != tmp_map.end());
+ p.push_back(tmp_map[e]);
+ }
+ fl = fscanf(file, "%[^\n]\n", ss);
+ TRACE(ss[0]);
+ AddPath(p, w);
+ }
+ }
+ fclose(file);
+ INFO("Loading finished.");
+ }
+
+ void AddStorage(PathStorage<Graph> & to_add) {
+
+ for(auto iter = to_add.inner_index_.begin(); iter != to_add.inner_index_.end(); iter++) {
+ for(auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter ++) {
+ this->AddPath(j_iter->path, (int) j_iter->getWeight());
+ }
+ }
+ }
+
+ void Clear() {
+ inner_index_.clear();
+ size_ = 0;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+// typename InnerIndex::iterator begin() const {
+// return inner_index.begin();
+// }
+//
+// typename InnerIndex::iterator end() const {
+// return inner_index.end();
+// }
+// typename InnerIndex::iterator operator*(){
+// return this->first;
+// }
+private:
+ size_t size_;
+};
+
+template<class Graph>
+class LongReadContainer {
+ Graph& g_;
+ vector<PathStorage<Graph>> data_;
+
+public:
+
+ LongReadContainer(Graph& g, size_t count = 0): g_(g) {
+ for (size_t i = 0; i < count; ++i) {
+ data_.emplace_back(g_);
+ }
+ }
+
+ PathStorage<Graph>& operator[](size_t index) {
+ return data_[index];
+ }
+
+ const PathStorage<Graph>& operator[](size_t index) const {
+ return data_[index];
+ }
+
+ size_t size() const {
+ return data_.size();
+ }
+
+ void Clear() {
+ for (auto& storage : data_) {
+ storage.Clear();
+ }
+ }
+
+};
+
+
+}
+
+
diff --git a/src/common/modules/alignment/pacbio/pac_index.hpp b/src/common/modules/alignment/pacbio/pac_index.hpp
new file mode 100644
index 0000000..ff779ab
--- /dev/null
+++ b/src/common/modules/alignment/pacbio/pac_index.hpp
@@ -0,0 +1,916 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/indices/edge_multi_index.hpp"
+#include "common/modules/alignment/edge_index_refiller.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+// FIXME: Layering violation, get rid of this
+#include "pipeline/config_struct.hpp"
+#include "pacbio_read_structures.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+
+#include <algorithm>
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+
+namespace pacbio {
+enum {
+ UNDEF_COLOR = -1,
+ DELETED_COLOR = - 2
+};
+
+struct OneReadMapping {
+ vector<vector<debruijn_graph::EdgeId>> main_storage;
+ vector<GapDescription> gaps;
+ vector<size_t> real_length;
+ //Total used seeds. sum over all subreads;
+ size_t seed_num;
+ OneReadMapping(const vector<vector<debruijn_graph::EdgeId>>& main_storage_,
+ const vector<GapDescription>& gaps_,
+ const vector<size_t>& real_length_,
+ size_t seed_num_) :
+ main_storage(main_storage_), gaps(gaps_), real_length(real_length_), seed_num(seed_num_) {
+ }
+
+};
+
+template<class Graph>
+class PacBioMappingIndex {
+public:
+ typedef map<typename Graph::EdgeId, vector<MappingInstance> > MappingDescription;
+ typedef pair<typename Graph::EdgeId, vector<MappingInstance> > ClusterDescription;
+ typedef set<KmerCluster<Graph> > ClustersSet;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
+ typedef typename Index::KeyWithHash KeyWithHash;
+
+private:
+ DECL_LOGGER("PacIndex")
+
+ const Graph &g_;
+ size_t pacbio_k;
+ size_t debruijn_k;
+ const static int short_edge_cutoff = 0;
+ const static size_t min_cluster_size = 8;
+ const static int max_similarity_distance = 500;
+
+//Debug stasts
+ int good_follow = 0;
+ int half_bad_follow = 0;
+ int bad_follow = 0;
+
+ set<Sequence> banned_kmers;
+ debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> tmp_index;
+ mutable map<pair<VertexId, VertexId>, size_t> distance_cashed;
+ size_t read_count;
+ bool ignore_map_to_middle;
+ debruijn_graph::config::debruijn_config::pacbio_processor pb_config_;
+public:
+ MappingDescription Locate(const Sequence &s) const;
+
+ PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle, string out_dir, debruijn_graph::config::debruijn_config::pacbio_processor pb_config )
+ : g_(g),
+ pacbio_k(k),
+ debruijn_k(debruijn_k_),
+ tmp_index((unsigned) pacbio_k, out_dir), ignore_map_to_middle(ignore_map_to_middle), pb_config_(pb_config) {
+ DEBUG("PB Mapping Index construction started");
+ debruijn_graph::EdgeIndexRefiller().Refill(tmp_index, g_);
+ INFO("Index constructed");
+ FillBannedKmers();
+ read_count = 0;
+ }
+ ~PacBioMappingIndex(){
+ DEBUG("good/ugly/bad counts:" << good_follow << " "<<half_bad_follow << " " << bad_follow);
+ }
+
+ void FillBannedKmers() {
+ for (int i = 0; i < 4; i++) {
+ auto base = nucl((unsigned char) i);
+ for (int j = 0; j < 4; j++) {
+ auto other = nucl((unsigned char) j);
+ for (size_t other_pos = 0; other_pos < pacbio_k; other_pos++) {
+ string s = "";
+ for (size_t k = 0; k < pacbio_k; k++) {
+ if (k != other_pos)
+ s += base;
+ else
+ s += other;
+ }
+ banned_kmers.insert(Sequence(s));
+ }
+ }
+ }
+ }
+
+ bool similar(const MappingInstance &a, const MappingInstance &b,
+ int shift = 0) const {
+ if (b.read_position + shift < a.read_position) {
+ return similar(b, a, -shift);
+ } else if (b.read_position == a.read_position) {
+ return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
+ } else {
+ return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * pb_config_.compression_cutoff) &&
+ ((b.edge_position + shift - a.edge_position) * pb_config_.compression_cutoff <= (b.read_position - a.read_position)));
+ }
+ }
+
+
+ bool similar_in_graph(const MappingInstance &a, const MappingInstance &b,
+ int shift = 0) const {
+ if (b.read_position + shift < a.read_position) {
+ return similar_in_graph(b, a, -shift);
+ } else if (b.read_position == a.read_position) {
+ return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
+ } else {
+ return ((b.edge_position + shift - a.edge_position) * pb_config_.compression_cutoff <= (b.read_position - a.read_position));
+ }
+ }
+
+
+ void dfs_cluster(vector<int> &used, vector<MappingInstance> &to_add,
+ const int cur_ind,
+ const typename MappingDescription::iterator iter) const {
+ size_t len = iter->second.size();
+ for (size_t k = 0; k < len; k++) {
+ if (!used[k] && similar(iter->second[cur_ind], iter->second[k])) {
+ to_add.push_back(iter->second[k]);
+ used[k] = 1;
+ dfs_cluster(used, to_add, (int) k, iter);
+ }
+ }
+ }
+
+ void dfs_cluster_norec(vector<int> &used, vector<MappingInstance> &to_add,
+ const size_t cur_ind,
+ const typename MappingDescription::iterator iter, vector<vector<size_t> > &similarity_list) const {
+ std::deque<size_t> stack;
+ stack.push_back(cur_ind);
+ used[cur_ind] = 1;
+ while (stack.size() > 0) {
+ size_t k = stack.back();
+ stack.pop_back();
+ to_add.push_back(iter->second[k]);
+
+ for (size_t i = 0; i < similarity_list[k].size(); i++) {
+ if (!used[similarity_list[k][i]]) {
+ stack.push_back(similarity_list[k][i]);
+ used[similarity_list[k][i]] = 1;
+ }
+ }
+ }
+ }
+
+ ClustersSet GetOrderClusters(const Sequence &s) const {
+ MappingDescription descr = Locate(s);
+ ClustersSet res;
+ TRACE(read_count << " read_count");
+
+ DEBUG(descr.size() <<" clusters");
+ for (auto iter = descr.begin(); iter != descr.end(); ++iter) {
+ size_t edge_id = g_.int_id(iter->first);
+ DEBUG(edge_id);
+ sort(iter->second.begin(), iter->second.end(), ReadPositionComparator());
+ set<vector<MappingInstance> > edge_cluster_set;
+ size_t len = iter->second.size();
+ vector<vector<size_t> > similarity_list(len);
+ int cnt = 0;
+ for (size_t i = 0; i < len; i++){
+ for (size_t j = i + 1; j < len; j++){
+ if (iter->second[i].read_position + max_similarity_distance < iter->second[j].read_position) {
+ break;
+ }
+ if (similar(iter->second[i], iter->second[j])) {
+ similarity_list[i].push_back(j);
+ cnt ++;
+ if (cnt % 10000 == 0) {
+ DEBUG(cnt);
+ }
+ }
+ }
+ }
+
+ DEBUG(len <<" kmers in cluster");
+ vector<int> used(len);
+ for (size_t i = 0; i < len; i++) {
+ if (!used[i]) {
+ vector<size_t> new_cluster(len);
+ vector<size_t> prev(len);
+ for(size_t j = i; j < len; j++) {
+ if (!used[j]) {
+ if (new_cluster[j] == 0) new_cluster[j] = 1, prev[j] = size_t(-1);
+ for(size_t k = 0; k < similarity_list[j].size(); k++) {
+ size_t next_ind = similarity_list[j][k];
+ if (!used[next_ind]) {
+ if (new_cluster[next_ind] < new_cluster[j] + 1){
+ new_cluster[next_ind] = new_cluster[j] + 1;
+ prev[next_ind] = j;
+ }
+ }
+ }
+ }
+ }
+ size_t maxx = 0;
+ size_t maxj = i;
+ for(size_t j = i; j < len; j++) {
+ if (new_cluster[j] > maxx) maxj = j, maxx = new_cluster[j];
+ }
+ vector<MappingInstance> to_add;
+ size_t real_maxj = maxj, first_j = maxj;
+ while (maxj != size_t(-1)) {
+ to_add.push_back(iter->second[maxj]);
+ first_j = maxj;
+ maxj = prev[maxj];
+ }
+ for (auto j = first_j; j < real_maxj; j++)
+ used[j] = 1;
+ reverse(to_add.begin(), to_add.end());
+ TRACE("adding cluster "" edge "<< edge_id << " len " <<to_add.size() )
+ res.insert(KmerCluster<Graph>(iter->first, to_add));
+ }
+ }
+ }
+ FilterClusters(res);
+ return res;
+ }
+ //filter clusters that are too small or fully located on a vertex or dominated by some other cluster.
+ void FilterClusters(ClustersSet &clusters) const {
+ for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
+ size_t edge_id = g_.int_id(i_iter->edgeId);
+
+ int len = (int) g_.length(i_iter->edgeId);
+ auto sorted_by_edge = i_iter->sorted_positions;
+ sort(sorted_by_edge.begin(), sorted_by_edge.end());
+ double good = 0;
+ DEBUG("filtering cluster of size " << sorted_by_edge.size());
+ DEBUG(edge_id <<" : edgeId");
+ for (auto iter = sorted_by_edge.begin();
+ iter < sorted_by_edge.end(); iter++) {
+ if (iter->IsUnique())
+ good++;
+ //good += 1.0 / (iter->quality * iter->quality);
+ }
+ DEBUG("good " << good);
+
+ if (good < min_cluster_size || (len < short_edge_cutoff)) {
+ if (len < short_edge_cutoff) {
+ DEBUG("Life is too long, and edge is too short!");
+ }
+ auto tmp_iter = i_iter;
+ tmp_iter++;
+ clusters.erase(i_iter);
+ i_iter = tmp_iter;
+ } else {
+ if (sorted_by_edge[0].edge_position >= len
+ || sorted_by_edge[i_iter->size - 1].edge_position
+ <= int(debruijn_k) - int(pacbio_k)) {
+ DEBUG("All anchors in vertex");
+ auto tmp_iter = i_iter;
+ tmp_iter++;
+ clusters.erase(i_iter);
+ i_iter = tmp_iter;
+ } else {
+ i_iter++;
+ }
+ }
+ }
+ for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
+ size_t edge_id = g_.int_id(i_iter->edgeId);
+ auto sorted_by_edge = i_iter->sorted_positions;
+
+ DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size());
+ for (auto j_iter = clusters.begin(); j_iter != clusters.end();) {
+ if (i_iter != j_iter) {
+ if (dominates(*i_iter, *j_iter)) {
+ TRACE("cluster is dominated");
+ auto tmp_iter = j_iter;
+ tmp_iter++;
+ TRACE("cluster on edge " << g_.int_id(j_iter->edgeId));
+ TRACE("erased - dominated");
+ clusters.erase(j_iter);
+ j_iter = tmp_iter;
+ } else {
+ j_iter++;
+ }
+ } else {
+ j_iter++;
+ }
+ }
+ DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering");
+ i_iter++;
+ }
+ }
+
+ // is "non strictly dominates" required?
+ bool dominates(const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b) const {
+ size_t a_size = a.size;
+ size_t b_size = b.size;
+ if ((double) a_size < (double) b_size * pb_config_.domination_cutoff
+ || a.sorted_positions[a.first_trustable_index].read_position
+ > b.sorted_positions[b.first_trustable_index].read_position
+ || a.sorted_positions[a.last_trustable_index].read_position
+ < b.sorted_positions[b.last_trustable_index].read_position) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ vector<vector<EdgeId>> FillGapsInCluster(const vector<pair<size_t, typename ClustersSet::iterator> > &cur_cluster,
+ const Sequence &s) const {
+ vector<EdgeId> cur_sorted;
+ vector<vector<EdgeId>> res;
+ EdgeId prev_edge = EdgeId(0);
+
+ for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
+ ++iter) {
+ EdgeId cur_edge = iter->second->edgeId;
+ if (prev_edge != EdgeId(0)) {
+//Need to find sequence of edges between clusters
+ VertexId start_v = g_.EdgeEnd(prev_edge);
+ VertexId end_v = g_.EdgeStart(cur_edge);
+ auto prev_iter = iter - 1;
+ MappingInstance cur_first_index =
+ iter->second->sorted_positions[iter->second
+ ->first_trustable_index];
+ MappingInstance prev_last_index = prev_iter->second
+ ->sorted_positions[prev_iter->second
+ ->last_trustable_index];
+
+ if (start_v != end_v ||
+ (start_v == end_v &&
+ (double) (cur_first_index.read_position - prev_last_index.read_position) >
+ (double) (cur_first_index.edge_position + (int) g_.length(prev_edge) - prev_last_index.edge_position) * 1.3)) {
+ DEBUG(" traversing tangled hregion between "<< g_.int_id(prev_edge)<< " " << g_.int_id(cur_edge));
+ DEBUG(" first pair" << cur_first_index.str() << " edge_len" << g_.length(cur_edge));
+ DEBUG(" last pair" << prev_last_index.str() << " edge_len" << g_.length(prev_edge));
+ string s_add = "";
+ string e_add = "";
+ int seq_end = cur_first_index.read_position;
+ int seq_start = prev_last_index.read_position;
+ string tmp = g_.EdgeNucls(prev_edge).str();
+ s_add = tmp.substr(prev_last_index.edge_position,
+ g_.length(prev_edge) - prev_last_index.edge_position);
+ tmp = g_.EdgeNucls(cur_edge).str();
+ e_add = tmp.substr(0, cur_first_index.edge_position);
+ pair<int, int> limits = GetPathLimits(*(prev_iter->second),
+ *(iter->second),
+ (int) s_add.length(),
+ (int) e_add.length());
+ if (limits.first == -1) {
+ res.push_back(cur_sorted);
+ cur_sorted.clear();
+ prev_edge = EdgeId(0);
+ continue;
+ }
+
+ vector<EdgeId> intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add);
+ if (intermediate_path.size() == 0) {
+ DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end);
+ if (pb_config_.additional_debug_info) {
+ DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end);
+ omnigraph::PathStorageCallback<Graph> callback(g_);
+ ProcessPaths(g_, 0, 4000,
+ start_v, end_v,
+ callback);
+ vector<vector<EdgeId> > paths = callback.paths();
+ stringstream s_buf;
+ for (auto p_iter = paths.begin();
+ p_iter != paths.end(); p_iter++) {
+ size_t tlen = 0;
+ for (auto path_iter = p_iter->begin();
+ path_iter != p_iter->end();
+ path_iter++) {
+ tlen += g_.length(*path_iter);
+ }
+ s_buf << tlen << " ";
+ }
+ DEBUG(s_buf.str());
+ }
+ res.push_back(cur_sorted);
+ cur_sorted.clear();
+ prev_edge = EdgeId(0);
+ continue;
+ }
+ for (auto j_iter = intermediate_path.begin(); j_iter != intermediate_path.end(); j_iter++) {
+ cur_sorted.push_back(*j_iter);
+ }
+ }
+ }
+ cur_sorted.push_back(cur_edge);
+ prev_edge = cur_edge;
+ }
+ if (cur_sorted.size() > 0)
+ res.push_back(cur_sorted);
+ return res;
+ }
+
+ bool TopologyGap(EdgeId first, EdgeId second, bool oriented) const {
+ omnigraph::TerminalVertexCondition<Graph> condition(g_);
+ bool res = condition.Check(g_.EdgeEnd(first)) && condition.Check(g_.EdgeStart(second));
+ if (!oriented)
+ res |= condition.Check(g_.EdgeStart(first)) && condition.Check(g_.EdgeEnd(second));
+ return res;
+ }
+
+ vector<int> GetWeightedColors(const ClustersSet &mapping_descr) const {
+ int len = (int) mapping_descr.size();
+ DEBUG("getting colors, table size "<< len);
+ vector<vector<int> > cons_table(len);
+
+ vector<int> colors(len);
+ vector<int> cluster_size(len);
+ vector<int> max_size(len);
+ vector<int> prev(len);
+
+ for (int i = 0; i < len; i++) {
+ cons_table[i].resize(len);
+ cons_table[i][i] = 0;
+ prev[i] = -1;
+ }
+ int i = 0;
+
+ for (int i = 0; i < len; i++) {
+//-1 not initialized, -2 - removed as trash
+ colors[i] = UNDEF_COLOR;
+ }
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ cluster_size[i] = i_iter->size;
+ }
+ i = 0;
+ if (len > 1) {
+ TRACE(len << "clusters");
+ }
+
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ int j = i;
+ for (auto j_iter = i_iter;
+ j_iter != mapping_descr.end(); ++j_iter, ++j) {
+ if (i_iter == j_iter)
+ continue;
+ cons_table[i][j] = IsConsistent(*i_iter, *j_iter);
+ }
+ }
+ i = 0;
+ int cur_color = 0;
+
+ while (true) {
+ for (i = 0; i < len; i++) {
+ max_size[i] = 0;
+ prev[i] = -1;
+ }
+ i = 0;
+ for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
+ ++i_iter, ++i) {
+ if (colors[i] != UNDEF_COLOR) continue;
+ max_size[i] = cluster_size[i];
+ for (int j = 0; j < i; j ++) {
+ if (colors[j] != -1) continue;
+ if (cons_table[j][i] && max_size[i] < cluster_size[i] + max_size[j]) {
+ max_size[i] = max_size[j] + cluster_size[i];
+ prev[i] = j;
+ }
+ }
+ }
+ int maxx = 0;
+ int maxi = -1;
+ for (int j = 0; j < len; j++) {
+ if (max_size[j] > maxx) {
+ maxx = max_size[j];
+ maxi = j;
+ }
+ }
+ if (maxi == -1) {
+ break;
+ }
+ cur_color = maxi;
+ colors[maxi] = cur_color;
+ int real_maxi = maxi, min_i = maxi;
+
+ while (prev[maxi] != -1) {
+ min_i = maxi;
+ maxi = prev[maxi];
+ colors[maxi] = cur_color;
+ }
+ while (real_maxi >= min_i) {
+ if (colors[real_maxi] == UNDEF_COLOR) {
+ colors[real_maxi] = DELETED_COLOR;
+ }
+ real_maxi --;
+ }
+ }
+ return colors;
+ }
+
+
+ GapDescription CreateGapDescription(const KmerCluster<debruijn_graph::Graph>& a,
+ const KmerCluster<debruijn_graph::Graph>& b,
+ const Sequence& read) const {
+ size_t seq_start = a.sorted_positions[a.last_trustable_index].read_position + pacbio_k;
+ size_t seq_end = b.sorted_positions[b.first_trustable_index].read_position;
+ if (seq_start > seq_end) {
+ DEBUG("Overlapping flanks not supported yet");
+ return GapDescription();
+ }
+ return GapDescription(a.edgeId,
+ b.edgeId,
+ read.Subseq(seq_start, seq_end),
+ a.sorted_positions[a.last_trustable_index].edge_position + pacbio_k - debruijn_k,
+ b.sorted_positions[b.first_trustable_index].edge_position);
+ }
+
+
+ OneReadMapping GetReadAlignment(Sequence &s) const {
+ ClustersSet mapping_descr = GetOrderClusters(s);
+ DEBUG("clusters got");
+ int len = (int) mapping_descr.size();
+ vector<size_t> real_length;
+
+ vector<int> colors = GetWeightedColors(mapping_descr);
+ vector<vector<EdgeId> > sortedEdges;
+ vector<bool> block_gap_closer;
+ vector<typename ClustersSet::iterator> start_clusters, end_clusters;
+ vector<GapDescription> illumina_gaps;
+ vector<int> used(len);
+ size_t used_seed_count = 0;
+ auto iter = mapping_descr.begin();
+ for (int i = 0; i < len; i++, iter ++) {
+ used[i] = 0;
+ DEBUG(colors[i] <<" " << iter->str(g_));
+ }
+ for (int i = 0; i < len; i++) {
+ if (!used[i]) {
+ DEBUG("starting new subread");
+ size_t cur_seed_count = 0;
+ vector<pair<size_t, typename ClustersSet::iterator> > cur_cluster;
+ used[i] = 1;
+ int j = 0;
+ int cur_color = colors[i];
+ if (cur_color == DELETED_COLOR)
+ continue;
+ for (auto i_iter = mapping_descr.begin();
+ i_iter != mapping_descr.end(); ++i_iter, ++j) {
+ if (colors[j] == cur_color) {
+ cur_cluster.push_back(
+ make_pair(
+ i_iter->average_read_position,
+ i_iter));
+ used[j] = 1;
+ cur_seed_count += i_iter->sorted_positions.size();
+ }
+ }
+ sort(cur_cluster.begin(), cur_cluster.end(),
+ pair_iterator_less<typename ClustersSet::iterator>());
+ VERIFY(cur_cluster.size() > 0);
+ //if (cur_seed_count > used_seed_count)
+ used_seed_count += cur_seed_count;
+ auto cur_cluster_start = cur_cluster.begin();
+ for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
+ ++iter) {
+ auto next_iter = iter + 1;
+ if (next_iter == cur_cluster.end()
+ || !IsConsistent(*(iter->second),
+ *(next_iter->second))) {
+ if (next_iter != cur_cluster.end()) {
+ DEBUG("clusters splitted:");
+ DEBUG("on "<< iter->second->str(g_));
+ DEBUG("and " << next_iter->second->str(g_));
+ }
+ vector<pair<size_t, typename ClustersSet::iterator> > splitted_cluster(
+ cur_cluster_start, next_iter);
+ auto res = FillGapsInCluster(
+ splitted_cluster, s);
+ for (auto &cur_sorted:res) {
+ DEBUG("Adding " <<res.size() << " subreads, cur alignments " << cur_sorted.size());
+ if (cur_sorted.size() > 0) {
+ for(EdgeId eee: cur_sorted) {
+ DEBUG (g_.int_id(eee));
+ }
+ start_clusters.push_back(cur_cluster_start->second);
+ end_clusters.push_back(iter->second);
+ sortedEdges.push_back(cur_sorted);
+ //Blocking gap closing inside clusters;
+ block_gap_closer.push_back(true);
+ }
+ }
+ if (block_gap_closer.size() > 0)
+ block_gap_closer[block_gap_closer.size() - 1] = false;
+ cur_cluster_start = next_iter;
+ } else {
+ DEBUG("connected consecutive clusters:");
+ DEBUG("on "<< iter->second->str(g_));
+ DEBUG("and " << next_iter->second->str(g_));
+ }
+ }
+ }
+ }
+ DEBUG("adding gaps between subreads");
+
+ for (size_t i = 0; i + 1 < sortedEdges.size() ; i++) {
+ if (block_gap_closer[i])
+ continue;
+ size_t j = i + 1;
+ EdgeId before_gap = sortedEdges[i][sortedEdges[i].size() - 1];
+ EdgeId after_gap = sortedEdges[j][0];
+//do not add "gap" for rc-jumping
+ if (before_gap != after_gap
+ && before_gap != g_.conjugate(after_gap)) {
+ if (i != j && TopologyGap(before_gap, after_gap, true)) {
+ if (start_clusters[j]->CanFollow(*end_clusters[i])) {
+ auto gap = CreateGapDescription(*end_clusters[i],
+ *start_clusters[j],
+ s);
+ if (gap != GapDescription()) {
+ illumina_gaps.push_back(gap);
+ DEBUG("adding gap between alignments number " << i<< " and " << j);
+ }
+ }
+
+ }
+ }
+
+ }
+ return OneReadMapping(sortedEdges, illumina_gaps, real_length, used_seed_count);
+ }
+
+ std::pair<int, int> GetPathLimits(const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b,
+ int s_add_len, int e_add_len) const {
+ int start_pos = a.sorted_positions[a.last_trustable_index].read_position;
+ int end_pos = b.sorted_positions[b.first_trustable_index].read_position;
+ int seq_len = -start_pos + end_pos;
+ //int new_seq_len =
+//TODO::something more reasonable
+ int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * pb_config_.path_limit_pressing)), 0);
+ int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * pb_config_.path_limit_stretching);
+ if (seq_len < 0) {
+ DEBUG("suspicious negative seq_len " << start_pos << " " << end_pos << " " << path_min_len << " " << path_max_len);
+ return std::make_pair(-1, -1);
+ }
+ path_min_len = max(path_min_len - int(s_add_len + e_add_len), 0);
+ path_max_len = max(path_max_len - int(s_add_len + e_add_len), 0);
+ return std::make_pair(path_min_len, path_max_len);
+ }
+
+//0 - No, 1 - Yes
+ int IsConsistent(const KmerCluster<Graph> &a,
+ const KmerCluster<Graph> &b) const {
+ EdgeId a_edge = a.edgeId;
+ EdgeId b_edge = b.edgeId;
+ size_t a_id = g_.int_id(a_edge);
+ size_t b_id = g_.int_id(b_edge);
+ DEBUG("clusters on " << a_id << " and " << b_id );
+ if (a.sorted_positions[a.last_trustable_index].read_position + (int) pb_config_.max_path_in_dijkstra <
+ b.sorted_positions[b.first_trustable_index].read_position) {
+ DEBUG ("Clusters are too far in read");
+ return 0;
+ }
+ VertexId start_v = g_.EdgeEnd(a_edge);
+ size_t addition = g_.length(a_edge);
+ VertexId end_v = g_.EdgeStart(b_edge);
+ pair<VertexId, VertexId> vertex_pair = make_pair(start_v, end_v);
+
+ size_t result = size_t(-1);
+ bool not_found = true;
+ auto distance_it = distance_cashed.begin();
+#pragma omp critical(pac_index)
+ {
+ distance_it = distance_cashed.find(vertex_pair);
+ not_found = (distance_it == distance_cashed.end());
+ }
+ if (not_found) {
+//TODO: constants
+ omnigraph::DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
+ omnigraph::DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(g_, pb_config_.max_path_in_dijkstra, pb_config_.max_vertex_in_dijkstra));
+ dijkstra.Run(start_v);
+ if (dijkstra.DistanceCounted(end_v)) {
+ result = dijkstra.GetDistance(end_v);
+ }
+#pragma omp critical(pac_index)
+ {
+ distance_it = distance_cashed.insert({vertex_pair, result}).first;
+ }
+ } else {
+ DEBUG("taking from cashed");
+ }
+
+
+ result = distance_it->second;
+ DEBUG (result);
+ if (result == size_t(-1)) {
+ return 0;
+ }
+ //TODO: Serious optimization possible
+
+ for (auto a_iter = a.sorted_positions.begin();
+ a_iter != a.sorted_positions.end(); ++a_iter) {
+ if (a_iter - a.sorted_positions.begin() > 500 && a.sorted_positions.end() - a_iter >500) continue;
+ int cnt = 0;
+ for (auto b_iter = b.sorted_positions.begin();
+ b_iter != b.sorted_positions.end() && cnt <500; ++b_iter, cnt ++) {
+ if (similar_in_graph(*a_iter, *b_iter,
+ (int) (result + addition))) {
+ return 1;
+ }
+ }
+ cnt = 0;
+ if (b.sorted_positions.size() > 500) {
+ for (auto b_iter = b.sorted_positions.end() - 1;
+ b_iter != b.sorted_positions.begin() && cnt < 500; --b_iter, cnt ++) {
+ if (similar_in_graph(*a_iter, *b_iter,
+ (int) (result + addition))) {
+ return 1;
+ }
+ }
+ }
+ }
+
+ return 0;
+
+ }
+
+ string PathToString(const vector<EdgeId>& path) const {
+ string res = "";
+ for (auto iter = path.begin(); iter != path.end(); iter++) {
+ size_t len = g_.length(*iter);
+ string tmp = g_.EdgeNucls(*iter).First(len).str();
+ res = res + tmp;
+ }
+ return res;
+ }
+
+ vector<EdgeId> BestScoredPath(const Sequence &s, VertexId start_v, VertexId end_v,
+ int path_min_length, int path_max_length,
+ int start_pos, int end_pos, string &s_add,
+ string &e_add) const {
+ DEBUG(" Traversing tangled region. Start and end vertices resp: " << g_.int_id(start_v) <<" " << g_.int_id(end_v));
+ omnigraph::PathStorageCallback<Graph> callback(g_);
+ ProcessPaths(g_,
+ path_min_length, path_max_length,
+ start_v, end_v,
+ callback);
+ vector<vector<EdgeId> > paths = callback.paths();
+ DEBUG("taking subseq" << start_pos <<" "<< end_pos <<" " << s.size());
+ int s_len = int(s.size());
+ string seq_string = s.Subseq(start_pos, min(end_pos + 1, s_len)).str();
+ size_t best_path_ind = paths.size();
+ size_t best_score = 1000000000;
+ DEBUG("need to find best scored path between "<<paths.size()<<" , seq_len " << seq_string.length());
+ if (paths.size() == 0) {
+ DEBUG ("no paths");
+ return vector<EdgeId>(0);
+ }
+ if (seq_string.length() > pb_config_.max_contigs_gap_length) {
+ DEBUG("Gap is too large");
+ return vector<EdgeId>(0);
+ }
+ for (size_t i = 0; i < paths.size(); i++) {
+ string cur_string = s_add + PathToString(paths[i]) + e_add;
+ DEBUG("cur_string: " << cur_string <<"\n seq_string " << seq_string);
+ if (paths.size() > 1 && paths.size() < 10) {
+ TRACE("candidate path number "<< i << " , len " << cur_string.length());
+ TRACE("graph candidate: " << cur_string);
+ TRACE("in pacbio read: " << seq_string);
+ for (auto j_iter = paths[i].begin(); j_iter != paths[i].end();
+ ++j_iter) {
+ DEBUG(g_.int_id(*j_iter));
+ }
+ }
+ size_t cur_score = StringDistance(cur_string, seq_string);
+ if (paths.size() > 1 && paths.size() < 10) {
+ DEBUG("score: "<< cur_score);
+ }
+ if (cur_score < best_score) {
+ best_score = cur_score;
+ best_path_ind = i;
+ }
+ }
+ DEBUG(best_score);
+ if (best_score == 1000000000)
+ return vector<EdgeId>(0);
+ if (paths.size() > 1 && paths.size() < 10) {
+ DEBUG("best score found! Path " <<best_path_ind <<" score "<< best_score);
+ }
+ return paths[best_path_ind];
+ }
+
+ // Short read alignment
+ omnigraph::MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
+ ClustersSet mapping_descr = GetOrderClusters(s);
+ map<EdgeId, KmerCluster<Graph> > largest_clusters;
+
+ //Selecting the biggest cluster for each edge
+ for (auto iter = mapping_descr.begin(); iter != mapping_descr.end(); ++iter) {
+
+ auto first_cluster = iter->sorted_positions[iter->first_trustable_index];
+ auto last_cluster = iter->sorted_positions[iter->last_trustable_index];
+ int read_range = last_cluster.read_position - first_cluster.read_position;
+ int edge_range = last_cluster.edge_position - first_cluster.edge_position;
+ int cluster_szie = iter->last_trustable_index - iter->first_trustable_index;
+ if (cluster_szie > 2 * read_range || edge_range < 0 || 2 * edge_range < read_range || edge_range > 2 * read_range) {
+ //skipping cluster
+ continue;
+ }
+
+ auto edge_cluster = largest_clusters.find(iter->edgeId);
+ if (edge_cluster != largest_clusters.end()) {
+ if (edge_cluster->second.last_trustable_index - edge_cluster->second.first_trustable_index
+ < iter->last_trustable_index - iter->first_trustable_index) {
+
+ edge_cluster->second = *iter;
+ }
+ } else {
+ largest_clusters.insert(make_pair(iter->edgeId, *iter));
+ }
+ }
+
+ omnigraph::MappingPath<EdgeId> result;
+ for (auto iter = largest_clusters.begin(); iter != largest_clusters.end(); ++iter) {
+ auto first_cluster = iter->second.sorted_positions[iter->second.first_trustable_index];
+ auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index];
+ omnigraph::MappingRange range(omnigraph::Range(first_cluster.read_position, last_cluster.read_position),
+ omnigraph::Range(first_cluster.edge_position, last_cluster.edge_position));
+ result.join({iter->second.edgeId, range});
+ }
+
+ return result;
+ }
+
+ std::pair<EdgeId, size_t> GetUniqueKmerPos(const RtSeq& kmer) const {
+ KeyWithHash kwh = tmp_index.ConstructKWH(kmer);
+
+ if (tmp_index.valid(kwh.key())) {
+ auto keys = tmp_index.get(kwh);
+ if (keys.size() == 1) {
+ return make_pair(keys[0].edge_id, keys[0].offset);
+ }
+ }
+ return std::make_pair(EdgeId(0), -1u);
+ }
+
+
+};
+
+template<class Graph>
+typename PacBioMappingIndex<Graph>::MappingDescription PacBioMappingIndex<Graph>::Locate(const Sequence &s) const {
+ MappingDescription res;
+ //WARNING: removed read_count from here to make const methods
+ int local_read_count = 0;
+ ++local_read_count;
+ if (s.size() < pacbio_k)
+ return res;
+
+ //RtSeq kmer = s.start<RtSeq>(pacbio_k);
+ KeyWithHash kwh = tmp_index.ConstructKWH(s.start<RtSeq>(pacbio_k));
+
+ for (size_t j = pacbio_k; j < s.size(); ++j) {
+ kwh = kwh << s[j];
+ if (!tmp_index.valid(kwh.key())) {
+// INFO("not valid kmer");
+ continue;
+ }
+ auto keys = tmp_index.get(kwh);
+ TRACE("Valid key, size: "<< keys.size());
+
+ for (auto iter = keys.begin(); iter != keys.end(); ++iter) {
+
+ int quality = (int) keys.size();
+ TRACE("and quality:" << quality);
+ if (banned_kmers.find(Sequence(kwh.key())) != banned_kmers.end())
+ continue;
+ int offset = (int)iter->offset;
+ int s_stretched = int ((double)s.size() * 1.2 + 50);
+ int edge_len = int(g_.length(iter->edge_id));
+ //No alignment in vertex, and further than s+eps bp from edge ends;
+ bool correct_alignment = offset > int(debruijn_k - pacbio_k) && offset < edge_len;
+ if (ignore_map_to_middle) {
+ correct_alignment &= (offset < int(debruijn_k - pacbio_k) + s_stretched || offset > edge_len - s_stretched);
+ }
+ if (correct_alignment) {
+ res[iter->edge_id].push_back(MappingInstance((int) iter->offset, (int) (j - pacbio_k + 1), quality));
+ }
+ }
+ }
+
+ for (auto iter = res.begin(); iter != res.end(); ++iter) {
+ sort(iter->second.begin(), iter->second.end());
+ DEBUG("read count "<< local_read_count);
+ DEBUG("edge: " << g_.int_id(iter->first) << "size: " << iter->second.size());
+ for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter++) {
+ DEBUG(j_iter->str());
+ }
+ }
+
+ return res;
+}
+
+}
diff --git a/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp b/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp
new file mode 100644
index 0000000..6ae4b7a
--- /dev/null
+++ b/src/common/modules/alignment/pacbio/pacbio_read_structures.hpp
@@ -0,0 +1,309 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/indices/perfect_hash_map.hpp"
+#include "common/modules/alignment/sequence_mapper.hpp"
+#include "common/assembly_graph/core/graph.hpp"
+#include <algorithm>
+#include <map>
+#include <set>
+
+namespace pacbio {
+typedef omnigraph::GapDescription<debruijn_graph::Graph> GapDescription;
+
+template<class T>
+struct pair_iterator_less {
+ bool operator ()(pair<size_t, T> const& a, pair<size_t, T> const& b) const {
+ return (a.first < b.first);
+ }
+};
+
+struct MappingInstance {
+ int edge_position;
+ int read_position;
+ //Now quality is the same with multiplicity, so best quality is 1,
+ int quality;
+ MappingInstance(int edge_position, int read_position, int quality) :
+ edge_position(edge_position), read_position(read_position), quality(quality) {
+ }
+
+ inline bool IsUnique() const {
+ return (quality == 1);
+ }
+
+ string str() {
+ stringstream s;
+ s << "E: " << edge_position << " R: " << read_position << " Q: " << quality;
+ return s.str();
+ }
+
+//Less by EDGE position
+ bool operator <(MappingInstance const& b) const {
+ if (edge_position < b.edge_position || (edge_position == b.edge_position && read_position < b.read_position))
+ return true;
+ else
+ return false;
+ }
+private:
+ DECL_LOGGER("MappingInstance")
+ ;
+};
+
+//Less by READ position
+struct ReadPositionComparator {
+ bool operator ()(MappingInstance const& a, MappingInstance const& b) const {
+ return (a.read_position < b.read_position || (a.read_position == b.read_position && a.edge_position < b.edge_position));
+ }
+};
+
+template<class Graph>
+struct KmerCluster {
+ typedef typename Graph::EdgeId EdgeId;
+ int last_trustable_index;
+ int first_trustable_index;
+ size_t average_read_position;
+ size_t average_edge_position;
+ EdgeId edgeId;
+ vector<MappingInstance> sorted_positions;
+ int size;
+
+ KmerCluster(EdgeId e, const vector<MappingInstance>& v) {
+ last_trustable_index = 0;
+ first_trustable_index = 0;
+ average_read_position = 0;
+ edgeId = e;
+ size = (int) v.size();
+ sorted_positions = v;
+ FillTrustableIndeces();
+ }
+
+ bool operator <(const KmerCluster & b) const {
+ return (average_read_position < b.average_read_position ||(average_read_position == b.average_read_position && edgeId < b.edgeId) ||
+ (average_read_position == b.average_read_position && edgeId == b.edgeId && sorted_positions < b.sorted_positions));
+ }
+
+ bool CanFollow(const KmerCluster &b) const {
+ return (b.sorted_positions[b.last_trustable_index].read_position < sorted_positions[first_trustable_index].read_position);
+ }
+
+ void FillTrustableIndeces() {
+ //ignore non-unique kmers for distance determination
+ int first_unique_ind = 0;
+ while (first_unique_ind != size - 1 && !(sorted_positions[first_unique_ind].IsUnique())) {
+ first_unique_ind += 1;
+ }
+ int last_unique_ind = size - 1;
+ while (last_unique_ind != 0 && !(sorted_positions[last_unique_ind].IsUnique())) {
+ last_unique_ind -= 1;
+ }
+ last_trustable_index = last_unique_ind;
+ first_trustable_index = first_unique_ind;
+ double tmp_read_position = 0, tmp_edge_position = 0;;
+ vector<int> diffs;
+ for (auto mp : sorted_positions) {
+ tmp_read_position += mp.read_position;
+ tmp_edge_position += mp.edge_position;
+ diffs.push_back(mp.read_position - mp.edge_position);
+ }
+ sort(diffs.begin(), diffs.end());
+ int median_diff = diffs[size/2];
+
+ tmp_read_position /= size;
+ tmp_edge_position /= size;
+ average_read_position = (size_t)trunc(tmp_read_position);
+ average_edge_position = (size_t)trunc(tmp_edge_position);
+
+ if (size > 10) {
+ int max_debug_size = 10;
+ vector<int> distances(max_debug_size);
+ for (int df: diffs) {
+ int ind = abs(df - median_diff)/ 50;
+ if (ind > max_debug_size - 1) ind = max_debug_size - 1;
+ distances [ind] ++;
+ }
+ if (size > 100 || distances[0] * 5 < size * 4) {
+ stringstream s;
+
+ for (int d: distances) {
+ s << d << " ";
+ }
+// INFO(s.str());
+
+ }
+ }
+ }
+
+ string str(const Graph &g) const{
+ stringstream s;
+ s << "Edge: " << g.int_id(edgeId) << " on edge: " << sorted_positions[first_trustable_index].edge_position<< " - " << sorted_positions[last_trustable_index].edge_position<< ";on read: " << sorted_positions[first_trustable_index].read_position<< " - " << sorted_positions[last_trustable_index].read_position<< ";size "<< size;
+ return s.str();
+ }
+private:
+ DECL_LOGGER("KmerCluster")
+ ;
+};
+
+//template<class Graph>
+//struct GapDescription {
+// typedef typename Graph::EdgeId EdgeId;
+// EdgeId start, end;
+// Sequence gap_seq;
+// int edge_gap_start_position, edge_gap_end_position;
+//
+//
+// GapDescription(EdgeId start_e, EdgeId end_e, const Sequence &gap, int gap_start, int gap_end) :
+// start(start_e), end(end_e), gap_seq(gap.str()), edge_gap_start_position(gap_start), edge_gap_end_position(gap_end) {
+// }
+//
+// GapDescription(const KmerCluster<Graph> &a, const KmerCluster<Graph> & b, Sequence read, int pacbio_k) {
+// edge_gap_start_position = a.sorted_positions[a.last_trustable_index].edge_position;
+// edge_gap_end_position = b.sorted_positions[b.first_trustable_index].edge_position + pacbio_k - 1;
+// start = a.edgeId;
+// end = b.edgeId;
+// DEBUG(read.str());
+// gap_seq = read.Subseq(a.sorted_positions[a.last_trustable_index].read_position,
+// b.sorted_positions[b.first_trustable_index].read_position + pacbio_k - 1);
+// DEBUG(gap_seq.str());
+// DEBUG("gap added");
+// }
+//
+// GapDescription<Graph> conjugate(Graph &g, int shift) const {
+// GapDescription<Graph> res(
+// g.conjugate(end), g.conjugate(start), (!gap_seq),
+// (int) g.length(end) + shift - edge_gap_end_position,
+// (int) g.length(start) + shift - edge_gap_start_position);
+// DEBUG("conjugate created" << res.str(g));
+// return res;
+// }
+//
+// string str(Graph &g) const {
+// stringstream s;
+// s << g.int_id(start) << " " << edge_gap_start_position <<endl << g.int_id(end) << " " << edge_gap_end_position << endl << gap_seq.str()<< endl;
+// return s.str();
+// }
+//
+// bool operator <(const GapDescription& b) const {
+// return (start < b.start || (start == b.start && end < b.end) ||
+// (start == b.start && end == b.end && edge_gap_start_position < b.edge_gap_start_position));
+// }
+//
+//private:
+// DECL_LOGGER("PacIndex")
+// ;
+//}
+
+struct StatsCounter{
+ map<size_t,size_t> path_len_in_edges;
+ vector<size_t> subreads_length;
+ size_t total_len ;
+ size_t reads_with_conjugate;
+ size_t subreads_count;
+ map<size_t, size_t> seeds_percentage;
+ StatsCounter() {
+ total_len = 0;
+ reads_with_conjugate = 0;
+ }
+
+ void AddStorage(StatsCounter &other) {
+ total_len += other.total_len;
+ reads_with_conjugate += other.reads_with_conjugate;
+ for (auto iter = other.subreads_length.begin(); iter != other.subreads_length.end(); ++iter) {
+ subreads_length.push_back(*iter);
+ }
+
+ for (auto iter = other.path_len_in_edges.begin(); iter != other.path_len_in_edges.end(); ++iter){
+ auto j_iter = iter;
+ if (( j_iter = path_len_in_edges.find(iter->first)) == other.path_len_in_edges.end()){
+ path_len_in_edges.insert(make_pair(iter->first, iter->second));
+ } else {
+ path_len_in_edges[j_iter->first] += iter->second;
+ }
+ }
+ for (auto iter = other.seeds_percentage.begin(); iter != other.seeds_percentage.end(); ++iter){
+ auto j_iter = iter;
+ if (( j_iter = seeds_percentage.find(iter->first)) == other.seeds_percentage.end()){
+ seeds_percentage.insert(make_pair(iter->first, iter->second));
+ } else {
+ seeds_percentage[j_iter->first] += iter->second;
+ }
+ }
+ }
+
+ void report() const {
+ size_t total = 0;
+ for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
+ total += iter->second;
+ }
+ size_t cur = 0;
+ size_t percentage = 0;
+ for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
+ cur += iter->second;
+ percentage = iter->first;
+ if (cur * 2 > total) break;
+ }
+ INFO("Median fraction of present seeds in maximal alignmnent among reads aligned to the graph: " << double(percentage) * 0.001);
+ }
+
+private:
+ DECL_LOGGER("StatsCounter");
+};
+
+inline int StringDistance(string &a, string &b) {
+ int a_len = (int) a.length();
+ int b_len = (int) b.length();
+ int d = min(a_len / 3, b_len / 3);
+ d = max(d, 10);
+ DEBUG(a_len << " " << b_len << " " << d);
+ vector<vector<int> > table(a_len);
+ //int d =
+ for (int i = 0; i < a_len; i++) {
+ table[i].resize(b_len);
+ int low = max(max(0, i - d - 1), i + b_len - a_len - d - 1);
+ int high = min(min(b_len, i + d + 1), i + a_len - b_len + d + 1);
+ TRACE(low << " " <<high);
+ for (int j = low; j < high; j++)
+ table[i][j] = 1000000;
+ }
+ table[a_len - 1][b_len - 1] = 1000000;
+ table[0][0] = 0;
+//free deletions on begin
+// for(int j = 0; j < b_len; j++)
+// table[0][j] = 0;
+
+ for (int i = 0; i < a_len; i++) {
+ int low = max(max(0, i - d), i + b_len - a_len - d);
+ int high = min(min(b_len, i + d), i + a_len - b_len + d);
+
+ TRACE(low << " " <<high);
+ for (int j = low; j < high; j++) {
+
+ if (i > 0)
+ table[i][j] = min(table[i][j], table[i - 1][j] + 1);
+ if (j > 0)
+ table[i][j] = min(table[i][j], table[i][j - 1] + 1);
+ if (i > 0 && j > 0) {
+ int add = 1;
+ if (a[i] == b[j])
+ add = 0;
+ table[i][j] = min(table[i][j], table[i - 1][j - 1] + add);
+ }
+ }
+ }
+ //return table[a_len - 1][b_len - 1];
+//free deletions on end
+ int res = table[a_len - 1][b_len - 1];
+ DEBUG(res);
+// for(int j = 0; j < b_len; j++){
+// res = min(table[a_len - 1][j], res);
+// }
+ return res;
+}
+
+
+}
diff --git a/src/common/modules/alignment/sequence_mapper.hpp b/src/common/modules/alignment/sequence_mapper.hpp
new file mode 100644
index 0000000..7572fb6
--- /dev/null
+++ b/src/common/modules/alignment/sequence_mapper.hpp
@@ -0,0 +1,405 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "sequence/sequence_tools.hpp"
+#include "common/assembly_graph/core/basic_graph_stats.hpp"
+
+#include "edge_index.hpp"
+#include "kmer_mapper.hpp"
+
+#include <cstdlib>
+#include "common/assembly_graph/core/basic_graph_stats.hpp"
+
+namespace debruijn_graph {
+using omnigraph::MappingPath;
+using omnigraph::Path;
+using omnigraph::MappingRange;
+using omnigraph::Range;
+
+template<class Graph>
+MappingPath<typename Graph::EdgeId> ConjugateMapping(const Graph& g,
+ const MappingPath<typename Graph::EdgeId>& mp,
+ size_t sequence_length) {
+ MappingPath<typename Graph::EdgeId> answer;
+ for (size_t i = mp.size(); i > 0; --i) {
+ auto p = mp[i-1];
+ auto e = p.first;
+ MappingRange mr = p.second;
+ answer.push_back(g.conjugate(e),
+ MappingRange(mr.initial_range.Invert(sequence_length - g.k()),
+ mr.mapped_range.Invert(g.length(e))));
+ }
+ return answer;
+}
+
+template<class Graph>
+class SequenceMapper {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef RtSeq Kmer;
+
+ virtual ~SequenceMapper() {}
+
+ virtual MappingPath<EdgeId> MapSequence(const Sequence &sequence) const = 0;
+
+ virtual MappingPath<EdgeId> MapRead(const io::SingleRead &read) const = 0;
+};
+
+template<class Graph>
+class AbstractSequenceMapper : public SequenceMapper<Graph> {
+protected:
+ const Graph& g_;
+
+// const Graph& g() const {
+// return g_;
+// }
+public:
+ AbstractSequenceMapper(const Graph& g) : g_(g) {
+ }
+
+ MappingPath<EdgeId> MapRead(const io::SingleRead &read) const override {
+// VERIFY(read.IsValid());
+ DEBUG(read.name() << " is mapping");
+ string s = read.GetSequenceString();
+ size_t l = 0, r = 0;
+ MappingPath<EdgeId> result;
+ for(size_t i = 0; i < s.size(); i++) {
+ if (read.GetSequenceString()[i] == 'N') {
+ if (r > l) {
+ result.join(this->MapSequence(Sequence(s.substr(l, r - l))), int(l));
+ }
+ r = i + 1;
+ l = i + 1;
+ } else {
+ r++;
+ }
+ }
+ if (r > l) {
+ result.join(this->MapSequence(Sequence(s.substr(l, r - l))), int(l));
+ }
+ DEBUG(read.name() << " is mapped");
+ DEBUG("Number of edges is " << result.size());
+
+ return result;
+ }
+};
+
+//potentially useful class
+//template<class Graph>
+//class DelegatingSequenceMapper : public SequenceMapper<Graph> {
+//public:
+// typedef std::function<MappingPath<EdgeId> (const MappingPath<EdgeId>&, size_t)> ProcessingF;
+//private:
+// shared_ptr<SequenceMapper<Graph>> inner_mapper_;
+// ProcessingF processing_f_;
+//
+//public:
+// DelegatingSequenceMapper(shared_ptr<SequenceMapper<Graph>> inner_mapper,
+// ProcessingF processing_f) :
+// inner_mapper_(inner_mapper), processing_f_(processing_f) {
+// }
+//
+// MappingPath<EdgeId> MapSequence(const Sequence& s) const override {
+// return processing_f_(inner_mapper_->MapSequence(s), s.size());
+// }
+//
+// MappingPath<EdgeId> MapRead(const io::SingleRead& r) const override {
+// return processing_f_(inner_mapper_->MapRead(r), r.size());
+// }
+//};
+
+template<class Graph>
+bool SpuriousMappingFilter(const Graph& /*g*/,
+ const MappingPath<EdgeId>& mapping_path,
+ size_t read_length,
+ size_t max_range,
+ size_t min_flank) {
+ if (mapping_path.size() == 1) {
+ Range read_range = mapping_path[0].second.initial_range;
+ if (read_range.size() <= max_range
+ && read_range.start_pos >= min_flank
+ && read_range.end_pos + min_flank <= read_length)
+ return false;
+ }
+ return true;
+}
+
+template<class Graph>
+class MappingPathFixer {
+public:
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ MappingPathFixer(const Graph& graph)
+ : g_(graph) {
+ }
+
+ bool CheckContiguous(const vector<typename Graph::EdgeId>& path) const {
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i]))
+ return false;
+ }
+ return true;
+ }
+
+ Path<EdgeId> TryFixPath(const Path<EdgeId>& path, size_t length_bound = 70) const {
+ return Path<EdgeId>(TryFixPath(path.sequence(), length_bound), path.start_pos(), path.end_pos());
+ }
+
+ vector<EdgeId> TryFixPath(const vector<EdgeId>& edges, size_t length_bound = 70) const {
+ vector<EdgeId> answer;
+ if (edges.empty()) {
+ // WARN("Mapping path was empty");
+ return vector<EdgeId>();
+ }
+ answer.push_back(edges[0]);
+ for (size_t i = 1; i < edges.size(); ++i) {
+ if (g_.EdgeEnd(edges[i - 1]) != g_.EdgeStart(edges[i])) {
+ vector<EdgeId> closure = TryCloseGap(g_.EdgeEnd(edges[i - 1]),
+ g_.EdgeStart(edges[i]),
+ length_bound);
+ answer.insert(answer.end(), closure.begin(), closure.end());
+ }
+ answer.push_back(edges[i]);
+ }
+ return answer;
+ }
+
+ vector<EdgeId> DeleteSameEdges(const vector<EdgeId>& path) const {
+ vector<EdgeId> result;
+ if (path.empty()) {
+ return result;
+ }
+ result.push_back(path[0]);
+ for (size_t i = 1; i < path.size(); ++i) {
+ if (path[i] != result[result.size() - 1]) {
+ result.push_back(path[i]);
+ }
+ }
+ return result;
+ }
+
+private:
+ vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, size_t length_bound) const {
+ if (v1 == v2)
+ return vector<EdgeId>();
+ TRACE("Trying to close gap between v1=" << g_.int_id(v1) << " and v2=" << g_.int_id(v2));
+ omnigraph::PathStorageCallback<Graph> path_store(g_);
+
+ TRACE("Path storage callback created");
+ //todo reduce value after investigation
+ omnigraph::ProcessPaths(g_, 0, length_bound, v1, v2, path_store);
+
+ TRACE("Paths processed");
+ if (path_store.size() == 0) {
+ TRACE("Failed to find closing path");
+ // TRACE("Failed to close gap between v1=" << graph_.int_id(v1)
+ // << " (conjugate "
+ // << graph_.int_id(g_.conjugate(v1))
+ // << ") and v2=" << g_.int_id(v2)
+ // << " (conjugate "
+ // << g_.int_id(g_.conjugate(v2)) << ")");
+ // return boost::none;
+ return vector<EdgeId>();
+ } else if (path_store.size() == 1) {
+ TRACE("Unique closing path found");
+ } else {
+ TRACE("Several closing paths found, first chosen");
+ }
+ TRACE("Taking answer ");
+ vector<EdgeId> answer = path_store.paths().front();
+ TRACE("Gap closed");
+ TRACE( "Cumulative closure length is " << CumulativeLength(g_, answer));
+ return answer;
+ }
+ const Graph& g_;
+};
+
+template<class Graph>
+class ReadPathFinder {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ typedef MappingPathFixer<Graph> GraphMappingPathFixer;
+ const GraphMappingPathFixer path_fixer_;
+public:
+ ReadPathFinder (const Graph& g) :
+ g_(g), path_fixer_(g)
+ { }
+
+ vector<EdgeId> FindReadPath(const MappingPath<EdgeId>& mapping_path) const {
+ if (!IsMappingPathValid(mapping_path)) {
+ TRACE("read unmapped");
+ return vector<EdgeId>();
+ }
+ vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
+ mapping_path.simple_path());
+ PrintPathInfo(corrected_path);
+ if(corrected_path.size() != mapping_path.simple_path().size()) {
+ DEBUG("Some edges were deleted");
+ }
+ vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
+ if (!path_fixer_.CheckContiguous(fixed_path)) {
+ TRACE("read unmapped");
+ std::stringstream debug_stream;
+ for (size_t i = 0; i < fixed_path.size(); ++i) {
+ debug_stream << g_.int_id(fixed_path[i]) << " ";
+ }
+ TRACE(debug_stream.str());
+ return vector<EdgeId>();
+ } else {
+ DEBUG("Path fix works");
+ }
+ return fixed_path;
+ }
+
+
+private:
+
+ bool IsMappingPathValid(const MappingPath<EdgeId>& path) const {
+ return path.size() != 0;
+ }
+
+ void PrintPathInfo(vector<EdgeId>& corrected_path) const {
+ for(size_t i = 0; i < corrected_path.size(); ++i) {
+ DEBUG(i + 1 << "-th edge is " << corrected_path[i].int_id());
+ }
+ }
+};
+
+template<class Graph, class Index>
+class BasicSequenceMapper: public AbstractSequenceMapper<Graph> {
+ using AbstractSequenceMapper<Graph>::g_;
+
+ const Index& index_;
+
+ typedef std::vector<MappingRange> RangeMappings;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Index::KMer Kmer;
+ typedef KmerMapper<Graph> KmerSubs;
+ const KmerSubs& kmer_mapper_;
+ size_t k_;
+ bool optimization_on_;
+
+ bool FindKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
+ RangeMappings& range_mappings) const {
+ std::pair<EdgeId, size_t> position = index_.get(kmer);
+ if (position.second == -1u)
+ return false;
+
+ if (passed.empty() || passed.back() != position.first ||
+ kmer_pos != range_mappings.back().initial_range.end_pos ||
+ position.second + 1 < range_mappings.back().mapped_range.end_pos) {
+ passed.push_back(position.first);
+
+ range_mappings.push_back(MappingRange(Range(kmer_pos, kmer_pos + 1),
+ Range(position.second, position.second + 1)));
+ } else {
+ range_mappings.back().initial_range.end_pos = kmer_pos + 1;
+ range_mappings.back().mapped_range.end_pos = position.second + 1;
+ }
+
+ return true;
+ }
+
+ bool TryThread(const Kmer& kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
+ RangeMappings& range_mappings) const {
+ EdgeId last_edge = passed.back();
+ size_t end_pos = range_mappings.back().mapped_range.end_pos;
+ if (end_pos < g_.length(last_edge)) {
+ if (g_.EdgeNucls(last_edge)[end_pos + k_ - 1] == kmer[k_ - 1]) {
+ range_mappings.back().initial_range.end_pos++;
+ range_mappings.back().mapped_range.end_pos++;
+ return true;
+ }
+ } else {
+ VertexId v = g_.EdgeEnd(last_edge);
+
+ if(!optimization_on_)
+ if(g_.OutgoingEdgeCount(v) > 1)
+ return false;
+
+ for (auto I = g_.out_begin(v), E = g_.out_end(v); I != E; ++I) {
+ EdgeId edge = *I;
+ if (g_.EdgeNucls(edge)[k_ - 1] == kmer[k_ - 1]) {
+ passed.push_back(edge);
+ range_mappings.push_back(
+ MappingRange(Range(kmer_pos, kmer_pos + 1),
+ Range(0, 1)));
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool ProcessKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
+ RangeMappings& range_mapping, bool try_thread) const {
+ if (try_thread) {
+ if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
+ FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
+ return false;
+ }
+
+ return true;
+ }
+
+ if (kmer_mapper_.CanSubstitute(kmer)) {
+ FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
+ return false;
+ }
+
+ return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
+ }
+
+ public:
+ BasicSequenceMapper(const Graph& g,
+ const Index& index,
+ const KmerSubs& kmer_mapper,
+ bool optimization_on = true) :
+ AbstractSequenceMapper<Graph>(g), index_(index),
+ kmer_mapper_(kmer_mapper), k_(g.k()+1),
+ optimization_on_(optimization_on) { }
+
+ MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
+ std::vector<EdgeId> passed_edges;
+ RangeMappings range_mapping;
+
+ if (sequence.size() < k_) {
+ return MappingPath<EdgeId>();
+ }
+
+ Kmer kmer = sequence.start<Kmer>(k_);
+ bool try_thread = false;
+ try_thread = ProcessKmer(kmer, 0, passed_edges,
+ range_mapping, try_thread);
+ for (size_t i = k_; i < sequence.size(); ++i) {
+ kmer <<= sequence[i];
+ try_thread = ProcessKmer(kmer, i - k_ + 1, passed_edges,
+ range_mapping, try_thread);
+ }
+
+ return MappingPath<EdgeId>(passed_edges, range_mapping);
+ }
+
+ DECL_LOGGER("BasicSequenceMapper");
+};
+
+
+template<class gp_t>
+std::shared_ptr<BasicSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t>> MapperInstance(const gp_t& gp) {
+ return std::make_shared<BasicSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t>>(gp.g, gp.index, gp.kmer_mapper);
+}
+
+}
diff --git a/src/common/modules/alignment/sequence_mapper_notifier.hpp b/src/common/modules/alignment/sequence_mapper_notifier.hpp
new file mode 100644
index 0000000..35120e2
--- /dev/null
+++ b/src/common/modules/alignment/sequence_mapper_notifier.hpp
@@ -0,0 +1,184 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_
+#define SEQUENCE_MAPPER_NOTIFIER_HPP_
+
+#include "utils/memory_limit.hpp"
+#include "sequence_mapper.hpp"
+#include "short_read_mapper.hpp"
+#include "io/reads/paired_read.hpp"
+#include "io/reads/read_stream_vector.hpp"
+#include "pipeline/graph_pack.hpp"
+
+#include <vector>
+#include <cstdlib>
+
+namespace debruijn_graph {
+//todo think if we still need all this
+class SequenceMapperListener {
+public:
+ virtual void StartProcessLibrary(size_t /* threads_count */) {}
+ virtual void StopProcessLibrary() {}
+
+ //TODO: think about read ierarchy
+ virtual void ProcessPairedRead(size_t /* thread_index */, const io::PairedRead& /* pr */,
+ const MappingPath<EdgeId>& /* read1 */, const MappingPath<EdgeId>& /* read2 */) {}
+ virtual void ProcessPairedRead(size_t /* thread_index */, const io::PairedReadSeq& /* pr */,
+ const MappingPath<EdgeId>& /* read1 */, const MappingPath<EdgeId>& /* read2 */) {}
+ virtual void ProcessSingleRead(size_t /* thread_index */, const io::SingleRead& /* r */, const MappingPath<EdgeId>& /* read */) {}
+ virtual void ProcessSingleRead(size_t /* thread_index */, const io::SingleReadSeq& /* r */, const MappingPath<EdgeId>& /* read */) {}
+
+ virtual void MergeBuffer(size_t /* thread_index */) {}
+
+ virtual ~SequenceMapperListener() {}
+};
+
+class SequenceMapperNotifier {
+ static constexpr size_t BUFFER_SIZE = 200000;
+public:
+ typedef SequenceMapper<conj_graph_pack::graph_t> SequenceMapperT;
+
+ SequenceMapperNotifier(const conj_graph_pack& gp)
+ : gp_(gp) { }
+
+ void Subscribe(size_t lib_index, SequenceMapperListener* listener) {
+ while ((int)lib_index >= (int)listeners_.size() - 1) {
+ std::vector<SequenceMapperListener*> vect;
+ listeners_.push_back(vect);
+ }
+ listeners_[lib_index].push_back(listener);
+ }
+
+ template<class ReadType>
+ void ProcessLibrary(io::ReadStreamList<ReadType>& streams,
+ size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) {
+ if (threads_count == 0)
+ threads_count = streams.size();
+
+ streams.reset();
+ NotifyStartProcessLibrary(lib_index, threads_count);
+ size_t counter = 0, n = 15;
+ size_t fmem = get_free_memory();
+
+ #pragma omp parallel for num_threads(threads_count) shared(counter)
+ for (size_t i = 0; i < streams.size(); ++i) {
+ size_t size = 0;
+ ReadType r;
+ auto& stream = streams[i];
+ while (!stream.eof()) {
+ if (size == BUFFER_SIZE ||
+ // Stop filling buffer if the amount of available is smaller
+ // than half of free memory.
+ (10 * get_free_memory() / 4 < fmem && size > 10000)) {
+ #pragma omp critical
+ {
+ counter += size;
+ if (counter >> n) {
+ INFO("Processed " << counter << " reads");
+ n += 1;
+ }
+ size = 0;
+ NotifyMergeBuffer(lib_index, i);
+ }
+ }
+ stream >> r;
+ ++size;
+ NotifyProcessRead(r, mapper, lib_index, i);
+ }
+ #pragma omp atomic
+ counter += size;
+ }
+
+ for (size_t i = 0; i < threads_count; ++i)
+ NotifyMergeBuffer(lib_index, i);
+
+ INFO("Total " << counter << " reads processed");
+ NotifyStopProcessLibrary(lib_index);
+ }
+
+private:
+ template<class ReadType>
+ void NotifyProcessRead(const ReadType& r, const SequenceMapperT& mapper, size_t ilib, size_t ithread) const;
+
+ void NotifyStartProcessLibrary(size_t ilib, size_t thread_count) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->StartProcessLibrary(thread_count);
+ }
+
+ void NotifyStopProcessLibrary(size_t ilib) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->StopProcessLibrary();
+ }
+
+ void NotifyMergeBuffer(size_t ilib, size_t ithread) const {
+ for (const auto& listener : listeners_[ilib])
+ listener->MergeBuffer(ithread);
+ }
+ const conj_graph_pack& gp_;
+
+ std::vector<std::vector<SequenceMapperListener*> > listeners_; //first vector's size = count libs
+};
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedReadSeq& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+
+ const Sequence& read1 = r.first().sequence();
+ const Sequence& read2 = r.second().sequence();
+ MappingPath<EdgeId> path1 = mapper.MapSequence(read1);
+ MappingPath<EdgeId> path2 = mapper.MapSequence(read2);
+ for (const auto& listener : listeners_[ilib]) {
+ TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
+ listener->ProcessPairedRead(ithread, r, path1, path2);
+ listener->ProcessSingleRead(ithread, r.first(), path1);
+ listener->ProcessSingleRead(ithread, r.second(), path2);
+ }
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedRead& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ MappingPath<EdgeId> path1 = mapper.MapRead(r.first());
+ MappingPath<EdgeId> path2 = mapper.MapRead(r.second());
+ for (const auto& listener : listeners_[ilib]) {
+ TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
+ listener->ProcessPairedRead(ithread, r, path1, path2);
+ listener->ProcessSingleRead(ithread, r.first(), path1);
+ listener->ProcessSingleRead(ithread, r.second(), path2);
+ }
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleReadSeq& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ const Sequence& read = r.sequence();
+ MappingPath<EdgeId> path = mapper.MapSequence(read);
+ for (const auto& listener : listeners_[ilib])
+ listener->ProcessSingleRead(ithread, r, path);
+}
+
+template<>
+inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleRead& r,
+ const SequenceMapperT& mapper,
+ size_t ilib,
+ size_t ithread) const {
+ MappingPath<EdgeId> path = mapper.MapRead(r);
+ for (const auto& listener : listeners_[ilib])
+ listener->ProcessSingleRead(ithread, r, path);
+}
+
+} /*debruijn_graph*/
+
+
+#endif /* SEQUENCE_MAPPER_NOTIFIER_HPP_ */
diff --git a/src/common/modules/alignment/short_read_mapper.hpp b/src/common/modules/alignment/short_read_mapper.hpp
new file mode 100644
index 0000000..db9e564
--- /dev/null
+++ b/src/common/modules/alignment/short_read_mapper.hpp
@@ -0,0 +1,93 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+
+#include "sequence_mapper.hpp"
+#include "common/modules/alignment/pacbio/pac_index.hpp"
+#include "modules/alignment/bwa_sequence_mapper.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class SensitiveReadMapper: public AbstractSequenceMapper<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ using AbstractSequenceMapper<Graph>::g_;
+private:
+
+ size_t small_k_;
+
+ //FIXME awful!
+ static map<size_t, pacbio::PacBioMappingIndex<Graph>* > indices_;
+ static size_t active_mappers_;
+
+ pacbio::PacBioMappingIndex<Graph>* index_;
+
+public:
+
+ SensitiveReadMapper(const Graph& g, size_t k, size_t graph_k) :
+ AbstractSequenceMapper<Graph>(g), small_k_(k)
+ {
+ if (indices_.find(small_k_) == indices_.end()) {
+ indices_.insert(make_pair(small_k_,
+ new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false, cfg::get().output_dir, cfg::get().pb)));
+ }
+ index_ = indices_[small_k_];
+ ++active_mappers_;
+ }
+
+ MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
+ return index_->GetShortReadAlignment(sequence);
+ }
+
+ ~SensitiveReadMapper() {
+ --active_mappers_;
+ }
+
+ static void EraseIndices() {
+ if (active_mappers_ > 0) {
+ WARN("There are still active mappers");
+ }
+ for (auto iter = indices_.begin(); iter != indices_.end(); ++iter) {
+ delete iter->second;
+ }
+ indices_.clear();
+ }
+
+};
+
+template<class Graph>
+map<size_t, pacbio::PacBioMappingIndex<Graph>* > SensitiveReadMapper<Graph>::indices_;
+
+template<class Graph>
+size_t SensitiveReadMapper<Graph>::active_mappers_ = 0;
+
+template<class graph_pack, class SequencingLib>
+std::shared_ptr<SequenceMapper<typename graph_pack::graph_t>> ChooseProperMapper(const graph_pack& gp, const SequencingLib& library, bool use_bwa = false) {
+ typedef typename graph_pack::graph_t Graph;
+ if (library.type() == io::LibraryType::MatePairs) {
+ if (use_bwa) {
+ INFO("Mapping mate-pairs using BWA lib mapper");
+ return std::make_shared<alignment::BWAReadMapper<Graph>>(gp.g);
+ } else {
+ INFO("Mapping mate-pair library, selecting sensitive read mapper with k=" << cfg::get().sensitive_map.k);
+ return std::make_shared<SensitiveReadMapper<Graph>>(gp.g, cfg::get().sensitive_map.k, gp.k_value);
+ }
+ }
+ size_t read_length = library.data().read_length;
+ if (read_length < gp.k_value && library.type() == io::LibraryType::PairedEnd) {
+ INFO("Read length = " << read_length << ", selecting short read mapper");
+ return std::make_shared<SensitiveReadMapper<Graph>>(gp.g, read_length/ 3, gp.k_value);
+ }
+
+ INFO("Selecting usual mapper");
+ return MapperInstance(gp);
+}
+
+}
+
diff --git a/src/common/modules/genome_consistance_checker.cpp b/src/common/modules/genome_consistance_checker.cpp
new file mode 100644
index 0000000..ac40130
--- /dev/null
+++ b/src/common/modules/genome_consistance_checker.cpp
@@ -0,0 +1,276 @@
+#include "modules/genome_consistance_checker.hpp"
+#include "assembly_graph/core/graph.hpp"
+#include <algorithm>
+#include <limits>
+namespace debruijn_graph {
+using omnigraph::MappingRange;
+using namespace std;
+
+//gap or overlap size. WITHOUT SIGN!
+static size_t gap(const Range &a, const Range &b) {
+ return max(a.end_pos, b.start_pos) - min (a.end_pos, b.start_pos);
+}
+bool GenomeConsistenceChecker::consequent(const Range &mr1, const Range &mr2) const{
+ if (mr1.end_pos > mr2.start_pos + absolute_max_gap_)
+ return false;
+ if (mr1.end_pos + absolute_max_gap_ < mr2.start_pos)
+ return false;
+ return true;
+
+}
+bool GenomeConsistenceChecker::consequent(const MappingRange &mr1, const MappingRange &mr2) const {
+ //do not want to think about handling gaps near 0 position.
+ if (!consequent(mr1.initial_range, mr2.initial_range) || !consequent(mr1.mapped_range, mr2.mapped_range))
+ return false;
+ size_t initial_gap = gap(mr1.initial_range, mr2.initial_range);
+ size_t mapped_gap = gap(mr1.mapped_range, mr2.mapped_range);
+ size_t max_gap = max(initial_gap, mapped_gap);
+ if ( max_gap > relative_max_gap_* double (max (min(mr1.initial_range.size(), mr1.mapped_range.size()), min(mr2.initial_range.size(), mr2.mapped_range.size()))))
+ return false;
+ return true;
+}
+
+PathScore GenomeConsistenceChecker::CountMisassemblies(const BidirectionalPath &path) const {
+ PathScore straight = CountMisassembliesWithStrand(path, "0");
+ PathScore reverse = CountMisassembliesWithStrand(path, "1");
+ size_t total_length = path.LengthAt(0);
+//TODO: constant;
+ if (total_length > std::max(straight.mapped_length, reverse.mapped_length) * 2) {
+ if (total_length > 10000) {
+ INFO ("For path length " << total_length <<" mapped less than half of the path, skipping");
+ }
+ return PathScore(0,0,0);
+ } else {
+ if (straight.mapped_length > reverse.mapped_length) {
+ return straight;
+ } else {
+ return reverse;
+ }
+ }
+}
+
+vector<pair<EdgeId, MappingRange> > GenomeConsistenceChecker::ConstructEdgeOrder() const {
+ vector<pair<EdgeId, MappingRange> > to_sort;
+ for(auto e: storage_) {
+ if (excluded_unique_.find(e) == excluded_unique_.end() ) {
+ set<MappingRange> mappings = gp_.edge_pos.GetEdgePositions(e, "fxd0");
+ if (mappings.size() > 1) {
+ INFO("edge " << e << "smth strange");
+ } else if (mappings.size() == 0) {
+ continue;
+ } else {
+ to_sort.push_back(make_pair(e, *mappings.begin()));
+ }
+ }
+ }
+ sort(to_sort.begin(), to_sort.end(), [](const pair<EdgeId, MappingRange> & a, const pair<EdgeId, MappingRange> & b) -> bool
+ {
+ return a.second.initial_range.start_pos < b.second.initial_range.start_pos;
+ }
+ );
+ return to_sort;
+}
+
+
+void GenomeConsistenceChecker::SpellGenome() {
+ size_t count = 0;
+ auto to_sort = ConstructEdgeOrder();
+ vector<size_t> starts;
+ vector<size_t> ends;
+ for(size_t i = 0; i <to_sort.size(); i++) {
+ if (i > 0 && to_sort[i].second.initial_range.start_pos - to_sort[i-1].second.initial_range.end_pos > storage_.GetMinLength() ) {
+ INFO ("Large gap " << to_sort[i].second.initial_range.start_pos - to_sort[i-1].second.initial_range.end_pos );
+ starts.push_back(to_sort[i].second.initial_range.start_pos);
+ ends.push_back(to_sort[i-1].second.initial_range.end_pos);
+ }
+ if (i == 0) {
+ starts.push_back(to_sort[i].second.initial_range.start_pos);
+ }
+ if (i == to_sort.size() - 1){
+ ends.push_back(to_sort[i].second.initial_range.end_pos);
+ }
+ INFO("edge " << gp_.g.int_id(to_sort[i].first) << " length "<< gp_.g.length(to_sort[i].first) <<
+ " coverage " << gp_.g.coverage(to_sort[i].first) << " mapped to " << to_sort[i].second.mapped_range.start_pos
+ << " - " << to_sort[i].second.mapped_range.end_pos << " init_range " << to_sort[i].second.initial_range.start_pos << " - " << to_sort[i].second.initial_range.end_pos );
+ genome_spelled_[to_sort[i].first] = count;
+ count++;
+ }
+ vector<size_t> lengths;
+ size_t total_len = 0;
+ for (size_t i = 0; i < starts.size(); i++) {
+ lengths.push_back(ends[i] - starts[i]);
+ total_len += lengths[i];
+ }
+ sort(lengths.begin(), lengths.end());
+ reverse(lengths.begin(), lengths.end());
+ size_t cur = 0;
+ size_t i = 0;
+ while (cur < total_len / 2 && i < lengths.size()) {
+ cur += lengths[i];
+ i++;
+ }
+ INFO("Assuming gaps of length > " << storage_.GetMinLength() << " unresolvable..");
+ if (lengths.size() > 0)
+ INFO("Rough estimates on N50/L50:" << lengths[i - 1] << " / " << i - 1 << " with len " << total_len);
+}
+
+PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const {
+ if (strand == "1") {
+ return (CountMisassembliesWithStrand(*path.GetConjPath(), "0"));
+ }
+ PathScore res(0, 0, 0);
+ EdgeId prev;
+ size_t prev_in_genome = std::numeric_limits<std::size_t>::max();
+ size_t prev_in_path = std::numeric_limits<std::size_t>::max();
+ MappingRange prev_range;
+ for (int i = 0; i < (int) path.Size(); i++) {
+ if (genome_spelled_.find(path.At(i)) != genome_spelled_.end()) {
+ size_t cur_in_genome = genome_spelled_[path.At(i)];
+ MappingRange cur_range = *gp_.edge_pos.GetEdgePositions(path.At(i), "fxd0").begin();
+ if (prev_in_genome != std::numeric_limits<std::size_t>::max()) {
+ if (cur_in_genome == prev_in_genome + 1) {
+ int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos;
+ int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos;
+ DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
+ DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path);
+ if (size_t(abs(dist_in_genome - dist_in_path)) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
+
+ res.wrong_gap_size ++;
+ }
+ } else {
+ if (path.At(i) != circular_edge_ && path.At(prev_in_path) != circular_edge_)
+ res.misassemblies++;
+ else
+ INFO("Skipping fake(circular) misassembly");
+ }
+ }
+ res.mapped_length += cur_range.mapped_range.size();
+ prev = path.At(i);
+ prev_in_genome = cur_in_genome;
+ prev_range = cur_range;
+ prev_in_path = i;
+ }
+ }
+ if (prev_in_path != std::numeric_limits<std::size_t>::max())
+ DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
+ return res;
+}
+void GenomeConsistenceChecker::RefillPos() {
+ RefillPos("0");
+ RefillPos("1");
+}
+
+
+void GenomeConsistenceChecker::RefillPos(const string &strand) {
+ for (auto e: storage_) {
+ RefillPos(strand, e);
+ }
+}
+
+void GenomeConsistenceChecker::FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const {
+ vector<MappingRange> to_process (old_mappings.begin(), old_mappings.end());
+ sort(to_process.begin(), to_process.end(), [](const MappingRange & a, const MappingRange & b) -> bool
+ {
+ return a.mapped_range.start_pos < b.mapped_range.start_pos;
+ } );
+ size_t sz = to_process.size();
+//max weight path in orgraph of mappings
+ TRACE("constructing mapping graph" << sz << " vertices");
+ vector<vector<size_t>> consecutive_mappings(sz);
+ for(size_t i = 0; i < sz; i++) {
+ for (size_t j = i + 1; j < sz; j++) {
+ if (consequent(to_process[i], to_process[j])) {
+ consecutive_mappings[i].push_back(j);
+ } else {
+ if (to_process[j].mapped_range.start_pos > to_process[i].mapped_range.end_pos + absolute_max_gap_) {
+ break;
+ }
+ }
+ }
+ }
+ vector<size_t> scores(sz), prev(sz);
+ for(size_t i = 0; i < sz; i++) {
+ scores[i] = to_process[i].initial_range.size();
+ prev[i] = std::numeric_limits<std::size_t>::max();
+ }
+ for(size_t i = 0; i < sz; i++) {
+ for (size_t j = 0; j < consecutive_mappings[i].size(); j++) {
+ TRACE(consecutive_mappings[i][j]);
+ if (scores[consecutive_mappings[i][j]] < scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size()) {
+ scores[consecutive_mappings[i][j]] = scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size();
+ prev[consecutive_mappings[i][j]] = i;
+ }
+ }
+ }
+ size_t cur_max = 0;
+ size_t cur_i = 0;
+ for(size_t i = 0; i < sz; i++) {
+ if (scores[i] > cur_max) {
+ cur_max = scores[i];
+ cur_i = i;
+ }
+ }
+ used_mappings.clear();
+ while (cur_i != std::numeric_limits<std::size_t>::max()) {
+ used_mappings.push_back(to_process[cur_i]);
+ cur_i = prev[cur_i];
+ }
+ reverse(used_mappings.begin(), used_mappings.end());
+};
+
+void GenomeConsistenceChecker::RefillPos(const string &strand, const EdgeId &e) {
+ set<MappingRange> old_mappings = gp_.edge_pos.GetEdgePositions(e, strand);
+ TRACE("old mappings sz " << old_mappings.size() );
+ size_t total_mapped = 0;
+ for (auto mp:old_mappings) {
+ total_mapped += mp.initial_range.size();
+ }
+ if (total_mapped > (double) gp_.g.length(e) * 1.5) {
+ INFO ("Edge " << gp_.g.int_id(e) << "is not unique, excluding");
+ excluded_unique_.insert(e);
+ return;
+ }
+//TODO: support non-unique edges;
+ if (total_mapped < (double) gp_.g.length(e) * 0.5) {
+ DEBUG ("Edge " << gp_.g.int_id(e) << "is not mapped on strand "<< strand <<", not used");
+ return;
+ }
+ TRACE(total_mapped << " " << gp_.g.length(e));
+ string new_strand = "fxd" + strand;
+ vector<MappingRange> used_mappings;
+ FindBestRangeSequence(old_mappings, used_mappings);
+
+ size_t cur_i = 0;
+ MappingRange new_mapping;
+ new_mapping = used_mappings[cur_i];
+ size_t used_mapped = new_mapping.initial_range.size();
+ TRACE ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e));
+ TRACE ("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
+ << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
+ while (cur_i < used_mappings.size() - 1) {
+ cur_i ++;
+ used_mapped += used_mappings[cur_i].initial_range.size();
+ new_mapping = new_mapping.Merge(used_mappings[cur_i]);
+ TRACE("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
+ << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
+ }
+//used less that 0.9 of aligned length
+ if (total_mapped * 10 >= used_mapped * 10 + gp_.g.length(e)) {
+ INFO ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e) << "is potentially misassembled! mappings: ");
+ for (auto mp:old_mappings) {
+ INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos );
+ if (mp.initial_range.start_pos < absolute_max_gap_) {
+ INFO ("Fake(linear order) misassembly on edge "<< e.int_id());
+ if (strand == "0") {
+ circular_edge_ = e;
+ }
+ }
+ }
+
+ }
+ gp_.edge_pos.AddEdgePosition(e, new_strand, new_mapping);
+}
+
+
+
+}
diff --git a/src/common/modules/genome_consistance_checker.hpp b/src/common/modules/genome_consistance_checker.hpp
new file mode 100644
index 0000000..0fcf115
--- /dev/null
+++ b/src/common/modules/genome_consistance_checker.hpp
@@ -0,0 +1,79 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#pragma once
+#include "visualization/graph_labeler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "sequence/sequence.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "visualization/position_filler.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+
+namespace debruijn_graph {
+
+
+using path_extend::BidirectionalPath;
+using path_extend::ScaffoldingUniqueEdgeStorage;
+
+struct PathScore{
+ size_t misassemblies;
+ size_t wrong_gap_size;
+ size_t mapped_length;
+ PathScore(size_t m, size_t w, size_t ml): misassemblies(m), wrong_gap_size(w), mapped_length(ml) {}
+};
+class GenomeConsistenceChecker {
+
+private:
+ const conj_graph_pack &gp_;
+ //EdgesPositionHandler<Graph> &position_handler_;
+ Sequence genome_;
+ const ScaffoldingUniqueEdgeStorage &storage_;
+ size_t absolute_max_gap_;
+ double relative_max_gap_;
+ set<EdgeId> excluded_unique_;
+ EdgeId circular_edge_;
+//map from unique edges to their order in genome spelling;
+ mutable map<EdgeId, size_t> genome_spelled_;
+ bool consequent(const Range &mr1, const Range &mr2) const;
+ bool consequent(const MappingRange &mr1, const MappingRange &mr2) const ;
+
+ PathScore CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const;
+//constructs longest sequence of consequetive ranges, stores result in used_mappings
+ void FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const;
+//Refills genomic positions uniting alingments separated with small gaps
+ void RefillPos();
+ void RefillPos(const string &strand);
+ void RefillPos(const string &strand, const EdgeId &e);
+DECL_LOGGER("GenomeConsistenceChecker");
+
+
+public:
+ GenomeConsistenceChecker(const conj_graph_pack &gp, const ScaffoldingUniqueEdgeStorage &storage, size_t max_gap, double relative_max_gap /*= 0.2*/) : gp_(gp),
+ genome_(gp.genome.GetSequence()), storage_(storage),
+ absolute_max_gap_(max_gap), relative_max_gap_(relative_max_gap), excluded_unique_(), circular_edge_() {
+ if (!gp.edge_pos.IsAttached()) {
+ gp.edge_pos.Attach();
+ }
+ gp.edge_pos.clear();
+ visualization::position_filler::FillPos(gp_, gp_.genome.GetSequence(), "0");
+ visualization::position_filler::FillPos(gp_, !gp_.genome.GetSequence(), "1");
+ RefillPos();
+ }
+ PathScore CountMisassemblies(const BidirectionalPath &path) const;
+ vector<pair<EdgeId, MappingRange> > ConstructEdgeOrder() const;
+
+//spells genome in language of long unique edges from storage;
+ void SpellGenome();
+
+
+};
+
+
+}
diff --git a/src/common/modules/graph_construction.hpp b/src/common/modules/graph_construction.hpp
new file mode 100644
index 0000000..c862956
--- /dev/null
+++ b/src/common/modules/graph_construction.hpp
@@ -0,0 +1,180 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * graph_construction.hpp
+ *
+ * Created on: Aug 12, 2011
+ * Author: sergey
+ */
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+
+#include "io/reads/io_helper.hpp"
+#include "assembly_graph/core/graph.hpp"
+
+#include "utils/debruijn_graph/debruijn_graph_constructor.hpp"
+#include "utils/debruijn_graph/early_simplification.hpp"
+
+#include "utils/perfcounter.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/graph_support/coverage_filling.hpp"
+#include "utils/indices/storing_traits.hpp"
+#include "utils/indices/edge_index_builders.hpp"
+#include "utils/openmp_wrapper.h"
+
+namespace debruijn_graph {
+
+template<class StoringType>
+struct CoverageCollector {
+};
+
+template<>
+struct CoverageCollector<SimpleStoring> {
+ template<class Info>
+ static void CollectCoverage(Info edge_info) {
+ edge_info.edge_id->IncCoverage(edge_info.count);
+ }
+};
+
+template<>
+struct CoverageCollector<InvertableStoring> {
+ template<class Info>
+ static void CollectCoverage(Info edge_info) {
+ edge_info.edge_id->IncCoverage(edge_info.count);
+ edge_info.edge_id->conjugate()->IncCoverage(edge_info.count);
+ }
+};
+
+
+template<class Index>
+void FillCoverageFromIndex(const Index &index) {
+ for (auto I = index.value_cbegin(), E = index.value_cend();
+ I != E; ++I) {
+ const auto& edge_info = *I;
+ VERIFY(edge_info.offset != -1u);
+// VERIFY(edge_info.edge_id.get() != NULL);
+ if(edge_info.offset != -1u) {
+ CoverageCollector<typename Index::storing_type>::CollectCoverage(edge_info);
+ }
+ }
+ DEBUG("Coverage counted");
+}
+
+template<class Graph, class Readers, class Index>
+size_t ConstructGraphUsingOldIndex(Readers& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ INFO("Constructing DeBruijn graph");
+
+ TRACE("Filling indices");
+ size_t rl = 0;
+ VERIFY_MSG(streams.size(), "No input streams specified");
+
+ TRACE("... in parallel");
+ typedef typename Index::InnerIndexT InnerIndex;
+ typedef typename EdgeIndexHelper<InnerIndex>::CoverageFillingEdgeIndexBuilderT IndexBuilder;
+ InnerIndex& debruijn = index.inner_index();
+ //fixme hack
+ rl = IndexBuilder().BuildIndexFromStream(debruijn, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream));
+
+ VERIFY(g.k() + 1== debruijn.k());
+ // FIXME: output_dir here is damn ugly!
+
+ TRACE("Filled indices");
+
+ INFO("Condensing graph");
+ DeBruijnGraphConstructor<Graph, InnerIndex> g_c(g, debruijn);
+ TRACE("Constructor ok");
+ VERIFY(!index.IsAttached());
+ index.Attach();
+ g_c.ConstructGraph(100, 10000, 1.2); // TODO: move magic constants to config
+ INFO("Graph condensed");
+
+ return rl;
+}
+
+template<class ExtensionIndex>
+void EarlyClipTips(size_t k, const config::debruijn_config::construction& params, size_t rl, ExtensionIndex& ext) {
+ if (params.early_tc.enable) {
+ size_t length_bound = rl - k;
+ if (params.early_tc.length_bound)
+ length_bound = params.early_tc.length_bound.get();
+ AlternativeEarlyTipClipper(ext, length_bound).ClipTips();
+ }
+}
+
+#include "utils/indices/kmer_extension_index_builder.hpp"
+
+template<class Graph, class Read, class Index>
+ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params,
+ io::ReadStreamList<Read>& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ size_t k = g.k();
+ INFO("Constructing DeBruijn graph for k=" << k);
+
+ TRACE("Filling indices");
+ VERIFY_MSG(streams.size(), "No input streams specified");
+
+ TRACE("... in parallel");
+ // FIXME: output_dir here is damn ugly!
+ typedef DeBruijnExtensionIndex<> ExtensionIndex;
+ typedef typename ExtensionIndexHelper<ExtensionIndex>::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder;
+ ExtensionIndex ext((unsigned) k, index.inner_index().workdir());
+
+ //fixme hack
+ ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size);
+
+ EarlyClipTips(k, params, stats.max_read_length_, ext);
+
+ INFO("Condensing graph");
+ VERIFY(!index.IsAttached());
+ DeBruijnGraphExtentionConstructor<Graph> g_c(g, ext);
+ g_c.ConstructGraph(100, 10000, 1.2, params.keep_perfect_loops);//TODO move these parameters to config
+
+ INFO("Building index with from graph")
+ //todo pass buffer size
+ index.Refill();
+ index.Attach();
+
+ return stats;
+}
+
+template<class Graph, class Index, class Streams>
+ReadStatistics ConstructGraph(const config::debruijn_config::construction ¶ms,
+ Streams& streams, Graph& g,
+ Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ if (params.con_mode == config::construction_mode::extention) {
+ return ConstructGraphUsingExtentionIndex(params, streams, g, index, contigs_stream);
+// } else if(params.con_mode == construction_mode::con_old){
+// return ConstructGraphUsingOldIndex(k, streams, g, index, contigs_stream);
+ } else {
+ INFO("Invalid construction mode")
+ VERIFY(false);
+ return {0,0,0};
+ }
+}
+
+template<class Graph, class Index, class Streams>
+ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::construction ¶ms,
+ Streams& streams, Graph& g,
+ Index& index, FlankingCoverage<Graph>& flanking_cov,
+ io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream);
+
+ typedef typename Index::InnerIndex InnerIndex;
+ typedef typename EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
+ INFO("Filling coverage index")
+ IndexBuilder().ParallelFillCoverage(index.inner_index(), streams);
+ INFO("Filling coverage and flanking coverage from index");
+ FillCoverageAndFlanking(index.inner_index(), g, flanking_cov);
+ return rs;
+}
+
+}
diff --git a/src/common/modules/graph_read_correction.hpp b/src/common/modules/graph_read_correction.hpp
new file mode 100644
index 0000000..892cfb8
--- /dev/null
+++ b/src/common/modules/graph_read_correction.hpp
@@ -0,0 +1,187 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/paths/path_utils.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "assembly_graph/paths/path_finders.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "io/reads/modifying_reader_wrapper.hpp"
+#include "assembly_graph/core/order_and_law.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+
+namespace debruijn_graph {
+
+template<class gp_t>
+class TipsProjector {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+
+ gp_t& gp_;
+
+ const omnigraph::UniquePathFinder<Graph> unique_path_finder_;
+
+ optional<EdgeId> UniqueAlternativeEdge(EdgeId tip, bool outgoing_tip) {
+ vector<EdgeId> edges;
+ if (outgoing_tip) {
+ push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip)));
+ } else {
+ push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip)));
+ }
+ restricted::set<EdgeId> edges_set(edges.begin(), edges.end());
+ edges_set.erase(tip);
+ if (edges_set.size() == 1)
+ return optional < EdgeId > (*edges_set.begin());
+ else
+ return boost::none;
+ }
+
+ vector<EdgeId> UniqueAlternativePath(EdgeId tip, bool outgoing_tip) {
+ optional<EdgeId> alt_edge = UniqueAlternativeEdge(tip, outgoing_tip);
+ if (alt_edge) {
+ if (outgoing_tip) {
+ return unique_path_finder_.UniquePathForward(*alt_edge);
+ } else {
+ return unique_path_finder_.UniquePathBackward(*alt_edge);
+ }
+ }
+ return vector<EdgeId>();
+ }
+
+ void AlignAndProject(const Sequence& tip_seq, const Sequence& alt_seq,
+ bool outgoing_tip) {
+ //todo refactor
+ Sequence aligned_tip = tip_seq;
+ Sequence aligned_alt = alt_seq;
+ if (outgoing_tip) {
+ if (tip_seq.size() >= alt_seq.size()) {
+ aligned_tip = tip_seq.Subseq(0, alt_seq.size());
+ } else {
+ aligned_alt = alt_seq.Subseq(0, tip_seq.size());
+ }
+ } else {
+ if (tip_seq.size() >= alt_seq.size()) {
+ aligned_tip = tip_seq.Subseq(tip_seq.size() - alt_seq.size());
+ } else {
+ aligned_alt = alt_seq.Subseq(alt_seq.size() - tip_seq.size());
+ }
+ }
+
+ INFO(
+ "Remapping " << aligned_tip.size()
+ << " kmers of aligned_tip to aligned_alt");
+ gp_.kmer_mapper.RemapKmers(aligned_tip, aligned_alt);
+ }
+
+public:
+ TipsProjector(gp_t& gp) :
+ gp_(gp), unique_path_finder_(gp.g) {
+
+ }
+
+ void ProjectTip(EdgeId tip) {
+ TRACE("Trying to project tip " << gp_.g.str(tip));
+ bool outgoing_tip = gp_.g.IsDeadEnd(gp_.g.EdgeEnd(tip));
+ Sequence tip_seq = gp_.g.EdgeNucls(tip);
+ vector<EdgeId> alt_path = UniqueAlternativePath(tip, outgoing_tip);
+ if (alt_path.empty()) {
+ TRACE(
+ "Failed to find unique alt path for tip " << gp_.g.str(tip)
+ << ". Wasn't projected!!!");
+ } else {
+ Sequence alt_seq = MergeSequences(gp_.g, alt_path);
+ if (tip_seq.size() > alt_seq.size()) {
+ TRACE(
+ "Can't fully project tip " << gp_.g.str(tip)
+ << " with seq length " << tip_seq.size()
+ << " because alt path length is "
+ << alt_seq.size()
+ << ". Trying to project partially");
+ }
+ AlignAndProject(tip_seq, alt_seq, outgoing_tip);
+ AlignAndProject(!tip_seq, !alt_seq, !outgoing_tip);
+ TRACE("Tip projected");
+ }
+ }
+private:
+ DECL_LOGGER("TipsProjector")
+ ;
+};
+
+//todo improve logging
+template<class Graph, class Mapper>
+class GraphReadCorrector: public io::SequenceModifier {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& graph_;
+ const Mapper mapper_;
+ const MappingPathFixer<Graph> path_fixer_;
+
+public:
+ /*virtual*/
+ Sequence Modify(const Sequence& s) {
+// if(s < !s)
+// return !Refine(!s);
+ omnigraph::MappingPath<EdgeId> mapping_path = mapper_.MapSequence(s);
+
+ if (mapping_path.size() == 0 || s.size() < graph_.k() + 1
+ || mapping_path.front().second.initial_range.start_pos != 0
+ || mapping_path.back().second.initial_range.end_pos
+ != s.size() - graph_.k()) {
+ //todo reduce concat unmapped beginning and end in future???
+ TRACE(
+ "Won't fix because wasn't mapped or start/end fell on unprojected tip/erroneous connection");
+// TRACE(
+// "For sequence of length " << s.size()
+// << " returning empty sequence");
+ return s;
+// return Sequence();
+ }
+
+ Path<EdgeId> path = path_fixer_.TryFixPath(mapping_path.path());
+// TRACE("Mapped sequence to path " << graph_.str(path.sequence()));
+
+ if (!path_fixer_.CheckContiguous(path.sequence())) {
+ TRACE("Even fixed path wasn't contiguous");
+ return s;
+ } else {
+ TRACE("Fixed path is contiguous");
+ Sequence answer = PathSequence(graph_, path);
+// if (answer != s) {
+// if (answer.size() < 1000) {
+// TRACE(
+// "Initial sequence modified, edit distance= "
+// << EditDistance(answer, s));
+// } else {
+// TRACE("Sequence too large, won't count edit distance");
+// }
+// }
+ return answer;
+ }
+
+// else {
+// TRACE("Initial sequence unmodified!");
+// }
+ }
+
+ GraphReadCorrector(const Graph& graph, const Mapper& mapper) :
+ graph_(graph), mapper_(mapper), path_fixer_(graph) {
+ }
+
+private:
+ DECL_LOGGER("ContigRefiner");
+};
+
+template<class Graph, class Mapper>
+shared_ptr<GraphReadCorrector<Graph, Mapper>> GraphReadCorrectorInstance(
+ const Graph& graph, const Mapper& mapper) {
+ return std::make_shared<GraphReadCorrector<Graph, Mapper>>(graph, mapper);
+}
+
+}
diff --git a/src/common/modules/mismatch_shall_not_pass.hpp b/src/common/modules/mismatch_shall_not_pass.hpp
new file mode 100644
index 0000000..085e412
--- /dev/null
+++ b/src/common/modules/mismatch_shall_not_pass.hpp
@@ -0,0 +1,333 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "modules/simplification/compressor.hpp"
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "utils/logger/logger.hpp"
+
+#include "io/reads/read_stream_vector.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+
+#include "pipeline/config_struct.hpp"
+
+namespace debruijn_graph {
+
+namespace mismatches {
+struct NuclCount {
+ size_t counts_[4];
+
+ NuclCount() {
+ memset(counts_, 0, sizeof(counts_));
+ }
+
+ size_t &operator[](size_t nucl) {
+ return counts_[nucl];
+ }
+
+ NuclCount &operator+=(const NuclCount &other) {
+ counts_[0] += other.counts_[0];
+ counts_[1] += other.counts_[1];
+ counts_[2] += other.counts_[2];
+ counts_[3] += other.counts_[3];
+ return *this;
+ }
+};
+
+struct MismatchEdgeInfo {
+ NuclCount operator[](size_t i) const {
+ auto it = info_.find(i);
+ if (it == info_.end())
+ return NuclCount();
+ else
+ return it->second;
+ }
+
+ void operator+=(const MismatchEdgeInfo &other) {
+ for (auto it = other.info_.begin(); it != other.info_.end(); ++it) {
+ info_[it->first] += it->second;
+ }
+ }
+
+ void IncIfContains(size_t position, size_t nucl) {
+ auto it = info_.find(position);
+ if (it != info_.end()) {
+ it->second[nucl]++;
+ }
+ }
+
+ void AddPosition(size_t position) {
+ info_[position]; //in case map did not contain this key creates entry in the map with default value
+ }
+
+public:
+ map<size_t, NuclCount> info_;
+};
+
+template<typename EdgeId>
+class MismatchStatistics {
+private:
+ typedef typename map<EdgeId, MismatchEdgeInfo>::const_iterator const_iterator;
+ map<EdgeId, MismatchEdgeInfo> statistics_;
+
+ template<class graph_pack>
+ void CollectPotensialMismatches(const graph_pack &gp) {
+ auto &kmer_mapper = gp.kmer_mapper;
+ for (auto it = kmer_mapper.begin(); it != kmer_mapper.end(); ++it) {
+ // Kmer mapper iterator dereferences to pair (KMer, KMer), not to the reference!
+ const auto mentry = *it;
+ const RtSeq &from = mentry.first;
+ const RtSeq &to = mentry.second;
+ size_t cnt = 0;
+ size_t cnt_arr[4];
+ for (size_t i = 0; i < 4; i++)
+ cnt_arr[i] = 0;
+ for (size_t i = 0; i < from.size(); i++) {
+ if (from[i] != to[i]) {
+ cnt++;
+ cnt_arr[(i * 4) / from.size()]++;
+ }
+ }
+ //last two contitions - to avoid excessive indels.
+ //if two/third of nucleotides in first/last quoter are mismatches, then it means erroneous mapping
+
+ if (cnt >= 1 && cnt <= from.size() / 3 && cnt_arr[0] <= from.size() / 6 &&
+ cnt_arr[3] <= from.size() / 6) {
+ for (size_t i = 0; i < from.size(); i++) {
+ if (from[i] != to[i] && gp.index.contains(to)) {
+ pair<EdgeId, size_t> position = gp.index.get(to);
+ statistics_[position.first].AddPosition(position.second + i);
+ }
+ }
+ }
+ }
+ }
+
+ void operator+=(const MismatchStatistics<EdgeId> &other) {
+ for (auto it = other.statistics_.begin(); it != other.statistics_.end(); ++it) {
+ statistics_[it->first] += it->second;
+ }
+ }
+
+public:
+ template<class graph_pack>
+ MismatchStatistics(const graph_pack &gp) {
+ CollectPotensialMismatches(gp);
+ }
+
+ const_iterator begin() const {
+ return statistics_.begin();
+ }
+
+ const_iterator end() const {
+ return statistics_.end();
+ }
+
+ const_iterator find(const EdgeId &edge) const {
+ return statistics_.find(edge);
+ }
+
+ template<class graph_pack, class read_type>
+ void Count(io::ReadStream<read_type> &stream, const graph_pack &gp) {
+ stream.reset();
+ DEBUG("count started");
+ auto sm = MapperInstance(gp);
+ DEBUG("seq mapper created");
+ while (!stream.eof()) {
+ read_type read;
+ stream >> read;
+ const Sequence &s_read = read.sequence();
+ omnigraph::MappingPath<EdgeId> path = sm->MapSequence(s_read);
+ TRACE("read mapped");
+ if (path.size() == 1 && path[0].second.initial_range.size() == path[0].second.mapped_range.size()) {
+ Range initial_range = path[0].second.initial_range;
+ Range mapped_range = path[0].second.mapped_range;
+ const Sequence &s_edge = gp.g.EdgeNucls(path[0].first);
+ size_t len = initial_range.size() + gp.g.k();
+ size_t cnt = 0;
+ for (size_t i = 0; i < len; i++) {
+ if (s_read[initial_range.start_pos + i] != s_edge[mapped_range.start_pos + i]) {
+ cnt++;
+ }
+ }
+ if (cnt <= gp.g.k() / 3) {
+ TRACE("statistics changing");
+ auto it = statistics_.find(path[0].first);
+ if (it == statistics_.end()) {
+ // if (gp.g.length(path[0].first) < 4000)
+ // WARN ("id "<< gp.g.length(path[0].first)<<" " << len);
+ continue;
+ }
+ for (size_t i = 0; i < len; i++) {
+ size_t nucl_code = s_read[initial_range.start_pos + i];
+ it->second.IncIfContains(mapped_range.start_pos + i, nucl_code);
+ }
+ }
+ }
+ }
+ }
+
+ template<class graph_pack, class read_type>
+ void ParallelCount(io::ReadStreamList<read_type> &streams, const graph_pack &gp) {
+ size_t nthreads = streams.size();
+ std::vector<MismatchStatistics<EdgeId> *> statistics(nthreads);
+#pragma omp parallel for num_threads(nthreads) shared(streams, statistics)
+ for (size_t i = 0; i < nthreads; ++i) {
+ statistics[i] = new MismatchStatistics<EdgeId>(*this);
+ DEBUG("statistics created thread " << i);
+ statistics[i]->Count(streams[i], gp);
+ DEBUG("count finished thread " << i);
+ }
+
+ INFO("Finished collecting potential mismatches positions");
+ for (size_t i = 0; i < statistics.size(); i++) {
+ *this += *statistics[i];
+ delete statistics[i];
+ }
+ }
+};
+}
+
+template<class graph_pack, class read_type>
+class MismatchShallNotPass {
+private:
+ typedef typename graph_pack::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ graph_pack &gp_;
+ double relative_threshold_;
+
+ EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) {
+ VERIFY(position >= gp_.g.k());
+ if (position + 1 < gp_.g.length(edge)) {
+ edge = gp_.g.SplitEdge(edge, position + 1).first;
+ }
+ EdgeId mismatch = edge;
+ if (position > gp_.g.k()) {
+ auto tmp = gp_.g.SplitEdge(edge, position - gp_.g.k());
+ edge = tmp.first;
+ mismatch = tmp.second;
+ }
+ Sequence s_mm = gp_.g.EdgeNucls(mismatch);
+ Sequence correct = s_mm.Subseq(0, gp_.g.k()) + Sequence(string(1, nucl)) +
+ s_mm.Subseq(gp_.g.k() + 1, gp_.g.k() * 2 + 1);
+
+ VERIFY(nucl != s_mm[gp_.g.k()]);
+ EdgeId correct_edge = gp_.g.AddEdge(gp_.g.EdgeStart(mismatch), gp_.g.EdgeEnd(mismatch), correct);
+ EdgeId glued = gp_.g.GlueEdges(mismatch, correct_edge);
+ return position > gp_.g.k() ? edge : glued;
+ }
+
+ EdgeId CorrectNucls(EdgeId edge, const std::vector<pair<size_t, char>> &mismatches) {
+ for (auto it = mismatches.rbegin(); it != mismatches.rend(); ++it) {
+ edge = CorrectNucl(edge, it->first, it->second);
+ }
+ EdgeId tmp = Compressor<Graph>(gp_.g).CompressVertexEdgeId(gp_.g.EdgeEnd(edge));
+ if (tmp == EdgeId(0))
+ return edge;
+ else
+ return tmp;
+ }
+
+ vector<pair<size_t, char>> FindMismatches(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
+ vector<pair<size_t, char>> to_correct;
+ const Sequence &s_edge = gp_.g.EdgeNucls(edge);
+ for (size_t i = gp_.g.k(); i < gp_.g.length(edge); i++) {
+ size_t cur_best = 0;
+ mismatches::NuclCount nc = statistics[i];
+ for (size_t j = 1; j < 4; j++) {
+ if (nc[j] > nc[cur_best]) {
+ cur_best = j;
+ }
+ }
+ size_t nucl_code = s_edge[i];
+ if ((double) nc[cur_best] > relative_threshold_ * (double) nc[nucl_code] + 1.) {
+ to_correct.push_back(make_pair(i, cur_best));
+ i += gp_.g.k();
+ }
+
+ }
+ return to_correct;
+ }
+
+ size_t CorrectEdge(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
+ vector<pair<size_t, char>> to_correct = FindMismatches(edge, statistics);
+ EdgeId new_edge = CorrectNucls(edge, to_correct);
+ if (new_edge == EdgeId(0))
+ new_edge = edge;
+
+ return to_correct.size();
+ }
+
+ size_t CorrectAllEdges(const mismatches::MismatchStatistics<typename Graph::EdgeId> &statistics) {
+ size_t res = 0;
+ set<EdgeId> conjugate_fix;
+ for (auto it = gp_.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (conjugate_fix.find(gp_.g.conjugate(*it)) == conjugate_fix.end()) {
+ conjugate_fix.insert(*it);
+ }
+ }
+ for (auto it = conjugate_fix.begin(); it != conjugate_fix.end(); ++it) {
+ DEBUG("processing edge" << gp_.g.int_id(*it));
+
+ if (statistics.find(*it) != statistics.end()) {
+ if (!gp_.g.RelatedVertices(gp_.g.EdgeStart(*it), gp_.g.EdgeEnd(*it)))
+ res += CorrectEdge(*it, statistics.find(*it)->second);
+ }
+ }
+ INFO("All edges processed");
+ return res;
+ }
+
+ size_t StopMismatchIteration(io::ReadStream<read_type> &stream) {
+ mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
+ statistics.Count(stream, gp_);
+ return CorrectAllEdges(statistics);
+ }
+
+ size_t ParallelStopMismatchIteration(io::ReadStreamList<read_type> &streams) {
+ mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
+ statistics.ParallelCount(streams, gp_);
+ return CorrectAllEdges(statistics);
+ }
+
+public:
+ MismatchShallNotPass(graph_pack &gp, double relative_threshold = 1.5) :
+ gp_(gp),
+ relative_threshold_(relative_threshold) {
+ VERIFY(relative_threshold >= 1);
+ }
+
+
+ size_t StopAllMismatches(io::ReadStream<read_type> &stream, size_t max_iterations = 1) {
+ size_t res = 0;
+ while (max_iterations > 0) {
+ size_t last = StopMismatchIteration(stream);
+ res += last;
+ if (last == 0)
+ break;
+ max_iterations--;
+ }
+ return res;
+ }
+
+ size_t ParallelStopAllMismatches(io::ReadStreamList<read_type> &streams, size_t max_iterations = 1) {
+ size_t res = 0;
+ while (max_iterations > 0) {
+ size_t last = ParallelStopMismatchIteration(streams);
+ res += last;
+ if (last == 0)
+ break;
+ max_iterations--;
+ }
+ return res;
+ }
+};
+
+}
diff --git a/src/common/modules/path_extend/CMakeLists.txt b/src/common/modules/path_extend/CMakeLists.txt
new file mode 100644
index 0000000..62d21fc
--- /dev/null
+++ b/src/common/modules/path_extend/CMakeLists.txt
@@ -0,0 +1,23 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(path_extend CXX)
+
+add_library(path_extend STATIC pe_config_struct.cpp
+ pipeline/launch_support.cpp
+ pipeline/launcher.cpp
+ pipeline/extenders_logic.cpp
+ scaffolder2015/extension_chooser2015.cpp
+ scaffolder2015/scaffold_graph.cpp
+ scaffolder2015/scaffold_graph_constructor.cpp
+ scaffolder2015/scaffold_graph_visualizer.cpp
+ scaffolder2015/connection_condition2015.cpp
+ scaffolder2015/path_polisher.cpp)
+
+target_link_libraries(path_extend assembly_graph ssw)
+
+
diff --git a/src/common/modules/path_extend/extension_chooser.hpp b/src/common/modules/path_extend/extension_chooser.hpp
new file mode 100644
index 0000000..cfd1e98
--- /dev/null
+++ b/src/common/modules/path_extend/extension_chooser.hpp
@@ -0,0 +1,1162 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * extension.hpp
+ *
+ * Created on: Mar 5, 2012
+ * Author: andrey
+ */
+
+#ifndef EXTENSION_HPP_
+#define EXTENSION_HPP_
+
+#include <cfloat>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include "weight_counter.hpp"
+#include "pe_utils.hpp"
+
+//#include "scaff_supplementary.hpp"
+
+namespace path_extend {
+
+typedef std::multimap<double, EdgeWithDistance> AlternativeContainer;
+
+class PathAnalyzer {
+protected:
+ const Graph& g_;
+
+public:
+ PathAnalyzer(const Graph& g): g_(g) {
+ }
+
+ void RemoveTrivial(const BidirectionalPath& path, std::set<size_t>& to_exclude, bool exclude_bulges = true) const {
+ if (exclude_bulges) {
+ ExcludeTrivialWithBulges(path, to_exclude);
+ } else {
+ ExcludeTrivial(path, to_exclude);
+ }
+ }
+
+protected:
+ virtual int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const {
+ int edgeIndex = (from == -1) ? (int) path.Size() - 1 : from;
+ if ((int) path.Size() <= from) {
+ return edgeIndex;
+ }
+ VertexId currentVertex = g_.EdgeEnd(path[edgeIndex]);
+ while (edgeIndex >= 0 && g_.CheckUniqueIncomingEdge(currentVertex)) {
+ EdgeId e = g_.GetUniqueIncomingEdge(currentVertex);
+ currentVertex = g_.EdgeStart(e);
+
+ edges.insert((size_t) edgeIndex);
+ --edgeIndex;
+ }
+ return edgeIndex;
+ }
+
+ virtual int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const {
+
+ if (path.Empty()) {
+ return 0;
+ }
+
+ int lastEdge = (int) path.Size() - 1;
+ do {
+ lastEdge = ExcludeTrivial(path, edges, lastEdge);
+ bool bulge = true;
+
+ if (lastEdge >= 0) {
+ VertexId v = g_.EdgeEnd(path[lastEdge]);
+ VertexId u = g_.EdgeStart(path[lastEdge]);
+ auto bulgeCandidates = g_.IncomingEdges(v);
+
+ for (const auto& candidate: bulgeCandidates) {
+ if (g_.EdgeStart(candidate) != u) {
+ bulge = false;
+ break;
+ }
+ }
+
+ if (!bulge) {
+ break;
+ }
+ --lastEdge;
+ }
+ } while (lastEdge >= 0);
+
+ return lastEdge;
+ }
+
+protected:
+ DECL_LOGGER("PathAnalyzer")
+};
+
+
+class PreserveSimplePathsAnalyzer: public PathAnalyzer {
+
+public:
+ PreserveSimplePathsAnalyzer(const Graph &g) : PathAnalyzer(g) {
+ }
+
+ int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const override {
+ int edgeIndex = PathAnalyzer::ExcludeTrivial(path, edges, from);
+
+ //Preserving simple path
+ if (edgeIndex == -1) {
+ edges.clear();
+ return (from == -1) ? (int) path.Size() - 1 : from;;
+ }
+ return edgeIndex;
+ }
+
+ int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const override {
+
+ if (path.Empty()) {
+ return 0;
+ }
+
+ int lastEdge = (int) path.Size() - 1;
+ bool has_bulge = false;
+ do {
+ lastEdge = PathAnalyzer::ExcludeTrivial(path, edges, lastEdge);
+
+ if (lastEdge >= 0) {
+ VertexId v = g_.EdgeEnd(path[lastEdge]);
+ VertexId u = g_.EdgeStart(path[lastEdge]);
+ auto bulgeCandidates = g_.IncomingEdges(v);
+ has_bulge = true;
+
+ for (auto iter = bulgeCandidates.begin(); iter != bulgeCandidates.end(); ++iter) {
+ if (g_.EdgeStart(*iter) != u) {
+ has_bulge = false;
+ break;
+ }
+ }
+
+ --lastEdge;
+ }
+ } while (lastEdge >= 0);
+
+ //Preserving simple path
+ if (!has_bulge && lastEdge == -1) {
+ edges.clear();
+ lastEdge = (int) path.Size() - 1;
+ }
+
+ return lastEdge;
+ }
+
+protected:
+ DECL_LOGGER("PathAnalyzer")
+
+};
+
+
+class ExtensionChooserListener {
+
+public:
+
+ virtual void ExtensionChosen(double weight) = 0;
+
+ virtual void ExtensionChosen(const AlternativeContainer& alts) = 0;
+
+ virtual ~ExtensionChooserListener() {
+
+ }
+};
+
+
+class ExtensionChooser {
+
+public:
+ typedef std::vector<EdgeWithDistance> EdgeContainer;
+
+protected:
+ const Graph& g_;
+ shared_ptr<WeightCounter> wc_;
+ //FIXME memory leak?!
+ std::vector<ExtensionChooserListener *> listeners_;
+
+ double weight_threshold_;
+
+public:
+ ExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc = nullptr, double weight_threshold = -1.):
+ g_(g), wc_(wc),
+ weight_threshold_(weight_threshold) {
+ }
+
+ virtual ~ExtensionChooser() {
+
+ }
+
+ virtual EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const = 0;
+
+ bool CheckThreshold(double weight) const {
+ return math::ge(weight, weight_threshold_);
+ }
+
+ void Subscribe(ExtensionChooserListener * listener) {
+ listeners_.push_back(listener);
+ }
+
+ void NotifyAll(double weight) const {
+ for (auto listener_ptr : listeners_) {
+ listener_ptr->ExtensionChosen(weight);
+ }
+ }
+
+ void NotifyAll(const AlternativeContainer& alts) const {
+ for (auto listener_ptr : listeners_) {
+ listener_ptr->ExtensionChosen(alts);
+ }
+ }
+
+ bool WeightCounterBased() const {
+ return wc_ != nullptr;
+ }
+
+ const WeightCounter& wc() const {
+ VERIFY(wc_);
+ return *wc_;
+ }
+
+protected:
+ bool HasIdealInfo(EdgeId e1, EdgeId e2, size_t dist) const {
+ return math::gr(wc_->lib().IdealPairedInfo(e1, e2, (int) dist), 0.);
+ }
+
+ bool HasIdealInfo(const BidirectionalPath& p, EdgeId e, size_t gap) const {
+ for (int i = (int) p.Size() - 1; i >= 0; --i)
+ if (HasIdealInfo(p[i], e, gap + p.LengthAt(i)))
+ return true;
+ return false;
+ }
+
+private:
+ DECL_LOGGER("ExtensionChooser");
+};
+
+
+class JointExtensionChooser: public ExtensionChooser {
+ shared_ptr<ExtensionChooser> first_;
+ shared_ptr<ExtensionChooser> second_;
+
+public:
+ JointExtensionChooser(const Graph& g,
+ shared_ptr<ExtensionChooser> first,
+ shared_ptr<ExtensionChooser> second): ExtensionChooser(g),
+ first_(first), second_(second) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
+ EdgeContainer answer;
+ auto r1 = first_->Filter(path, edges);
+ auto r2 = second_->Filter(path, edges);
+ for (auto ewd1 : r1) {
+ for (auto ewd2 : r2) {
+ if (ewd1.e_ == ewd2.e_) {
+ VERIFY(ewd1.d_ == ewd2.d_);
+ answer.push_back(ewd1);
+ }
+ }
+ }
+ return answer;
+ }
+};
+
+
+class TrivialExtensionChooser: public ExtensionChooser {
+
+public:
+ TrivialExtensionChooser(Graph& g): ExtensionChooser(g) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& /*path*/, const EdgeContainer& edges) const override {
+ if (edges.size() == 1) {
+ return edges;
+ }
+ return EdgeContainer();
+ }
+};
+
+class ExcludingExtensionChooser: public ExtensionChooser {
+ PathAnalyzer analyzer_;
+ double prior_coeff_;
+
+ AlternativeContainer FindWeights(const BidirectionalPath& path, const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
+ AlternativeContainer weights;
+ for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
+ double weight = wc_->CountWeight(path, iter->e_, to_exclude);
+ weights.insert(std::make_pair(weight, *iter));
+ DEBUG("Candidate " << g_.int_id(iter->e_) << " weight " << weight << " length " << g_.length(iter->e_));
+ }
+ NotifyAll(weights);
+ return weights;
+ }
+
+ EdgeContainer FindPossibleEdges(const AlternativeContainer& weights,
+ double max_weight) const {
+ EdgeContainer top;
+ auto possible_edge = weights.lower_bound(max_weight / prior_coeff_);
+ for (auto iter = possible_edge; iter != weights.end(); ++iter) {
+ top.push_back(iter->second);
+ }
+ return top;
+ }
+
+ EdgeContainer FindFilteredEdges(const BidirectionalPath& path,
+ const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
+ AlternativeContainer weights = FindWeights(path, edges, to_exclude);
+ VERIFY(!weights.empty());
+ auto max_weight = (--weights.end())->first;
+ EdgeContainer top = FindPossibleEdges(weights, max_weight);
+ EdgeContainer result;
+ if (CheckThreshold(max_weight)) {
+ result = top;
+ }
+ return result;
+ }
+
+protected:
+
+ virtual void ExcludeEdges(const BidirectionalPath& path,
+ const EdgeContainer& /*edges*/,
+ std::set<size_t>& to_exclude) const {
+ analyzer_.RemoveTrivial(path, to_exclude);
+ }
+
+
+public:
+ ExcludingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, PathAnalyzer analyzer, double weight_threshold, double priority) :
+ ExtensionChooser(g, wc, weight_threshold), analyzer_(analyzer), prior_coeff_(priority) {
+
+ }
+
+ virtual EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const {
+ DEBUG("Paired-end extension chooser");
+ if (edges.empty()) {
+ return edges;
+ }
+ std::set<size_t> to_exclude;
+ path.Print();
+ EdgeContainer result = edges;
+ ExcludeEdges(path, result, to_exclude);
+ result = FindFilteredEdges(path, result, to_exclude);
+ if (result.size() == 1) {
+ DEBUG("Paired-end extension chooser helped");
+ }
+ return result;
+ }
+
+private:
+ DECL_LOGGER("ExcludingExtensionChooser");
+
+};
+
+class SimpleExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const override {
+ ExcludingExtensionChooser::ExcludeEdges(path, edges, to_exclude);
+
+ if (edges.size() < 2) {
+ return;
+ }
+ //excluding based on absence of ideal info
+ int index = (int) path.Size() - 1;
+ while (index >= 0) {
+ if (to_exclude.count(index)) {
+ index--;
+ continue;
+ }
+ EdgeId path_edge = path[index];
+
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (!HasIdealInfo(path_edge,
+ edges.at(i).e_,
+ path.LengthAt(index))) {
+ to_exclude.insert((size_t) index);
+ }
+ }
+
+ index--;
+ }
+
+ //excluding based on presense of ambiguous paired info
+ map<size_t, unsigned> edge_2_extension_cnt;
+ for (size_t i = 0; i < edges.size(); ++i) {
+ for (size_t e : wc_->PairInfoExist(path, edges.at(i).e_)) {
+ edge_2_extension_cnt[e] += 1;
+ }
+ }
+
+ for (auto e_w_ec : edge_2_extension_cnt) {
+ if (e_w_ec.second == edges.size()) {
+ to_exclude.insert(e_w_ec.first);
+ }
+ }
+ }
+
+public:
+
+ SimpleExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
+ }
+
+private:
+ DECL_LOGGER("SimpleExtensionChooser");
+};
+
+//TODO this class should not exist with better configuration of excluding conditions
+class IdealBasedExtensionChooser : public ExcludingExtensionChooser {
+protected:
+ void ExcludeEdges(const BidirectionalPath &path, const EdgeContainer &edges,
+ std::set<size_t> &to_exclude) const override {
+ //commented for a reason
+ //ExcludingExtensionChooser::ExcludeEdges(path, edges, to_exclude);
+ //if (edges.size() < 2) {
+ // return;
+ //}
+ VERIFY(to_exclude.empty());
+ //excluding based on absence of ideal info
+ for (int index = (int) path.Size() - 1; index >= 0; index--) {
+ EdgeId path_edge = path[index];
+
+ for (size_t i = 0; i < edges.size(); ++i) {
+ if (!HasIdealInfo(path_edge,
+ edges.at(i).e_,
+ path.LengthAt(index))) {
+ to_exclude.insert(size_t(index));
+ }
+ }
+ }
+ }
+
+public:
+
+ IdealBasedExtensionChooser(const Graph &g,
+ shared_ptr<WeightCounter> wc,
+ double weight_threshold,
+ double priority) :
+ ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
+ }
+
+private:
+ DECL_LOGGER("IdealBasedExtensionChooser");
+};
+
+class RNAExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const override {
+ ExcludingExtensionChooser::ExcludeEdges(path, edges, to_exclude);
+ if (edges.size() < 2) {
+ return;
+ }
+ size_t i = path.Size() - 1;
+ PathAnalyzer analyzer(g_);
+ while (i > 0) {
+ if (g_.IncomingEdgeCount(g_.EdgeStart(path[i])) > 1)
+ break;
+ to_exclude.insert(i);
+ --i;
+ }
+
+ if (i == 0)
+ to_exclude.clear();
+ }
+
+public:
+
+ RNAExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PreserveSimplePathsAnalyzer(g), weight_threshold, priority) {
+ }
+
+private:
+ DECL_LOGGER("SimpleExtensionChooser");
+};
+
+class LongEdgeExtensionChooser: public ExcludingExtensionChooser {
+protected:
+ virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const {
+ ExcludingExtensionChooser::ExcludeEdges(path, edges, to_exclude);
+ if (edges.size() < 2) {
+ return;
+ }
+ int index = (int) path.Size() - 1;
+ while (index >= 0) {
+ if (to_exclude.count(index)) {
+ index--;
+ continue;
+ }
+ EdgeId path_edge = path[index];
+ //FIXME configure!
+ if (path.graph().length(path_edge) < 200)
+ to_exclude.insert((size_t) index);
+ index--;
+ }
+ }
+public:
+ LongEdgeExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
+ ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
+ }
+};
+
+class ScaffoldingExtensionChooser : public ExtensionChooser {
+
+protected:
+ typedef ExtensionChooser base;
+ double raw_weight_threshold_;
+ double cl_weight_threshold_;
+ const double is_scatter_coeff_ = 3.0;
+
+ void AddInfoFromEdge(const std::vector<int>& distances, const std::vector<double>& weights,
+ std::vector<pair<int, double>>& histogram, size_t len_to_path_end) const {
+ for (size_t l = 0; l < distances.size(); ++l) {
+ //todo commented out condition seems unnecessary and should be library dependent! do we need "max(0" there?
+ if (/*distances[l] > max(0, (int) len_to_path_end - int(1000)) && */math::ge(weights[l], raw_weight_threshold_)) {
+ histogram.push_back(make_pair(distances[l] - (int) len_to_path_end, weights[l]));
+ }
+ }
+ }
+
+ int CountMean(const vector<pair<int, double> >& histogram) const {
+ double dist = 0.0;
+ double sum = 0.0;
+ for (size_t i = 0; i < histogram.size(); ++i) {
+ dist += histogram[i].first * histogram[i].second;
+ sum += histogram[i].second;
+ }
+ dist /= sum;
+ return (int) round(dist);
+ }
+
+ void GetDistances(EdgeId e1, EdgeId e2, std::vector<int>& dist,
+ std::vector<double>& w) const {
+ wc_->lib().CountDistances(e1, e2, dist, w);
+ }
+
+ void CountAvrgDists(const BidirectionalPath& path, EdgeId e, std::vector<pair<int, double>> & histogram) const {
+ for (size_t j = 0; j < path.Size(); ++j) {
+ std::vector<int> distances;
+ std::vector<double> weights;
+ GetDistances(path.At(j), e, distances, weights);
+ if (distances.size() > 0) {
+ AddInfoFromEdge(distances, weights, histogram, path.LengthAt(j));
+ }
+ }
+ }
+
+ void FindBestFittedEdgesForClustered(const BidirectionalPath& path, const set<EdgeId>& edges, EdgeContainer& result) const {
+ for (EdgeId e : edges) {
+ std::vector<pair<int, double>> histogram;
+ CountAvrgDists(path, e, histogram);
+ double sum = 0.0;
+ for (size_t j = 0; j < histogram.size(); ++j) {
+ sum += histogram[j].second;
+ }
+ DEBUG("Weight for scaffolding = " << sum << ", threshold = " << cl_weight_threshold_)
+ if (math::ls(sum, cl_weight_threshold_)) {
+ continue;
+ }
+
+ int gap = CountMean(histogram);
+ if (HasIdealInfo(path, e, gap)) {
+ DEBUG("scaffolding " << g_.int_id(e) << " gap " << gap);
+ result.push_back(EdgeWithDistance(e, gap));
+ }
+ }
+ }
+
+ bool IsTip(EdgeId e) const {
+ return g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0;
+ }
+
+ set<EdgeId> FindCandidates(const BidirectionalPath& path) const {
+ set<EdgeId> jumping_edges;
+ const auto& lib = wc_->lib();
+ //todo lib (and FindJumpEdges) knows its var so it can be counted there
+ int is_scatter = int(math::round(lib.GetIsVar() * is_scatter_coeff_));
+ for (int i = (int) path.Size() - 1; i >= 0 && path.LengthAt(i) - g_.length(path.At(i)) <= lib.GetISMax(); --i) {
+ set<EdgeId> jump_edges_i;
+ lib.FindJumpEdges(path.At(i), jump_edges_i,
+ std::max(0, (int)path.LengthAt(i) - is_scatter),
+ //FIXME do we need is_scatter here?
+ int((path.LengthAt(i) + lib.GetISMax() + is_scatter)),
+ 0);
+ for (EdgeId e : jump_edges_i) {
+ if (IsTip(e)) {
+ jumping_edges.insert(e);
+ }
+ }
+ }
+ return jumping_edges;
+ }
+
+public:
+
+
+ ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc,
+ double cl_weight_threshold,
+ double is_scatter_coeff) :
+ ExtensionChooser(g, wc), raw_weight_threshold_(0.0),
+ cl_weight_threshold_(cl_weight_threshold),
+ is_scatter_coeff_(is_scatter_coeff) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
+ if (edges.empty()) {
+ return edges;
+ }
+ set<EdgeId> candidates = FindCandidates(path);
+ EdgeContainer result;
+ FindBestFittedEdgesForClustered(path, candidates, result);
+ return result;
+ }
+
+private:
+ DECL_LOGGER("ScaffoldingExtensionChooser");
+};
+
+inline bool EdgeWithWeightCompareReverse(const pair<EdgeId, double>& p1,
+ const pair<EdgeId, double>& p2) {
+ return p1.second > p2.second;
+}
+
+class LongReadsUniqueEdgeAnalyzer {
+private:
+ DECL_LOGGER("LongReadsUniqueEdgeAnalyzer")
+public:
+ LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map,
+ double filter_threshold, double prior_threshold,
+ size_t max_repeat_length, bool uneven_depth)
+ : g_(g),
+ cov_map_(cov_map),
+ filter_threshold_(filter_threshold),
+ prior_threshold_(prior_threshold),
+ max_repeat_length_(max_repeat_length),
+ uneven_depth_(uneven_depth) {
+
+ FindAllUniqueEdges();
+ }
+
+ bool IsUnique(EdgeId e) const {
+ return unique_edges_.count(e) > 0;
+ }
+
+private:
+ bool UniqueEdge(EdgeId e) const {
+ if (g_.length(e) > max_repeat_length_)
+ return true;
+ DEBUG("Analyze unique edge " << g_.int_id(e));
+ if (cov_map_.size() == 0) {
+ return false;
+ }
+ auto cov_paths = cov_map_.GetCoveringPaths(e);
+ for (auto it1 = cov_paths.begin(); it1 != cov_paths.end(); ++it1) {
+ auto pos1 = (*it1)->FindAll(e);
+ if (pos1.size() > 1) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ for (auto it2 = it1; it2 != cov_paths.end(); it2++) {
+ auto pos2 = (*it2)->FindAll(e);
+ if (pos2.size() > 1) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ if (!ConsistentPath(**it1, pos1[0], **it2, pos2[0])) {
+ DEBUG("Checking inconsistency");
+ if (CheckInconsistence(**it1, pos1[0], **it2, pos2[0],
+ cov_paths)) {
+ DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
+ return false;
+ }
+ }
+ }
+ }
+ DEBUG("***edge " << g_.int_id(e) << " is unique.***");
+ return true;
+ }
+
+ bool ConsistentPath(const BidirectionalPath& path1, size_t pos1,
+ const BidirectionalPath& path2, size_t pos2) const {
+ return EqualBegins(path1, pos1, path2, pos2, false)
+ && EqualEnds(path1, pos1, path2, pos2, false);
+ }
+ bool SignificantlyDiffWeights(double w1, double w2) const {
+ if (w1 > filter_threshold_ and w2 > filter_threshold_) {
+ if (w1 > w2 * prior_threshold_ or w2 > w1 * prior_threshold_) {
+ return true;
+ }
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckInconsistence(
+ const BidirectionalPath& path1, size_t pos1,
+ const BidirectionalPath& path2, size_t pos2,
+ const BidirectionalPathSet& cov_paths) const {
+ size_t first_diff_pos1 = FirstNotEqualPosition(path1, pos1, path2, pos2, false);
+ size_t first_diff_pos2 = FirstNotEqualPosition(path2, pos2, path1, pos1, false);
+ if (first_diff_pos1 != -1UL && first_diff_pos2 != -1UL) {
+ const BidirectionalPath cand1 = path1.SubPath(first_diff_pos1,
+ pos1 + 1);
+ const BidirectionalPath cand2 = path2.SubPath(first_diff_pos2,
+ pos2 + 1);
+ std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
+ cov_paths);
+ DEBUG("Not equal begin " << g_.int_id(path1.At(first_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(first_diff_pos2)) << " weight " << weights.second);
+ if (!SignificantlyDiffWeights(weights.first, weights.second)) {
+ DEBUG("not significantly different");
+ return true;
+ }
+ }
+ size_t last_diff_pos1 = LastNotEqualPosition(path1, pos1, path2, pos2, false);
+ size_t last_diff_pos2 = LastNotEqualPosition(path2, pos2, path1, pos1, false);
+ if (last_diff_pos1 != -1UL) {
+ const BidirectionalPath cand1 = path1.SubPath(pos1,
+ last_diff_pos1 + 1);
+ const BidirectionalPath cand2 = path2.SubPath(pos2,
+ last_diff_pos2 + 1);
+ std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
+ cov_paths);
+ DEBUG("Not equal end " << g_.int_id(path1.At(last_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(last_diff_pos2)) << " weight " << weights.second);
+ if (!SignificantlyDiffWeights(weights.first, weights.second)) {
+ DEBUG("not significantly different");
+ return true;
+ }
+ }
+ return false;
+ }
+
+ std::pair<double, double> GetSubPathsWeights(
+ const BidirectionalPath& cand1, const BidirectionalPath& cand2,
+ const BidirectionalPathSet& cov_paths) const {
+ double weight1 = 0.0;
+ double weight2 = 0.0;
+ for (auto iter = cov_paths.begin(); iter != cov_paths.end(); ++iter) {
+ BidirectionalPath* path = *iter;
+ if (ContainSubPath(*path, cand1)) {
+ weight1 += path->GetWeight();
+ } else if (ContainSubPath(*path, cand2)) {
+ weight2 += path->GetWeight();
+ }
+ }
+ return std::make_pair(weight1, weight2);
+ }
+
+ bool ContainSubPath(const BidirectionalPath& path,
+ const BidirectionalPath& subpath) const {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (path.CompareFrom(i, subpath))
+ return true;
+ }
+ return false;
+ }
+
+ void FindAllUniqueCoverageEdges() {
+ VERIFY(!uneven_depth_);
+ double sum_cov = 0;
+ size_t sum_len = 0;
+ size_t total_len = 0;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ total_len += g_.length(*iter);
+ if (g_.length(*iter) >= max_repeat_length_) {
+ sum_cov += g_.coverage(*iter) * (double)g_.length(*iter);
+ sum_len += g_.length(*iter);
+ }
+ }
+ if (sum_len * 4 < total_len) return;
+ sum_cov /= (double)sum_len;
+ DEBUG("average coverage of long edges: " << sum_cov) ;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.length(*iter) > 500 && (double)g_.coverage(*iter) < 1.2 * sum_cov) {
+ if (unique_edges_.find(*iter) == unique_edges_.end()) {
+ unique_edges_.insert(*iter);
+ unique_edges_.insert(g_.conjugate(*iter));
+ DEBUG("Added coverage based unique edge " << g_.int_id(*iter) << " len "<< g_.length(*iter) << " " << g_.coverage(*iter));
+ }
+ }
+ }
+ }
+
+
+ void FindAllUniqueEdges() {
+ DEBUG("Looking for unique edges");
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (UniqueEdge(*iter)) {
+ unique_edges_.insert(*iter);
+ unique_edges_.insert(g_.conjugate(*iter));
+ }
+ }
+ DEBUG("coverage based uniqueness started");
+ if (!uneven_depth_)
+ FindAllUniqueCoverageEdges();
+ DEBUG("Unique edges are found");
+ }
+
+ const Graph& g_;
+ const GraphCoverageMap& cov_map_;
+ double filter_threshold_;
+ double prior_threshold_;
+ std::set<EdgeId> unique_edges_;
+ size_t max_repeat_length_;
+ bool uneven_depth_;
+};
+
+class SimpleScaffolding {
+public:
+ SimpleScaffolding(const Graph& g) : g_(g) {}
+
+ BidirectionalPath FindMaxCommonPath(const vector<BidirectionalPath*>& paths,
+ size_t max_diff_len) const {
+ BidirectionalPath max_end(g_);
+ for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) {
+ BidirectionalPath* p1 = *it1;
+ for (size_t i = 0; i < p1->Size(); ++i) {
+ if (p1->Length() - p1->LengthAt(i) > max_diff_len) {
+ break;
+ }
+ bool contain_all = true;
+ for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) {
+ BidirectionalPath subpath = p1->SubPath(i, i1);
+ for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) {
+ BidirectionalPath* p2 = *it2;
+ vector<size_t> positions2 = p2->FindAll(subpath.At(0));
+ bool contain = false;
+ for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) {
+ size_t pos2 = positions2[ipos2];
+ if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len
+ && EqualEnds(subpath, 0, *p2, pos2, false)) {
+ contain = true;
+ break;
+ }
+ }
+ if (!contain) {
+ contain_all = false;
+ }
+ }
+ if (contain_all && (i1 - i) >= max_end.Size()) {
+ max_end.Clear();
+ max_end.PushBack(subpath);
+ }
+ }
+ }
+ }
+ return max_end;
+ }
+
+private:
+ const Graph& g_;
+};
+
+class LongReadsExtensionChooser : public ExtensionChooser {
+public:
+ LongReadsExtensionChooser(const Graph& g,
+ const GraphCoverageMap& read_paths_cov_map,
+ double filtering_threshold,
+ double weight_priority_threshold,
+ double unique_edge_priority_threshold,
+ size_t min_significant_overlap,
+ size_t max_repeat_length,
+ bool uneven_depth)
+ : ExtensionChooser(g),
+ filtering_threshold_(filtering_threshold),
+ weight_priority_threshold_(weight_priority_threshold),
+ min_significant_overlap_(min_significant_overlap),
+ cov_map_(read_paths_cov_map),
+ unique_edge_analyzer_(g, cov_map_, filtering_threshold,
+ unique_edge_priority_threshold,
+ max_repeat_length, uneven_depth),
+ simple_scaffolding_(g)
+ {
+ }
+
+ /* Choose extension as correct only if we have reads that traverse a unique edge from the path and this extension.
+ * Edge is unique if all reads mapped to this edge are consistent.
+ * Two reads are consistent if they can form one path in the graph.
+ */
+ EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const override {
+ if (edges.empty()) {
+ return edges;
+ }DEBUG("We in Filter of LongReadsExtensionChooser");
+ path.Print();
+ map<EdgeId, double> weights_cands;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ weights_cands.insert(make_pair(it->e_, 0.0));
+ }
+ set<EdgeId> filtered_cands;
+ map<EdgeId, BidirectionalPathSet > support_paths_ends;
+ auto support_paths = cov_map_.GetCoveringPaths(path.Back());
+ DEBUG("Found " << support_paths.size() << " covering paths!!!");
+ for (auto it = support_paths.begin(); it != support_paths.end(); ++it) {
+ auto positions = (*it)->FindAll(path.Back());
+ for (size_t i = 0; i < positions.size(); ++i) {
+ if ((int) positions[i] < (int) (*it)->Size() - 1
+ && EqualBegins(path, (int) path.Size() - 1, **it,
+ positions[i], false)) {
+ DEBUG("Checking unique path_back for " << (*it)->GetId());
+
+ if (UniqueBackPath(**it, positions[i])) {
+ DEBUG("Success");
+
+ EdgeId next = (*it)->At(positions[i] + 1);
+ weights_cands[next] += (*it)->GetWeight();
+ filtered_cands.insert(next);
+ if (support_paths_ends.count(next) == 0){
+ support_paths_ends[next] = BidirectionalPathSet();
+ }
+ support_paths_ends[next].insert(new BidirectionalPath((*it)->SubPath(positions[i] + 1)));
+ }
+ }
+ }
+ }
+ DEBUG("Candidates");
+ for (auto iter = weights_cands.begin(); iter != weights_cands.end(); ++iter) {
+ DEBUG("Candidate " << g_.int_id(iter->first) << " weight " << iter->second);
+ }
+ vector<pair<EdgeId, double> > sort_res = MapToSortVector(weights_cands);
+ DEBUG("sort res " << sort_res.size() << " tr " << weight_priority_threshold_);
+ if (sort_res.size() < 1 || sort_res[0].second < filtering_threshold_) {
+ filtered_cands.clear();
+ } else if (sort_res.size() > 1
+ && sort_res[0].second > weight_priority_threshold_ * sort_res[1].second) {
+ filtered_cands.clear();
+ filtered_cands.insert(sort_res[0].first);
+ } else if (sort_res.size() > 1) {
+ for (size_t i = 0; i < sort_res.size(); ++i) {
+ if (sort_res[i].second * weight_priority_threshold_ < sort_res[0].second) {
+ filtered_cands.erase(sort_res[i].first);
+ }
+ }
+ }
+ EdgeContainer result;
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (filtered_cands.find(it->e_) != filtered_cands.end()) {
+ result.push_back(*it);
+ }
+ }
+ if (result.size() != 1) {
+ DEBUG("Long reads doesn't help =(");
+ }
+ return result;
+ }
+
+private:
+
+ bool UniqueBackPath(const BidirectionalPath& path, size_t pos) const {
+ int int_pos = (int) pos;
+ while (int_pos >= 0) {
+ if (unique_edge_analyzer_.IsUnique(path.At(int_pos)) > 0 && g_.length(path.At(int_pos)) >= min_significant_overlap_)
+ return true;
+ int_pos--;
+ }
+ return false;
+ }
+
+ vector<pair<EdgeId, double> > MapToSortVector(const map<EdgeId, double>& map) const {
+ vector<pair<EdgeId, double> > result(map.begin(), map.end());
+ std::sort(result.begin(), result.end(), EdgeWithWeightCompareReverse);
+ return result;
+ }
+
+ double filtering_threshold_;
+ double weight_priority_threshold_;
+ size_t min_significant_overlap_;
+ const GraphCoverageMap& cov_map_;
+ LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
+ SimpleScaffolding simple_scaffolding_;
+
+ DECL_LOGGER("LongReadsExtensionChooser");
+};
+
+
+class CoordinatedCoverageExtensionChooser: public ExtensionChooser {
+public:
+ CoordinatedCoverageExtensionChooser(const Graph& g,
+ CoverageAwareIdealInfoProvider& coverage_provider,
+ size_t max_edge_length_in_repeat, double delta, size_t min_path_len) :
+ ExtensionChooser(g), provider_(coverage_provider),
+ max_edge_length_in_repeat_(max_edge_length_in_repeat), delta_(delta), min_path_len_(min_path_len) {
+ }
+
+ EdgeContainer Filter(const BidirectionalPath& path,
+ const EdgeContainer& edges) const override {
+
+ if (edges.size() < 2) {
+ DEBUG("If unique candidate has not been accepted by previous choosers better not to touch it");
+ return EdgeContainer();
+ }
+
+ if (path.Length() < min_path_len_) {
+ DEBUG("Path is too short");
+ return EdgeContainer();
+ }
+
+ double path_coverage = provider_.EstimatePathCoverage(path);
+ if (math::eq(path_coverage, -1.0) || math::le(path_coverage, 10.0)) {
+ DEBUG("Path coverage can't be calculated of too low");
+ return EdgeContainer();
+ }
+ DEBUG("Path coverage is " << path_coverage);
+
+ for (auto e_d : edges) {
+ if (path.Contains(g_.EdgeEnd(e_d.e_))) {
+ DEBUG("Avoid to create loops");
+ return EdgeContainer();
+ }
+ }
+ return FindExtensionTroughRepeat(edges, path_coverage);
+ }
+
+private:
+
+ void UpdateCanBeProcessed(VertexId v,
+ std::queue<VertexId>& can_be_processed, double path_coverage) const {
+ DEBUG("Updating can be processed");
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId neighbour_v = g_.EdgeEnd(e);
+ if (g_.length(e) <= max_edge_length_in_repeat_ && CompatibleEdge(e, path_coverage)) {
+ DEBUG("Adding vertex " << neighbour_v.int_id()
+ << "through edge " << g_.str(e));
+ can_be_processed.push(neighbour_v);
+ }
+ }
+ }
+
+ GraphComponent<Graph> GetRepeatComponent(const VertexId start, double path_coverage) const {
+ set<VertexId> vertices_of_component;
+ vertices_of_component.insert(start);
+ std::queue<VertexId> can_be_processed;
+ UpdateCanBeProcessed(start, can_be_processed, path_coverage);
+ while (!can_be_processed.empty()) {
+ VertexId v = can_be_processed.front();
+ can_be_processed.pop();
+ if (vertices_of_component.count(v) != 0) {
+ DEBUG("Component is too complex");
+ return GraphComponent<Graph>::Empty(g_);
+ }
+ DEBUG("Adding vertex " << g_.str(v) << " to component set");
+ vertices_of_component.insert(v);
+ UpdateCanBeProcessed(v, can_be_processed, path_coverage);
+ }
+
+ return GraphComponent<Graph>::FromVertices(g_, vertices_of_component);
+ }
+
+ EdgeContainer FinalFilter(const EdgeContainer& edges,
+ EdgeId edge_to_extend) const {
+ EdgeContainer result;
+ for (auto e_with_d : edges) {
+ if (e_with_d.e_ == edge_to_extend) {
+ result.push_back(e_with_d);
+ }
+ }
+ return result;
+ }
+
+ bool CompatibleEdge(EdgeId e, double path_coverage) const {
+ return math::ge(g_.coverage(e), path_coverage * delta_);
+ }
+
+ //returns lowest coverage among long compatible edges ahead of e
+ //if std::numeric_limits<double>::max() -- no such edges were detected
+ //if negative -- abort at once
+ double AnalyzeExtension(EdgeId ext, double path_coverage) const {
+ double answer = std::numeric_limits<double>::max();
+
+ if (!CompatibleEdge(ext, path_coverage)) {
+ DEBUG("Extension coverage is too low");
+ return answer;
+ }
+
+ if (g_.length(ext) > max_edge_length_in_repeat_) {
+ DEBUG("Long extension");
+ return g_.coverage(ext);
+ }
+
+ DEBUG("Short extension, launching repeat component analysis");
+ GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(ext), path_coverage);
+ if (gc.v_size() == 0) {
+ DEBUG("Component search failed");
+ return -1.;
+ }
+
+ for (auto e : gc.edges()) {
+ if (g_.length(e) > max_edge_length_in_repeat_) {
+ DEBUG("Repeat component contains long edges");
+ return -1.;
+ }
+ }
+
+ DEBUG("Checking long sinks");
+ for (auto v : gc.exits()) {
+ for (auto e : g_.OutgoingEdges(v)) {
+ if (g_.length(e) > max_edge_length_in_repeat_ &&
+ CompatibleEdge(e, path_coverage) &&
+ math::ls(g_.coverage(e), answer)) {
+ DEBUG("Updating answer to coverage of edge " << g_.str(e));
+ answer = g_.coverage(e);
+ }
+ }
+ }
+
+ return answer;
+ }
+
+ EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
+ static EdgeContainer EMPTY_CONTAINER;
+
+ map<EdgeId, double> good_extension_to_ahead_cov;
+
+ for (auto edge : edges) {
+ DEBUG("Processing candidate extension " << g_.str(edge.e_));
+ double analysis_res = AnalyzeExtension(edge.e_, path_coverage);
+
+ if (analysis_res == std::numeric_limits<double>::max()) {
+ DEBUG("Ignoring extension");
+ } else if (math::ls(analysis_res, 0.)) {
+ DEBUG("Troubles detected, abort mission");
+ return EMPTY_CONTAINER;
+ } else {
+ good_extension_to_ahead_cov[edge.e_] = analysis_res;
+ DEBUG("Extension mapped to ahead coverage of " << analysis_res);
+ }
+ }
+
+ DEBUG("Number of good extensions is " << good_extension_to_ahead_cov.size());
+
+ if (good_extension_to_ahead_cov.size() == 1) {
+ auto extension_info = *good_extension_to_ahead_cov.begin();
+ DEBUG("Single extension candidate " << g_.str(extension_info.first));
+ if (math::le(extension_info.second, path_coverage / delta_)) {
+ DEBUG("Extending");
+ return FinalFilter(edges, extension_info.first);
+ } else {
+ DEBUG("Predicted ahead coverage is too high");
+ }
+ } else {
+ DEBUG("Multiple extension candidates");
+ }
+
+ return EMPTY_CONTAINER;
+ }
+
+ CoverageAwareIdealInfoProvider provider_;
+ const size_t max_edge_length_in_repeat_;
+ const double delta_;
+ const size_t min_path_len_;
+ DECL_LOGGER("CoordCoverageExtensionChooser");
+};
+
+}
+#endif /* EXTENSION_HPP_ */
diff --git a/src/modules/algorithms/path_extend/ideal_pair_info.hpp b/src/common/modules/path_extend/ideal_pair_info.hpp
similarity index 100%
rename from src/modules/algorithms/path_extend/ideal_pair_info.hpp
rename to src/common/modules/path_extend/ideal_pair_info.hpp
diff --git a/src/common/modules/path_extend/loop_traverser.hpp b/src/common/modules/path_extend/loop_traverser.hpp
new file mode 100644
index 0000000..40e451c
--- /dev/null
+++ b/src/common/modules/path_extend/loop_traverser.hpp
@@ -0,0 +1,228 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * loop_traverser.hpp
+ *
+ * Created on: Jan 28, 2013
+ * Author: ira
+ */
+
+#ifndef LOOP_TRAVERSER_H_
+#define LOOP_TRAVERSER_H_
+
+#include "path_extender.hpp"
+#include "pe_resolver.hpp"
+#include "path_visualizer.hpp"
+
+namespace path_extend {
+
+class LoopTraverser {
+
+ const Graph& g_;
+ GraphCoverageMap& covMap_;
+ size_t long_edge_limit_;
+ size_t component_size_limit_;
+ size_t shortest_path_limit_;
+ static const size_t DIJKSTRA_LIMIT = 3000;
+private:
+ bool AnyTipsInComponent(const GraphComponent<Graph>& component) const{
+ for(auto e : component.edges()) {
+ if (g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0 || g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0)
+ return true;
+ }
+ return false;
+ }
+
+ EdgeId FindStart(const set<VertexId>& component_set) const{
+ EdgeId result;
+ for (auto it = component_set.begin(); it != component_set.end(); ++it) {
+ for (auto eit = g_.in_begin(*it); eit != g_.in_end(*it); ++eit) {
+ if (component_set.count(g_.EdgeStart(*eit)) == 0) {
+ if (result != EdgeId()) {
+ return EdgeId();
+ }
+ result = *eit;
+ }
+ }
+ }
+ return result;
+ }
+
+ EdgeId FindFinish(const set<VertexId>& component_set) {
+ EdgeId result;
+ for (auto it = component_set.begin(); it != component_set.end(); ++it) {
+ for (auto I = g_.out_begin(*it), E = g_.out_end(*it);
+ I != E; ++I) {
+ if (component_set.count(g_.EdgeEnd(*I)) == 0) {
+ if (result != EdgeId()) {
+ return EdgeId();
+ }
+ result = *I;
+ }
+ }
+ }
+ return result;
+ }
+
+
+ bool IsEndInsideComponent(const BidirectionalPath &path,
+ const set <VertexId> &component_set) {
+ if (component_set.count(g_.EdgeStart(path.Front())) == 0) {
+ return false;
+ }
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (component_set.count(g_.EdgeEnd(path.At(i))) == 0)
+ return false;
+ }
+ return true;
+ }
+
+
+ bool IsEndInsideComponent(const BidirectionalPath &path, EdgeId component_entrance,
+ const set <VertexId> &component_set,
+ bool conjugate = false) {
+ int i = path.FindLast(component_entrance);
+ VERIFY_MSG(i != -1, "Component edge is not found in the path")
+
+ if ((size_t) i == path.Size() - 1) {
+ if (conjugate)
+ return component_set.count(g_.conjugate(g_.EdgeEnd(path.Back()))) > 0;
+ else
+ return component_set.count(g_.EdgeEnd(path.Back())) > 0;
+ }
+
+ if (conjugate)
+ return IsEndInsideComponent(path.SubPath((size_t) i + 1).Conjugate(), component_set);
+ else
+ return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set);
+ }
+
+ bool TraverseLoop(EdgeId start, EdgeId end, const set<VertexId>& component_set) {
+ DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end));
+ BidirectionalPathSet coveredStartPaths =
+ covMap_.GetCoveringPaths(start);
+ BidirectionalPathSet coveredEndPaths =
+ covMap_.GetCoveringPaths(end);
+
+ for (auto it_path = coveredStartPaths.begin();
+ it_path != coveredStartPaths.end(); ++it_path) {
+ if ((*it_path)->FindAll(end).size() > 0) {
+ return false;
+ }
+ }
+ if (coveredStartPaths.size() < 1 or coveredEndPaths.size() < 1) {
+ DEBUG("TraverseLoop STRANGE SITUATION: start " << coveredStartPaths.size() << " end " << coveredEndPaths.size());
+ return false;
+ }
+
+ if (coveredStartPaths.size() > 1 or coveredEndPaths.size() > 1) {
+ DEBUG("Ambiguous situation in path joining, quitting");
+ return false;
+ }
+
+ BidirectionalPath* startPath = *coveredStartPaths.begin();
+ BidirectionalPath* endPath = *coveredEndPaths.begin();
+ if ((*startPath) == endPath->Conjugate()){
+ return false;
+ }
+
+ //Checking that paths ends are within component
+ if (!IsEndInsideComponent(*startPath, start, component_set) ||
+ !IsEndInsideComponent(*endPath->GetConjPath(), g_.conjugate(end), component_set, true)) {
+ DEBUG("Some path goes outside of the component")
+ return false;
+ }
+
+ size_t commonSize = startPath->CommonEndSize(*endPath);
+ size_t nLen = 0;
+ DEBUG("Str " << startPath->Size() << ", end" << endPath->Size());
+ if (commonSize == 0 && !startPath->Empty() > 0 && !endPath->Empty()) {
+ DEBUG("Estimating gap size");
+ VertexId lastVertex = g_.EdgeEnd(startPath->Back());
+ VertexId firstVertex = g_.EdgeStart(endPath->Front());
+
+ if (firstVertex == lastVertex) {
+ nLen = 0;
+ } else {
+ DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, shortest_path_limit_,
+ DIJKSTRA_LIMIT));
+ dijkstra.Run(lastVertex);
+ vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart(endPath->Front()));
+
+ if (shortest_path.empty()) {
+ DEBUG("Failed to find closing path");
+ return false;
+ } else if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) {
+ DEBUG("Closing path is outside the component");
+ return false;
+ } else {
+ nLen = CumulativeLength(g_, shortest_path);
+ }
+ }
+ }
+ if (commonSize < endPath->Size()){
+ startPath->PushBack(endPath->At(commonSize), (int) nLen);
+ }
+ for (size_t i = commonSize + 1; i < endPath->Size(); ++i) {
+ startPath->PushBack(endPath->At(i), endPath->GapAt(i), endPath->TrashPreviousAt(i), endPath->TrashCurrentAt(i));
+ }
+ DEBUG("travers");
+ startPath->Print();
+ endPath->Print();
+ DEBUG("conj");
+ endPath->GetConjPath()->Print();
+ endPath->Clear();
+ return true;
+ }
+
+ bool ContainsLongEdges(const GraphComponent<Graph>& component) const {
+ for(auto e : component.edges()) {
+ if(g_.length(e) > long_edge_limit_) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+public:
+ LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, size_t long_edge_limit, size_t component_size_limit, size_t shortest_path_limit) :
+ g_(g), covMap_(coverageMap), long_edge_limit_(long_edge_limit), component_size_limit_(component_size_limit), shortest_path_limit_(shortest_path_limit) {
+ }
+
+ size_t TraverseAllLoops() {
+ DEBUG("TraverseAllLoops");
+ size_t traversed = 0;
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, long_edge_limit_);
+ while (splitter->HasNext()) {
+ GraphComponent<Graph> component = splitter->Next();
+ if (component.v_size() > component_size_limit_)
+ continue;
+ if (ContainsLongEdges(component))
+ continue;
+ if (AnyTipsInComponent(component))
+ continue;
+
+ set<VertexId> component_set(component.v_begin(), component.v_end());
+ EdgeId start = FindStart(component_set);
+ EdgeId finish = FindFinish(component_set);
+ if (start == EdgeId() || finish == EdgeId()) {
+ continue;
+ }
+ if (TraverseLoop(start, finish, component_set))
+ ++traversed;
+ }
+ return traversed;
+ }
+
+protected:
+ DECL_LOGGER("LoopTraverser");
+};
+
+}
+
+#endif /* LOOP_TRAVERSER_H_ */
diff --git a/src/common/modules/path_extend/overlap_analysis.hpp b/src/common/modules/path_extend/overlap_analysis.hpp
new file mode 100644
index 0000000..3c3178f
--- /dev/null
+++ b/src/common/modules/path_extend/overlap_analysis.hpp
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "utils/logger/logger.hpp"
+#include "utils/range.hpp"
+#include "ssw/ssw_cpp.h"
+
+namespace debruijn_graph {
+using omnigraph::Range;
+
+struct OverlapInfo {
+ Range r1;
+ Range r2;
+ size_t match_cnt;
+
+ OverlapInfo(const Range& r1_, const Range& r2_, size_t match_cnt_)
+ : r1(r1_),
+ r2(r2_),
+ match_cnt(match_cnt_) {
+ VERIFY(match_cnt <= std::min(r1.size(), r2.size()));
+ }
+
+ OverlapInfo()
+ : match_cnt(0) {
+ }
+
+ double identity() const {
+ if (match_cnt == 0)
+ return 0.;
+ return (double)match_cnt / (double)size();
+ }
+
+ size_t size() const {
+ return std::max(r1.size(), r2.size());
+ }
+
+ bool operator==(const OverlapInfo &that) const {
+ return r1 == that.r1 && r2 == that.r2 && match_cnt == that.match_cnt;
+ }
+
+ bool operator!=(const OverlapInfo &that) const {
+ return !(*this == that);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const OverlapInfo& info) {
+ return os << "R1: [" << info.r1.start_pos << ", " << info.r1.end_pos
+ << "]; R2: [" << info.r2.start_pos << ", " << info.r2.end_pos << "]"
+ << "; match_cnt: " << info.match_cnt;
+}
+
+class SWOverlapAnalyzer {
+ static const uint32_t CIGAR_FLAG_MASK = (1 << 4) - 1;
+ static const uint32_t CIGAR_MATCH_FLAG = 7;
+ typedef typename Graph::EdgeId EdgeId;
+ size_t flank_length_;
+
+ const StripedSmithWaterman::Aligner aligner_;
+ const StripedSmithWaterman::Filter filter_;
+
+ size_t CountMatches(std::vector<uint32_t> cigar) const {
+ size_t match_cnt = 0;
+ for (uint32_t entry : cigar) {
+ if ((entry & CIGAR_FLAG_MASK) == CIGAR_MATCH_FLAG) {
+ match_cnt += (entry >> 4);
+ }
+ }
+ return match_cnt;
+ }
+
+ OverlapInfo InnerAnalyze(const Sequence& s1, const Sequence& s2) const {
+ if (s1.size() == 0 || s2.size() == 0) {
+ return OverlapInfo();
+ }
+ StripedSmithWaterman::Alignment alignment;
+ if (aligner_.Align(s1.str().c_str(), s2.str().c_str(), int(s2.size()), filter_, &alignment)) {
+ if (alignment.sw_score > 0) {
+ return OverlapInfo(Range(alignment.query_begin, alignment.query_end + 1),
+ Range(alignment.ref_begin, alignment.ref_end + 1),
+ CountMatches(alignment.cigar));
+ }
+ }
+ return OverlapInfo();
+ }
+
+public:
+ SWOverlapAnalyzer(size_t flank_length)
+ : flank_length_(flank_length),
+ aligner_(/*match_score*/1,
+ /*mismatch_penalty*/3,
+ /*gap_opening_penalty*/4,
+ /*gap_extending_penalty*/3) {
+ }
+
+
+ OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
+ DEBUG("Analysis started");
+ size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
+ size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;
+
+ DEBUG("s1 " << s1.Subseq(start1, s1.size()));
+ DEBUG("s2 " << s2.Subseq(0, end2));
+ OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
+ if (result == OverlapInfo()) {
+ DEBUG("Empty overlap")
+ return result;
+ }
+
+ result.r1.shift(int(start1));
+ DEBUG("Result " << result)
+ return result;
+ }
+
+ template<class Graph>
+ OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
+ DEBUG("Analyzing edges " << g.str(e1) << " and " << g.str(e2));
+ return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
+ }
+
+private:
+ DECL_LOGGER("SWOverlapAnalyzer");
+};
+
+}
diff --git a/src/common/modules/path_extend/paired_library.hpp b/src/common/modules/path_extend/paired_library.hpp
new file mode 100644
index 0000000..2b22da0
--- /dev/null
+++ b/src/common/modules/path_extend/paired_library.hpp
@@ -0,0 +1,186 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * paired_library.hpp
+ *
+ * Created on: Feb 19, 2012
+ * Author: andrey
+ */
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "paired_info/paired_info.hpp"
+#include "ideal_pair_info.hpp"
+
+#include "math/xmath.h"
+
+namespace path_extend {
+
+using debruijn_graph::Graph;
+using debruijn_graph::EdgeId;
+
+using omnigraph::de::PairedInfoIndexT;
+using omnigraph::de::Point;
+
+class PairedInfoLibrary {
+public:
+ PairedInfoLibrary(size_t k, const Graph& g, size_t read_size, size_t is,
+ size_t is_min, size_t is_max, double is_var,
+ bool is_mp,
+ const std::map<int, size_t>& is_distribution)
+ : g_(g),
+ k_(k),
+ read_size_(read_size),
+ is_(is),
+ is_min_(is_min),
+ is_max_(is_max),
+ is_var_(is_var),
+ is_mp_(is_mp),
+ ideal_pi_counter_(g, (int) is_min, (int) is_max,
+ read_size, is_distribution) {
+ }
+
+ virtual ~PairedInfoLibrary() {}
+
+ virtual size_t FindJumpEdges(EdgeId e, set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const = 0;
+ virtual void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const = 0;
+ virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int distance, bool from_interval = false) const = 0;
+ virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const = 0;
+
+ double IdealPairedInfo(EdgeId e1, EdgeId e2, int distance, bool additive = false) const {
+ return ideal_pi_counter_.IdealPairedInfo(e1, e2, distance, additive);
+ }
+
+ size_t GetIS() const { return is_; }
+ size_t GetISMin() const { return is_min_; }
+ size_t GetISMax() const { return is_max_; }
+ double GetIsVar() const { return is_var_; }
+ bool IsMp() const { return is_mp_; }
+
+protected:
+ const Graph& g_;
+ size_t k_;
+ size_t read_size_;
+ size_t is_;
+ size_t is_min_;
+ size_t is_max_;
+ double is_var_;
+ bool is_mp_;
+ IdealPairInfoCounter ideal_pi_counter_;
+ DECL_LOGGER("PathExtendPI");
+};
+
+template<class Index>
+class PairedInfoLibraryWithIndex : public PairedInfoLibrary {
+ const Index& index_;
+
+public:
+ PairedInfoLibraryWithIndex(size_t k, const Graph& g, size_t readS, size_t is, size_t is_min, size_t is_max, double is_div,
+ const Index& index, bool is_mp,
+ const std::map<int, size_t>& is_distribution)
+ : PairedInfoLibrary(k, g, readS, is, is_min, is_max, is_div, is_mp, is_distribution),
+ index_(index) {}
+
+ size_t FindJumpEdges(EdgeId e, std::set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const override {
+ VERIFY(index_.size() > 0);
+ result.clear();
+
+ auto infos = index_.Get(e);
+ // We do not care about iteration order here - all the edges collected
+ // will be inside std::set<EdgeId>
+ for (auto it : infos) {
+ EdgeId e2 = it.first;
+ if (e2 == e)
+ continue;
+ if (g_.length(e2) < min_len)
+ continue;
+ for (auto point : it.second) {
+ omnigraph::de::DEDistance dist = point.d;
+ if (math::le(dist, (omnigraph::de::DEDistance) max_dist) &&
+ math::ge(dist, (omnigraph::de::DEDistance) min_dist)) {
+ result.insert(e2);
+ }
+ }
+ }
+ return result.size();
+ }
+
+
+ void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const override {
+ VERIFY(index_.size() > 0);
+ if (e1 == e2)
+ return;
+
+ for (auto point : index_.Get(e1, e2)) {
+ int pairedDistance = de::rounded_d(point);
+ dist.push_back(pairedDistance);
+ w.push_back(point.weight);
+ }
+ }
+
+ double CountPairedInfo(EdgeId e1, EdgeId e2, int distance,
+ bool from_interval = false) const override {
+ VERIFY(index_.size() != 0);
+ double weight = 0.0;
+
+ for (auto point : index_.Get(e1, e2)) {
+ int pairedDistance = de::rounded_d(point);
+ int distanceDev = (int) point.variance(); //max((int) pointIter->var, (int) is_variation_);
+ //Can be modified according to distance comparison
+ int d_min = distance - distanceDev;
+ int d_max = distance + distanceDev;
+
+ if (from_interval) {
+ d_min -= (int) (is_ - is_min_);
+ d_max += (int) (is_max_ - is_);
+ }
+ if (pairedDistance >= d_min && pairedDistance <= d_max) {
+ weight += point.weight;
+ }
+ }
+ return weight;
+ }
+
+ double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const override {
+ VERIFY(index_.size() != 0);
+ double weight = 0.0;
+ for (const auto &point : index_.Get(e1, e2)) {
+ int dist = de::rounded_d(point);
+ if (dist >= dist_min && dist <= dist_max)
+ weight += point.weight;
+ }
+ return weight;
+ }
+
+};
+
+template<class Index>
+shared_ptr<PairedInfoLibrary> MakeNewLib(const Graph& g,
+ const debruijn_graph::config::dataset::Library &lib,
+ const Index &paired_index) {
+ //why all those local variables? :)
+ size_t read_length = lib.data().read_length;
+ size_t is = (size_t) lib.data().mean_insert_size;
+ int is_min = (int) lib.data().insert_size_left_quantile;
+ int is_max = (int) lib.data().insert_size_right_quantile;
+ double var = lib.data().insert_size_deviation;
+ bool is_mp = lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs;
+ return make_shared<PairedInfoLibraryWithIndex<decltype(paired_index)>>(g.k(),
+ g,
+ read_length,
+ is,
+ is_min > 0 ? size_t(is_min) : 0,
+ is_max > 0 ? size_t(is_max) : 0,
+ var,
+ paired_index,
+ is_mp,
+ lib.data().insert_size_distribution);
+}
+
+} // path extend
diff --git a/src/common/modules/path_extend/path_extender.hpp b/src/common/modules/path_extend/path_extender.hpp
new file mode 100644
index 0000000..df1c5b2
--- /dev/null
+++ b/src/common/modules/path_extend/path_extender.hpp
@@ -0,0 +1,1576 @@
+//***************************************************************************
+//* Copyright (c) 2011-2014 Saint-Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//****************************************************************************
+
+/*
+ * path_extender.hpp
+ *
+ * Created on: Mar 5, 2012
+ * Author: andrey
+ */
+
+#pragma once
+
+#include "extension_chooser.hpp"
+#include "path_filter.hpp"
+#include "overlap_analysis.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include <cmath>
+
+namespace path_extend {
+
+class ShortLoopResolver {
+public:
+ ShortLoopResolver(const Graph& g)
+ : g_(g) { }
+
+ virtual ~ShortLoopResolver() { }
+
+ virtual void ResolveShortLoop(BidirectionalPath& path) const = 0;
+
+protected:
+ DECL_LOGGER("PathExtender")
+ const Graph& g_;
+
+ void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const {
+ if (p.Size() <= 2) {
+ return;
+ }
+ EdgeId first_edge = p.Back();
+ EdgeId second_edge = next_edge;
+ while (p.Size() > 2) {
+ if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) {
+ p.PopBack(2);
+ } else {
+ return;;
+ }
+ }
+ }
+
+ void MakeCycleStep(BidirectionalPath& path, EdgeId e) const {
+ if (path.Size() == 0) {
+ return;
+ }
+ EdgeId pathEnd = path.Back();
+ path.PushBack(e);
+ path.PushBack(pathEnd);
+ }
+};
+
+class CovShortLoopResolver : public ShortLoopResolver {
+public:
+ CovShortLoopResolver(const conj_graph_pack& gp)
+ : ShortLoopResolver(gp.g), gp_(gp) {
+
+ }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ DEBUG("resolve short loop by coverage");
+ path.Print();
+
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Coverage Short Loop Resolver");
+ UndoCycles(path, edges.first);
+ EdgeId e1 = path.Back();
+ EdgeId e2 = edges.first;
+ EdgeId e_out = edges.second;
+ auto prob_e_in = g_.IncomingEdges(g_.EdgeEnd(e2));
+ EdgeId e_in = *prob_e_in.begin();
+ size_t count = 0;
+ for (auto edge = prob_e_in.begin(); edge != prob_e_in.end(); ++edge) {
+ if (*edge != e2)
+ e_in = *edge;
+ count++;
+ }
+ if (count != 2) {
+ return;
+ }
+ double in_cov = gp_.flanking_cov.GetOutCov(e_in); //g_.coverage(e_in);
+ double out_cov = gp_.flanking_cov.GetInCov(e_out); //g_.coverage(e_out);
+ double cov = (in_cov + out_cov) / 2.0;
+ //what are time variables???
+ double time1 = math::round(gp_.g.coverage(e1) / cov);
+ double time2 = math::round(gp_.g.coverage(e2) / cov);
+ size_t time = (size_t) std::max(0.0, std::min(time1 - 1.0, time2));
+ for (size_t i = 0; i < time; ++i) {
+ MakeCycleStep(path, edges.first);
+ }
+ path.PushBack(edges.second);
+ DEBUG("loop with start " << g_.int_id(e_in)
+ <<" e1 " << g_.int_id(e1)
+ << " e2 " << g_.int_id(e2)
+ << " out " <<g_.int_id(e_out)
+ << " cov in = " << in_cov
+ << " cov out " << out_cov
+ << " cov " << cov
+ << " cov e1 = " << gp_.g.coverage(e1)
+ << " cov e2 = " << gp_.g.coverage(e2)
+ << " time1 = " << time1
+ << " time2 = " << time2
+ << " time = " << time);
+ }
+ }
+private:
+ const conj_graph_pack& gp_;
+};
+
+class SimpleLoopResolver : public ShortLoopResolver {
+
+public:
+ SimpleLoopResolver(Graph& g) : ShortLoopResolver(g) { }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Resolving short loop...");
+ EdgeId e = path.Back();
+ path.PushBack(edges.first);
+ path.PushBack(e);
+ path.PushBack(edges.second);
+ DEBUG("Resolving short loop done");
+ }
+ }
+
+protected:
+ DECL_LOGGER("PathExtender")
+};
+
+class LoopResolver : public ShortLoopResolver {
+ static const size_t ITER_COUNT = 10;
+ const WeightCounter& wc_;
+
+private:
+ bool CheckLoopPlausible(EdgeId froward_loop_edge, EdgeId backward_loop_edge) const {
+ size_t single_loop_length = 2 * g_.length(froward_loop_edge) + g_.length(backward_loop_edge);
+ return single_loop_length <= wc_.get_libptr()->GetISMax();
+ }
+
+public:
+ LoopResolver(const Graph& g, const WeightCounter& wc)
+ : ShortLoopResolver(g),
+ wc_(wc) { }
+ //This code works only if loop wasn't fairly resolved
+ //
+ //Weird interface; need comments
+ void MakeBestChoice(BidirectionalPath& path, pair<EdgeId, EdgeId>& edges) const {
+ UndoCycles(path, edges.first);
+ BidirectionalPath experiment(path);
+ double max_weight = wc_.CountWeight(experiment, edges.second);
+ double diff = max_weight - wc_.CountWeight(experiment, edges.first);
+ size_t maxIter = 0;
+ for (size_t i = 1; i <= ITER_COUNT; ++i) {
+ double weight = wc_.CountWeight(experiment, edges.first);
+ if (weight > 0) {
+ MakeCycleStep(experiment, edges.first);
+ weight = wc_.CountWeight(experiment, edges.second);
+ double weight2 = wc_.CountWeight(experiment, edges.first);
+ if (weight > max_weight || (weight == max_weight && weight - weight2 > diff)
+ || (weight == max_weight && weight - weight2 == diff && i == 1)) {
+ max_weight = weight;
+ maxIter = i;
+ diff = weight - weight2;
+ }
+ }
+ }
+
+ if (!CheckLoopPlausible(path.Back(), edges.first) && maxIter > 0) {
+ MakeCycleStep(path, edges.first);
+ path.PushBack(edges.second, int(g_.k() + 100));
+ }
+ else {
+ for (size_t i = 0; i < maxIter; ++i) {
+ MakeCycleStep(path, edges.first);
+ }
+ path.PushBack(edges.second);
+ }
+
+ }
+
+ void ResolveShortLoop(BidirectionalPath& path) const override {
+ pair<EdgeId, EdgeId> edges;
+ if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), edges)) {
+ DEBUG("Resolving short loop...");
+ MakeBestChoice(path, edges);
+ DEBUG("Resolving short loop done");
+ }
+ }
+
+};
+
+class GapJoiner {
+
+public:
+ static const int INVALID_GAP = -1000000;
+ GapJoiner(const Graph& g)
+ : g_(g) { }
+
+ virtual Gap FixGap( EdgeId source, EdgeId sink, int initial_gap) const = 0;
+
+ virtual ~GapJoiner() { }
+protected:
+ const Graph& g_;
+};
+
+class SimpleGapJoiner : public GapJoiner {
+
+public:
+ SimpleGapJoiner(const Graph& g) : GapJoiner(g) { }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
+ if (initial_gap > 2 * (int) g_.k()) {
+ return Gap(initial_gap);
+ }
+ for (int l = (int) g_.k(); l > 0; --l) {
+ if (g_.EdgeNucls(sink).Subseq(g_.length(source) + g_.k() - l) == g_.EdgeNucls(sink).Subseq(0, l)) {
+ DEBUG("Found correct gap length");
+ DEBUG("Inintial: " << initial_gap << ", new gap: " << g_.k() - l);
+ return Gap((int) g_.k() - l);
+ }
+ }
+ DEBUG("Perfect overlap is not found, inintial: " << initial_gap);
+ return Gap(initial_gap);
+ }
+};
+
+class HammingGapJoiner: public GapJoiner {
+ const double min_gap_score_;
+ const size_t short_overlap_threshold_;
+ const size_t basic_overlap_length_;
+
+ vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ vector < size_t > answer;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ answer.push_back(i);
+ return answer;
+ }
+
+ size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ size_t dist = 0;
+ for (size_t i = 0; i < s1.size(); ++i) {
+ if (s1[i] != s2[i]) {
+ dist++;
+ }
+ }
+ return dist;
+ }
+
+// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
+// VERIFY(s1.size() == s2.size());
+// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
+// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
+// }
+
+
+ double ScoreGap(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size();
+ }
+
+public:
+
+ //todo review parameters in usages
+ HammingGapJoiner(const Graph& g,
+ double min_gap_score,
+ size_t short_overlap_threshold,
+ size_t basic_overlap_length):
+ GapJoiner(g),
+ min_gap_score_(min_gap_score),
+ short_overlap_threshold_(short_overlap_threshold),
+ basic_overlap_length_(basic_overlap_length)
+ {
+ DEBUG("HammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
+ "\n short_overlap_threshold " << short_overlap_threshold_ <<
+ "\n basic_overlap_length " << basic_overlap_length_);
+ }
+
+ //estimated_gap is in k-mers
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+
+ size_t corrected_start_overlap = basic_overlap_length_;
+ if (estimated_gap < 0) {
+ corrected_start_overlap -= estimated_gap;
+ }
+
+ corrected_start_overlap = min(corrected_start_overlap,
+ g_.k() + min(g_.length(source), g_.length(sink)));
+
+ DEBUG("Corrected max overlap " << corrected_start_overlap);
+
+ double best_score = min_gap_score_;
+ int fixed_gap = INVALID_GAP;
+
+ double overlap_coeff = 0.3;
+ size_t min_overlap = 1ul;
+ if (estimated_gap < 0) {
+ size_t estimated_overlap = g_.k() - estimated_gap;
+ min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
+ }
+ //todo better usage of estimated overlap
+ DEBUG("Min overlap " << min_overlap);
+
+ for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
+ //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
+ //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
+ double score = 0;
+ score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
+ g_.EdgeNucls(sink).Subseq(0, l));
+ if (math::gr(score, best_score)) {
+ TRACE("Curr overlap " << l);
+ TRACE("Score: " << score);
+ best_score = score;
+ fixed_gap = int(g_.k() - l);
+ }
+
+ if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
+ //look at "short" overlaps only if long overlaps couldn't be found
+ DEBUG("Not looking at short overlaps");
+ break;
+ }
+ }
+
+ if (fixed_gap != INVALID_GAP) {
+ DEBUG("Found candidate gap length with score " << best_score);
+ DEBUG("Estimated gap: " << estimated_gap <<
+ ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
+ }
+ return Gap(fixed_gap);
+ }
+
+private:
+ DECL_LOGGER("HammingGapJoiner");
+};
+
+//deprecated!
+//fixme reduce code duplication with HammingGapJoiner
+class LikelihoodHammingGapJoiner: public GapJoiner {
+ static const size_t DEFAULT_PADDING_LENGTH = 10;
+ const double min_gap_score_;
+ const size_t short_overlap_threshold_;
+ const size_t basic_overlap_length_;
+
+ vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ vector < size_t > answer;
+ for (size_t i = 0; i < s1.size(); ++i)
+ if (s1[i] != s2[i])
+ answer.push_back(i);
+ return answer;
+ }
+
+ size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
+ VERIFY(s1.size() == s2.size());
+ size_t dist = 0;
+ for (size_t i = 0; i < s1.size(); ++i) {
+ if (s1[i] != s2[i]) {
+ dist++;
+ }
+ }
+ return dist;
+ }
+
+// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
+// VERIFY(s1.size() == s2.size());
+// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
+// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
+// }
+
+ //FIXME use GC content, change match prob and use partition of tip sequence into bad and good part
+ double ScoreGap(const Sequence& s1, const Sequence& s2) const {
+ static double match_prob = 0.9;
+ static double log_match_prob = log2(match_prob);
+ static double log_mismatch_prob = log2(1. - match_prob);
+ VERIFY(s1.size() == s2.size());
+ size_t n = s1.size();
+ size_t mismatches = HammingDistance(s1, s2);
+ VERIFY(mismatches <= n);
+ return 2.*double(n) + double(n - mismatches) * log_match_prob + double(mismatches) * log_mismatch_prob;
+ }
+
+public:
+
+ //todo review parameters in usages
+ LikelihoodHammingGapJoiner(const Graph& g,
+ double min_gap_score,
+ size_t short_overlap_threshold,
+ size_t basic_overlap_length):
+ GapJoiner(g),
+ min_gap_score_(min_gap_score),
+ short_overlap_threshold_(short_overlap_threshold),
+ basic_overlap_length_(basic_overlap_length)
+ {
+ DEBUG("LikelihoodHammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
+ "\n short_overlap_threshold " << short_overlap_threshold_ <<
+ "\n basic_overlap_length " << basic_overlap_length_);
+ }
+
+ //estimated_gap is in k-mers
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+
+ size_t corrected_start_overlap = basic_overlap_length_;
+ if (estimated_gap < 0) {
+ corrected_start_overlap -= estimated_gap;
+ }
+
+ corrected_start_overlap = min(corrected_start_overlap,
+ g_.k() + min(g_.length(source), g_.length(sink)));
+
+ DEBUG("Corrected max overlap " << corrected_start_overlap);
+
+ double best_score = min_gap_score_;
+ int fixed_gap = INVALID_GAP;
+
+ double overlap_coeff = 0.3;
+ size_t min_overlap = 1ul;
+ if (estimated_gap < 0) {
+ size_t estimated_overlap = g_.k() - estimated_gap;
+ min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
+ }
+ //todo better usage of estimated overlap
+ DEBUG("Min overlap " << min_overlap);
+
+ for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
+ //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
+ //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
+ double score = 0;
+ score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
+ g_.EdgeNucls(sink).Subseq(0, l));
+ if (math::gr(score, best_score)) {
+ TRACE("Curr overlap " << l);
+ TRACE("Score: " << score);
+ best_score = score;
+ fixed_gap = int(g_.k() - l);
+ }
+
+ if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
+ //look at "short" overlaps only if long overlaps couldn't be found
+ DEBUG("Not looking at short overlaps");
+ break;
+ }
+ }
+
+ if (fixed_gap != INVALID_GAP) {
+ DEBUG("Found candidate gap length with score " << best_score);
+ DEBUG("Estimated gap: " << estimated_gap <<
+ ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
+ }
+ return Gap(fixed_gap);
+ }
+
+private:
+ DECL_LOGGER("LikelihoodHammingGapJoiner");
+};
+
+//if I was in LA
+class LAGapJoiner: public GapJoiner {
+public:
+ LAGapJoiner(const Graph& g, size_t min_la_length,
+ double flank_multiplication_coefficient,
+ double flank_addition_coefficient) :
+ GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_(
+ flank_addition_coefficient), flank_multiplication_coefficient_(
+ flank_multiplication_coefficient) {
+ DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_);
+ DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
+ }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
+
+ DEBUG("Overlap doesn't exceed " << size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
+ SWOverlapAnalyzer overlap_analyzer(
+ size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
+
+ auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, source,
+ sink);
+
+ DEBUG(overlap_info);
+
+ if (overlap_info.size() < min_la_length_) {
+ DEBUG("Low alignment size");
+ return Gap(INVALID_GAP);
+ }
+
+ size_t max_flank_length = max(overlap_info.r2.start_pos,
+ g_.length(source) + g_.k() - overlap_info.r1.end_pos);
+ DEBUG("Max flank length - " << max_flank_length);
+
+ if ((double) max_flank_length * flank_multiplication_coefficient_
+ + flank_addition_coefficient_ > (double) overlap_info.size()) {
+ DEBUG("Too long flanks for such alignment");
+ return Gap(INVALID_GAP);
+ }
+
+ if (math::ls(overlap_info.identity(), IDENTITY_RATIO)) {
+ DEBUG("Low identity score");
+ return Gap(INVALID_GAP);
+ }
+
+ if ((g_.length(source) + g_.k()) - overlap_info.r1.end_pos > g_.length(source)) {
+ DEBUG("Save kmers. Don't want to have edges shorter than k");
+ return Gap(INVALID_GAP);
+ }
+
+ if (overlap_info.r2.start_pos > g_.length(sink)) {
+ DEBUG("Save kmers. Don't want to have edges shorter than k");
+ return Gap(INVALID_GAP);
+ }
+
+ return Gap(
+ (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos
+ + g_.k()),
+ (uint32_t) (g_.length(source) + g_.k()
+ - overlap_info.r1.end_pos),
+ (uint32_t) overlap_info.r2.start_pos);
+ }
+
+private:
+ DECL_LOGGER("LAGapJoiner");
+ const size_t min_la_length_;
+ const double flank_addition_coefficient_;
+ const double flank_multiplication_coefficient_;
+ constexpr static double IDENTITY_RATIO = 0.9;
+ constexpr static double ESTIMATED_GAP_MULTIPLIER = 2.0;
+ const size_t GAP_ADDITIONAL_COEFFICIENT = 30;
+};
+
+
+class CompositeGapJoiner: public GapJoiner {
+public:
+
+ CompositeGapJoiner(const Graph& g,
+ const vector<shared_ptr<GapJoiner>>& joiners,
+ size_t may_overlap_threhold,
+ int must_overlap_threhold,
+ size_t artificail_gap) :
+ GapJoiner(g),
+ joiners_(joiners),
+ may_overlap_threshold_(may_overlap_threhold),
+ must_overlap_threshold_(must_overlap_threhold),
+ artificial_gap_(artificail_gap)
+ { }
+
+ Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
+ DEBUG("Trying to fix estimated gap " << estimated_gap <<
+ " between " << g_.str(source) << " and " << g_.str(sink));
+
+ if (estimated_gap > int(g_.k() + may_overlap_threshold_)) {
+ DEBUG("Edges are supposed to be too far to check overlaps");
+ return Gap(estimated_gap);
+ }
+
+ for (auto joiner : joiners_) {
+ Gap gap = joiner->FixGap(source, sink, estimated_gap);
+ if (gap.gap_ != GapJoiner::INVALID_GAP) {
+ return gap;
+ }
+ }
+
+ //couldn't find decent overlap
+ if (estimated_gap < must_overlap_threshold_) {
+ DEBUG("Estimated gap looks unreliable");
+ return Gap(INVALID_GAP);
+ } else {
+ DEBUG("Overlap was not found");
+ return Gap(max(estimated_gap, int(g_.k() + artificial_gap_)));
+ }
+ }
+
+private:
+ vector<shared_ptr<GapJoiner>> joiners_;
+ const size_t may_overlap_threshold_;
+ const int must_overlap_threshold_;
+ const size_t artificial_gap_;
+
+ DECL_LOGGER("CompositeGapJoiner");
+};
+
+//FIXME move to tests
+//Just for test. Look at overlap_analysis_tests
+inline Gap MimicLAGapJoiner(Sequence& s1, Sequence& s2) {
+ const int INVALID_GAP = -1000000;
+ constexpr static double IDENTITY_RATIO = 0.9;
+
+ SWOverlapAnalyzer overlap_analyzer_(10000);
+ auto overlap_info = overlap_analyzer_.AnalyzeOverlap(s1, s2);
+ size_t min_la_length_ = 4;
+ if (overlap_info.size() < min_la_length_) {
+ DEBUG("Low alignment size");
+ return Gap(INVALID_GAP);
+ }
+ if (overlap_info.identity() < IDENTITY_RATIO) {
+ DEBUG("Low identity score");
+ return Gap(INVALID_GAP);
+ }
+ std::cout << overlap_info;
+
+ return Gap(
+ (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos),
+ (uint32_t) (s1.size() - overlap_info.r1.end_pos),
+ (uint32_t) overlap_info.r2.start_pos);
+}
+
+
+//Detects a cycle as a minsuffix > IS present earlier in the path. Overlap is allowed.
+class InsertSizeLoopDetector {
+protected:
+ GraphCoverageMap visited_cycles_coverage_map_;
+ PathContainer path_storage_;
+ size_t min_cycle_len_;
+
+public:
+ InsertSizeLoopDetector(const Graph& g, size_t is):
+ visited_cycles_coverage_map_(g),
+ path_storage_(),
+ min_cycle_len_(is) {
+ }
+
+ ~InsertSizeLoopDetector() {
+ path_storage_.DeleteAllPaths();
+ }
+
+ bool CheckCycledNonIS(const BidirectionalPath& path) const {
+ if (path.Size() <= 2) {
+ return false;
+ }
+ BidirectionalPath last = path.SubPath(path.Size() - 2);
+ int pos = path.FindFirst(last);
+ VERIFY(pos >= 0);
+ return size_t(pos) != path.Size() - 2;
+ }
+
+ bool CheckCycled(const BidirectionalPath& path) const {
+ return FindCycleStart(path) != -1;
+ }
+//first suffix longer than min_cycle_len
+ int FindPosIS(const BidirectionalPath& path) const {
+ int i = (int) path.Size() - 1;
+ while (i >= 0 && path.LengthAt(i) < min_cycle_len_) {
+ --i;
+ }
+ return i;
+ }
+ int FindCycleStart(const BidirectionalPath& path) const {
+ TRACE("Looking for IS cycle " << min_cycle_len_);
+ int i = FindPosIS(path);
+ TRACE("last is pos " << i);
+ if (i < 0) return -1;
+//Tail
+ BidirectionalPath last = path.SubPath(i);
+ //last.Print();
+
+ int pos = path.FindFirst(last);
+// not cycle
+ if (pos == i) pos = -1;
+ TRACE("looking for 1sr IS cycle " << pos);
+ return pos;
+ }
+
+//After cycle detected, removes min suffix > IS.
+//returns the beginning of the cycle.
+ int RemoveCycle(BidirectionalPath& path) const {
+ int pos = FindCycleStart(path);
+ DEBUG("Found IS cycle " << pos);
+ if (pos == -1) {
+ return -1;
+ }
+
+ int last_edge_pos = FindPosIS(path);
+ VERIFY(last_edge_pos > -1);
+ DEBUG("last edge pos " << last_edge_pos);
+ VERIFY(last_edge_pos > pos);
+ for (int i = (int) path.Size() - 1; i >= last_edge_pos; --i) {
+ path.PopBack();
+ }
+ VERIFY((int) path.Size() == last_edge_pos);
+ VERIFY(pos < (int) path.Size());
+ DEBUG("result pos " <<pos);
+ return pos;
+ }
+
+ //seems that it is outofdate
+ bool InExistingLoop(const BidirectionalPath& path) {
+ DEBUG("Checking existing loops");
+ auto visited_cycles = visited_cycles_coverage_map_.GetEdgePaths(path.Back());
+ for (auto cycle : *visited_cycles) {
+ DEBUG("checking cycle ");
+ int pos = path.FindLast(*cycle);
+ if (pos == -1)
+ continue;
+
+ int start_cycle_pos = pos + (int) cycle->Size();
+ bool only_cycles_in_tail = true;
+ int last_cycle_pos = start_cycle_pos;
+ DEBUG("start_cycle pos "<< last_cycle_pos);
+ for (int i = start_cycle_pos; i < (int) path.Size() - (int) cycle->Size(); i += (int) cycle->Size()) {
+ if (!path.CompareFrom(i, *cycle)) {
+ only_cycles_in_tail = false;
+ break;
+ } else {
+ last_cycle_pos = i + (int) cycle->Size();
+ DEBUG("last cycle pos changed " << last_cycle_pos);
+ }
+ }
+ DEBUG("last_cycle_pos " << last_cycle_pos);
+ only_cycles_in_tail = only_cycles_in_tail && cycle->CompareFrom(0, path.SubPath(last_cycle_pos));
+ if (only_cycles_in_tail) {
+// seems that most of this is useless, checking
+ VERIFY (last_cycle_pos == start_cycle_pos);
+ DEBUG("find cycle " << last_cycle_pos);
+ DEBUG("path");
+ path.Print();
+ DEBUG("last subpath");
+ path.SubPath(last_cycle_pos).Print();
+ DEBUG("cycle");
+ cycle->Print();
+ DEBUG("last_cycle_pos " << last_cycle_pos << " path size " << path.Size());
+ VERIFY(last_cycle_pos <= (int)path.Size());
+ DEBUG("last cycle pos + cycle " << last_cycle_pos + (int)cycle->Size());
+ VERIFY(last_cycle_pos + (int)cycle->Size() >= (int)path.Size());
+
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void AddCycledEdges(const BidirectionalPath& path, size_t pos) {
+ if (pos >= path.Size()) {
+ DEBUG("Wrong position in IS cycle");
+ return;
+ }
+ BidirectionalPath * p = new BidirectionalPath(path.SubPath(pos));
+ BidirectionalPath * cp = new BidirectionalPath(p->Conjugate());
+ visited_cycles_coverage_map_.Subscribe(p);
+ visited_cycles_coverage_map_.Subscribe(cp);
+ DEBUG("add cycle");
+ p->Print();
+ }
+};
+
+class RepeatDetector {
+public:
+ RepeatDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t max_repeat_len)
+ : g_(g),
+ cov_map_(cov_map),
+ used_paths_(),
+ repeat_len_(max_repeat_len){
+ empty_ = new BidirectionalPath(g_);
+ }
+ ~RepeatDetector() {
+ delete empty_;
+ }
+
+ BidirectionalPath* RepeatPath(const BidirectionalPath& p) {
+ if (p.Size() == 0) {
+ return empty_;
+ }
+ EdgeId last_e = p.Back();
+ BidirectionalPathSet cov_paths = cov_map_.GetCoveringPaths(last_e);
+ DEBUG("cov paths for e " << g_.int_id(last_e) << " size " << cov_paths.size());
+ size_t max_common_size = 0;
+ BidirectionalPath* result_p = empty_;
+ for (BidirectionalPath* cov_p : cov_paths) {
+ if (used_paths_.find(cov_p) == used_paths_.end() || cov_p == &p || cov_p == p.GetConjPath()) {
+ continue;
+ }
+ size_t common_size = MaxCommonSize(p, *cov_p);
+ DEBUG("max comon size with path " << cov_p->GetId() << " is " << common_size);
+ if (common_size == 0) {
+ continue;
+ }
+ VERIFY(common_size <= p.Size());
+ if (p.LengthAt(p.Size() - common_size) > repeat_len_) {
+ DEBUG("repeat from " << (p.Size() - common_size) << " length " << p.LengthAt(p.Size() - common_size) << " repeat length " << repeat_len_);
+ max_common_size = max(common_size, max_common_size);
+ result_p = cov_p;
+ }
+ }
+ used_paths_.insert(&p);
+ DEBUG("max common size " << max_common_size);
+ return result_p;
+ }
+ size_t MaxCommonSize(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
+ DEBUG("max coomon size ")
+ EdgeId last_e = p1.Back();
+ vector<size_t> positions2 = p2.FindAll(last_e);
+ DEBUG("pos size " << positions2.size())
+ size_t max_common_size = 0;
+ for (size_t pos2 : positions2) {
+ size_t common_size = MaxCommonSize(p1, p1.Size() - 1, p2, pos2);
+ DEBUG("max common size from " << pos2 << " is " << common_size);
+ max_common_size = max(max_common_size, common_size);
+ }
+ return max_common_size;
+ }
+private:
+ size_t MaxCommonSize(const BidirectionalPath& p1, size_t pos1, const BidirectionalPath& p2, size_t pos2) const {
+ int i1 = (int) pos1;
+ int i2 = (int) pos2;
+ while (i1 >= 0 && i2 >= 0 &&
+ p1.At((size_t) i1) == p2.At((size_t) i2) &&
+ p1.GapAt((size_t) i1) == p2.GapAt((size_t) i2)) {
+ i1--;
+ i2--;
+ }
+ if (i1 >=0 && i2>=0 && p1.At((size_t) i1) == p2.At((size_t) i2)) {
+ i1--;
+ i2--;
+ }
+
+ VERIFY(i1 <= (int)pos1);
+ return std::max(size_t((int) pos1 - i1), (size_t)1);
+ }
+ const Graph& g_;
+ const GraphCoverageMap& cov_map_;
+ set<const BidirectionalPath*> used_paths_;
+ size_t repeat_len_;
+ BidirectionalPath* empty_;
+};
+
+class ContigsMaker {
+public:
+ ContigsMaker(const Graph & g)
+ : g_(g) { }
+
+ virtual ~ContigsMaker() { }
+
+ virtual void GrowPath(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual void GrowAll(PathContainer & paths, PathContainer& paths_storage) = 0;
+
+protected:
+ const Graph& g_;
+ DECL_LOGGER("PathExtender")
+};
+
+struct UsedUniqueStorage {
+ set<EdgeId> used_;
+
+ const ScaffoldingUniqueEdgeStorage& unique_;
+
+ UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {}
+
+ void insert(EdgeId e) {
+ if (unique_.IsUnique(e)) {
+ used_.insert(e);
+ used_.insert(e->conjugate());
+ }
+ }
+
+ bool IsUsedAndUnique(EdgeId e) const {
+ return (unique_.IsUnique(e) && used_.find(e) != used_.end());
+ }
+
+ bool UniqueCheckEnabled() const {
+ return unique_.size() > 0;
+ }
+
+
+};
+
+class PathExtender {
+public:
+ PathExtender(const Graph & g):
+ g_(g){ }
+
+ virtual ~PathExtender() { }
+
+ virtual bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ void AddUniqueEdgeStorage(shared_ptr<UsedUniqueStorage> used_storage) {
+ used_storage_ = used_storage;
+ }
+protected:
+ const Graph& g_;
+ shared_ptr<UsedUniqueStorage> used_storage_;
+ DECL_LOGGER("PathExtender")
+};
+
+class CompositeExtender : public ContigsMaker {
+public:
+ CompositeExtender(const Graph &g, GraphCoverageMap& cov_map,
+ size_t max_diff_len,
+ size_t max_repeat_length,
+ bool detect_repeats_online)
+ : ContigsMaker(g),
+ cover_map_(cov_map),
+ repeat_detector_(g, cover_map_, 2 * max_repeat_length),
+ extenders_(),
+ max_diff_len_(max_diff_len),
+ max_repeat_len_(max_repeat_length),
+ detect_repeats_online_(detect_repeats_online) {
+ }
+
+ CompositeExtender(const Graph & g, GraphCoverageMap& cov_map,
+ vector<shared_ptr<PathExtender> > pes,
+ const ScaffoldingUniqueEdgeStorage& unique,
+ size_t max_diff_len,
+ size_t max_repeat_length,
+ bool detect_repeats_online)
+ : ContigsMaker(g),
+ cover_map_(cov_map),
+ repeat_detector_(g, cover_map_, 2 * max_repeat_length),
+ extenders_(),
+ max_diff_len_(max_diff_len),
+ max_repeat_len_(max_repeat_length),
+ detect_repeats_online_(detect_repeats_online) {
+ extenders_ = pes;
+ used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage(unique));
+ for (auto ex: extenders_) {
+ ex->AddUniqueEdgeStorage(used_storage_);
+ }
+ }
+
+ void AddExtender(shared_ptr<PathExtender> pe) {
+ extenders_.push_back(pe);
+ pe->AddUniqueEdgeStorage(used_storage_);
+ }
+
+ void GrowAll(PathContainer& paths, PathContainer& result) override {
+ result.clear();
+ GrowAllPaths(paths, result);
+ LengthPathFilter filter(g_, 0);
+ filter.filter(result);
+ }
+
+ void GrowPath(BidirectionalPath& path, PathContainer* paths_storage) override {
+ while (MakeGrowStep(path, paths_storage)) { }
+ }
+
+ void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage) override {
+ while (MakeGrowStep(path, paths_storage, false)) { }
+ }
+
+
+ bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage,
+ bool detect_repeats_online_local = true) {
+ DEBUG("make grow step composite extender");
+ if (detect_repeats_online_ && detect_repeats_online_local) {
+ BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path);
+ size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path);
+
+ if (repeat_size > 0) {
+ DEBUG("repeat with length " << repeat_size);
+ path.Print();
+ repeat_path->Print();
+ BidirectionalPath repeat = path.SubPath(path.Size() - repeat_size);
+ int begin_repeat = repeat_path->FindLast(repeat);
+ VERIFY(begin_repeat > -1);
+ size_t end_repeat = (size_t) begin_repeat + repeat_size;
+ DEBUG("not consistent subpaths ");
+ BidirectionalPath begin1 = path.SubPath(0, path.Size() - repeat_size);
+ begin1.Print();
+ BidirectionalPath begin2 = repeat_path->SubPath(0, begin_repeat);
+ begin2.Print();
+ int gpa_in_repeat_path = repeat_path->GapAt(begin_repeat);
+ BidirectionalPath end2 = repeat_path->SubPath(end_repeat);
+ BidirectionalPath begin1_conj = path.SubPath(0, path.Size() - repeat_size + 1).Conjugate();
+ BidirectionalPath begin2_conj = repeat_path->SubPath(0, begin_repeat + 1).Conjugate();
+ pair<size_t, size_t> last = ComparePaths(0, 0, begin1_conj, begin2_conj, max_diff_len_);
+ DEBUG("last " << last.first << " last2 " << last.second);
+ path.Clear();
+ repeat_path->Clear();
+ int gap_len = repeat.GapAt(0);
+
+ if (begin2.Size() == 0 || last.second != 0) { //TODO: incorrect: common edges, but then different ends
+ path.PushBack(begin1);
+ repeat_path->PushBack(begin2);
+ } else {
+ gap_len = gpa_in_repeat_path;
+ path.PushBack(begin2);
+ repeat_path->PushBack(begin1);
+ }
+
+ path.PushBack(repeat.At(0), gap_len);
+ path.PushBack(repeat.SubPath(1));
+ path.PushBack(end2);
+ DEBUG("new path");
+ path.Print();
+ return false;
+ }
+ }
+
+ size_t current = 0;
+ while (current < extenders_.size()) {
+ DEBUG("step " << current << " of total " << extenders_.size());
+ if (extenders_[current]->MakeGrowStep(path, paths_storage)) {
+ return true;
+ }
+ ++current;
+ }
+ return false;
+ }
+
+private:
+ GraphCoverageMap& cover_map_;
+ RepeatDetector repeat_detector_;
+ vector<shared_ptr<PathExtender> > extenders_;
+ size_t max_diff_len_;
+ size_t max_repeat_len_;
+ bool detect_repeats_online_;
+ shared_ptr<UsedUniqueStorage> used_storage_;
+
+ void SubscribeCoverageMap(BidirectionalPath * path) {
+ path->Subscribe(&cover_map_);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ cover_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ void GrowAllPaths(PathContainer& paths, PathContainer& result) {
+ for (size_t i = 0; i < paths.size(); ++i) {
+ VERBOSE_POWER_T2(i, 100, "Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
+ if (paths.size() > 10 && i % (paths.size() / 10 + 1) == 0) {
+ INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
+ }
+ //In 2015 modes do not use a seed already used in paths.
+ if (used_storage_->UniqueCheckEnabled()) {
+ bool was_used = false;
+ for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) {
+ EdgeId eid = paths.Get(i)->At(ind);
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ DEBUG("Used edge " << g_.int_id(eid));
+ was_used = true;
+ break;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ if (was_used) {
+ DEBUG("skipping already used seed");
+ continue;
+ }
+ }
+
+ if (!cover_map_.IsCovered(*paths.Get(i))) {
+ BidirectionalPath * path = new BidirectionalPath(*paths.Get(i));
+ BidirectionalPath * conjugatePath = new BidirectionalPath(*paths.GetConjugate(i));
+ result.AddPair(path, conjugatePath);
+ SubscribeCoverageMap(path);
+ SubscribeCoverageMap(conjugatePath);
+ size_t count_trying = 0;
+ size_t current_path_len = 0;
+ do {
+ current_path_len = path->Length();
+ count_trying++;
+ GrowPath(*path, &result);
+ GrowPath(*conjugatePath, &result);
+ } while (count_trying < 10 && (path->Length() != current_path_len));
+ path->CheckConjugateEnd(max_repeat_len_);
+ DEBUG("result path " << path->GetId());
+ path->Print();
+ }
+ }
+ }
+
+};
+
+//All Path-Extenders inherit this one
+class LoopDetectingPathExtender : public PathExtender {
+
+protected:
+ bool investigate_short_loops_;
+ bool use_short_loop_cov_resolver_;
+ CovShortLoopResolver cov_loop_resolver_;
+
+ InsertSizeLoopDetector is_detector_;
+ const GraphCoverageMap& cov_map_;
+
+public:
+ LoopDetectingPathExtender(const conj_graph_pack &gp,
+ const GraphCoverageMap &cov_map,
+ bool investigate_short_loops,
+ bool use_short_loop_cov_resolver,
+ size_t is)
+ : PathExtender(gp.g),
+ investigate_short_loops_(investigate_short_loops),
+ use_short_loop_cov_resolver_(use_short_loop_cov_resolver),
+ cov_loop_resolver_(gp),
+ is_detector_(gp.g, is),
+ cov_map_(cov_map) {
+
+ }
+
+ bool isInvestigateShortLoops() const {
+ return investigate_short_loops_;
+ }
+
+ void setInvestigateShortLoops(bool investigateShortLoops) {
+ this->investigate_short_loops_ = investigateShortLoops;
+ }
+
+ bool DetectCycle(BidirectionalPath& path) {
+ DEBUG("detect cycle");
+ if (is_detector_.CheckCycled(path)) {
+ DEBUG("Checking IS cycle");
+ int loop_pos = is_detector_.RemoveCycle(path);
+ DEBUG("Removed IS cycle");
+ if (loop_pos != -1) {
+ is_detector_.AddCycledEdges(path, loop_pos);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool DetectCycleScaffolding(BidirectionalPath& path) {
+ return is_detector_.CheckCycledNonIS(path);
+ }
+
+ virtual bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
+
+ virtual bool ResolveShortLoopByCov(BidirectionalPath& path) = 0;
+
+ virtual bool ResolveShortLoopByPI(BidirectionalPath& path) = 0;
+
+ virtual bool CanInvestigateShortLoop() const {
+ return false;
+ }
+
+ bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
+ if (is_detector_.InExistingLoop(path)) {
+ DEBUG("in existing loop");
+ return false;
+ }
+ bool result;
+ LoopDetector loop_detector(&path, cov_map_);
+ if (DetectCycle(path)) {
+ result = false;
+ } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back()) && use_short_loop_cov_resolver_) {
+ DEBUG("edge in short loop");
+ result = ResolveShortLoop(path);
+ } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop() && use_short_loop_cov_resolver_) {
+ DEBUG("Prev edge in short loop");
+ path.PopBack();
+ result = ResolveShortLoop(path);
+ } else {
+ DEBUG("Making step");
+ result = MakeSimpleGrowStep(path, paths_storage);
+ DEBUG("Made step");
+ if (DetectCycle(path)) {
+ result = false;
+ } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back())) {
+ DEBUG("Edge in short loop");
+ result = ResolveShortLoop(path);
+ } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop()) {
+ DEBUG("Prev edge in short loop");
+ path.PopBack();
+ result = ResolveShortLoop(path);
+ }
+ }
+ return result;
+ }
+
+private:
+ bool ResolveShortLoop(BidirectionalPath& p) {
+ if (use_short_loop_cov_resolver_) {
+ return ResolveShortLoopByCov(p);
+ } else {
+ return ResolveShortLoopByPI(p);
+ }
+ }
+
+ bool InvestigateShortLoop() {
+ return investigate_short_loops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
+ }
+protected:
+ DECL_LOGGER("LoopDetectingPathExtender")
+};
+
+class SimpleExtender: public LoopDetectingPathExtender {
+
+protected:
+
+ shared_ptr<ExtensionChooser> extensionChooser_;
+
+ void FindFollowingEdges(BidirectionalPath& path, ExtensionChooser::EdgeContainer * result) {
+ DEBUG("Looking for the following edges")
+ result->clear();
+ vector<EdgeId> edges;
+ DEBUG("Pushing back")
+ push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back())));
+ result->reserve(edges.size());
+ for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
+ DEBUG("Adding edge w distance " << g_.int_id(*iter));
+ result->push_back(EdgeWithDistance(*iter, 0));
+ }
+ DEBUG("Following edges found");
+ }
+
+
+public:
+
+ SimpleExtender(const conj_graph_pack &gp,
+ const GraphCoverageMap &cov_map,
+ shared_ptr<ExtensionChooser> ec,
+ size_t is,
+ bool investigate_short_loops,
+ bool use_short_loop_cov_resolver) :
+ LoopDetectingPathExtender(gp, cov_map, investigate_short_loops, use_short_loop_cov_resolver, is),
+ extensionChooser_(ec) {
+ }
+
+ std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
+ return extensionChooser_;
+ }
+
+ bool CanInvestigateShortLoop() const override {
+ return extensionChooser_->WeightCounterBased();
+ }
+
+ bool ResolveShortLoopByCov(BidirectionalPath& path) override {
+ LoopDetector loop_detector(&path, cov_map_);
+ size_t init_len = path.Length();
+ bool result = false;
+ while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
+ cov_loop_resolver_.ResolveShortLoop(path);
+ if (init_len == path.Length()) {
+ return result;
+ } else {
+ result = true;
+ }
+ init_len = path.Length();
+ }
+ return true;
+ }
+
+ bool ResolveShortLoopByPI(BidirectionalPath& path) override {
+ if (extensionChooser_->WeightCounterBased()) {
+ LoopResolver loop_resolver(g_, extensionChooser_->wc());
+ LoopDetector loop_detector(&path, cov_map_);
+ size_t init_len = path.Length();
+ bool result = false;
+ while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
+ loop_resolver.ResolveShortLoop(path);
+ if (init_len == path.Length()) {
+ return result;
+ } else {
+ result = true;
+ }
+ init_len = path.Length();
+ }
+ return true;
+ }
+ return false;
+ }
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
+ ExtensionChooser::EdgeContainer candidates;
+ return FilterCandidates(path, candidates) and AddCandidates(path, paths_storage, candidates);
+ }
+
+protected:
+ virtual bool FilterCandidates(BidirectionalPath& path, ExtensionChooser::EdgeContainer& candidates) {
+ if (path.Size() == 0) {
+ return false;
+ }
+ DEBUG("Simple grow step");
+ path.Print();
+ FindFollowingEdges(path, &candidates);
+ DEBUG("found candidates");
+ DEBUG(candidates.size())
+ if (candidates.size() == 1) {
+ LoopDetector loop_detector(&path, cov_map_);
+ if (!investigate_short_loops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+ }
+ DEBUG("more filtering");
+ candidates = extensionChooser_->Filter(path, candidates);
+ DEBUG("filtered candidates");
+ DEBUG(candidates.size())
+ return true;
+ }
+
+ virtual bool AddCandidates(BidirectionalPath& path, PathContainer* /*paths_storage*/, ExtensionChooser::EdgeContainer& candidates) {
+ if (candidates.size() != 1)
+ return false;
+
+ LoopDetector loop_detector(&path, cov_map_);
+ DEBUG("loop detecor");
+ if (!investigate_short_loops_ &&
+ (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ return false;
+ }
+ DEBUG("push");
+ EdgeId eid = candidates.back().e_;
+//In 2015 modes when trying to use already used unique edge, it is not added and path growing stops.
+//That allows us to avoid overlap removal hacks used earlier.
+ if (used_storage_->UniqueCheckEnabled()) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ path.PushBack(eid, candidates.back().d_);
+ DEBUG("push done");
+ return true;
+ }
+
+protected:
+ DECL_LOGGER("SimpleExtender")
+
+};
+
+
+class MultiExtender: public SimpleExtender {
+
+protected:
+ size_t max_candidates_;
+
+public:
+
+ MultiExtender(const conj_graph_pack &gp,
+ const GraphCoverageMap &cov_map,
+ shared_ptr<ExtensionChooser> ec,
+ size_t is,
+ bool investigate_short_loops,
+ bool use_short_loop_cov_resolver,
+ size_t max_candidates = 0) :
+ SimpleExtender(gp, cov_map, ec, is, investigate_short_loops, use_short_loop_cov_resolver),
+ max_candidates_(max_candidates) {
+ }
+
+protected:
+ virtual bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override {
+ if (candidates.size() == 0)
+ return false;
+
+ bool res = false;
+ LoopDetector loop_detector(&path, cov_map_);
+ DEBUG("loop detecor");
+ if (!investigate_short_loops_ &&
+ (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
+ && extensionChooser_->WeightCounterBased()) {
+ DEBUG("loop deteced");
+ return false;
+ }
+ if (candidates.size() == 1) {
+ DEBUG("push");
+ EdgeId eid = candidates.back().e_;
+ path.PushBack(eid, candidates.back().d_);
+ DEBUG("push done");
+ return true;
+ }
+ else if (candidates.size() == 2) {
+ //Check for bulge
+ auto v = g_.EdgeStart(candidates.front().e_);
+ auto u = g_.EdgeEnd(candidates.front().e_);
+ for (auto edge : candidates) {
+ if (v != g_.EdgeStart(edge.e_) || u != g_.EdgeEnd(edge.e_))
+ return false;
+ }
+
+ //Creating new paths for other than new candidate.
+ for (size_t i = 1; i < candidates.size(); ++i) {
+ DEBUG("push other candidates " << i);
+ BidirectionalPath *p = new BidirectionalPath(path);
+ p->PushBack(candidates[i].e_, candidates[i].d_);
+ BidirectionalPath *cp = new BidirectionalPath(p->Conjugate());
+ paths_storage->AddPair(p, cp);
+ }
+
+ DEBUG("push");
+ path.PushBack(candidates.front().e_, candidates.front().d_);
+ DEBUG("push done");
+ res = true;
+
+ if (candidates.size() > 1) {
+ DEBUG("Found " << candidates.size() << " candidates");
+ }
+ }
+
+ return res;
+ }
+
+protected:
+ DECL_LOGGER("MultiExtender")
+
+};
+
+
+class ScaffoldingPathExtender: public LoopDetectingPathExtender {
+private:
+ std::shared_ptr<ExtensionChooser> extension_chooser_;
+ ExtensionChooser::EdgeContainer sources_;
+ std::shared_ptr<GapJoiner> gap_joiner_;
+ bool avoid_rc_connections_;
+
+//When check_sink_ set to false we can scaffold not only tips
+ bool check_sink_;
+
+ void InitSources() {
+ sources_.clear();
+
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.IncomingEdgeCount(g_.EdgeStart(*iter)) == 0) {
+ sources_.push_back(EdgeWithDistance(*iter, 0));
+ }
+ }
+ }
+
+ bool IsSink(EdgeId e) const {
+ return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0;
+ }
+
+protected:
+ virtual bool GapSatisfies(int /*gap*/) const {
+ return true;
+ }
+
+ bool MakeSimpleGrowStepForChooser(BidirectionalPath& path, std::shared_ptr<ExtensionChooser> ec, bool must_overlap = false) {
+ if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back()))) {
+ return false;
+ }
+ DEBUG("scaffolding:");
+ DEBUG("Simple grow step, growing path");
+ path.Print();
+ ExtensionChooser::EdgeContainer candidates = ec->Filter(path, sources_);
+ DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size());
+
+ //DEBUG("Extension chooser threshold = " << ec->GetThreshold())
+ DEBUG("Candidate size = " << candidates.size())
+ if (candidates.size() == 1) {
+ if (candidates[0].e_ == path.Back()
+ || (avoid_rc_connections_ && candidates[0].e_ == g_.conjugate(path.Back()))) {
+ return false;
+ }
+ BidirectionalPath temp_path(path);
+ temp_path.PushBack(candidates[0].e_);
+ if (this->DetectCycleScaffolding(temp_path)) {
+ return false;
+ }
+
+ EdgeId eid = candidates.back().e_;
+ if (check_sink_) {
+ Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_);
+ DEBUG("Gap after fixing " << gap.gap_ << " (was " << candidates.back().d_ << ")");
+ if (gap.gap_ != GapJoiner::INVALID_GAP) {
+ DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() <<
+ ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
+ gap.trash_current_);
+
+ if (used_storage_->UniqueCheckEnabled()) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+
+ if (must_overlap && GapSatisfies(gap.gap_)) {
+ DEBUG("Overlap is not large enogh")
+ return false;
+ }
+ DEBUG("Overlap is good, success")
+ path.PushBack(eid, gap);
+ return true;
+ }
+ else {
+ DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " <<
+ path.Length() << ", fixed gap length: " << candidates.back().d_ << ", fixed = " << gap.gap_);
+ return false;
+ }
+ }
+ else {
+ DEBUG("Gap joiners off");
+ DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length()
+ << ", fixed gap length: " << candidates.back().d_);
+
+ if (used_storage_->UniqueCheckEnabled()) {
+ if (used_storage_->IsUsedAndUnique(eid)) {
+ return false;
+ } else {
+ used_storage_->insert(eid);
+ }
+ }
+ path.PushBack(candidates.back().e_, candidates.back().d_);
+ return true;
+ }
+ }
+ DEBUG("scaffolding end");
+ return false;
+ }
+
+public:
+
+ ScaffoldingPathExtender(const conj_graph_pack &gp,
+ const GraphCoverageMap &cov_map,
+ std::shared_ptr<ExtensionChooser> extension_chooser,
+ std::shared_ptr<GapJoiner> gap_joiner,
+ size_t is,
+ bool investigate_short_loops,
+ bool avoid_rc_connections,
+ bool check_sink = true):
+ LoopDetectingPathExtender(gp, cov_map, investigate_short_loops, false, is),
+ extension_chooser_(extension_chooser),
+ gap_joiner_(gap_joiner),
+ avoid_rc_connections_(avoid_rc_connections),
+ check_sink_(check_sink)
+ {
+ InitSources();
+ }
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
+ return MakeSimpleGrowStepForChooser(path, extension_chooser_);
+ }
+
+ bool ResolveShortLoopByCov(BidirectionalPath&) override {
+ return false;
+ }
+
+ bool ResolveShortLoopByPI(BidirectionalPath&) override {
+ return false;
+ }
+
+ std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
+ return extension_chooser_;
+ }
+
+protected:
+ DECL_LOGGER("ScaffoldingPathExtender");
+};
+
+
+class RNAScaffoldingPathExtender: public ScaffoldingPathExtender {
+ std::shared_ptr<ExtensionChooser> strict_extension_chooser_;
+
+ int min_overlap_;
+
+protected:
+ bool GapSatisfies(int gap) const override {
+ return gap > (int) g_.k() - min_overlap_;
+ }
+
+public:
+
+ RNAScaffoldingPathExtender(const conj_graph_pack &gp,
+ const GraphCoverageMap &cov_map,
+ std::shared_ptr<ExtensionChooser> extension_chooser,
+ std::shared_ptr<ExtensionChooser> strict_extension_chooser,
+ std::shared_ptr<GapJoiner> gap_joiner,
+ size_t is,
+ bool investigate_short_loops,
+ int min_overlap = 0):
+ ScaffoldingPathExtender(gp, cov_map, extension_chooser, gap_joiner, is, investigate_short_loops, true),
+ strict_extension_chooser_(strict_extension_chooser), min_overlap_(min_overlap) {}
+
+
+ bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
+ return MakeSimpleGrowStepForChooser(path, GetExtensionChooser(), true) ||
+ MakeSimpleGrowStepForChooser(path, strict_extension_chooser_);
+ }
+
+};
+
+}
diff --git a/src/common/modules/path_extend/path_filter.hpp b/src/common/modules/path_extend/path_filter.hpp
new file mode 100644
index 0000000..b012dd3
--- /dev/null
+++ b/src/common/modules/path_extend/path_filter.hpp
@@ -0,0 +1,186 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_filter.hpp
+ *
+ * Created on: Mar 14, 2012
+ * Author: andrey
+ */
+
+#ifndef PATH_FILTER_HPP_
+#define PATH_FILTER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+namespace path_extend {
+
+class CopyOnWritePathFilter {
+
+protected:
+ const Graph& g;
+
+public:
+ CopyOnWritePathFilter(const Graph& g_): g(g_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) = 0;
+
+ PathContainer filter(PathContainer& paths) {
+ PathContainer result;
+
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (predicate(*paths.Get(i)) || predicate(*paths.GetConjugate(i))) {
+ result.AddPair(paths.Get(i), paths.GetConjugate(i));
+ }
+ }
+
+ return result;
+ }
+
+};
+
+
+class IdFilter: public CopyOnWritePathFilter {
+
+protected:
+ std::set<size_t> ids;
+
+public:
+
+ IdFilter(const Graph& g_, std::set<size_t> ids_): CopyOnWritePathFilter(g_), ids(ids_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ return ids.count(path.GetId()) > 0;
+ }
+};
+
+
+class DuplicateFilter {
+
+protected:
+ const Graph& g;
+
+public:
+ DuplicateFilter(const Graph& g_): g(g_) {
+ }
+
+ PathContainer filter(PathContainer& paths) {
+ PathContainer result;
+
+ for (size_t i = 0; i < paths.size(); ++i) {
+ bool duplicate = false;
+ for (size_t j = 0; j < result.size(); ++j) {
+ if (result[j] == paths[j])
+ duplicate = true;
+ }
+ if (!duplicate) {
+ result.AddPair(paths.Get(i), paths.GetConjugate(i));
+ }
+ }
+
+ return result;
+ }
+
+};
+
+class ErasingPathFilter {
+
+protected:
+ const Graph& g;
+
+public:
+ ErasingPathFilter(const Graph& g_): g(g_) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) = 0;
+
+ void filter(PathContainer& paths) {
+ for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) {
+ if (predicate(*iter.get()) || predicate(*iter.getConjugate())) {
+ iter = paths.erase(iter);
+ }
+ else {
+ ++iter;
+ }
+ }
+ }
+
+};
+
+
+class CoveragePathFilter: public ErasingPathFilter {
+
+protected:
+ double minCoverage;
+
+public:
+ CoveragePathFilter(Graph& g_, double cov): ErasingPathFilter(g_), minCoverage(cov) {
+
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (math::ls(g.coverage(path[i]), minCoverage)) {
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+
+class LengthPathFilter: public ErasingPathFilter {
+
+protected:
+ size_t minLength;
+
+public:
+ LengthPathFilter(const Graph& g_, size_t len): ErasingPathFilter(g_), minLength(len) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ return path.Length() <= minLength;
+ }
+};
+
+
+class IsolatedPathFilter: public ErasingPathFilter {
+
+protected:
+ size_t min_length_;
+
+ double min_cov_;
+
+public:
+ IsolatedPathFilter(const Graph& g_, size_t min_length, double min_cov = 10000000.0):
+ ErasingPathFilter(g_),
+ min_length_(min_length),
+ min_cov_(min_cov) {
+ }
+
+ virtual bool predicate(BidirectionalPath& path) {
+ if (path.Empty())
+ return true;
+
+ if (path.Size() <= 2) {
+ auto v1 = g.EdgeStart(path.Front());
+ auto v2 = g.EdgeEnd(path.Back());
+
+ return g.IncomingEdgeCount(v1) == 0 &&
+ g.OutgoingEdgeCount(v2) == 0 &&
+ path.Length() < min_length_ &&
+ math::ls(path.Coverage(), min_cov_);
+ }
+ return false;
+ }
+};
+
+}
+
+#endif /* PATH_FILTER_HPP_ */
diff --git a/src/common/modules/path_extend/path_visualizer.hpp b/src/common/modules/path_extend/path_visualizer.hpp
new file mode 100644
index 0000000..b11d4c2
--- /dev/null
+++ b/src/common/modules/path_extend/path_visualizer.hpp
@@ -0,0 +1,172 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * path_visualizer.hpp
+ *
+ * Created on: Mar 22, 2012
+ * Author: andrey
+ */
+
+#ifndef PATH_VISUALIZER_HPP_
+#define PATH_VISUALIZER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+template<class Graph>
+class PathGraphLabeler : public visualization::graph_labeler::AbstractGraphLabeler<Graph> {
+ typedef visualization::graph_labeler::AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ std::map<EdgeId, std::string> labels_;
+
+public:
+ PathGraphLabeler(const Graph& g, const PathContainer& paths) : base(g) {
+ for(size_t i = 0; i < paths.size(); ++i) {
+ BidirectionalPath * path = paths.Get(i);
+ for (size_t j = 0; j < path->Size(); ++j) {
+ if (labels_.count(path->At(j)) > 0) {
+ labels_[path->At(j)] += ", ";
+ }
+ labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
+ }
+
+ path = paths.GetConjugate(i);
+ for (size_t j = 0; j < path->Size(); ++j) {
+ if (labels_.count(path->At(j)) > 0) {
+ labels_[path->At(j)] += ", ";
+ }
+ labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
+ }
+ }
+ }
+
+ virtual std::string label(VertexId /*vertexId*/) const {
+ return "";
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ auto label = labels_.find(edgeId);
+ return label == labels_.end() ? "" : label->second;
+ }
+};
+
+
+class PathVisualizer {
+
+protected:
+ bool writeLength;
+ bool writePos;
+
+public:
+
+ PathVisualizer(): writeLength(true), writePos(true) {
+
+ }
+
+ void writeGraphWithPathsSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name, const PathContainer& paths) const {
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ visualization::graph_labeler::StrGraphLabeler<Graph> str_labeler(gp.g);
+ PathGraphLabeler<Graph> path_labeler(gp.g, paths);
+ visualization::graph_labeler::CoverageGraphLabeler<Graph> cov_labler(gp.g);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+
+ visualization::graph_labeler::CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, path_labeler, pos_labeler);
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer;
+ if (gp.index.IsAttached()) {
+ colorer = stats::DefaultColorer(gp);
+ } else {
+ colorer = visualization::graph_colorer::DefaultColorer(gp.g);
+ }
+
+ visualization::visualizers::ComponentVisualizer<Graph> visualizer(gp.g, false);
+ visualization::vertex_linker::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ void writeGraphSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name) const{
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ visualization::graph_labeler::StrGraphLabeler<Graph> str_labeler(gp.g);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+ visualization::graph_labeler::CoverageGraphLabeler<Graph> cov_labler(gp.g);
+ visualization::graph_labeler::CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, pos_labeler);
+
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer;
+
+ if (gp.index.IsAttached()) {
+ colorer = stats::DefaultColorer(gp);
+ } else {
+ Path<EdgeId> empty;
+ colorer = visualization::graph_colorer::DefaultColorer(gp.g, empty, empty);
+ }
+
+ visualization::visualizers::ComponentVisualizer<Graph> visualizer(gp.g, false);
+ visualization::vertex_linker::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ void writeGraphSimple(const Graph& g, const string& file_name, const string& graph_name) const{
+ INFO("Visualizing graph " << graph_name << " to file " << file_name);
+ std::fstream filestr;
+ filestr.open(file_name.c_str(), std::fstream::out);
+
+ visualization::graph_labeler::StrGraphLabeler<Graph> str_labeler(g);
+ visualization::graph_labeler::CoverageGraphLabeler<Graph> cov_labler(g);
+ visualization::graph_labeler::CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler);
+
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer;
+
+ Path<EdgeId> empty;
+ colorer = visualization::graph_colorer::DefaultColorer(g, empty, empty);
+
+ visualization::visualizers::ComponentVisualizer<Graph> visualizer(g, false);
+ visualization::vertex_linker::EmptyGraphLinker<Graph> linker;
+ visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
+ filestr.close();
+ INFO("Visualizing graph done");
+ }
+
+ bool isWriteLength() const
+ {
+ return writeLength;
+ }
+
+ bool isWritePos() const
+ {
+ return writePos;
+ }
+
+ void setWriteLength(bool writeLength)
+ {
+ this->writeLength = writeLength;
+ }
+
+ void setWritePos(bool writePos)
+ {
+ this->writePos = writePos;
+ }
+};
+
+}
+
+#endif /* PATH_VISUALIZER_HPP_ */
diff --git a/src/common/modules/path_extend/pe_config_struct.cpp b/src/common/modules/path_extend/pe_config_struct.cpp
new file mode 100644
index 0000000..cccb95e
--- /dev/null
+++ b/src/common/modules/path_extend/pe_config_struct.cpp
@@ -0,0 +1,211 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pe_config_struct.hpp"
+#include "pipeline/config_common.hpp"
+
+namespace path_extend {
+
+
+void load(scaffolding_mode &sm, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ std::string ep = pt.get<std::string>(key);
+ sm = pe_config::scaffolding_mode_id(ep);
+ }
+}
+
+void load(pe_config::ParamSetT::ScaffoldGraphParamsT& sg, boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(sg.construct, pt, "construct" );
+ load(sg.output, pt, "output" );
+ load(sg.always_add, pt, "always_add" );
+ load(sg.never_add, pt, "never_add" );
+ load(sg.relative_threshold, pt, "relative_threshold" );
+ load(sg.use_graph_connectivity, pt, "use_graph_connectivity");
+ load(sg.max_path_length, pt, "max_path_length" );
+}
+
+void load(pe_config::OutputParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(o.write_overlaped_paths, pt, "write_overlaped_paths" , complete);
+ load(o.write_paths, pt, "write_paths" , complete);
+}
+
+void load(pe_config::VisualizeParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(o.print_overlaped_paths, pt, "print_overlaped_paths" , complete);
+ load(o.print_paths, pt, "print_paths" , complete);
+}
+
+void load(pe_config::ParamSetT::ExtensionOptionsT& es,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(es.use_default_single_threshold, pt, "use_default_single_threshold", complete);
+ load(es.priority_coeff, pt, "priority_coeff", complete);
+ load(es.weight_threshold, pt, "weight_threshold", complete);
+ load(es.single_threshold, pt, "single_threshold", complete);
+ load(es.max_repeat_length, pt, "max_repeat_length", complete);
+}
+
+
+void load(pe_config::ParamSetT::CoordinatedCoverageT& coord_cov,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(coord_cov.max_edge_length_in_repeat, pt, "max_edge_length_repeat", complete);
+ load(coord_cov.delta, pt, "delta", complete);
+ load(coord_cov.min_path_len, pt, "min_path_len", complete);
+}
+
+void load(pe_config::ParamSetT::ScaffolderOptionsT& so,
+ boost::property_tree::ptree const& pt, bool complete)
+{
+ using config_common::load;
+ load(so.enabled, pt, "enabled" , complete);
+ load(so.cutoff , pt, "cutoff", complete);
+ load(so.hard_cutoff , pt, "hard_cutoff", complete);
+ load(so.rel_cutoff , pt, "rel_cutoff", complete);
+ load(so.sum_threshold , pt, "sum_threshold", complete);
+
+ load(so.cluster_info , pt, "cluster_info", complete);
+ load(so.cl_threshold , pt, "cl_threshold", complete);
+
+ load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
+ load(so.min_gap_score , pt, "min_gap_score", complete);
+ load(so.max_must_overlap , pt, "max_must_overlap", complete);
+ load(so.max_can_overlap , pt, "max_can_overlap", complete);
+ load(so.short_overlap , pt, "short_overlap", complete);
+ load(so.artificial_gap , pt, "artificial_gap", complete);
+ load(so.use_old_score , pt, "use_old_score", complete);
+ load(so.min_overlap_length, pt, "min_overlap_length", complete);
+ load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
+ load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
+
+ load(so.var_coeff , pt, "var_coeff", complete);
+ load(so.basic_overlap_coeff, pt, "basic_overlap_coeff", complete);
+
+ if (pt.count("min_overlap_for_rna_scaffolding")) {
+ VERIFY_MSG(!so.min_overlap_for_rna_scaffolding, "Option can be loaded only once");
+ so.min_overlap_for_rna_scaffolding.reset(0);
+ load(*so.min_overlap_for_rna_scaffolding, pt, "min_overlap_for_rna_scaffolding");
+ }
+}
+
+
+void load(pe_config::ParamSetT::PathFiltrationT& pf,
+ boost::property_tree::ptree const& pt, bool complete)
+{
+ using config_common::load;
+ load(pf.enabled , pt, "enabled" , complete);
+ if (pf.enabled) {
+ load(pf.min_length , pt, "min_length" , complete);
+ load(pf.isolated_min_length , pt, "isolated_min_length" , complete);
+ load(pf.min_length_for_low_covered , pt, "min_length_for_low_covered" , complete);
+ load(pf.min_coverage , pt, "min_coverage" , complete);
+ }
+}
+
+void load(pe_config::ParamSetT::GenomeConsistencyCheckerParamsT& gcc,
+ boost::property_tree::ptree const& pt, bool complete)
+{
+ using config_common::load;
+ load(gcc.max_gap , pt, "max_gap" , complete);
+ load(gcc.relative_max_gap , pt, "relative_max_gap" , complete);
+}
+
+void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(p.sm, pt, "scaffolding_mode", complete);
+ load(p.normalize_weight, pt, "normalize_weight", complete);
+ load(p.cut_all_overlaps, pt, "cut_all_overlaps", complete);
+ load(p.remove_overlaps, pt, "remove_overlaps", complete);
+ load(p.multi_path_extend, pt, "multi_path_extend", complete);
+ load(p.split_edge_length, pt, "split_edge_length", complete);
+ load(p.extension_options, pt, "extension_options", complete);
+ load(p.mate_pair_options, pt, "mate_pair_options", complete);
+ load(p.scaffolder_options, pt, "scaffolder", complete);
+ load(p.coordinated_coverage, pt, "coordinated_coverage", complete);
+ load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete);
+ load(p.scaffolding2015, pt, "scaffolding2015", complete);
+ load(p.scaffold_graph_params, pt, "scaffold_graph", complete);
+ load(p.path_filtration, pt, "path_cleaning", complete);
+ load(p.genome_consistency_checker, pt, "genome_consistency_checker", complete);
+ load(p.uniqueness_analyser, pt, "uniqueness_analyser", complete);
+ load(p.loop_traversal, pt, "loop_traversal", complete);
+}
+
+void load(pe_config::LongReads& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.filtering, pt, "filtering", complete);
+ load(p.weight_priority, pt, "weight_priority", complete);
+ load(p.unique_edge_priority, pt, "unique_edge_priority", complete);
+ load(p.min_significant_overlap, pt, "min_significant_overlap", complete);
+
+}
+
+void load(pe_config::ParamSetT::LoopTraversalParamsT& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.min_edge_length, pt, "min_edge_length", complete);
+ load(p.max_component_size, pt, "max_component_size", complete);
+ load(p.max_path_length, pt, "max_path_length", complete);
+}
+
+void load(pe_config::ParamSetT::UniquenessAnalyserParamsT& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.enabled, pt, "enabled", complete);
+ load(p.nonuniform_coverage_variation, pt, "nonuniform_coverage_variation", complete);
+ load(p.uniformity_fraction_threshold, pt, "uniformity_fraction_threshold", complete);
+ load(p.unique_coverage_variation, pt, "unique_coverage_variation", complete);
+}
+
+void load(pe_config::ParamSetT::Scaffolding2015& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.unique_length_lower_bound, pt, "unique_length_lower_bound", complete);
+ load(p.unique_length_upper_bound, pt, "unique_length_upper_bound", complete);
+ load(p.unique_length_step, pt, "unique_length_step", complete);
+ load(p.graph_connectivity_max_edges, pt, "graph_connectivity_max_edges", complete);
+ load(p.relative_weight_cutoff, pt, "relative_weight_cutoff", complete);
+}
+
+void load(pe_config::AllLongReads& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.pacbio_reads, pt, "pacbio_reads", complete);
+ load(p.single_reads, pt, "single_reads", complete);
+ load(p.contigs, pt, "contigs", complete);
+ load(p.meta_contigs, pt, "meta_untrusted_contigs", complete);
+}
+
+void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt,
+ bool complete) {
+ using config_common::load;
+ load(p.debug_output, pt, "debug_output", complete);
+ load(p.output, pt, "output", complete);
+ load(p.viz, pt, "visualize", complete);
+ load(p.param_set, pt, "params", complete);
+ load(p.long_reads, pt, "long_reads", complete);
+ if (!p.debug_output) {
+ p.output.DisableAll();
+ p.viz.DisableAll();
+ }
+ p.etc_dir = "path_extend";
+}
+
+//// main long contigs config load function
+//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete) {
+// using config_common::load;
+//
+// load(pe_cfg.dataset_name , pt, "dataset", complete);
+// load(pe_cfg.params , pt, "pe_params", complete);
+//}
+
+};
+
diff --git a/src/common/modules/path_extend/pe_config_struct.hpp b/src/common/modules/path_extend/pe_config_struct.hpp
new file mode 100644
index 0000000..a5b161f
--- /dev/null
+++ b/src/common/modules/path_extend/pe_config_struct.hpp
@@ -0,0 +1,246 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * lc_config_struct.hpp
+ *
+ * Created on: Aug 16, 2011
+ * Author: Alexey.Gurevich
+ */
+
+#ifndef LC_CONFIG_STRUCT_HPP_
+#define LC_CONFIG_STRUCT_HPP_
+
+#include "pipeline/config_singl.hpp"
+#include "utils/cpp_utils.hpp"
+
+#include <boost/optional.hpp>
+#include <boost/property_tree/ptree_fwd.hpp>
+#include <boost/bimap.hpp>
+
+#include <string>
+#include <vector>
+
+namespace path_extend {
+
+enum scaffolding_mode {
+ sm_old,
+ sm_2015,
+ sm_combined,
+ sm_old_pe_2015
+};
+
+//Both this functions return always true, right?
+//still necessary?
+inline bool IsScaffolder2015Enabled(const scaffolding_mode mode) {
+ return (mode == sm_old_pe_2015 || mode == sm_2015 || mode == sm_combined);
+}
+
+inline bool IsOldPEEnabled(const scaffolding_mode mode) {
+ return (mode == sm_old_pe_2015 || mode == sm_old || mode == sm_combined);
+}
+
+// struct for path extend subproject's configuration file
+struct pe_config {
+ typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
+
+ static const scaffolding_mode_id_mapping FillSMInfo() {
+ scaffolding_mode_id_mapping::value_type info[] = {
+ scaffolding_mode_id_mapping::value_type("old", sm_old),
+ scaffolding_mode_id_mapping::value_type("2015", sm_2015),
+ scaffolding_mode_id_mapping::value_type("combined", sm_combined),
+ scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
+ };
+
+ return scaffolding_mode_id_mapping(info, utils::array_end(info));
+ }
+
+ static const scaffolding_mode_id_mapping &scaffolding_mode_info() {
+ static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
+ return scaffolding_mode_info;
+ }
+
+ static const std::string &scaffolding_mode_name(scaffolding_mode sm) {
+ auto it = scaffolding_mode_info().right.find(sm);
+ VERIFY_MSG(it != scaffolding_mode_info().right.end(),
+ "No name for scaffolding mode id = " << sm);
+
+ return it->second;
+ }
+
+ static scaffolding_mode scaffolding_mode_id(std::string name) {
+ auto it = scaffolding_mode_info().left.find(name);
+ VERIFY_MSG(it != scaffolding_mode_info().left.end(),
+ "There is no scaffolding mode with name = " << name);
+
+ return it->second;
+ }
+
+ struct OutputParamsT {
+ bool write_overlaped_paths;
+ bool write_paths;
+
+ void DisableAll() {
+ write_overlaped_paths = false;
+ write_paths = false;
+ }
+ };
+
+
+ struct VisualizeParamsT {
+ bool print_overlaped_paths;
+ bool print_paths;
+
+ void DisableAll() {
+ print_overlaped_paths = false;
+ print_paths = false;
+ }
+ };
+
+ struct ParamSetT {
+ scaffolding_mode sm;
+
+ bool normalize_weight;
+ size_t split_edge_length;
+
+ bool multi_path_extend;
+ bool remove_overlaps;
+ bool cut_all_overlaps;
+
+ struct ExtensionOptionsT {
+ bool use_default_single_threshold;
+ double single_threshold;
+ double weight_threshold;
+ double priority_coeff;
+ size_t max_repeat_length;
+ } extension_options;
+
+ ExtensionOptionsT mate_pair_options;
+
+
+ struct ScaffolderOptionsT {
+ bool enabled;
+ int cutoff;
+ int hard_cutoff;
+ double rel_cutoff;
+ double sum_threshold;
+
+ bool cluster_info;
+ double cl_threshold;
+
+ bool use_la_gap_joiner;
+ double min_gap_score;
+ double max_must_overlap;
+ double max_can_overlap;
+ int short_overlap;
+ size_t artificial_gap;
+
+ bool use_old_score;
+
+ double var_coeff;
+ double basic_overlap_coeff;
+
+ size_t min_overlap_length;
+ double flank_addition_coefficient;
+ double flank_multiplication_coefficient;
+
+ boost::optional<int> min_overlap_for_rna_scaffolding;
+ } scaffolder_options;
+
+ struct PathFiltrationT {
+ bool enabled;
+ size_t min_length;
+ size_t isolated_min_length;
+ size_t min_length_for_low_covered;
+ double min_coverage;
+ } path_filtration;
+
+
+ bool use_coordinated_coverage;
+
+ struct CoordinatedCoverageT {
+ size_t max_edge_length_in_repeat;
+ double delta;
+ size_t min_path_len;
+ } coordinated_coverage;
+
+ struct Scaffolding2015 {
+ double relative_weight_cutoff;
+
+ size_t unique_length_upper_bound;
+ size_t unique_length_lower_bound;
+ size_t unique_length_step;
+
+ size_t graph_connectivity_max_edges;
+ } scaffolding2015;
+
+ struct ScaffoldGraphParamsT {
+ bool construct;
+ bool output;
+ size_t always_add;
+ size_t never_add;
+ double relative_threshold;
+ bool use_graph_connectivity;
+ size_t max_path_length;
+ } scaffold_graph_params;
+
+ struct GenomeConsistencyCheckerParamsT {
+ size_t max_gap;
+ double relative_max_gap;
+ } genome_consistency_checker;
+
+ struct LoopTraversalParamsT {
+ size_t min_edge_length ;
+ size_t max_component_size;
+ size_t max_path_length;
+ } loop_traversal;
+
+ struct UniquenessAnalyserParamsT {
+ bool enabled;
+ double unique_coverage_variation;
+
+ double nonuniform_coverage_variation;
+ double uniformity_fraction_threshold;
+ } uniqueness_analyser;
+
+ };
+
+ struct LongReads {
+ double filtering;
+ double weight_priority;
+ double unique_edge_priority;
+ size_t min_significant_overlap;
+ };
+
+ struct AllLongReads {
+ LongReads single_reads;
+ LongReads pacbio_reads;
+ LongReads contigs;
+ LongReads meta_contigs;
+ };
+
+
+ struct MainPEParamsT {
+ bool debug_output;
+ std::string etc_dir;
+
+ OutputParamsT output;
+ VisualizeParamsT viz;
+ ParamSetT param_set;
+ AllLongReads long_reads;
+ }; // params;
+
+};
+
+void load(pe_config::ParamSetT &p, boost::property_tree::ptree const &pt, bool complete = true);
+void load(pe_config::MainPEParamsT &p, boost::property_tree::ptree const &pt, bool complete = true);
+
+}
+
+//typedef config_common::config<path_extend::pe_config> pe_cfg;
+
+#endif /* CONFIG_STRUCT_HPP_ */
diff --git a/src/common/modules/path_extend/pe_resolver.hpp b/src/common/modules/path_extend/pe_resolver.hpp
new file mode 100644
index 0000000..dfbd4f3
--- /dev/null
+++ b/src/common/modules/path_extend/pe_resolver.hpp
@@ -0,0 +1,577 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pe_resolver.hpp
+ *
+ * Created on: Mar 12, 2012
+ * Author: andrey
+ */
+
+#ifndef PE_RESOLVER_HPP_
+#define PE_RESOLVER_HPP_
+
+#include "path_extender.hpp"
+
+namespace path_extend {
+
+
+class SimpleOverlapRemover {
+
+public:
+ SimpleOverlapRemover(const Graph& g, GraphCoverageMap& cm)
+ : g_(g), coverage_map_(cm) {
+ }
+
+ void RemoveOverlaps(PathContainer& paths) const {
+ for (size_t i = 0; i < paths.size(); i++) {
+ FindAndRemovePathOverlap(paths, paths.Get(i));
+ FindAndRemovePathOverlap(paths, paths.GetConjugate(i));
+ }
+ }
+
+ size_t NonUniqueCommon(BidirectionalPath * path, int pos1, int pos2) {
+ size_t answer = 0;
+ while (pos1 >= 0) {
+ if (path->At(pos1) == path->At(pos2)) {
+ pos1--;
+ pos2--;
+ answer++;
+ } else {
+ break;
+ }
+ }
+ return answer;
+ }
+
+ size_t MaximumNonUniqueSuffix(BidirectionalPath * path) {
+ if (path->Size() == 0) {
+ return 0;
+ }
+
+ size_t answer = 0;
+ EdgeId back = path->Back();
+ vector<size_t> all_pos = path->FindAll(back);
+ for (size_t i = 0; i < all_pos.size() - 1; ++i) {
+ answer = std::max(answer, NonUniqueCommon(path, (int) all_pos[i], (int) path->Size() - 1));
+ }
+ return answer;
+ }
+
+ void CutNonUniqueSuffix(PathContainer& paths) {
+ vector<pair<BidirectionalPath *, BidirectionalPath *>> tmp_paths(paths.begin(), paths.end());
+ for (auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) {
+ BidirectionalPath * path1 = it->first;
+ BidirectionalPath * path2 = it->second;
+ size_t longest_suffix1 = MaximumNonUniqueSuffix(path1);
+ path1->PopBack(longest_suffix1);
+ size_t longest_suffix2 = MaximumNonUniqueSuffix(path2);
+ path2->PopBack(longest_suffix2);
+ }
+ }
+
+ void CutPseudoSelfConjugatePaths(PathContainer& paths) {
+ vector<pair<BidirectionalPath *, BidirectionalPath *>> tmp_paths(paths.begin(), paths.end());
+ for (auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) {
+ BidirectionalPath * path1 = it->first;
+ BidirectionalPath * path2 = it->second;
+ bool ups = false;
+ if(path1 != path2) {
+ size_t last = 0;
+ while(last < path1->Size() && path1->operator [](last) == path2->operator [](last)) {
+ last++;
+ }
+ if(last > 0) {
+ AddOverlap(paths, path1, 0, last - 1);
+ path1->PopBack(last);
+ path2->PopBack(last);
+ }
+ }
+ if(ups) path1->Print();
+ }
+ }
+
+ void RemoveSimilarPaths(PathContainer& paths, size_t min_edge_len, size_t max_path_diff, bool del_only_equal, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
+ DEBUG("== Removing similar paths ==");
+ DEBUG("Min edge len " << min_edge_len << ", max path diff " << max_path_diff)
+ DEBUG("Only equal " << del_only_equal << ", subpaths " << del_subpaths << ", starts " << del_begins << ", all " << del_all << ", add starts " << add_overlap_begins);
+ std::vector<EdgeId> edges = GetSortedEdges();
+ for (size_t edgeIndex = 0; edgeIndex < edges.size(); ++edgeIndex) {
+ EdgeId edge = edges.at(edgeIndex);
+ BidirectionalPathSet cov_paths = coverage_map_.GetCoveringPaths(edge);
+ std::vector<BidirectionalPath*> cov_vect(cov_paths.begin(), cov_paths.end());
+ std::sort(cov_vect.begin(), cov_vect.end(), PathIdCompare);
+ for (size_t vect_i = 0; vect_i < cov_vect.size(); ++vect_i) {
+ BidirectionalPath* path1 = cov_vect.at(vect_i);
+ if (cov_paths.find(path1) == cov_paths.end()) {
+ continue;
+ }
+ for (size_t vect_i1 = vect_i + 1; vect_i1 < cov_vect.size(); ++vect_i1) {
+ BidirectionalPath* path2 = cov_vect.at(vect_i1);
+ if (path1 == path2 || path1 == path2->GetConjPath()) {
+ continue;
+ }
+ if (cov_paths.find(path2) == cov_paths.end())
+ continue;
+ if ((*path1) == (*path2)) {
+ if (path2->IsOverlap()) {
+ path1->SetOverlap(true);
+ }
+ DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->Clear();
+ cov_paths = coverage_map_.GetCoveringPaths(edge);
+ continue;
+ }
+ if (g_.length(edge) <= min_edge_len || path1->IsOverlap() || path2->IsOverlap() || del_only_equal) {
+ continue;
+ }
+ CompareAndCut(paths, edge, path1, path2, max_path_diff,
+ del_subpaths, del_begins, del_all, add_overlap_begins);
+ cov_paths = coverage_map_.GetCoveringPaths(edge);
+ }
+ }
+ }
+ DEBUG("== Emd removing similar paths ==");
+ }
+
+private:
+
+ void SubscribeCoverageMap(BidirectionalPath* path) const {
+ path->Subscribe(&coverage_map_);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ coverage_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ void CompareAndCut(PathContainer& paths, EdgeId edge, BidirectionalPath* path1, BidirectionalPath* path2,
+ size_t max_path_diff,
+ bool del_subpaths, bool del_begins,
+ bool del_all, bool add_overlap_begins) const {
+ vector<size_t> positions1 = path1->FindAll(edge);
+ vector<size_t> positions2 = path2->FindAll(edge);
+ size_t i1 = 0;
+ size_t i2 = 0;
+ bool renewed = false;
+ while (i1 < positions1.size()) {
+ while (i2 < positions2.size()) {
+ DEBUG("CompareAndCutFromPos paths " << g_.int_id(edge));
+ CompareAndCutFromPos(paths, path1, (int) positions1[i1], path2,
+ (int) positions2[i2], max_path_diff,
+ del_subpaths, del_begins, del_all, add_overlap_begins);
+
+ if (positions1[i1] >= path1->Size() || path1->At(positions1[i1]) != edge || positions2[i2] >= path2->Size() || path2->At(positions2[i2]) != edge) {
+ vector<size_t> new_positions1 = path1->FindAll(edge);
+ vector<size_t> new_positions2 = path2->FindAll(edge);
+
+ if (new_positions1.size() == positions1.size() && new_positions2.size() == positions2.size()) {
+ return;
+ }
+ else {
+ positions1 = new_positions1;
+ positions2 = new_positions2;
+ i1 = 0;
+ i2 = 0;
+ renewed = true;
+ break;
+ }
+ ++i2;
+ }
+ ++i2;
+ }
+
+ if (renewed) {
+ renewed = false;
+ continue;
+ }
+ ++i1;
+ }
+ }
+
+ void CompareAndCutFromPos(PathContainer& paths, BidirectionalPath* path1, int pos1,
+ BidirectionalPath* path2, int pos2,
+ size_t max_path_diff,
+ bool delete_subpaths, bool delete_begins,
+ bool delete_all, bool add_overlap_begins) const {
+ int last2 = pos2;
+ int last1 = pos1;
+ if (last1 >= (int) path1->Size() || last2 >= (int) path2->Size()) {
+ return;
+ }
+ vector<int> other_path_end;
+ pair<int, int> posRes = ComparePaths(last1, last2, *path1, *path2, max_path_diff);
+ last1 = posRes.first;
+ last2 = posRes.second;
+ BidirectionalPath* conj1 = path1->GetConjPath();
+ BidirectionalPath* conj2 = path2->GetConjPath();
+ size_t first1 = conj1->Size() - pos1 - 1;
+ size_t first2 = conj2->Size() - pos2 - 1;
+ posRes = ComparePaths(first1, first2, *conj1, *conj2, max_path_diff);
+ first2 = conj2->Size() - posRes.second - 1;
+ first1 = conj1->Size() - posRes.first - 1;
+ if ((int)path2->LengthAt(last2) - (int)g_.length(path2->At(last2)) < (int) max_path_diff) {
+ last2 = (int)path2->Size() - 1;
+ }
+ if ((int)path2->Length() - (int)path2->LengthAt(first2) < (int) max_path_diff) {
+ first2 = 0;
+ }
+ if ((int)path1->LengthAt(last1) - (int)g_.length(path1->At(last1)) < (int) max_path_diff) {
+ last1 = (int)path1->Size() - 1;
+ }
+ if ((int)path1->Length() - (int)path1->LengthAt(first1) < (int) max_path_diff) {
+ first1 = 0;
+ }
+
+ CutOverlaps(paths, path1, first1, last1, path1->Size(), path2,
+ first2, last2, path2->Size(), delete_subpaths,
+ delete_begins, delete_all, add_overlap_begins);
+ }
+
+ void AddOverlap(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1) const {
+ BidirectionalPath* overlap = new BidirectionalPath(path1->SubPath(first1, last1 + 1));
+ BidirectionalPath* conj_overlap = new BidirectionalPath(overlap->Conjugate());
+ SubscribeCoverageMap(overlap);
+ SubscribeCoverageMap(conj_overlap);
+ paths.AddPair(overlap, conj_overlap);
+ }
+
+ bool CutOverlaps(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1, size_t size1, BidirectionalPath* path2, size_t first2,
+ size_t last2, size_t size2, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
+ if (first1 == 0 && last1 == size1 - 1 && del_subpaths) {
+ DEBUG("Removing path " << path1->GetId() << " because of path " << path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->Clear();
+ } else if (first2 == 0 && last2 == size2 - 1 && del_subpaths) {
+ DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->Clear();
+ } else if (first2 == 0 && first1 == 0 && del_begins) {
+ DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar starts");
+ DEBUG("Path 1: " << last1 << " edges of length " << path1->Length() - path1->LengthAt(min(last1 + 1, path1->Size() - 1)));
+ DEBUG("Path 2: " << last2 << " edges of length " << path2->Length() - path2->LengthAt(min(last2 + 1, path2->Size() - 1)));
+ DEBUG("Path 1 has overlap start " << path1->HasOverlapedBegin() << ", path 2 has overlap start " << path2->HasOverlapedBegin());
+
+ if (add_overlap_begins) {
+ AddOverlap(paths, path1, first1, last1);
+ DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path1->GetConjPath()->PopBack(last1 + 1);
+ path2->GetConjPath()->PopBack(last2 + 1);
+ } else if (path1->Length() < path2->Length()) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->GetConjPath()->PopBack(last1 + 1);
+ } else {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->GetConjPath()->PopBack(last2 + 1);
+ }
+ } else if ((last1 == size1 - 1 && last2 == size2 - 1) && del_begins) {
+ DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar ends");
+ DEBUG("Path 1: " << path1->Size() - first1 << " edges of length " << path1->LengthAt(first1));
+ DEBUG("Path 2: " << path2->Size() - first2 << " edges of length " << path2->LengthAt(first2));
+ DEBUG("Path 1 has overlap end " << path1->HasOverlapedEnd() << ", path 2 has overlap end " << path2->HasOverlapedEnd());
+
+ if (add_overlap_begins){
+ AddOverlap(paths, path1, first1, last1);
+ DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
+ path2->Print();
+ path1->Print();
+ path1->PopBack(last1 + 1 - first1);
+ path2->PopBack(last2 + 1 - first2);
+ }
+ if (path1->Length() < path2->Length()) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->PopBack(last1 + 1 - first1);
+ } else {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ path2->PopBack(last2 + 1 - first2);
+ }
+ } else if (first2 == 0 && del_all) {
+ DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
+ DEBUG("Does it have overlap in the beginning: " << path2->HasOverlapedBegin());
+ path2->Print();
+ DEBUG(" >>>> ")
+ path1->Print();
+ DEBUG(" ==== ");
+ path2->GetConjPath()->PopBack(last2 + 1);
+ } else if (last2 == size2 - 1 && del_all) {
+ DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
+ DEBUG("Does it have overlap in the end: " << path2->HasOverlapedEnd());
+ path2->Print();
+ DEBUG(" >>>> ")
+ path1->Print();
+ DEBUG(" ==== ");
+ path2->PopBack(last1 + 1 - first1);
+ } else if (first1 == 0 && del_all) {
+ DEBUG("Detaching overlap from " << path1->GetConjPath()->GetId() << " because of "<< path2->GetId());
+ DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
+ path1->Print();
+ DEBUG(" >>>> ")
+ path2->Print();
+ DEBUG(" ==== ");
+ path1->GetConjPath()->PopBack(last1 + 1);
+ } else if (last1 == size1 - 1 && del_all) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
+ path1->Print();
+ DEBUG(" >>>> ")
+ path2->Print();
+ DEBUG(" ==== ");
+ path1->PopBack(last1 + 1 - first1);
+ } else {
+ return false;
+ }
+ return true;
+ }
+
+ std::vector<EdgeId> GetSortedEdges() const {
+ std::set<EdgeId> edges_set;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ edges_set.insert(*iter);
+ edges_set.insert(g_.conjugate(*iter));
+ }
+ std::vector<EdgeId> edges(edges_set.begin(), edges_set.end());
+ std::sort(edges.begin(), edges.end(), EdgeLengthAndIdComparator(g_));
+ return edges;
+ }
+
+ bool HasAlreadyOverlapedEnd(BidirectionalPath * path) const {
+ return !path->IsOverlap() and path->HasOverlapedEnd();
+ }
+
+ bool HasAlreadyOverlapedBegin(BidirectionalPath * path) const {
+ return !path->IsOverlap() and path->HasOverlapedBegin();
+ }
+
+ bool IsSamePath(BidirectionalPath * path1,
+ BidirectionalPath * path2) const {
+ return *path2 == *path1 or *path2 == *path1->GetConjPath();
+ }
+
+ void RemoveOverlap(PathContainer& paths, BidirectionalPath* path1,
+ BidirectionalPath* path2, size_t overlap_size) const {
+ BidirectionalPath* conj2 = path2->GetConjPath();
+ if (path1->IsOverlap() && overlap_size == path1->Size()) {
+ DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
+ path2->Print();
+ path1->Print();
+ conj2->PopBack(overlap_size);
+ path2->SetOverlapedBeginTo(path1);
+ } else if (path2->IsOverlap() && path2->Size() == overlap_size) {
+ DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
+ path1->Print();
+ path2->Print();
+ path1->PopBack(overlap_size);
+ path1->SetOverlapedEndTo(path2);
+ } else if (overlap_size < path2->Size()
+ && overlap_size < path1->Size()) {
+ BidirectionalPath *overlap = new BidirectionalPath(g_, path1->Back());
+ BidirectionalPath *conj_overlap = new BidirectionalPath(g_, g_.conjugate(path1->Back()));
+ SubscribeCoverageMap(overlap);
+ SubscribeCoverageMap(conj_overlap);
+ paths.AddPair(overlap, conj_overlap);
+ DEBUG("Detaching overlap " << path1->GetId() << " and " << conj2->GetId());
+ path1->Print();
+ conj2->Print();
+ path1->PopBack();
+ conj2->PopBack();
+
+ for (size_t i = 1; i < overlap_size; ++i) {
+ conj_overlap->PushBack(g_.conjugate(path1->Back()));
+ path1->PopBack();
+ conj2->PopBack();
+ }
+ overlap->SetOverlap(true);
+ path1->SetOverlapedEndTo(overlap);
+ path2->SetOverlapedBeginTo(overlap);
+ }
+ }
+
+ void FindAndRemovePathOverlap(PathContainer& all_paths,
+ BidirectionalPath* path1) const {
+ int last = (int) path1->Size() - 1;
+ if (last <= 0 or coverage_map_.GetCoverage(path1->At(last)) <= 1) {
+ return;
+ }
+ BidirectionalPathSet paths =
+ coverage_map_.GetCoveringPaths(path1->At(last));
+ BidirectionalPath* overlap_path = NULL;
+ size_t overlap_size = 0;
+ for (auto path_iter = paths.begin(); path_iter != paths.end();
+ ++path_iter) {
+ if (IsSamePath(*path_iter, path1)) {
+ continue;
+ }
+ size_t over_size = path1->OverlapEndSize(*path_iter);
+ if (over_size > overlap_size) {
+ overlap_size = over_size;
+ overlap_path = *path_iter;
+ } else if (over_size == overlap_size &&
+ (overlap_path == NULL || (*path_iter)->GetId() < overlap_path->GetId())) {
+ overlap_path = *path_iter;
+ }
+ }
+ if (overlap_path == NULL) {
+ return;
+ }
+ if (overlap_size > 0) {
+ RemoveOverlap(all_paths, path1, overlap_path, overlap_size);
+ }
+ }
+
+ class EdgeLengthAndIdComparator {
+ public:
+ EdgeLengthAndIdComparator(const Graph& g)
+ : g_(g) {
+ }
+ bool operator()(const EdgeId& e1, const EdgeId& e2) const {
+ if (g_.length(e1) > g_.length(e2)) {
+ return true;
+ }
+ if (g_.length(e2) > g_.length(e1)) {
+ return false;
+ }
+ return e1.int_id() < e2.int_id();
+ }
+ private:
+ const Graph& g_;
+ };
+
+ const Graph& g_;
+ GraphCoverageMap& coverage_map_;
+protected:
+ DECL_LOGGER("PEResolver")
+};
+
+class PathExtendResolver {
+
+protected:
+ const Graph& g_;
+ size_t k_;
+
+public:
+ PathExtendResolver(const Graph& g): g_(g), k_(g.k()) {
+ }
+
+ PathContainer MakeSimpleSeeds() const {
+ std::set<EdgeId> included;
+ PathContainer edges;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (g_.int_id(*iter) <= 0 or InTwoEdgeCycle(*iter, g_))
+ continue;
+ if (included.count(*iter) == 0) {
+ BidirectionalPath * first = new BidirectionalPath(g_, *iter);
+ BidirectionalPath * second = new BidirectionalPath(g_, g_.conjugate(*iter));
+ edges.AddPair(first,second);
+ included.insert(*iter);
+ included.insert(g_.conjugate(*iter));
+ }
+ }
+ return edges;
+ }
+
+ PathContainer ExtendSeeds(PathContainer &seeds, ContigsMaker &pathExtender) const {
+ PathContainer paths;
+ pathExtender.GrowAll(seeds, paths);
+ return paths;
+ }
+
+ void RemoveEqualPaths(PathContainer &paths, GraphCoverageMap &coverage_map,
+ size_t min_edge_len) const {
+
+ SimpleOverlapRemover remover(g_, coverage_map);
+ remover.RemoveSimilarPaths(paths, min_edge_len, min_edge_len, true, false, false, false, false);
+ }
+
+ void RemoveRNAOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map,
+ size_t min_edge_len, size_t max_path_diff) const {
+
+ SimpleOverlapRemover remover(g_, coverage_map);
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, false);
+
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, false);
+
+ remover.RemoveOverlaps(paths);
+
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, false);
+ }
+
+ void RemoveOverlaps(PathContainer &paths, GraphCoverageMap &coverage_map,
+ size_t min_edge_len, size_t max_path_diff,
+ bool add_overlaps_begin,
+ bool cut_preudo_self_conjugate) const {
+ SimpleOverlapRemover remover(g_, coverage_map);
+ if (cut_preudo_self_conjugate)
+ remover.CutPseudoSelfConjugatePaths(paths);
+
+ remover.CutNonUniqueSuffix(paths);
+ //writer.WritePathsToFASTA(paths, output_dir + "/before.fasta");
+ //DEBUG("Removing subpaths");
+ //delete not only eq,
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, add_overlaps_begin);
+ //writer.WritePathsToFASTA(paths, output_dir + "/remove_similar.fasta");
+ //DEBUG("Remove overlaps")
+ remover.RemoveOverlaps(paths);
+ //writer.WritePathsToFASTA(paths, output_dir + "/after_remove_overlaps.fasta");
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, add_overlaps_begin);
+ //writer.WritePathsToFASTA(paths, output_dir + "/remove_equal.fasta");
+ //DEBUG("remove similar path. Max difference " << max_overlap);
+ remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, true, true, add_overlaps_begin);
+ DEBUG("end removing");
+ }
+
+ void RemoveMatePairEnds(PathContainer& paths, size_t min_edge_len) const {
+ DEBUG("remove mp ends");
+ for (size_t i = 0; i < paths.size(); ++i) {
+ RemoveMatePairEnd(*paths.Get(i), min_edge_len);
+ RemoveMatePairEnd(*paths.GetConjugate(i), min_edge_len);
+ }
+ }
+
+ void AddUncoveredEdges(PathContainer &paths, GraphCoverageMap &coverageMap) const {
+ std::set<EdgeId> included;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (included.count(*iter) == 0 && !coverageMap.IsCovered(*iter)) {
+ BidirectionalPath* path = new BidirectionalPath(g_, *iter);
+ BidirectionalPath* conj = new BidirectionalPath(g_, g_.conjugate(*iter));
+ path->Subscribe(&coverageMap);
+ conj->Subscribe(&coverageMap);
+ coverageMap.BackEdgeAdded(path->At(0), path, path->GapAt(0));
+ coverageMap.BackEdgeAdded(conj->At(0), conj, conj->GapAt(0));
+ paths.AddPair(path, conj);
+ included.insert(*iter);
+ included.insert(g_.conjugate(*iter));
+ }
+ }
+ }
+
+private:
+ void RemoveMatePairEnd(BidirectionalPath& path, size_t min_edge_len) const {
+ int pos = int(path.Size()) - 1;
+ while (pos > 0 and g_.length(path.At(pos)) < min_edge_len) {
+ path.PopBack();
+ pos--;
+ }
+ }
+protected:
+ DECL_LOGGER("PEResolver")
+};
+
+} /* PE_RESOLVER_HPP_ */
+
+#endif
diff --git a/src/common/modules/path_extend/pe_utils.hpp b/src/common/modules/path_extend/pe_utils.hpp
new file mode 100644
index 0000000..8df0968
--- /dev/null
+++ b/src/common/modules/path_extend/pe_utils.hpp
@@ -0,0 +1,397 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * pe_utils.hpp
+ *
+ * Created on: Nov 27, 2012
+ * Author: andrey
+ */
+
+#ifndef PE_UTILS_HPP_
+#define PE_UTILS_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+//Checks whether we are in a cycle of length 2, used only for seed selection.
+inline bool InTwoEdgeCycle(EdgeId e, const Graph &g) {
+ auto v = g.EdgeEnd(e);
+ if (g.OutgoingEdgeCount(v) >= 1) {
+ auto edges = g.OutgoingEdges(v);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if (g.EdgeStart(e) == g.EdgeEnd(*it)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+inline bool InBuble(EdgeId e, const Graph& g) {
+ auto edges = g.OutgoingEdges(g.EdgeStart(e));
+ auto endVertex = g.EdgeEnd(e);
+ for (auto it = edges.begin(); it != edges.end(); ++it) {
+ if ((g.EdgeEnd(*it) == endVertex) and (*it != e)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+// Handles all paths in PathContainer.
+// For each edge output all paths that _traverse_ this path. If path contains multiple instances - count them. Position of the edge is not reported.
+class GraphCoverageMap: public PathListener {
+
+public:
+ typedef BidirectionalPathMultiset MapDataT;
+
+
+private:
+ const Graph& g_;
+
+ std::unordered_map <EdgeId, MapDataT * > edge_coverage_;
+
+ MapDataT * empty_;
+
+ virtual void EdgeAdded(EdgeId e, BidirectionalPath * path, Gap /*gap*/) {
+ auto iter = edge_coverage_.find(e);
+ if (iter == edge_coverage_.end()) {
+ edge_coverage_.insert(std::make_pair(e, new MapDataT()));
+ }
+ edge_coverage_[e]->insert(path);
+ }
+
+ virtual void EdgeRemoved(EdgeId e, BidirectionalPath * path) {
+ auto iter = edge_coverage_.find(e);
+ if (iter != edge_coverage_.end()) {
+ if (iter->second->count(path) == 0) {
+ DEBUG("Error erasing path from coverage map");
+ } else {
+ auto entry = iter->second->find(path);
+ iter->second->erase(entry);
+ }
+ }
+ }
+
+ size_t EdgeCount() const {
+ size_t result = 0;
+ for (auto e = g_.ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ ++result;
+ }
+ return result;
+ }
+
+public:
+ GraphCoverageMap(const Graph& g) : g_(g), edge_coverage_() {
+ empty_ = new MapDataT();
+ edge_coverage_.reserve(EdgeCount());
+ }
+
+ GraphCoverageMap(const Graph& g, const PathContainer& paths, bool subscribe = false) : g_(g), edge_coverage_() {
+ empty_ = new MapDataT();
+ edge_coverage_.reserve(EdgeCount());
+ AddPaths(paths, subscribe);
+ }
+
+ virtual ~GraphCoverageMap() {
+ delete empty_;
+ for (auto iter = edge_coverage_.begin(); iter != edge_coverage_.end(); ++iter) {
+ delete iter->second;
+ }
+ }
+
+ void AddPaths(const PathContainer& paths, bool subscribe = false) {
+ for (size_t i = 0; i < paths.size(); ++i) {
+ if (subscribe)
+ paths.Get(i)->Subscribe(this);
+ for (size_t j = 0; j < paths.Get(i)->Size(); ++j) {
+ EdgeAdded(paths.Get(i)->At(j), paths.Get(i), paths.Get(i)->GapAt(j));
+ }
+ if (subscribe)
+ paths.GetConjugate(i)->Subscribe(this);
+ for (size_t j = 0; j < paths.GetConjugate(i)->Size(); ++j) {
+ EdgeAdded(paths.GetConjugate(i)->At(j), paths.GetConjugate(i), paths.GetConjugate(i)->GapAt(j));
+ }
+ }
+ }
+
+ void Subscribe(BidirectionalPath * path) {
+ path->Subscribe(this);
+ for (size_t i = 0; i < path->Size(); ++i) {
+ BackEdgeAdded(path->At(i), path, path->GapAt(i));
+ }
+ }
+
+ //Inherited from PathListener
+ void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) override {
+ EdgeAdded(e, path, gap);
+ }
+
+ //Inherited from PathListener
+ void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) override {
+ EdgeAdded(e, path, gap);
+ }
+
+ //Inherited from PathListener
+ void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) override {
+ EdgeRemoved(e, path);
+ }
+
+ //Inherited from PathListener
+ void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) override {
+ EdgeRemoved(e, path);
+ }
+
+ MapDataT * GetEdgePaths(EdgeId e) const {
+ auto iter = edge_coverage_.find(e);
+ if (iter != edge_coverage_.end()) {
+ return iter->second;
+ }
+ return empty_;
+ }
+
+ int GetCoverage(EdgeId e) const {
+ return (int) GetEdgePaths(e)->size();
+ }
+
+ bool IsCovered(EdgeId e) const {
+ return GetCoverage(e) > 0;
+ }
+
+ bool IsCovered(const BidirectionalPath& path) const {
+ for (size_t i = 0; i < path.Size(); ++i) {
+ if (!IsCovered(path[i])) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ BidirectionalPathSet GetCoveringPaths(EdgeId e) const {
+ auto mapData = GetEdgePaths(e);
+ return BidirectionalPathSet(mapData->begin(), mapData->end());
+ }
+
+ std::unordered_map <EdgeId, MapDataT * >::const_iterator begin() const {
+ return edge_coverage_.begin();
+ }
+
+ std::unordered_map <EdgeId, MapDataT * >::const_iterator end() const {
+ return edge_coverage_.end();
+ }
+
+ // DEBUG output
+ void PrintUncovered() const {
+ DEBUG("Uncovered edges");
+ int s = 0;
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ if (!IsCovered(*iter)) {
+ DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << ") ~ " << g_.int_id(g_.conjugate(*iter)) << " (" << g_.length(g_.conjugate(*iter)) << ")");
+ s += 1;
+ }
+ }
+ DEBUG("Uncovered edges " << s / 2);
+ }
+
+ void PrintMulticovered() const {
+ DEBUG("Multicovered edges");
+ for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+ auto paths = GetCoveringPaths(*iter);
+ if (paths.size() > 1 && g_.length(*iter) > 1000) {
+ DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << "). " << " Covered: " << paths.size());
+ for (auto path = paths.begin(); path != paths.end(); ++path) {
+ (*path)->Print();
+ }
+ DEBUG("=====");
+ }
+ }
+ }
+
+ size_t size() const {
+ return edge_coverage_.size();
+ }
+
+ const Graph& graph() const {
+ return g_;
+ }
+
+private:
+ GraphCoverageMap(const GraphCoverageMap& t) : g_(t.g_), empty_(t.empty_) {}
+};
+
+inline bool GetLoopAndExit(const Graph& g, EdgeId e, pair<EdgeId, EdgeId>& result) {
+ VertexId v = g.EdgeEnd(e);
+ VertexId start = g.EdgeStart(e);
+ if (g.OutgoingEdgeCount(v) != 2 || g.IncomingEdgeCount(v) != 1 || g.OutgoingEdgeCount(start) != 1 || g.IncomingEdgeCount(start) != 2) {
+ return false;
+ }
+ EdgeId loop;
+ EdgeId exit;
+ bool loop_found = false;
+ bool exit_found = false;
+ auto edges = g.OutgoingEdges(v);
+ for (auto edge = edges.begin(); edge != edges.end(); ++edge) {
+ if (g.EdgeEnd(*edge) == g.EdgeStart(e) && *edge != e) {
+ loop = *edge;
+ loop_found = true;
+ } else if (*edge != e) {
+ exit = *edge;
+ exit_found = true;
+ }
+ }
+ result = make_pair(loop, exit);
+ return exit_found && loop_found;
+}
+
+class LoopDetector {
+public:
+ LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map);
+ size_t LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ size_t LoopLength(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ bool PathIsLoop(size_t edges) const;
+ size_t LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const;
+ size_t LastLoopCount(size_t edges) const;
+ bool IsCycled(size_t loopLimit, size_t& skip_identical_edges) const;
+ size_t EdgesToRemove(size_t skip_identical_edges, bool fullRemoval = false) const;
+ void RemoveLoop(size_t skip_identical_edges, bool fullRemoval = true);
+ bool EdgeInShortLoop(EdgeId e) const;
+ bool PrevEdgeInShortLoop() const;
+private:
+ BidirectionalPath* path_;
+ const GraphCoverageMap& cov_map_;
+ DECL_LOGGER("BidirectionalPath");
+};
+
+inline LoopDetector::LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map)
+ : path_(p),
+ cov_map_(cov_map) {
+}
+
+inline size_t LoopDetector::LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const {
+ if (path_->Size() == 0) {
+ return 0;
+ }
+ EdgeId e = path_->Back();
+ size_t count = cov_map_.GetEdgePaths(e)->count(path_);
+ if (count <= 1 || count < min_cycle_appearences * (skip_identical_edges + 1)) {
+ return 0;
+ }
+ vector<size_t> edge_positions = path_->FindAll(e);
+ VERIFY(edge_positions.size() == count);
+ VERIFY(edge_positions.size() >= skip_identical_edges);
+ size_t loopSize = edge_positions.back() - edge_positions[edge_positions.size() - 1 - (skip_identical_edges + 1)];
+ return loopSize;
+}
+
+inline bool LoopDetector::PathIsLoop(size_t edges) const {
+ if (edges == 0 || path_->Size() <= 1)
+ return false;
+
+ for (size_t i = 0; i < edges; ++i) {
+ EdgeId e = path_->At(i);
+ for (int j = (int) path_->Size() - ((int) edges - (int) i); j >= 0; j -= (int) edges) {
+ if (path_->operator [](j) != e) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+inline size_t LoopDetector::LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const {
+ size_t edges = LoopEdges(skip_identical_edges, min_cycle_appearences);
+ return LastLoopCount(edges);
+}
+
+inline size_t LoopDetector::LastLoopCount(size_t edges) const {
+ if (edges == 0) {
+ return 0;
+ }
+
+ BidirectionalPath loop = path_->SubPath(path_->Size() - edges);
+ size_t count = 0;
+ int i = (int) path_->Size() - (int) edges;
+ int delta = -(int) edges;
+
+ while (i >= 0) {
+ if (!path_->CompareFrom(i, loop)) {
+ break;
+ }
+ ++count;
+ i += delta;
+ }
+
+ return count;
+}
+
+inline bool LoopDetector::IsCycled(size_t loopLimit, size_t& skip_identical_edges) const {
+ if (path_->Size() == 0 or cov_map_.GetEdgePaths(path_->Back())->count(path_) < loopLimit) {
+ return false;
+ }
+ skip_identical_edges = 0;
+ size_t loop_count = LastLoopCount(skip_identical_edges, loopLimit);
+ while (loop_count > 0) {
+ if (loop_count >= loopLimit) {
+ return true;
+ }
+ loop_count = LastLoopCount(++skip_identical_edges, loopLimit);
+ }
+ return false;
+}
+
+inline size_t LoopDetector::EdgesToRemove(size_t skip_identical_edges, bool fullRemoval) const {
+ size_t edges = LoopEdges(skip_identical_edges, 1);
+ size_t count = LastLoopCount(edges);
+ bool onlyCycle = PathIsLoop(edges);
+ int result;
+
+ if (onlyCycle || path_->Size() <= count * edges) {
+ result = (int) path_->Size() - (int) edges;
+ } else if (fullRemoval) {
+ result = (int) count * (int) edges;
+ } else {
+ result = (int) (count - 1) * (int) edges;
+ }
+
+ return result < 0 ? 0 : result;
+}
+
+inline void LoopDetector::RemoveLoop(size_t skip_identical_edges, bool fullRemoval) {
+ size_t toRemove = EdgesToRemove(skip_identical_edges, fullRemoval);
+ for (size_t i = 0; i < toRemove; ++i) {
+ path_->PopBack();
+ }
+}
+
+inline bool LoopDetector::EdgeInShortLoop(EdgeId e) const {
+ pair<EdgeId, EdgeId> temp;
+ return GetLoopAndExit(path_->graph(), e, temp);
+}
+
+inline bool LoopDetector::PrevEdgeInShortLoop() const {
+ if (path_->Size() <= 2) {
+ return false;
+ }
+ const Graph& g = path_->graph();
+ EdgeId e2 = path_->At(path_->Size() - 1);
+ EdgeId e1 = path_->At(path_->Size() - 2);
+ VertexId v2 = g.EdgeEnd(e1);
+ if (g.OutgoingEdgeCount(v2) == 2 && g.EdgeEnd(e2) == g.EdgeStart(e1) && g.EdgeEnd(e1) == g.EdgeStart(e2)) {
+ return EdgeInShortLoop(e1);
+ }
+ return false;
+}
+
+
+}
+
+#endif /* PE_UTILS_HPP_ */
diff --git a/src/common/modules/path_extend/pipeline/extenders_logic.cpp b/src/common/modules/path_extend/pipeline/extenders_logic.cpp
new file mode 100644
index 0000000..7b26fed
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/extenders_logic.cpp
@@ -0,0 +1,423 @@
+//
+// Created by andrey on 14.11.16.
+//
+
+#include "extenders_logic.hpp"
+#include "modules/path_extend/scaffolder2015/extension_chooser2015.hpp"
+
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+shared_ptr<ExtensionChooser> ExtendersGenerator::MakeLongReadsExtensionChooser(size_t lib_index,
+ const GraphCoverageMap &read_paths_cov_map) const {
+ auto long_reads_config = support_.GetLongReadsConfig(dataset_info_.reads[lib_index].type());
+ return make_shared<LongReadsExtensionChooser>(gp_.g, read_paths_cov_map,
+ long_reads_config.filtering,
+ long_reads_config.weight_priority,
+ long_reads_config.unique_edge_priority,
+ long_reads_config.min_significant_overlap,
+ params_.pset.extension_options.max_repeat_length,
+ params_.uneven_depth);
+}
+
+shared_ptr<SimpleExtender> ExtendersGenerator::MakeLongReadsExtender(size_t lib_index,
+ const GraphCoverageMap &read_paths_cov_map) const {
+ const auto &lib = dataset_info_.reads[lib_index];
+ //TODO params
+ size_t resolvable_repeat_length_bound = 10000ul;
+ if (!dataset_info_.reads[lib_index].is_contig_lib()) {
+ resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
+ }
+ INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
+
+
+ auto long_read_ec = MakeLongReadsExtensionChooser(lib_index, read_paths_cov_map);
+ return make_shared<SimpleExtender>(gp_, cover_map_,
+ long_read_ec,
+ resolvable_repeat_length_bound,
+ true, /* investigate short loops */
+ support_.UseCoverageResolverForSingleReads(lib.type()));
+}
+
+shared_ptr<SimpleExtender> ExtendersGenerator::MakeLongEdgePEExtender(size_t lib_index,
+ bool investigate_loops) const {
+ const auto &lib = dataset_info_.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+ //INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ shared_ptr<WeightCounter> wc =
+ make_shared<PathCoverWeightCounter>(gp_.g, paired_lib,
+ params_.pset.normalize_weight,
+ support_.SingleThresholdForLib(params_.pset, lib.data().pi_threshold));
+ auto opts = support_.GetExtensionOpts(paired_lib, params_.pset);
+ shared_ptr<ExtensionChooser> extension =
+ make_shared<LongEdgeExtensionChooser>(gp_.g, wc,
+ opts.weight_threshold,
+ opts.priority_coeff);
+
+ return make_shared<SimpleExtender>(gp_, cover_map_,
+ extension,
+ paired_lib->GetISMax(),
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
+}
+
+shared_ptr<GapJoiner> ExtendersGenerator::MakeGapJoiners(double is_variation) const {
+ const auto &pset = params_.pset;
+
+ vector<shared_ptr<GapJoiner>> joiners;
+ if (params_.pset.scaffolder_options.use_la_gap_joiner)
+ joiners.push_back(std::make_shared<LAGapJoiner>(gp_.g, pset.scaffolder_options.min_overlap_length,
+ pset.scaffolder_options.flank_multiplication_coefficient,
+ pset.scaffolder_options.flank_addition_coefficient));
+
+
+ joiners.push_back(std::make_shared<HammingGapJoiner>(gp_.g,
+ pset.scaffolder_options.min_gap_score,
+ pset.scaffolder_options.short_overlap,
+ (int) pset.scaffolder_options.basic_overlap_coeff
+ * dataset_info_.RL()));
+
+ return std::make_shared<CompositeGapJoiner>(gp_.g,
+ joiners,
+ size_t(pset.scaffolder_options.max_can_overlap
+ * (double) gp_.g.k()), /* may overlap threshold */
+ int(math::round(double(gp_.g.k())
+ - pset.scaffolder_options.var_coeff
+ * is_variation)), /* must overlap threshold */
+ pset.scaffolder_options.artificial_gap);
+
+}
+
+shared_ptr<PathExtender> ExtendersGenerator::MakeScaffoldingExtender(size_t lib_index) const {
+
+ const auto &lib = dataset_info_.reads[lib_index];
+ const auto &pset = params_.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.scaffolding_indices[lib_index]);
+
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp_.g, paired_lib);
+
+ auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp_.g, counter,
+ pset.scaffolder_options.cl_threshold,
+ pset.scaffolder_options.var_coeff);
+
+ return make_shared<ScaffoldingPathExtender>(gp_, cover_map_, scaff_chooser,
+ MakeGapJoiners(paired_lib->GetIsVar()),
+ paired_lib->GetISMax(),
+ false, /* investigate short loops */
+ params_.avoid_rc_connections);
+}
+
+shared_ptr<PathExtender> ExtendersGenerator::MakeRNAScaffoldingExtender(size_t lib_index) const {
+
+ const auto &lib = dataset_info_.reads[lib_index];
+ const auto &pset = params_.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.paired_indices[lib_index]);
+
+ shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp_.g, paired_lib);
+
+ auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp_.g,
+ counter,
+ pset.scaffolder_options.cutoff,
+ pset.scaffolder_options.var_coeff);
+ auto scaff_chooser2 = std::make_shared<ScaffoldingExtensionChooser>(gp_.g,
+ counter,
+ pset.scaffolder_options.hard_cutoff,
+ pset.scaffolder_options.var_coeff);
+
+
+ VERIFY(pset.scaffolder_options.min_overlap_for_rna_scaffolding.is_initialized());
+ return make_shared<RNAScaffoldingPathExtender>(gp_, cover_map_,
+ scaff_chooser,
+ scaff_chooser2,
+ MakeGapJoiners(paired_lib->GetIsVar()),
+ paired_lib->GetISMax(),
+ false /* investigate short loops */,
+ *pset.scaffolder_options.min_overlap_for_rna_scaffolding);
+}
+
+shared_ptr<PathExtender> ExtendersGenerator::MakeMatePairScaffoldingExtender(
+ size_t lib_index,
+ const ScaffoldingUniqueEdgeStorage &storage) const {
+
+ const auto &lib = dataset_info_.reads[lib_index];
+ const auto &pset = params_.pset;
+ shared_ptr<PairedInfoLibrary> paired_lib;
+ INFO("Creating Scaffolding 2015 extender for lib #" << lib_index);
+
+ //FIXME: DimaA
+ if (gp_.paired_indices[lib_index].size() > gp_.clustered_indices[lib_index].size()) {
+ INFO("Paired unclustered indices not empty, using them");
+ paired_lib = MakeNewLib(gp_.g, lib, gp_.paired_indices[lib_index]);
+ } else if (gp_.clustered_indices[lib_index].size() != 0) {
+ INFO("clustered indices not empty, using them");
+ paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+ } else {
+ ERROR("All paired indices are empty!");
+ }
+
+ //TODO::was copypasted from MakeScaffoldingExtender, refactor 2015 extension chooser
+ DEBUG("creating extchooser");
+ shared_ptr<ConnectionCondition>
+ condition = make_shared<PairedLibConnectionCondition>(gp_.g, paired_lib, lib_index, 0);
+ auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp_.g,
+ nullptr,
+ condition,
+ storage,
+ pset.scaffolder_options.cl_threshold,
+ pset.scaffolder_options.var_coeff,
+ pset.scaffolding2015.relative_weight_cutoff,
+ gp_.g.size()
+ <= params_.pset.scaffolding2015.graph_connectivity_max_edges);
+
+ return make_shared<ScaffoldingPathExtender>(gp_, cover_map_,
+ scaff_chooser,
+ MakeGapJoiners(paired_lib->GetIsVar()),
+ paired_lib->GetISMax(),
+ false, /* investigate short loops */
+ params_.avoid_rc_connections,
+ false /* jump only from tips */);
+}
+
+shared_ptr<SimpleExtender> ExtendersGenerator::MakeCoordCoverageExtender(size_t lib_index) const {
+ const auto& lib = dataset_info_.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+
+ auto provider = make_shared<CoverageAwareIdealInfoProvider>(gp_.g, paired_lib, dataset_info_.RL());
+
+ auto meta_wc = make_shared<PathCoverWeightCounter>(gp_.g, paired_lib,
+ params_.pset.normalize_weight,
+ support_.SingleThresholdForLib(params_.pset, lib.data().pi_threshold),
+ provider);
+
+ auto permissive_pi_chooser = make_shared<IdealBasedExtensionChooser>(gp_.g,
+ meta_wc,
+ params_.pset.extension_options.weight_threshold,
+ params_.pset.extension_options.priority_coeff);
+
+ auto coord_cov_chooser = make_shared<CoordinatedCoverageExtensionChooser>(gp_.g, *provider,
+ params_.pset.coordinated_coverage.max_edge_length_in_repeat,
+ params_.pset.coordinated_coverage.delta,
+ params_.pset.coordinated_coverage.min_path_len);
+
+ auto chooser = make_shared<JointExtensionChooser>(gp_.g, permissive_pi_chooser, coord_cov_chooser);
+
+ return make_shared<SimpleExtender>(gp_, cover_map_, chooser,
+ -1ul /* insert size is needed only for loop detection, which is not needed in this case */,
+ false, /* investigate short loops */
+ false /*use short loop coverage resolver*/);
+}
+
+shared_ptr<SimpleExtender> ExtendersGenerator::MakeRNAExtender(size_t lib_index, bool investigate_loops) const {
+
+ const auto &lib = dataset_info_.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+// INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ auto cip = make_shared<CoverageAwareIdealInfoProvider>(gp_.g, paired_lib, dataset_info_.RL());
+ shared_ptr<WeightCounter> wc =
+ make_shared<PathCoverWeightCounter>(gp_.g, paired_lib, params_.pset.normalize_weight,
+ support_.SingleThresholdForLib(params_.pset, lib.data().pi_threshold),
+ cip);
+
+ auto opts = support_.GetExtensionOpts(paired_lib, params_.pset);
+ shared_ptr<RNAExtensionChooser> extension =
+ make_shared<RNAExtensionChooser>(gp_.g, wc,
+ opts.weight_threshold,
+ opts.priority_coeff);
+
+ return make_shared<MultiExtender>(gp_, cover_map_,
+ extension,
+ paired_lib->GetISMax(),
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
+}
+
+shared_ptr<SimpleExtender> ExtendersGenerator::MakePEExtender(size_t lib_index, bool investigate_loops) const {
+ const auto &lib = dataset_info_.reads[lib_index];
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+ VERIFY_MSG(!paired_lib->IsMp(), "Tried to create PE extender for MP library");
+ auto opts = params_.pset.extension_options;
+// INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
+
+ shared_ptr<CoverageAwareIdealInfoProvider> iip = nullptr;
+ if (opts.use_default_single_threshold) {
+ if (params_.uneven_depth) {
+ iip = make_shared<CoverageAwareIdealInfoProvider>(gp_.g, paired_lib, dataset_info_.RL());
+ } else {
+ double lib_cov = support_.EstimateLibCoverage(lib_index);
+ INFO("Estimated coverage of library #" << lib_index << " is " << lib_cov);
+ iip = make_shared<GlobalCoverageAwareIdealInfoProvider>(gp_.g, paired_lib, dataset_info_.RL(), lib_cov);
+ }
+ }
+ auto wc = make_shared<PathCoverWeightCounter>(gp_.g, paired_lib, params_.pset.normalize_weight,
+ support_.SingleThresholdForLib(params_.pset, lib.data().pi_threshold),
+ iip);
+
+ auto extension_chooser = make_shared<SimpleExtensionChooser>(gp_.g, wc,
+ opts.weight_threshold,
+ opts.priority_coeff);
+
+ return make_shared<SimpleExtender>(gp_, cover_map_,
+ extension_chooser,
+ paired_lib->GetISMax(),
+ investigate_loops,
+ false /*use short loop coverage resolver*/);
+}
+
+
+void ExtendersGenerator::PrintExtenders(const Extenders &extenders) const {
+ DEBUG("Extenders in vector:");
+ for (const auto& extender : extenders) {
+ //TODO: use polymorphism instead of RTTI
+ auto ext_ptr = extender.get();
+ DEBUG("Extender #i" << typeid(*ext_ptr).name());
+ if (instanceof<SimpleExtender>(ext_ptr)) {
+ auto ec = ((SimpleExtender *) ext_ptr)->GetExtensionChooser();
+ auto ec_ptr = ec.get();
+ DEBUG(" Extender #i" << typeid(*ec_ptr).name());
+ }
+ else if (instanceof<ScaffoldingPathExtender>(ext_ptr)) {
+ auto ec = ((ScaffoldingPathExtender *) ext_ptr)->GetExtensionChooser();
+ auto ec_ptr = ec.get();
+ DEBUG(" Extender #i" << typeid(*ec_ptr).name());
+ }
+ }
+}
+
+Extenders ExtendersGenerator::MakeMPExtenders(const ScaffoldingUniqueEdgeStorage &storage) const {
+ ExtenderTriplets result;
+
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); ++lib_index) {
+ const auto &lib = dataset_info_.reads[lib_index];
+
+ if (lib.is_mate_pair()) {
+ result.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage));
+ }
+ }
+ std::stable_sort(result.begin(), result.end());
+
+ return ExtractExtenders(result);
+}
+
+Extenders ExtendersGenerator::MakePBScaffoldingExtenders(const ScaffoldingUniqueEdgeStorage &unique_storage_pb,
+ const vector<shared_ptr<GraphCoverageMap>> &long_reads_cov_map) const {
+ const auto &pset = params_.pset;
+ ExtenderTriplets result;
+
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) {
+ if (support_.IsForSingleReadScaffolder(dataset_info_.reads[lib_index])) {
+ INFO("Creating scaffolding extender for lib " << lib_index);
+ shared_ptr<ConnectionCondition> condition = make_shared<LongReadsLibConnectionCondition>(gp_.g,
+ lib_index, 2,
+ *long_reads_cov_map[lib_index]);
+ auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp_.g,
+ nullptr,
+ condition,
+ unique_storage_pb,
+ pset.scaffolder_options.cl_threshold,
+ pset.scaffolder_options.var_coeff,
+ pset.scaffolding2015.relative_weight_cutoff);
+
+ result.emplace_back(dataset_info_.reads[lib_index].type(),
+ lib_index,
+ make_shared<ScaffoldingPathExtender>(gp_, cover_map_,
+ scaff_chooser,
+ MakeGapJoiners(1000), /* "IS vatiation" */
+ 10000, /* insert size */
+ false, /* investigate short loops */
+ params_.avoid_rc_connections,
+ false /* jump only from tips */));
+
+ }
+ }
+ INFO("Using " << result.size() << " long reads scaffolding " << support_.LibStr(result.size()));
+ std::stable_sort(result.begin(), result.end());
+
+ return ExtractExtenders(result);
+}
+
+
+Extenders ExtendersGenerator::MakeCoverageExtenders() const {
+ Extenders result;
+
+ INFO("Using additional coordinated coverage extender");
+ result.push_back(MakeCoordCoverageExtender(0 /* lib index */));
+
+ return result;
+}
+
+Extenders ExtendersGenerator::MakeBasicExtenders(const ScaffoldingUniqueEdgeStorage &storage,
+ const vector<shared_ptr<GraphCoverageMap>> &long_reads_cov_map) const {
+ ExtenderTriplets basic_extenders;
+ ExtenderTriplets loop_resolving_extenders;
+ ExtenderTriplets scaffolding_extenders;
+
+ size_t single_read_libs = 0;
+ size_t pe_libs = 0;
+ size_t scf_pe_libs = 0;
+
+ const auto &pset = params_.pset;
+
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); ++lib_index) {
+ const auto &lib = dataset_info_.reads[lib_index];
+
+ //TODO: scaff2015 does not need any single read libs?
+ if (support_.IsForSingleReadExtender(lib)) {
+ basic_extenders.emplace_back(lib.type(), lib_index, MakeLongReadsExtender(lib_index, *long_reads_cov_map[lib_index]));
+ ++single_read_libs;
+ }
+ if (support_.IsForPEExtender(lib)) {
+ ++pe_libs;
+ if (IsOldPEEnabled(pset.sm)) {
+ if (params_.mode == config::pipeline_type::moleculo) {
+ basic_extenders.emplace_back(lib.type(), lib_index, MakeLongEdgePEExtender(lib_index, false));
+ } else if (pset.multi_path_extend) {
+ basic_extenders.emplace_back(lib.type(), lib_index, MakePEExtender(lib_index, false));
+ basic_extenders.emplace_back(lib.type(), lib_index, MakeRNAExtender(lib_index, false));
+ } else {
+ basic_extenders.emplace_back(lib.type(), lib_index, MakePEExtender(lib_index, false));
+ }
+ } else if (pset.sm == sm_2015) {
+ basic_extenders.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage));
+ }
+ }
+ //TODO logic is very cryptic!
+ if (support_.IsForShortLoopExtender(lib) && IsOldPEEnabled(pset.sm)) {
+ loop_resolving_extenders.emplace_back(lib.type(), lib_index, MakePEExtender(lib_index, true));
+ //TODO what about moleculo and rna here?
+ }
+ if (support_.IsForScaffoldingExtender(lib) && params_.use_scaffolder
+ && pset.scaffolder_options.enabled) {
+ ++scf_pe_libs;
+ if (params_.mode == config::pipeline_type::rna) {
+ scaffolding_extenders.emplace_back(lib.type(), lib_index, MakeRNAScaffoldingExtender(lib_index));
+ } else {
+ scaffolding_extenders.emplace_back(lib.type(), lib_index, MakeScaffoldingExtender(lib_index));
+ if (pset.sm == sm_combined) {
+ scaffolding_extenders.emplace_back(lib.type(), lib_index, MakeMatePairScaffoldingExtender(lib_index, storage));
+ }
+ }
+ }
+ }
+
+ std::stable_sort(basic_extenders.begin(), basic_extenders.end());
+ std::stable_sort(scaffolding_extenders.begin(), scaffolding_extenders.end());
+ std::stable_sort(loop_resolving_extenders.begin(), loop_resolving_extenders.end());
+
+ Extenders result;
+ push_back_all(result, ExtractExtenders(basic_extenders));
+ push_back_all(result, ExtractExtenders(scaffolding_extenders));
+ push_back_all(result, ExtractExtenders(loop_resolving_extenders));
+
+ INFO("Using " << pe_libs << " paired-end " << support_.LibStr(pe_libs));
+ INFO("Using " << scf_pe_libs << " paired-end scaffolding " << support_.LibStr(scf_pe_libs));
+ INFO("Using " << single_read_libs << " single read " << support_.LibStr(single_read_libs));
+
+ PrintExtenders(result);
+ return result;
+}
+
+}
diff --git a/src/common/modules/path_extend/pipeline/extenders_logic.hpp b/src/common/modules/path_extend/pipeline/extenders_logic.hpp
new file mode 100644
index 0000000..2f6c190
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/extenders_logic.hpp
@@ -0,0 +1,118 @@
+//
+// Created by andrey on 14.11.16.
+//
+
+#pragma once
+
+#include "modules/path_extend/path_extender.hpp"
+#include "launch_support.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+struct ExtenderTriplet {
+ io::LibraryType lib_type_;
+ size_t lib_index_;
+ shared_ptr<PathExtender> extender_;
+
+ ExtenderTriplet(io::LibraryType lib_type, size_t lib_index, shared_ptr<PathExtender> extender):
+ lib_type_(lib_type), lib_index_(lib_index), extender_(extender) {
+
+ }
+
+ bool operator<(const ExtenderTriplet& that) const {
+ if (this->lib_type_ == that.lib_type_)
+ return this->lib_index_ < that.lib_index_;
+ return this->lib_type_ < that.lib_type_;
+ }
+};
+
+typedef vector<ExtenderTriplet> ExtenderTriplets;
+
+typedef vector<shared_ptr<PathExtender>> Extenders;
+
+inline Extenders ExtractExtenders(const ExtenderTriplets& triplets) {
+ Extenders result;
+ for (const auto& triplet : triplets)
+ result.push_back(triplet.extender_);
+
+ return result;
+}
+
+class ExtendersGenerator {
+ const config::dataset &dataset_info_;
+ const PathExtendParamsContainer ¶ms_;
+ const conj_graph_pack &gp_;
+
+ const GraphCoverageMap &cover_map_;
+
+ const PELaunchSupport &support_;
+
+public:
+ ExtendersGenerator(const config::dataset &dataset_info,
+ const PathExtendParamsContainer ¶ms,
+ const conj_graph_pack &gp,
+ const GraphCoverageMap &cover_map,
+ const PELaunchSupport& support) :
+ dataset_info_(dataset_info),
+ params_(params),
+ gp_(gp),
+ cover_map_(cover_map),
+ support_(support) { }
+
+ Extenders MakePBScaffoldingExtenders(const ScaffoldingUniqueEdgeStorage &unique_storage_pb,
+ const vector<shared_ptr<GraphCoverageMap>> &long_reads_cov_map) const;
+
+ Extenders MakeBasicExtenders(const ScaffoldingUniqueEdgeStorage &storage,
+ const vector<shared_ptr<GraphCoverageMap>> &long_reads_cov_map) const;
+
+ Extenders MakeMPExtenders(const ScaffoldingUniqueEdgeStorage &storage) const;
+
+ Extenders MakeCoverageExtenders() const;
+
+private:
+
+ shared_ptr<ExtensionChooser> MakeLongReadsExtensionChooser(size_t lib_index, const GraphCoverageMap& read_paths_cov_map) const;
+
+ shared_ptr<SimpleExtender> MakeLongReadsExtender(size_t lib_index, const GraphCoverageMap& read_paths_cov_map) const;
+
+ shared_ptr<SimpleExtender> MakeLongEdgePEExtender(size_t lib_index,
+ bool investigate_loops) const;
+
+ shared_ptr<WeightCounter> MakeMetaWeightCounter(shared_ptr<PairedInfoLibrary> lib,
+ size_t read_length) const;
+
+ shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(shared_ptr<PairedInfoLibrary> lib,
+ size_t read_length) const;
+
+ shared_ptr<SimpleExtender> MakeMetaExtender(size_t lib_index, bool investigate_loops) const;
+
+
+ shared_ptr<SimpleExtender> MakePEExtender(size_t lib_index, bool investigate_loops) const;
+
+
+ shared_ptr<GapJoiner> MakeGapJoiners(double is_variation) const;
+
+
+ shared_ptr<PathExtender> MakeScaffoldingExtender(size_t lib_index) const;
+
+
+ shared_ptr<PathExtender> MakeRNAScaffoldingExtender(size_t lib_index) const;
+
+
+ shared_ptr<PathExtender> MakeMatePairScaffoldingExtender
+ (size_t lib_index, const ScaffoldingUniqueEdgeStorage &storage) const;
+
+
+ shared_ptr<SimpleExtender> MakeCoordCoverageExtender(size_t lib_index) const;
+
+
+ shared_ptr<SimpleExtender> MakeRNAExtender(size_t lib_index, bool investigate_loops) const;
+
+
+ void PrintExtenders(const vector<shared_ptr<PathExtender>> &extenders) const;
+
+};
+
+}
diff --git a/src/common/modules/path_extend/pipeline/launch_support.cpp b/src/common/modules/path_extend/pipeline/launch_support.cpp
new file mode 100644
index 0000000..3be9ce5
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/launch_support.cpp
@@ -0,0 +1,128 @@
+//
+// Created by andrey on 10.10.16.
+//
+
+#include "launch_support.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+bool PELaunchSupport::HasOnlyMPLibs() const {
+ for (const auto &lib : dataset_info_.reads) {
+ if (!(lib.is_mate_pair() && lib.data().mean_insert_size > 0.0)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+pe_config::ParamSetT::ExtensionOptionsT PELaunchSupport::GetExtensionOpts(shared_ptr<PairedInfoLibrary> lib,
+ const pe_config::ParamSetT &pset) const {
+ return lib->IsMp() ? pset.mate_pair_options : pset.extension_options;
+}
+
+double PELaunchSupport::SingleThresholdForLib(const pe_config::ParamSetT &pset,
+ double threshold) const {
+ return pset.extension_options.use_default_single_threshold || math::le(threshold, 0.0) ?
+ pset.extension_options.single_threshold : threshold;
+}
+
+bool PELaunchSupport::IsForSingleReadExtender(const io::SequencingLibrary<config::DataSetData> &lib) const {
+ return (lib.data().single_reads_mapped || lib.is_long_read_lib() || lib.is_contig_lib());
+}
+bool PELaunchSupport::IsForSingleReadScaffolder(const io::SequencingLibrary<config::DataSetData> &lib) const {
+ return (lib.is_long_read_lib() || (lib.is_contig_lib() && lib.type() != io::LibraryType::PathExtendContigs));
+}
+
+bool PELaunchSupport::IsForPEExtender(const io::SequencingLibrary<config::DataSetData> &lib) const {
+ return (lib.type() == io::LibraryType::PairedEnd && lib.data().mean_insert_size > 0.0);
+}
+
+bool PELaunchSupport::IsForShortLoopExtender(const io::SequencingLibrary<config::DataSetData> &lib) const {
+ return (lib.type() == io::LibraryType::PairedEnd && lib.data().mean_insert_size > 0.0);
+}
+
+bool PELaunchSupport::IsForScaffoldingExtender(const io::SequencingLibrary<config::DataSetData> &lib) const {
+ return (lib.type() == io::LibraryType::PairedEnd && lib.data().mean_insert_size > 0.0);
+}
+
+//TODO: review usage
+bool PELaunchSupport::UseCoverageResolverForSingleReads(const io::LibraryType &type) const {
+ return HasOnlyMPLibs() && (type == io::LibraryType::HQMatePairs);
+}
+
+std::string PELaunchSupport::LibStr(size_t count) const {
+ return count == 1 ? "library" : "libraries";
+}
+
+pe_config::LongReads PELaunchSupport::GetLongReadsConfig(const io::LibraryType &type) const {
+ if (io::SequencingLibraryBase::is_long_read_lib(type)) {
+ return params_.pe_cfg.long_reads.pacbio_reads;
+ } else if (type == io::LibraryType::PathExtendContigs) {
+ return params_.pe_cfg.long_reads.meta_contigs;
+ } else if (io::SequencingLibraryBase::is_contig_lib(type)) {
+ return params_.pe_cfg.long_reads.contigs;
+ }
+ return params_.pe_cfg.long_reads.single_reads;
+}
+
+size_t PELaunchSupport::FindMaxMPIS() const {
+ size_t max_is = 0;
+ for (size_t i = 0; i < dataset_info_.reads.lib_count(); ++i) {
+ if (dataset_info_.reads[i].is_mate_pair()) {
+ max_is = max(max_is, (size_t) dataset_info_.reads[i].data().mean_insert_size);
+ }
+ }
+ return max_is;
+}
+
+bool PELaunchSupport::HasLongReads() const {
+ return path_extend::HasLongReads(dataset_info_);
+}
+
+bool PELaunchSupport::HasLongReadsScaffolding() const {
+ for (const auto &lib : dataset_info_.reads) {
+ if (IsForSingleReadScaffolder(lib))
+ return true;
+ }
+ return false;
+}
+
+bool PELaunchSupport::HasMPReads() const {
+ for (const auto &lib : dataset_info_.reads) {
+ if (lib.is_mate_pair()) {
+ return true;
+ }
+ }
+ return false;
+}
+bool PELaunchSupport::SingleReadsMapped() const {
+ for (const auto &lib : dataset_info_.reads) {
+ if (lib.data().single_reads_mapped) {
+ return true;
+ }
+ }
+ return false;
+}
+
+double PELaunchSupport::EstimateLibCoverage(size_t lib_index) const {
+ double cov_fraction = double(dataset_info_.reads[lib_index].data().total_nucls) / double(TotalNuclsInGraph());
+ return cov_fraction * dataset_info_.avg_coverage();
+}
+
+size_t PELaunchSupport::TotalNuclsInGraph() const {
+ size_t total_nc_count = 0;
+ for (const auto &lib: dataset_info_.reads) {
+ if (lib.is_graph_contructable())
+ total_nc_count += lib.data().total_nucls;
+ }
+ return total_nc_count;
+}
+
+
+bool PELaunchSupport::NeedsUniqueEdgeStorage() const {
+ return !(params_.pset.sm == sm_old ||
+ (params_.pset.sm == sm_old_pe_2015 && !HasLongReadsScaffolding() && !HasMPReads()));
+}
+}
diff --git a/src/common/modules/path_extend/pipeline/launch_support.hpp b/src/common/modules/path_extend/pipeline/launch_support.hpp
new file mode 100644
index 0000000..53870af
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/launch_support.hpp
@@ -0,0 +1,145 @@
+//
+// Created by andrey on 10.10.16.
+//
+
+#pragma once
+
+
+#include "modules/path_extend/paired_library.hpp"
+#include "pipeline/config_struct.hpp"
+#include "modules/path_extend/pe_config_struct.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+inline size_t FindMaxISRightQuantile(const config::dataset& dataset_info, bool include_mate_pairs = true) {
+ size_t res = 0;
+ for (const auto& lib : dataset_info.reads) {
+ if (lib.is_paired()) {
+ if (lib.is_mate_pair() && !include_mate_pairs)
+ continue;
+ res = max(res, (size_t) lib.data().insert_size_right_quantile);
+ }
+ }
+ return res;
+}
+
+inline bool HasLongReads(const config::dataset& dataset_info) {
+ for (const auto& lib : dataset_info.reads) {
+ if (lib.is_long_read_lib() || lib.is_contig_lib()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+struct PathExtendParamsContainer {
+
+ PathExtendParamsContainer(const config::dataset& dataset_info,
+ const pe_config::MainPEParamsT& pe_cfg_,
+ const std::string& output_dir_,
+ config::pipeline_type mode_,
+ bool uneven_depth_,
+ bool avoid_rc_connections_,
+ bool use_scaffolder_):
+ pe_cfg(pe_cfg_),
+ pset(pe_cfg_.param_set),
+ output_dir(output_dir_),
+ etc_dir(output_dir + pe_cfg_.etc_dir + "/"),
+ mode(mode_),
+ uneven_depth(uneven_depth_),
+ avoid_rc_connections(avoid_rc_connections_),
+ use_scaffolder(use_scaffolder_),
+ traverse_loops(true),
+ detect_repeats_online(mode_ != config::pipeline_type::meta && mode_ != config::pipeline_type::rna)
+ {
+ if (!(use_scaffolder && pset.scaffolder_options.enabled)) {
+ traverse_loops = false;
+ }
+ if (mode_ == config::pipeline_type::rna)
+ traverse_loops = false;
+
+ //Parameters are subject to change
+ max_polisher_gap = FindMaxISRightQuantile(dataset_info);
+ //TODO: params
+ if (HasLongReads(dataset_info))
+ max_polisher_gap = max(max_polisher_gap, size_t(10000));
+
+ min_edge_len = 100;
+ max_path_diff = mode == config::pipeline_type::rna ? 1 : FindMaxISRightQuantile(dataset_info);
+ }
+
+ const pe_config::MainPEParamsT& pe_cfg;
+ const pe_config::ParamSetT& pset;
+
+ std::string output_dir;
+ std::string etc_dir;
+
+ config::pipeline_type mode;
+ bool uneven_depth;
+
+ bool avoid_rc_connections;
+ bool use_scaffolder;
+ bool traverse_loops;
+ bool detect_repeats_online;
+
+ size_t min_edge_len;
+ size_t max_path_diff;
+ size_t max_polisher_gap;
+ //TODO: move here size_t max_repeat_length;
+};
+
+
+class PELaunchSupport {
+ const config::dataset& dataset_info_;
+ const PathExtendParamsContainer& params_;
+
+public:
+
+ PELaunchSupport(const config::dataset& dataset_info,
+ const PathExtendParamsContainer& params):
+ dataset_info_(dataset_info),
+ params_(params) { }
+
+ pe_config::ParamSetT::ExtensionOptionsT GetExtensionOpts(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) const;
+
+ double SingleThresholdForLib(const pe_config::ParamSetT &pset, double threshold) const;
+
+ bool HasOnlyMPLibs() const;
+
+ bool IsForSingleReadExtender(const io::SequencingLibrary<config::DataSetData> &lib) const;
+
+ bool IsForSingleReadScaffolder(const io::SequencingLibrary<config::DataSetData> &lib) const;
+
+ bool IsForPEExtender(const io::SequencingLibrary<config::DataSetData> &lib) const;
+
+ bool IsForShortLoopExtender(const io::SequencingLibrary<config::DataSetData> &lib) const;
+
+ bool IsForScaffoldingExtender(const io::SequencingLibrary<config::DataSetData> &lib) const;
+
+ bool UseCoverageResolverForSingleReads(const io::LibraryType& type) const;
+
+ std::string LibStr(size_t count) const;
+
+ pe_config::LongReads GetLongReadsConfig(const io::LibraryType &type) const;
+
+ size_t FindMaxMPIS() const;
+
+ bool HasLongReads() const;
+
+ bool HasLongReadsScaffolding() const;
+
+ bool HasMPReads() const;
+
+ bool SingleReadsMapped() const;
+
+ double EstimateLibCoverage(size_t lib_index) const;
+
+ size_t TotalNuclsInGraph() const;
+
+ bool NeedsUniqueEdgeStorage() const;
+
+};
+
+}
diff --git a/src/common/modules/path_extend/pipeline/launcher.cpp b/src/common/modules/path_extend/pipeline/launcher.cpp
new file mode 100644
index 0000000..98540b6
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/launcher.cpp
@@ -0,0 +1,448 @@
+//
+// Created by andrey on 14.11.16.
+//
+
+#include "launcher.hpp"
+
+#include "modules/path_extend/path_visualizer.hpp"
+#include "modules/path_extend/loop_traverser.hpp"
+#include "modules/alignment/long_read_storage.hpp"
+#include "modules/path_extend/scaffolder2015/extension_chooser2015.hpp"
+#include "modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp"
+#include "modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp"
+#include "assembly_graph/graph_support/coverage_uniformity_analyzer.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include "modules/path_extend/scaffolder2015/path_polisher.hpp"
+
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+using namespace std;
+
+
+vector<shared_ptr<ConnectionCondition>>
+ PathExtendLauncher::ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage& edge_storage) const {
+
+ vector<shared_ptr<ConnectionCondition>> conditions;
+ const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params;
+
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); ++lib_index) {
+ const auto &lib = dataset_info_.reads[lib_index];
+ if (lib.is_paired()) {
+ shared_ptr<PairedInfoLibrary> paired_lib;
+ if (lib.is_mate_pair())
+ paired_lib = MakeNewLib(gp_.g, lib, gp_.paired_indices[lib_index]);
+ else if (lib.type() == io::LibraryType::PairedEnd)
+ paired_lib = MakeNewLib(gp_.g, lib, gp_.clustered_indices[lib_index]);
+ else {
+ INFO("Unusable for scaffold graph paired lib #" << lib_index);
+ continue;
+ }
+ conditions.push_back(make_shared<ScaffoldGraphPairedConnectionCondition>(gp_.g, edge_storage.GetSet(),
+ paired_lib, lib_index,
+ params.always_add,
+ params.never_add,
+ params.relative_threshold));
+ }
+ }
+ return conditions;
+}
+
+shared_ptr<scaffold_graph::ScaffoldGraph> PathExtendLauncher::ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage &edge_storage) const {
+ using namespace scaffold_graph;
+
+ const pe_config::ParamSetT::ScaffoldGraphParamsT ¶ms = params_.pset.scaffold_graph_params;
+
+ INFO("Constructing connections");
+ LengthLowerBound edge_condition(gp_.g, edge_storage.GetMinLength());
+
+ vector<shared_ptr<ConnectionCondition>> conditions =
+ ConstructPairedConnectionConditions(edge_storage);
+
+ if (params.use_graph_connectivity) {
+ auto as_con = make_shared<AssemblyGraphConnectionCondition>(gp_.g, params.max_path_length, edge_storage);
+ as_con->AddInterestingEdges(edge_condition);
+ conditions.push_back(as_con);
+ }
+
+ INFO("Total conditions " << conditions.size());
+
+ INFO("Constructing scaffold graph from set of size " << edge_storage.GetSet().size());
+
+ DefaultScaffoldGraphConstructor constructor(gp_.g, edge_storage.GetSet(), conditions, edge_condition);
+ auto scaffold_graph = constructor.Construct();
+
+ INFO("Scaffold graph contains " << scaffold_graph->VertexCount() << " vertices and " << scaffold_graph->EdgeCount()
+ << " edges");
+ return scaffold_graph;
+}
+
+void PathExtendLauncher::PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph,
+ const set<EdgeId> &main_edge_set,
+ const debruijn_graph::GenomeConsistenceChecker &genome_checker,
+ const string &filename) const {
+ using namespace scaffold_graph;
+
+ INFO("Constructing reference labels");
+ map<debruijn_graph::EdgeId, string> edge_labels;
+ size_t count = 0;
+ for (const auto &edge_coord_pair: genome_checker.ConstructEdgeOrder()) {
+ if (edge_labels.find(edge_coord_pair.first) == edge_labels.end()) {
+ edge_labels[edge_coord_pair.first] = "";
+ }
+ edge_labels[edge_coord_pair.first] += "order: " + ToString(count) +
+ "\n mapped range: " + ToString(edge_coord_pair.second.mapped_range.start_pos) + " : "
+ + ToString(edge_coord_pair.second.mapped_range.end_pos) +
+ "\n init range: " + ToString(edge_coord_pair.second.initial_range.start_pos) + " : "
+ + ToString(edge_coord_pair.second.initial_range.end_pos) + "\n";
+ ++count;
+ }
+
+ auto vertex_colorer = make_shared<ScaffoldVertexSetColorer>(main_edge_set);
+ auto edge_colorer = make_shared<ScaffoldEdgeColorer>();
+ graph_colorer::CompositeGraphColorer<ScaffoldGraph> colorer(vertex_colorer, edge_colorer);
+
+ INFO("Visualizing scaffold graph");
+ ScaffoldGraphVisualizer singleVisualizer(scaffold_graph, edge_labels);
+ std::ofstream single_dot;
+ single_dot.open((filename + "_single.dot").c_str());
+ singleVisualizer.Visualize(single_dot, colorer);
+ single_dot.close();
+
+ INFO("Printing scaffold graph");
+ std::ofstream data_stream;
+ data_stream.open((filename + ".data").c_str());
+ scaffold_graph.Print(data_stream);
+ data_stream.close();
+}
+
+
+void PathExtendLauncher::MakeAndOutputScaffoldGraph() const {
+ //Scaffold graph
+ shared_ptr<scaffold_graph::ScaffoldGraph> scaffold_graph;
+ if (params_.pset.scaffold_graph_params.construct) {
+ debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, unique_data_.main_unique_storage_,
+ params_.pset.genome_consistency_checker.max_gap,
+ params_.pset.genome_consistency_checker.relative_max_gap);
+ scaffold_graph = ConstructScaffoldGraph(unique_data_.main_unique_storage_);
+ if (params_.pset.scaffold_graph_params.output) {
+ PrintScaffoldGraph(*scaffold_graph,
+ unique_data_.main_unique_storage_.GetSet(),
+ genome_checker,
+ params_.etc_dir + "scaffold_graph");
+ }
+ }
+}
+
+void PathExtendLauncher::CountMisassembliesWithReference(const PathContainer &paths) const {
+ if (gp_.genome.size() == 0)
+ return;
+
+ debruijn_graph::GenomeConsistenceChecker genome_checker(gp_, unique_data_.main_unique_storage_,
+ params_.pset.genome_consistency_checker.max_gap,
+ params_.pset.genome_consistency_checker.relative_max_gap);
+
+ size_t total_mis = 0, gap_mis = 0;
+ genome_checker.SpellGenome();
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+ BidirectionalPath *path = iter.get();
+ auto map_res = genome_checker.CountMisassemblies(*path);
+ if (map_res.misassemblies > 0) {
+ INFO ("there are " << map_res.misassemblies << " misassemblies in path: ");
+ path->PrintInfo();
+ total_mis += map_res.misassemblies;
+ }
+ if (map_res.wrong_gap_size > 0) {
+ INFO ("there are " << map_res.wrong_gap_size << " wrong gaps in path: ");
+ path->PrintInfo();
+ gap_mis += map_res.wrong_gap_size;
+ }
+ }
+ INFO ("In total found " << total_mis << " misassemblies " << " and " << gap_mis << " gaps.");
+}
+
+
+void PathExtendLauncher::EstimateUniqueEdgesParams() {
+ bool uniform_coverage = false;
+ if (params_.pset.uniqueness_analyser.enabled) {
+ INFO("Autodetecting unique edge set parameters...");
+ unique_data_.min_unique_length_ = max(unique_data_.min_unique_length_, support_.FindMaxMPIS());
+ INFO("Minimal unique edge length set to the smallest MP library IS: " << unique_data_.min_unique_length_);
+
+ CoverageUniformityAnalyzer coverage_analyzer(gp_.g, unique_data_.min_unique_length_);
+ double median_coverage = coverage_analyzer.CountMedianCoverage();
+ double uniformity_fraction = coverage_analyzer.UniformityFraction(unique_data_.unique_variation_, median_coverage);
+ INFO ("median coverage for edges longer than " << unique_data_.min_unique_length_ << " is " << median_coverage <<
+ " uniformity " << size_t(uniformity_fraction * 100) << "%");
+ if (math::gr(uniformity_fraction, params_.pset.uniqueness_analyser.uniformity_fraction_threshold)) {
+ uniform_coverage = true;
+ }
+ if (!uniform_coverage) {
+ unique_data_.unique_variation_ = params_.pset.uniqueness_analyser.nonuniform_coverage_variation;
+ INFO("Coverage is not uniform, we do not rely on coverage for long edge uniqueness");
+ }
+
+ } else {
+ INFO("Unique edge set constructed with parameters from config : length " << unique_data_.min_unique_length_
+ << " variation " << unique_data_.unique_variation_);
+ }
+}
+
+
+void PathExtendLauncher::FillUniqueEdgeStorage() {
+ ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp_, unique_data_.min_unique_length_, unique_data_.unique_variation_);
+ unique_edge_analyzer.FillUniqueEdgeStorage(unique_data_.main_unique_storage_);
+}
+
+void PathExtendLauncher::DebugOutputPaths(const PathContainer &paths, const string &name) const {
+ if (!params_.pe_cfg.debug_output) {
+ return;
+ }
+ PathInfoWriter path_writer;
+ PathVisualizer visualizer;
+
+ writer_.OutputPaths(paths, params_.etc_dir + name);
+ if (params_.pe_cfg.output.write_paths) {
+ path_writer.WritePaths(paths, params_.etc_dir + name + ".dat");
+ }
+ if (params_.pe_cfg.viz.print_paths) {
+ visualizer.writeGraphWithPathsSimple(gp_, params_.etc_dir + name + ".dot", name, paths);
+ }
+}
+
+void PathExtendLauncher::FinalizePaths(PathContainer &paths,
+ GraphCoverageMap &cover_map,
+ const PathExtendResolver &resolver) const {
+
+ if (params_.pset.remove_overlaps) {
+ resolver.RemoveOverlaps(paths, cover_map, params_.min_edge_len, params_.max_path_diff,
+ params_.pset.cut_all_overlaps,
+ (params_.mode == config::pipeline_type::moleculo));
+ } else if (params_.mode == config::pipeline_type::rna) {
+ resolver.RemoveRNAOverlaps(paths, cover_map, params_.min_edge_len, params_.max_path_diff);
+ } else {
+ resolver.RemoveEqualPaths(paths, cover_map, params_.min_edge_len);
+ }
+
+ if (params_.avoid_rc_connections) {
+ paths.FilterInterstandBulges();
+ }
+ paths.FilterEmptyPaths();
+ resolver.AddUncoveredEdges(paths, cover_map);
+
+ if (params_.pset.path_filtration.enabled) {
+ LengthPathFilter(gp_.g, params_.pset.path_filtration.min_length).filter(paths);;
+ IsolatedPathFilter(gp_.g,
+ params_.pset.path_filtration.min_length_for_low_covered,
+ params_.pset.path_filtration.min_coverage).filter(paths);
+ IsolatedPathFilter(gp_.g, params_.pset.path_filtration.isolated_min_length).filter(paths);
+ }
+ paths.SortByLength();
+ for (auto &path : paths) {
+ path.first->ResetOverlaps();
+ }
+}
+
+void PathExtendLauncher::TraverseLoops(PathContainer &paths, GraphCoverageMap &cover_map) const {
+ INFO("Traversing tandem repeats");
+
+ LoopTraverser
+ loopTraverser(cover_map.graph(), cover_map,
+ params_.pset.loop_traversal.min_edge_length,
+ params_.pset.loop_traversal.max_component_size,
+ params_.pset.loop_traversal.max_path_length);
+ size_t res = loopTraverser.TraverseAllLoops();
+ paths.SortByLength();
+
+ INFO("Traversed " << res << " loops");
+}
+
+Extenders PathExtendLauncher::ConstructMPExtender(const ExtendersGenerator &generator, size_t uniqe_edge_len) {
+ ScaffoldingUniqueEdgeAnalyzer additional_edge_analyzer(gp_, (size_t) uniqe_edge_len, unique_data_.unique_variation_);
+ unique_data_.unique_storages_.push_back(make_shared<ScaffoldingUniqueEdgeStorage>());
+ additional_edge_analyzer.FillUniqueEdgeStorage(*unique_data_.unique_storages_.back());
+
+ return generator.MakeMPExtenders(*unique_data_.unique_storages_.back());
+}
+
+Extenders PathExtendLauncher::ConstructMPExtenders(const ExtendersGenerator &generator) {
+ const pe_config::ParamSetT &pset = params_.pset;
+
+ Extenders extenders = generator.MakeMPExtenders(unique_data_.main_unique_storage_);
+ INFO("Using " << extenders.size() << " mate-pair " << support_.LibStr(extenders.size()));
+
+ size_t cur_length = unique_data_.min_unique_length_ - pset.scaffolding2015.unique_length_step;
+ size_t lower_bound = max(pset.scaffolding2015.unique_length_lower_bound, pset.scaffolding2015.unique_length_step);
+
+ while (cur_length > lower_bound) {
+ INFO("Adding extender with length " << cur_length);
+ push_back_all(extenders, ConstructMPExtender(generator, cur_length));
+ cur_length -= pset.scaffolding2015.unique_length_step;
+ }
+ if (unique_data_.min_unique_length_ > lower_bound) {
+ INFO("Adding final extender with length " << lower_bound);
+ push_back_all(extenders, ConstructMPExtender(generator, lower_bound));
+ }
+
+ return extenders;
+}
+
+void PathExtendLauncher::FillPathContainer(size_t lib_index, size_t size_threshold) {
+ std::vector<PathInfo<Graph>> paths;
+ gp_.single_long_reads[lib_index].SaveAllPaths(paths);
+ for (const auto &path: paths) {
+ const auto &edges = path.getPath();
+ if (edges.size() <= size_threshold)
+ continue;
+
+ BidirectionalPath *new_path = new BidirectionalPath(gp_.g, edges);
+ BidirectionalPath *conj_path = new BidirectionalPath(new_path->Conjugate());
+ new_path->SetWeight((float) path.getWeight());
+ conj_path->SetWeight((float) path.getWeight());
+ unique_data_.long_reads_paths_[lib_index]->AddPair(new_path, conj_path);
+ }
+ DEBUG("Long reads paths " << unique_data_.long_reads_paths_[lib_index]->size());
+ unique_data_.long_reads_cov_map_[lib_index]->AddPaths(*unique_data_.long_reads_paths_[lib_index]);
+}
+
+
+void PathExtendLauncher::FillLongReadsCoverageMaps() {
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) {
+ unique_data_.long_reads_paths_.push_back(make_shared<PathContainer>());
+ unique_data_.long_reads_cov_map_.push_back(make_shared<GraphCoverageMap>(gp_.g));
+ if (support_.IsForSingleReadExtender(dataset_info_.reads[lib_index])) {
+ FillPathContainer(lib_index);
+ }
+ }
+}
+
+void PathExtendLauncher::FillPBUniqueEdgeStorages() {
+ //FIXME magic constants
+ ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer_pb(gp_, 500, 0.5);
+
+ INFO("Filling backbone edges for long reads scaffolding...");
+ if (params_.uneven_depth) {
+ INFO(" with long reads paths");
+ //TODO:: muiltiple libraries?
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) {
+ if (support_.IsForSingleReadScaffolder(dataset_info_.reads[lib_index])) {
+ unique_edge_analyzer_pb.FillUniqueEdgesWithLongReads(unique_data_.long_reads_cov_map_[lib_index],
+ unique_data_.unique_pb_storage_,
+ support_.GetLongReadsConfig(dataset_info_.reads[lib_index].type()));
+ }
+ }
+ INFO("Removing fake unique with paired-end libs");
+ for (size_t lib_index = 0; lib_index < dataset_info_.reads.lib_count(); lib_index++) {
+ if (dataset_info_.reads[lib_index].type() == io::LibraryType::PairedEnd) {
+ unique_edge_analyzer_pb.ClearLongEdgesWithPairedLib(lib_index, unique_data_.unique_pb_storage_);
+ }
+ }
+
+ } else {
+ INFO(" with coverage")
+ unique_edge_analyzer_pb.FillUniqueEdgeStorage(unique_data_.unique_pb_storage_);
+ }
+ INFO(unique_data_.unique_pb_storage_.size() << " unique edges");
+}
+
+Extenders PathExtendLauncher::ConstructPBExtenders(const ExtendersGenerator &generator) {
+ FillPBUniqueEdgeStorages();
+ return generator.MakePBScaffoldingExtenders(unique_data_.unique_pb_storage_,
+ unique_data_.long_reads_cov_map_);
+}
+
+
+Extenders PathExtendLauncher::ConstructExtenders(const GraphCoverageMap& cover_map) {
+ INFO("Creating main extenders, unique edge length = " << unique_data_.min_unique_length_);
+ if (support_.SingleReadsMapped() || support_.HasLongReads())
+ FillLongReadsCoverageMaps();
+
+ ExtendersGenerator generator(dataset_info_, params_, gp_, cover_map, support_);
+ Extenders extenders = generator.MakeBasicExtenders(unique_data_.main_unique_storage_,
+ unique_data_.long_reads_cov_map_);
+
+ //long reads scaffolding extenders.
+ if (support_.HasLongReads()) {
+ if (params_.pset.sm == sm_old) {
+ INFO("Will not use new long read scaffolding algorithm in this mode");
+ } else {
+ push_back_all(extenders, ConstructPBExtenders(generator));
+ }
+ }
+
+ if (support_.HasMPReads()) {
+ if (params_.pset.sm == sm_old) {
+ INFO("Will not use mate-pairs is this mode");
+ } else {
+ push_back_all(extenders, ConstructMPExtenders(generator));
+ }
+ }
+
+ if (params_.pset.use_coordinated_coverage)
+ push_back_all(extenders, generator.MakeCoverageExtenders());
+
+ INFO("Total number of extenders is " << extenders.size());
+ return extenders;
+}
+
+void PathExtendLauncher::PolishPaths(const PathContainer &paths, PathContainer &result) const {
+ //Fixes distances for paths gaps and tries to fill them in
+ INFO("Closing gaps in paths");
+ PathPolisher polisher(gp_, dataset_info_, unique_data_.main_unique_storage_, params_.max_polisher_gap);
+ polisher.PolishPaths(paths, result);
+ result.SortByLength();
+ INFO("Gap closing completed")
+}
+
+void PathExtendLauncher::Launch() {
+ INFO("ExSPAnder repeat resolving tool started");
+ make_dir(params_.output_dir);
+ make_dir(params_.etc_dir);
+
+ if (support_.NeedsUniqueEdgeStorage()) {
+ //Fill the storage to enable unique edge check
+ EstimateUniqueEdgesParams();
+ FillUniqueEdgeStorage();
+ }
+
+ MakeAndOutputScaffoldGraph();
+
+ PathExtendResolver resolver(gp_.g);
+
+ auto seeds = resolver.MakeSimpleSeeds();
+ seeds.SortByLength();
+ DebugOutputPaths(seeds, "init_paths");
+
+ GraphCoverageMap cover_map(gp_.g);
+ Extenders extenders = ConstructExtenders(cover_map);
+ shared_ptr<CompositeExtender> composite_extender = make_shared<CompositeExtender>(gp_.g, cover_map, extenders,
+ unique_data_.main_unique_storage_,
+ params_.max_path_diff,
+ params_.pset.extension_options.max_repeat_length,
+ params_.detect_repeats_online);
+
+ auto paths = resolver.ExtendSeeds(seeds, *composite_extender);
+ paths.FilterEmptyPaths();
+ paths.SortByLength();
+ DebugOutputPaths(paths, "raw_paths");
+
+ FinalizePaths(paths, cover_map, resolver);
+ DebugOutputPaths(paths, "before_loop_traversal");
+
+ TraverseLoops(paths, cover_map);
+ DebugOutputPaths(paths, "loop_traveresed");
+
+ PolishPaths(paths, gp_.contig_paths);
+ DebugOutputPaths(gp_.contig_paths, "polished_paths");
+
+ GraphCoverageMap polished_map(gp_.g, gp_.contig_paths, true);
+ FinalizePaths(gp_.contig_paths, polished_map, resolver);
+ DebugOutputPaths(gp_.contig_paths, "final_paths");
+
+ CountMisassembliesWithReference(gp_.contig_paths);
+
+ INFO("ExSPAnder repeat resolving tool finished");
+}
+
+}
diff --git a/src/common/modules/path_extend/pipeline/launcher.hpp b/src/common/modules/path_extend/pipeline/launcher.hpp
new file mode 100644
index 0000000..e936f58
--- /dev/null
+++ b/src/common/modules/path_extend/pipeline/launcher.hpp
@@ -0,0 +1,115 @@
+//
+// Created by andrey on 14.11.16.
+//
+
+#ifndef PROJECT_LAUNCHER_H
+#define PROJECT_LAUNCHER_H
+
+#include "launch_support.hpp"
+#include "extenders_logic.hpp"
+
+#include "modules/path_extend/pe_resolver.hpp"
+#include "modules/genome_consistance_checker.hpp"
+#include "modules/path_extend/scaffolder2015/scaffold_graph.hpp"
+#include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp"
+
+namespace path_extend {
+
+using namespace debruijn_graph;
+
+class PathExtendLauncher {
+
+private:
+ const config::dataset& dataset_info_;
+ const PathExtendParamsContainer& params_;
+ conj_graph_pack& gp_;
+ PELaunchSupport support_;
+
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector_;
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor_;
+ shared_ptr<ContigNameGenerator> contig_name_generator_;
+ ContigWriter writer_;
+
+ struct {
+ size_t min_unique_length_;
+ double unique_variation_;
+
+ ScaffoldingUniqueEdgeStorage main_unique_storage_;
+ vector<shared_ptr<ScaffoldingUniqueEdgeStorage>> unique_storages_;
+
+ ScaffoldingUniqueEdgeStorage unique_pb_storage_;
+ vector<shared_ptr<PathContainer>> long_reads_paths_;
+ vector<shared_ptr<GraphCoverageMap>> long_reads_cov_map_;
+ } unique_data_;
+
+ vector<shared_ptr<ConnectionCondition>>
+ ConstructPairedConnectionConditions(const ScaffoldingUniqueEdgeStorage& edge_storage) const;
+
+ shared_ptr<scaffold_graph::ScaffoldGraph>
+ ConstructScaffoldGraph(const ScaffoldingUniqueEdgeStorage& edge_storage) const;
+
+ void PrintScaffoldGraph(const scaffold_graph::ScaffoldGraph &scaffold_graph,
+ const set<EdgeId>& main_edge_set,
+ const debruijn_graph::GenomeConsistenceChecker& genome_checker,
+ const string& filename) const;
+
+ void MakeAndOutputScaffoldGraph() const;
+
+ void CountMisassembliesWithReference(const PathContainer& paths) const;
+
+ void EstimateUniqueEdgesParams();
+
+ void FillUniqueEdgeStorage();
+
+ void FillPBUniqueEdgeStorages();
+
+ void FillPathContainer(size_t lib_index, size_t size_threshold = 1);
+
+ void FillLongReadsCoverageMaps();
+
+ void DebugOutputPaths(const PathContainer& paths, const string& name) const;
+
+ void FinalizePaths(PathContainer& paths, GraphCoverageMap &cover_map, const PathExtendResolver&resolver) const;
+
+ void TraverseLoops(PathContainer& paths, GraphCoverageMap& cover_map) const;
+
+ void PolishPaths(const PathContainer &paths, PathContainer &result) const;
+
+ Extenders ConstructExtenders(const GraphCoverageMap& cover_map);
+
+ Extenders ConstructMPExtenders(const ExtendersGenerator &generator);
+
+ Extenders ConstructMPExtender(const ExtendersGenerator &generator, size_t uniqe_edge_len);
+
+ Extenders ConstructPBExtenders(const ExtendersGenerator &generator);
+
+
+public:
+
+ PathExtendLauncher(const config::dataset& dataset_info,
+ const PathExtendParamsContainer& params,
+ conj_graph_pack& gp):
+ dataset_info_(dataset_info),
+ params_(params),
+ gp_(gp),
+ support_(dataset_info, params),
+ corrector_(gp.g),
+ constructor_(gp.g, corrector_),
+ contig_name_generator_(MakeContigNameGenerator(params_.mode, gp)),
+ writer_(gp.g, constructor_, gp_.components, contig_name_generator_),
+ unique_data_()
+ {
+ unique_data_.min_unique_length_ = params.pset.scaffolding2015.unique_length_upper_bound;
+ unique_data_.unique_variation_ = params.pset.uniqueness_analyser.unique_coverage_variation;
+ }
+
+ ~PathExtendLauncher() {
+ }
+
+ void Launch();
+
+};
+
+}
+
+#endif //PROJECT_LAUNCHER_H
diff --git a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp
new file mode 100644
index 0000000..9149f3c
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.cpp
@@ -0,0 +1,260 @@
+#include "connection_condition2015.hpp"
+namespace path_extend {
+
+
+map <debruijn_graph::EdgeId, double> ConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e, const ScaffoldingUniqueEdgeStorage& storage) const {
+ auto all_edges = this->ConnectedWith(e);
+ map <debruijn_graph::EdgeId, double> res;
+ for (auto edge: all_edges) {
+ if (storage.IsUnique(edge.first)){
+ res.insert(edge);
+ }
+ }
+ return res;
+}
+
+PairedLibConnectionCondition::PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t min_read_count) :
+ graph_(graph),
+ lib_(lib),
+ lib_index_(lib_index),
+ min_read_count_(min_read_count),
+ //FIXME reconsider condition; config!
+ left_dist_delta_(5 * (int) lib_->GetISMax()),
+ right_dist_delta_(max(5 * (int) lib_->GetIsVar(), int(lib_->GetIS()))) {
+}
+
+size_t PairedLibConnectionCondition::GetLibIndex() const {
+ return lib_index_;
+}
+
+map <debruijn_graph::EdgeId, double> PairedLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ set <debruijn_graph::EdgeId> all_edges;
+ int e_length = (int) graph_.length(e);
+ lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
+
+ map <debruijn_graph::EdgeId, double> result;
+ for (auto edge : all_edges) {
+ double w = GetWeight(e, edge);
+ if (edge != e && edge != graph_.conjugate(e) &&
+ math::ge(w, (double) min_read_count_)) {
+ result[edge] = w;
+ }
+ }
+ return result;
+}
+
+double PairedLibConnectionCondition::GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
+ int e_length = (int) graph_.length(e1);
+ return lib_->CountPairedInfo(e1, e2, e_length - left_dist_delta_, e_length + right_dist_delta_);
+}
+
+LongReadsLibConnectionCondition::LongReadsLibConnectionCondition(const debruijn_graph::Graph &graph,
+ size_t lib_index,
+ size_t min_read_count, const GraphCoverageMap& cov_map):graph_(graph), lib_index_(lib_index), min_read_count_(min_read_count), cov_map_(cov_map){}
+
+map<debruijn_graph::EdgeId, double> LongReadsLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId ) const {
+ return map <debruijn_graph::EdgeId, double>();
+};
+
+bool LongReadsLibConnectionCondition::CheckPath(BidirectionalPath *path, EdgeId e1, EdgeId e2) const {
+ auto pos1 = path->FindAll(e1);
+ if (pos1.size() != 1) return false;
+ auto pos2 = path->FindAll(e2);
+ if (pos2.size() != 1) {
+ if (pos2.size() >= 2) {
+ DEBUG("Something went wrong:: Edge " << graph_.int_id(e2) << "is called unique but presents in path twice! first edge " << graph_.int_id(e1) << " path ");
+ path->Print();
+ }
+ return false;
+ }
+ if (pos1[0] == path->Size() - 1) return false;
+ return true;
+}
+
+map<debruijn_graph::EdgeId, double> LongReadsLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e, const ScaffoldingUniqueEdgeStorage& storage) const {
+ map <debruijn_graph::EdgeId, double> res;
+ auto cov_paths = cov_map_.GetCoveringPaths(e);
+ DEBUG("Got cov paths " << cov_paths.size());
+ for (const auto path: cov_paths) {
+ auto pos1 = path->FindAll(e);
+ if (pos1.size() != 1) {
+ DEBUG("***not unique " << graph_.int_id(e) << " len " << graph_.length(e) << "***");
+ continue;
+ }
+ size_t pos = pos1[0];
+ pos++;
+ while (pos < path->Size()){
+ if (storage.IsUnique(path->At(pos))) {
+ if (CheckPath(path, path->At(pos1[0]), path->At(pos))) {
+ res[path->At(pos)] += path->GetWeight();
+ }
+ break;
+ }
+ pos++;
+ }
+ }
+ DEBUG("Before prefiltering " << res.size());
+ auto iter = res.begin();
+ while (iter != res.end()) {
+ if (iter->second < min_read_count_){
+ iter = res.erase(iter);
+ } else {
+ iter++;
+ }
+ }
+ DEBUG("After prefiltering" << res.size());
+ return res;
+}
+
+int LongReadsLibConnectionCondition::GetMedianGap(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
+ auto cov_paths = cov_map_.GetCoveringPaths(e1);
+ std::vector<pair<int, double> > h;
+ for (const auto path: cov_paths) {
+ if (CheckPath(path, e1, e2)) {
+ auto pos1 = path->FindAll(e1);
+ auto pos2 = path->FindAll(e2);
+ h.push_back(make_pair(path->LengthAt(pos1[0] + 1) - path->LengthAt(pos2[0]), path->GetWeight()));
+ }
+ }
+ std::sort(h.begin(), h.end());
+ double sum = 0.0;
+ double sum2 = 0.0;
+ for (size_t j = 0; j< h.size(); ++j) {
+ sum += h[j].second;
+ }
+ size_t i = 0;
+ for (; i < h.size(); ++i) {
+ sum2 += h[i].second;
+ if (sum2 * 2 > sum)
+ break;
+ }
+ if (h.size() == 0) {
+ WARN("filtering incorrectness");
+ return 0;
+ }
+
+ return h[i].first;
+}
+
+size_t LongReadsLibConnectionCondition::GetLibIndex() const {
+ return lib_index_;
+}
+
+ScaffoldGraphPairedConnectionCondition::ScaffoldGraphPairedConnectionCondition(const debruijn_graph::Graph &graph,
+ const set<debruijn_graph::EdgeId>& graph_edges,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t always_add,
+ size_t never_add,
+ double relative_threshold):
+ PairedLibConnectionCondition(graph, lib, lib_index, never_add),
+ graph_edges_(graph_edges),
+ always_add_(always_add),
+ never_add_(never_add),
+ relative_threshold_(relative_threshold) {}
+
+map <debruijn_graph::EdgeId, double> ScaffoldGraphPairedConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ set <debruijn_graph::EdgeId> all_edges;
+ int e_length = (int) graph_.length(e);
+ lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
+
+ double max_weight = 0;
+ for (auto edge : all_edges) {
+ if (edge != e && edge != graph_.conjugate(e)) {
+ double w = GetWeight(e, edge);
+ if (graph_edges_.count(edge) > 0 && math::gr(w, max_weight))
+ max_weight = w;
+ }
+ }
+ double threshold = std::max((double) never_add_, std::min((double) always_add_, max_weight * relative_threshold_));
+ map <debruijn_graph::EdgeId, double> result;
+ for (auto edge : all_edges) {
+ double w = GetWeight(e, edge);
+ if (edge != e && edge != graph_.conjugate(e) &&
+ math::ge(w, threshold)) {
+ result[edge] = w;
+ }
+ }
+ return result;
+}
+
+
+//TODO: We use same part of index twice, is it necessary?
+int PairedLibConnectionCondition::GetMedianGap(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
+ std::vector<int> distances;
+ std::vector<double> weights;
+ int e_length = (int) graph_.length(e1);
+ lib_->CountDistances(e1, e2, distances, weights);
+ std::vector<pair<int, double> >h(distances.size());
+ for (size_t i = 0; i< distances.size(); i++) {
+//TODO:: we make same checks twice! That's bad
+ if (distances[i] >= e_length - left_dist_delta_ && distances[i] <= e_length + right_dist_delta_)
+ h.push_back(std::make_pair(distances[i], weights[i]));
+ }
+//TODO: is it really necessary?
+ std::sort(h.begin(), h.end());
+ double sum = 0.0;
+ double sum2 = 0.0;
+ for (size_t j = 0; j< h.size(); ++j) {
+ sum += h[j].second;
+ }
+ size_t i = 0;
+ for (; i < h.size(); ++i) {
+ sum2 += h[i].second;
+ if (sum2 * 2 > sum)
+ break;
+ }
+ if (h.size() == 0) {
+ WARN("filtering incorrectness");
+ return 0;
+ }
+ return (int) round(h[i].first - e_length);
+}
+
+AssemblyGraphConnectionCondition::AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g,
+ size_t max_connection_length, const ScaffoldingUniqueEdgeStorage & unique_edges) :
+ g_(g), max_connection_length_(max_connection_length), interesting_edge_set_(unique_edges.GetSet()), stored_distances_() {
+}
+
+map <debruijn_graph::EdgeId, double> AssemblyGraphConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
+ VERIFY_MSG(interesting_edge_set_.find(e)!= interesting_edge_set_.end(), " edge "<< e.int_id() << " not applicable for connection condition");
+ if (stored_distances_.find(e) != stored_distances_.end()) {
+ return stored_distances_[e];
+ }
+ stored_distances_.insert(make_pair(e, map<debruijn_graph::EdgeId, double>()));
+ for (auto connected: g_.OutgoingEdges(g_.EdgeEnd(e))) {
+ if (interesting_edge_set_.find(connected) != interesting_edge_set_.end()) {
+ stored_distances_[e].insert(make_pair(connected, 1));
+ }
+ }
+ DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
+ DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(g_, max_connection_length_));
+ dijkstra.Run(g_.EdgeEnd(e));
+ for (auto v: dijkstra.ReachedVertices()) {
+ for (auto connected: g_.OutgoingEdges(v)) {
+ if (interesting_edge_set_.find(connected) != interesting_edge_set_.end() && dijkstra.GetDistance(v) < max_connection_length_) {
+ stored_distances_[e].insert(make_pair(connected, 1));
+ }
+ }
+ }
+ return stored_distances_[e];
+}
+void AssemblyGraphConnectionCondition::AddInterestingEdges(func::TypedPredicate<typename Graph::EdgeId> edge_condition) {
+ for (auto e_iter = g_.ConstEdgeBegin(); !e_iter.IsEnd(); ++e_iter) {
+ if (edge_condition(*e_iter))
+ interesting_edge_set_.insert(*e_iter);
+ }
+}
+
+size_t AssemblyGraphConnectionCondition::GetLibIndex() const {
+ return (size_t) - 1;
+}
+
+int AssemblyGraphConnectionCondition::GetMedianGap (debruijn_graph::EdgeId, debruijn_graph::EdgeId) const {
+ return 0;
+}
+
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp
new file mode 100644
index 0000000..be1f51c
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/connection_condition2015.hpp
@@ -0,0 +1,143 @@
+#pragma once
+#include "modules/genome_consistance_checker.hpp"
+#include "utils/logger/logger.hpp"
+#include "modules/path_extend/paired_library.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include "modules/alignment/long_read_storage.hpp"
+#include "modules/path_extend/pe_utils.hpp"
+#include "common/assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include <map>
+#include <set>
+
+
+namespace path_extend {
+using debruijn_graph::EdgeId;
+using debruijn_graph::Graph;
+
+//De Bruijn graph edge condition interface
+class LengthLowerBound : public omnigraph::EdgeCondition<Graph> {
+ typedef Graph::EdgeId EdgeId;
+ typedef EdgeCondition<Graph> base;
+
+ const size_t max_length_;
+
+public:
+
+ LengthLowerBound(const Graph &g, size_t max_length)
+ : base(g),
+ max_length_(max_length) {
+ }
+
+ bool Check(EdgeId e) const {
+ return this->g().length(e) >= max_length_;
+ }
+};
+
+/* Connection condition are used by both scaffolder's extension chooser and scaffold graph */
+
+class ConnectionCondition {
+protected:
+ DECL_LOGGER("ConnectionCondition")
+
+public:
+// Outputs the edges e is connected with.
+//TODO performance issue: think about inside filtering. Return only unique connected edges?
+ virtual map<EdgeId, double> ConnectedWith(EdgeId e) const = 0;
+ virtual map<EdgeId, double> ConnectedWith(EdgeId e, const ScaffoldingUniqueEdgeStorage& storage) const;
+ virtual int GetMedianGap(EdgeId e1, EdgeId e2) const = 0;
+ virtual size_t GetLibIndex() const = 0;
+ virtual ~ConnectionCondition() {
+ }
+};
+
+// Main (mate pair library) connection condition.
+class PairedLibConnectionCondition : public ConnectionCondition {
+protected:
+ const Graph &graph_;
+ shared_ptr <PairedInfoLibrary> lib_;
+ size_t lib_index_;
+//Minimal number of mate pairs to call connection sound
+ size_t min_read_count_;
+public:
+//Only paired info with gap between e1 and e2 between -left_dist_delta_ and right_dist_delta_ taken in account
+ int left_dist_delta_;
+ int right_dist_delta_;
+
+ PairedLibConnectionCondition(const Graph &graph,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t min_read_count);
+ size_t GetLibIndex() const override;
+ map <EdgeId, double> ConnectedWith(EdgeId e) const override;
+ double GetWeight(EdgeId e1, EdgeId e2) const;
+//Returns median gap size
+ int GetMedianGap (EdgeId e1, EdgeId e2) const override;
+};
+
+class LongReadsLibConnectionCondition : public ConnectionCondition {
+protected:
+ const Graph &graph_;
+ size_t lib_index_;
+//Minimal number of reads to call connection sound
+ size_t min_read_count_;
+ const GraphCoverageMap& cov_map_;
+
+ bool CheckPath(BidirectionalPath *path, EdgeId e1, EdgeId e2) const;
+
+public:
+//Only paired info with gap between e1 and e2 between -left_dist_delta_ and right_dist_delta_ taken in account
+
+ LongReadsLibConnectionCondition(const Graph &graph,
+ size_t lib_index,
+ size_t min_read_count, const GraphCoverageMap& cov_map);
+ size_t GetLibIndex() const override;
+ map<EdgeId, double> ConnectedWith(EdgeId e) const override;
+ map<EdgeId, double> ConnectedWith(EdgeId e, const ScaffoldingUniqueEdgeStorage& storage) const override;
+// Returns median gap size
+ int GetMedianGap (EdgeId e1, EdgeId e2) const override;
+
+};
+
+
+
+//Should it be removed after ConnectedWith using unique storage was introduced?
+class ScaffoldGraphPairedConnectionCondition: public PairedLibConnectionCondition {
+protected:
+ const set<EdgeId>& graph_edges_;
+
+ size_t always_add_;
+ size_t never_add_;
+ double relative_threshold_;
+
+public:
+ ScaffoldGraphPairedConnectionCondition(const Graph &graph,
+ const set<EdgeId>& graph_edges,
+ shared_ptr <PairedInfoLibrary> lib,
+ size_t lib_index,
+ size_t always_add,
+ size_t never_add,
+ double relative_threshold);
+
+ map<EdgeId, double> ConnectedWith(EdgeId e) const override;
+
+};
+
+/* Condition used to find connected in graph edges.
+*
+*/
+class AssemblyGraphConnectionCondition : public ConnectionCondition {
+protected:
+ const Graph &g_;
+//Maximal gap to the connection.
+ size_t max_connection_length_;
+ set<EdgeId> interesting_edge_set_;
+ mutable map<EdgeId, map<EdgeId, double>> stored_distances_;
+public:
+ AssemblyGraphConnectionCondition(const Graph &g, size_t max_connection_length,
+ const ScaffoldingUniqueEdgeStorage& unique_edges);
+ void AddInterestingEdges(func::TypedPredicate<typename Graph::EdgeId> edge_condition);
+ map<EdgeId, double> ConnectedWith(EdgeId e) const override;
+ size_t GetLibIndex() const override;
+ int GetMedianGap(EdgeId, EdgeId ) const override;
+};
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.cpp b/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.cpp
new file mode 100644
index 0000000..0267d68
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.cpp
@@ -0,0 +1,93 @@
+//
+// Created by lab42 on 8/26/15.
+//
+
+#include "extension_chooser2015.hpp"
+
+namespace path_extend {
+using namespace std;
+
+std::pair<EdgeId, int> ExtensionChooser2015::FindLastUniqueInPath(const BidirectionalPath& path) const {
+ for (int i = (int)path.Size() - 1; i >= 0; --i) {
+ if (unique_edges_.IsUnique(path.At(i))) {
+ return std::make_pair(path.At(i), i);
+ }
+ }
+ return std::make_pair(EdgeId(0), -1);
+}
+
+ExtensionChooser::EdgeContainer ExtensionChooser2015::FindNextUniqueEdge(const EdgeId from) const {
+ VERIFY(unique_edges_.IsUnique(from));
+ EdgeContainer result;
+ map<EdgeId, double> candidate_edges = lib_connection_condition_->ConnectedWith(from, unique_edges_);
+ DEBUG(candidate_edges.size() << " candidate edges");
+ vector<pair<double, pair<EdgeId, int >>> to_sort;
+ for (const auto& pair: candidate_edges) {
+ EdgeId e = pair.first;
+ double sum = pair.second;
+ DEBUG("edge " << g_.int_id(e) << " weight " << sum);
+ if (sum < absolute_weight_threshold_) {
+ DEBUG("Edge " << g_.int_id(e) << " weight " << sum << " failed absolute weight threshold " << absolute_weight_threshold_);
+ continue;
+ }
+ int gap = lib_connection_condition_->GetMedianGap(from, e);
+
+ if (use_graph_connectivity_) {
+ auto connected_with = graph_connection_condition_.ConnectedWith(from);
+ if (connected_with.find(e) != connected_with.end()) {
+ sum *= graph_connection_bonus_;
+ }
+ }
+ to_sort.push_back(make_pair(sum, make_pair(e, gap)));
+ }
+//descending order, reverse iterators;
+ sort(to_sort.rbegin(), to_sort.rend());
+ for(size_t j = 0; j < to_sort.size(); j++) {
+ if (j == 0 || to_sort[j].first* relative_weight_threshold_ > to_sort[j - 1].first) {
+ result.push_back(EdgeWithDistance(to_sort[j].second.first, to_sort[j].second.second));
+ DEBUG("Edge " << g_.int_id(to_sort[j].second.first) << " gap " << to_sort[j].second.second << " weight "<< to_sort[j].first << " passed absolute weight threshold " << absolute_weight_threshold_);
+ } else {
+ DEBUG ("Edge " << g_.int_id(to_sort[j].second.first) << " weight " << to_sort[j].first << " failed relative weight threshold " << relative_weight_threshold_);
+ DEBUG("other removed");
+ break;
+ }
+ }
+ return result;
+}
+void ExtensionChooser2015::InsertAdditionalGaps(ExtensionChooser::EdgeContainer& result) const{
+ for (size_t i = 0; i< result.size(); i++) {
+//At least 10*"N" when scaffolding
+ if (result[i].d_ < MIN_N_QUANTITY + int(g_.k())) {
+ result[i].d_ = MIN_N_QUANTITY + int(g_.k());
+ }
+ }
+}
+
+ExtensionChooser::EdgeContainer ExtensionChooser2015::Filter(const BidirectionalPath& path, const ExtensionChooser::EdgeContainer& /*edges*/) const {
+ DEBUG("filtering started");
+ pair<EdgeId, int> last_unique = FindLastUniqueInPath(path);
+ DEBUG ("last unique found");
+ EdgeContainer result;
+ DEBUG(g_.int_id(last_unique.first)<< " " << last_unique.second << " " << path.Size());
+ if (last_unique.second < 0) {
+// No unique edge found
+ return result;
+ }
+
+ result = FindNextUniqueEdge(last_unique.first);
+ DEBUG("next unique edges found, there are " << result.size() << " of them");
+//Backward check. We connected edges iff they are best continuation to each other.
+ if (result.size() == 1) {
+ //We should reduce gap size with length of the edges that came after last unique.
+ result[0].d_ -= int (path.LengthAt(last_unique.second) - g_.length(last_unique.first));
+ DEBUG("For edge " << g_.int_id(last_unique.first) << " unique next edge "<< result[0].e_ <<" found, doing backwards check ");
+ EdgeContainer backwards_check = FindNextUniqueEdge(g_.conjugate(result[0].e_));
+ if ((backwards_check.size() != 1) || (g_.conjugate(backwards_check[0].e_) != last_unique.first)) {
+ result.clear();
+ }
+ }
+ InsertAdditionalGaps(result);
+ return result;
+}
+
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp b/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp
new file mode 100644
index 0000000..18b5721
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/extension_chooser2015.hpp
@@ -0,0 +1,65 @@
+//
+// Created by lab42 on 8/26/15.
+//
+#pragma once
+
+#include "modules/path_extend/extension_chooser.hpp"
+#include "connection_condition2015.hpp"
+#include "modules/genome_consistance_checker.hpp"
+#include "utils/logger/logger.hpp"
+#include <map>
+#include <set>
+namespace path_extend {
+
+class ExtensionChooser2015: public ScaffoldingExtensionChooser {
+ static const int MIN_N_QUANTITY = 10;
+ shared_ptr<ConnectionCondition> lib_connection_condition_;
+ const ScaffoldingUniqueEdgeStorage& unique_edges_;
+
+ // for possible connections e1 and e2 if weight(e1) > relative_weight_threshold_ * weight(e2) then e2 will be ignored
+ double relative_weight_threshold_;
+ AssemblyGraphConnectionCondition graph_connection_condition_;
+ // weight < absolute_weight_threshold_ will be ignored
+ size_t absolute_weight_threshold_;
+ // multiplicator for the pairs which are connected in graph.
+ double graph_connection_bonus_;
+ bool use_graph_connectivity_;
+
+ //If path contains no unique edges return -1
+ pair<EdgeId, int> FindLastUniqueInPath(const BidirectionalPath& path) const;
+ //Find all possible next unique edges confirmed with mate-pair information. (absolute/relative)_weight_threshold_ used for filtering
+ EdgeContainer FindNextUniqueEdge(const EdgeId from) const;
+public:
+ ExtensionChooser2015(const Graph& g,
+ shared_ptr<WeightCounter> wc,
+ shared_ptr<ConnectionCondition> condition,
+ const ScaffoldingUniqueEdgeStorage& unique_edges,
+ double cl_weight_threshold,
+ double is_scatter_coeff,
+ double relative_threshold,
+ bool use_graph_connectivity = true):
+ //TODO: constants are subject to reconsider
+ ScaffoldingExtensionChooser(g, wc, cl_weight_threshold, is_scatter_coeff),
+ lib_connection_condition_(condition),
+ unique_edges_(unique_edges),
+ relative_weight_threshold_(relative_threshold),
+ graph_connection_condition_(g, 2 * unique_edges_.GetMinLength(), unique_edges),
+ //TODO to config!
+ absolute_weight_threshold_(2),
+ graph_connection_bonus_(2),
+ use_graph_connectivity_(use_graph_connectivity) {
+ INFO("ExtensionChooser2015 created");
+ }
+
+ /* @param edges are really not used and left for compatibility
+ * @returns possible next edge if there is unique one, else returns empty container
+ */
+ EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer&) const override;
+ void InsertAdditionalGaps(ExtensionChooser::EdgeContainer& result) const;
+
+private:
+ DECL_LOGGER("ExtensionChooser2015");
+};
+
+
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp b/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp
new file mode 100644
index 0000000..e570749
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/path_polisher.cpp
@@ -0,0 +1,326 @@
+
+#include "path_polisher.hpp"
+
+namespace path_extend {
+
+void PathPolisher::InfoAboutGaps(const PathContainer & result){
+ for (const auto& p_iter: result) {
+ for (size_t i = 1; i < p_iter.first->Size(); ++i) {
+ if (p_iter.first->GapAt(i) > 0) {
+ DEBUG("Gap "<< p_iter.first->GapAt(i) << " left between " << gp_.g.int_id(p_iter.first->At(i-1)) << " and " << gp_.g.int_id(p_iter.first->At(i)));
+ }
+ }
+ }
+}
+
+PathPolisher::PathPolisher(const conj_graph_pack& gp, const config::dataset& dataset_info, const ScaffoldingUniqueEdgeStorage& storage, size_t max_resolvable_len ): gp_(gp) {
+ gap_closers.push_back(make_shared<DijkstraGapCloser>(gp.g, max_resolvable_len));
+ for (size_t i = 0; i < dataset_info.reads.lib_count(); i++) {
+ auto lib = dataset_info.reads[i];
+ if (lib.type() == io::LibraryType::HQMatePairs || lib.type() == io::LibraryType::MatePairs) {
+ shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(gp.g, lib, gp.paired_indices[i]);
+ gap_closers.push_back(make_shared<MatePairGapCloser> (gp.g, max_resolvable_len, paired_lib, storage));
+ }
+ }
+}
+
+void PathPolisher::PolishPaths(const PathContainer &paths, PathContainer &result) {
+ result.clear();
+
+ for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
+
+ BidirectionalPath *path = new BidirectionalPath(Polish(*iter.get()));
+ BidirectionalPath *conjugatePath = new BidirectionalPath(Polish(path->Conjugate()));
+ BidirectionalPath *re_path = new BidirectionalPath(conjugatePath->Conjugate());
+ result.AddPair(re_path, conjugatePath);
+ }
+ InfoAboutGaps(result);
+}
+
+size_t DijkstraGapCloser::MinPathLength(const omnigraph::PathStorageCallback<Graph>& path_storage) const {
+ size_t shortest_len = omnigraph::CumulativeLength(g_, path_storage.paths().front());
+ for (size_t j = 1; j < path_storage.paths().size(); ++j) {
+ size_t cur_len = omnigraph::CumulativeLength(g_, path_storage.paths()[j]);
+ shortest_len = min(shortest_len, cur_len);
+ }
+ return shortest_len;
+}
+
+BidirectionalPath PathPolisher::Polish(const BidirectionalPath &path) {
+ if (path.Empty())
+ return path;
+ shared_ptr<BidirectionalPath> current;
+ shared_ptr<BidirectionalPath> prev_step = std::make_shared<BidirectionalPath>(path);
+ bool changed = true;
+ size_t count = 0;
+ while (changed) {
+ changed = false;
+ for (size_t i = 0; i < gap_closers.size(); i++) {
+ current = std::make_shared<BidirectionalPath>(gap_closers[i]->Polish(*prev_step));
+ if (current->Size() != prev_step->Size()){
+ changed = true;
+ std::swap(current, prev_step);
+ break;
+ }
+ }
+ count++;
+ if (count > 5) {
+ INFO("Unexpected cycle while polishing path, stopping polishing " );
+ path.Print();
+ break;
+ }
+ }
+ return *prev_step;
+}
+
+BidirectionalPath DijkstraGapCloser::Polish(const BidirectionalPath &path) {
+ BidirectionalPath result(g_);
+ if (path.Empty())
+ return result;
+ result.PushBack(path[0], path.GapAt(0));
+ for (size_t i = 1; i < path.Size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) == g_.EdgeStart(path[i])) {
+ result.PushBack(path[i], path.GapAt(i));
+ } else {
+ //Connect edges using Dijkstra
+ omnigraph::PathStorageCallback<Graph> path_storage(g_);
+ omnigraph::ProcessPaths(g_, 0,
+ max_path_len_,
+ g_.EdgeEnd(path[i - 1]),
+ g_.EdgeStart(path[i]),
+ path_storage);
+
+ if (path_storage.size() == 0) {
+ //No paths found, keeping the gap
+ result.PushBack(path[i], path.GapAt(i));
+ } else if (path_storage.size() > 1) {
+ //More than one path, using shortest path for gap length estimation
+ //We cannot use both common paths and bridges in one attempt;
+ if (!FillWithMultiplePaths(path, i, path_storage, result))
+ FillWithBridge(path, i, path_storage, result);
+ } else {
+ //Closing the gap with the unique shortest path
+ for (size_t j = 0; j < path_storage.paths().front().size(); ++j) {
+ result.PushBack(path_storage.paths().front()[j]);
+ }
+ result.PushBack(path[i]);
+ }
+ }
+ }
+ return result;
+}
+
+
+bool DijkstraGapCloser::FillWithBridge(const BidirectionalPath& path, size_t index,
+ const omnigraph::PathStorageCallback<Graph>& path_storage,
+ BidirectionalPath& result) const {
+//TODO:: constant;
+ auto counts = CountEdgesQuantity(path_storage, 300);
+ size_t path_quantity = path_storage.paths().size();
+ vector<EdgeId> bridges;
+ for (const auto& pair: counts)
+ if (pair.second == path_quantity)
+ bridges.push_back(pair.first);
+ if (bridges.size() > 0) {
+ std::sort(bridges.begin(), bridges.end(), [&] (EdgeId e1, EdgeId e2) {
+ return g_.length(e1) > g_.length(e2); });
+ EdgeId bridge = bridges[0];
+ int min_gap_before = path.GapAt(index);
+ int min_gap_after = path.GapAt(index);
+ for (const auto& path:path_storage.paths()) {
+ int current_before = 0;
+ for(size_t i = 0; i< path.size(); i++) {
+ if (path[i] != bridge)
+ current_before += (int)g_.length(path[i]);
+ else
+ break;
+ }
+ int current_after = (int)CumulativeLength(g_, path) - current_before - int(g_.length(bridge));
+ min_gap_after = std::min(current_after, min_gap_after);
+ min_gap_before = std::min(current_before, min_gap_before);
+ }
+ min_gap_after = std::max(min_gap_after, min_gap_);
+ min_gap_before = std::max(min_gap_before, min_gap_);
+ result.PushBack(bridge, min_gap_before);
+ result.PushBack(path[index], min_gap_after);
+ return true;
+ } else {
+ result.PushBack(path[index], path.GapAt(index));
+ return false;
+ }
+}
+
+bool DijkstraGapCloser::FillWithMultiplePaths(const BidirectionalPath& path, size_t index,
+ const omnigraph::PathStorageCallback<Graph>& path_storage,
+ BidirectionalPath& result) const {
+ bool changed = false;
+ auto left = LCP(path_storage);
+ for (auto e : left) {
+ result.PushBack(e);
+ changed = true;
+ }
+ int middle_gap = (int) max(size_t(min_gap_), MinPathLength(path_storage) -
+ omnigraph::CumulativeLength(g_, left));
+ if (changed)
+ result.PushBack(path[index], middle_gap);
+ return changed;
+}
+
+std::map<EdgeId, size_t> DijkstraGapCloser::CountEdgesQuantity(const omnigraph::PathStorageCallback<Graph>& path_storage, size_t length_limit ) const{
+ map<EdgeId, size_t> res;
+ for (const auto& path: path_storage.paths()) {
+ set<EdgeId> edge_set(path.begin(), path.end());
+ for (const auto& e: edge_set) {
+ if (g_.length(e) >= length_limit) {
+ res[e] += 1;
+ }
+ }
+ }
+ return res;
+};
+
+size_t DijkstraGapCloser::MinPathSize(const omnigraph::PathStorageCallback<Graph>& path_storage) const {
+ size_t size = path_storage.paths().front().size();
+ for (size_t i = 1; i < path_storage.size(); ++i) {
+ size = min(size, path_storage.paths()[i].size());
+ }
+ return size;
+}
+
+vector<EdgeId> DijkstraGapCloser::LCP(const omnigraph::PathStorageCallback<Graph>& path_storage) const {
+ bool all_equal = true;
+ size_t index = 0;
+ size_t min_size = MinPathSize(path_storage);
+
+ while (index < min_size && all_equal) {
+ for (size_t i = 1; i < path_storage.size(); ++i) {
+ auto e = path_storage.paths().front()[index];
+ if (e != path_storage.paths()[i][index]) {
+ all_equal = false;
+ break;
+ }
+ }
+ if (all_equal)
+ ++index;
+ }
+
+ vector<EdgeId> result;
+ for (size_t i = 0; i < index; ++i) {
+ result.push_back(path_storage.paths().front()[i]);
+ }
+ return result;
+}
+
+
+EdgeId MatePairGapCloser::FindNext(const BidirectionalPath& path, size_t index,
+ const set<EdgeId>& present_in_paths, VertexId v) const {
+ auto next_edges = g_.OutgoingEdges(v);
+ map<EdgeId, double> candidates;
+ for (const auto edge: next_edges)
+ if (present_in_paths.find(edge) != present_in_paths.end())
+ candidates.insert(make_pair(edge, 0));
+ if (candidates.size() <= 1 ) {
+ if (candidates.size() == 0 || candidates.begin()->first == path[index])
+ return EdgeId(0);
+ else
+ return (candidates.begin()->first);
+ } else {
+ int i = (int) index - 1;
+ for (; i >= 0; i--) {
+ if (storage_.IsUnique(path[i]))
+ break;
+ }
+ if (i < 0) {
+ return EdgeId(0);
+ } else {
+ EdgeId last_unique = path[i];
+ for (auto &pair: candidates){
+ vector<int> d;
+ vector<double> w;
+//TODO:: any filtration?
+ lib_->CountDistances(last_unique, pair.first, d, w);
+ double sum = 0;
+ for (auto weight: w)
+ sum += weight;
+ pair.second = sum / double(g_.length(pair.first));
+ }
+ vector<std::pair<EdgeId, double>> to_sort(candidates.begin(),candidates.end());
+ sort(to_sort.begin(), to_sort.end(), [&] (std::pair<EdgeId, double> a, std::pair<EdgeId, double> b ) {
+ return a.second > b.second;
+ });
+ if (to_sort[0].second > to_sort[1].second * weight_priority && to_sort[0].first != path[index])
+ return to_sort[0].first;
+ else
+ return EdgeId(0);
+ }
+ }
+}
+
+//TODO: make shorter functions
+BidirectionalPath MatePairGapCloser::Polish(const BidirectionalPath& path) {
+ BidirectionalPath result(g_);
+ DEBUG("Path " << path.GetId() << " len "<< path.Length() << " size " << path.Size());
+ result.PushBack(path[0], path.GapAt(0));
+ for (size_t i = 1; i < path.Size(); ++i) {
+ if (g_.EdgeEnd(path[i - 1]) == g_.EdgeStart(path[i]) || path.GapAt(i) <= min_gap_) {
+ result.PushBack(path[i], path.GapAt(i));
+ } else {
+ DEBUG("position "<< i <<" gap between edges " << g_.int_id(path[i-1]) << " and " << g_.int_id(path[i]) << " was " << path.GapAt(i));
+
+ vector<EdgeId> addition;
+ VertexId v = g_.EdgeEnd(path[i - 1]);
+ EdgeId last = path[i - 1];
+ omnigraph::PathStorageCallback<Graph> path_storage(g_);
+ omnigraph::ProcessPaths(g_, 0,
+ max_path_len_,
+ g_.EdgeEnd(path[i - 1]),
+ g_.EdgeStart(path[i]),
+ path_storage);
+ set<EdgeId> present_in_paths;
+ for(const auto &p: path_storage.paths())
+ for(size_t j = 0; j < p.size(); j ++)
+ present_in_paths.insert(p[j]);
+ size_t total = 0;
+ while (last != EdgeId(0)){
+ last = FindNext(path, i, present_in_paths, v);
+ if (last != EdgeId(0)){
+ v = g_.EdgeEnd(last);
+ addition.push_back(last);
+ total += g_.length(last);
+ }
+ if (total > max_path_len_){
+ DEBUG("gap between edges " << g_.int_id(path[i-1]) << " and " << g_.int_id(path[i]) << " was: " << path.GapAt(i) << ", closing path length too long: " << total);
+ break;
+ }
+ }
+ if (total > max_path_len_) {
+ result.PushBack(path[i], path.GapAt(i));
+ continue;
+ }
+ int len = int(CumulativeLength(g_, addition));
+ int new_gap = path.GapAt(i) - len;
+ if (new_gap < min_gap_ && addition.size() > 0) {
+ if (path.GapAt(i) * 3 < len * 2 ) {
+//inserted path significantly longer than estimated gap
+ DEBUG("Gap size estimation problem: gap between edges " << g_.int_id(path[i - 1]) << " and " << g_.int_id(path[i]) << " was " <<
+ path.GapAt(i) << "filled len" << len);
+ }
+ if (g_.EdgeEnd(addition.back()) != g_.EdgeStart(path[i]))
+ new_gap = min_gap_;
+ else
+ new_gap = 0;
+ }
+ DEBUG("filling");
+ for (size_t j = 0; j < addition.size(); j++) {
+ DEBUG(g_.int_id(addition[j]));
+ result.PushBack(addition[j], 0);
+ }
+ result.PushBack(path[i], new_gap);
+ DEBUG("filled");
+ }
+ }
+ DEBUG("result " << result.GetId() << " len "<< result.Length() << " size " << result.Size());
+ return result;
+}
+
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp b/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp
new file mode 100644
index 0000000..c13ddcb
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/path_polisher.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "assembly_graph/paths/path_processor.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
+#include "modules/path_extend/paired_library.hpp"
+#include "assembly_graph/graph_support/scaff_supplementary.hpp"
+#include "common/pipeline/graph_pack.hpp"
+
+namespace path_extend {
+
+class PathGapCloser {
+protected:
+ const Graph& g_;
+ size_t max_path_len_;
+ int min_gap_;
+public:
+ virtual BidirectionalPath Polish(const BidirectionalPath& path) = 0;
+//TODO:: config
+ PathGapCloser(const Graph& g, size_t max_path_len): g_(g), max_path_len_(max_path_len), min_gap_(int(g.k() + 10)) {}
+
+};
+
+class MatePairGapCloser: public PathGapCloser {
+ const shared_ptr<PairedInfoLibrary> lib_;
+ const ScaffoldingUniqueEdgeStorage& storage_;
+
+//TODO: config? somewhere else?
+ static constexpr double weight_priority = 5;
+public:
+ EdgeId FindNext(const BidirectionalPath& path, size_t index,
+ const set<EdgeId>& present_in_paths, VertexId v) const;
+ MatePairGapCloser(const Graph& g, size_t max_path_len, const shared_ptr<PairedInfoLibrary> lib, const ScaffoldingUniqueEdgeStorage& storage):
+ PathGapCloser(g, max_path_len), lib_(lib), storage_(storage) {}
+ BidirectionalPath Polish(const BidirectionalPath& path) override;
+};
+
+class DijkstraGapCloser: public PathGapCloser {
+
+protected:
+
+ BidirectionalPath Polish(const BidirectionalPath& path) override;
+
+ size_t MinPathLength(const omnigraph::PathStorageCallback<Graph>& path_storage) const;
+
+ bool FillWithMultiplePaths(const BidirectionalPath& path, size_t index,
+ const omnigraph::PathStorageCallback<Graph>& path_storage,
+ BidirectionalPath& result) const;
+
+ bool FillWithBridge(const BidirectionalPath& path, size_t index,
+ const omnigraph::PathStorageCallback<Graph>& path_storage,
+ BidirectionalPath& result) const;
+
+ size_t MinPathSize(const omnigraph::PathStorageCallback<Graph>& path_storage) const;
+
+ vector<EdgeId> LCP(const omnigraph::PathStorageCallback<Graph>& path_storage) const;
+
+ std::map<EdgeId, size_t> CountEdgesQuantity(const omnigraph::PathStorageCallback<Graph>& path_storage, size_t length_limit) const;
+
+public:
+ DijkstraGapCloser(const Graph& g, size_t max_path_len):
+ PathGapCloser(g, max_path_len) {}
+
+
+};
+
+class PathPolisher {
+
+private:
+ const conj_graph_pack& gp_;
+ vector<shared_ptr<PathGapCloser>> gap_closers;
+
+private:
+ void InfoAboutGaps(const PathContainer & result);
+ BidirectionalPath Polish(const BidirectionalPath& path);
+
+public:
+ PathPolisher(const conj_graph_pack& gp, const config::dataset& dataset_info, const ScaffoldingUniqueEdgeStorage& storage, size_t max_resolvable_len);
+
+ void PolishPaths(const PathContainer& paths, PathContainer& result);
+};
+
+
+}
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp
new file mode 100644
index 0000000..f4a6417
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.cpp
@@ -0,0 +1,258 @@
+#include "scaffold_graph.hpp"
+
+
+namespace path_extend {
+namespace scaffold_graph {
+
+std::atomic<ScaffoldGraph::ScaffoldEdgeIdT> ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0};
+
+
+void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e) {
+ edges_.emplace(e.getId(), e);
+ outgoing_edges_.emplace(e.getStart(), e.getId());
+ incoming_edges_.emplace(e.getEnd(), e.getId());
+}
+
+void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) {
+ auto e_range = outgoing_edges_.equal_range(e.getStart());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ outgoing_edges_.erase(edge_id);
+ }
+ }
+}
+
+void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) {
+ auto e_range = incoming_edges_.equal_range(e.getEnd());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ incoming_edges_.erase(edge_id);
+ }
+ }
+}
+
+void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
+ auto e_range = outgoing_edges_.equal_range(v);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ DeleteIncoming(edges_.at(edge_id->second));
+ }
+ outgoing_edges_.erase(v);
+}
+
+void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) {
+ VERIFY(!Exists(e));
+ edges_.erase(e.getId());
+}
+
+void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
+ auto e_range = incoming_edges_.equal_range(v);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ DeleteOutgoing(edges_.at(edge_id->second));
+ }
+ incoming_edges_.erase(v);
+}
+
+bool ScaffoldGraph::Exists(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return vertices_.count(assembly_graph_edge) != 0;
+}
+
+bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const {
+ auto e_range = outgoing_edges_.equal_range(e.getStart());
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ if (edges_.at(edge_id->second) == e) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return assembly_graph_.conjugate(assembly_graph_edge);
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const {
+ return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight());
+}
+
+bool ScaffoldGraph::AddVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
+ if (!Exists(assembly_graph_edge)) {
+ VERIFY(!Exists(conjugate(assembly_graph_edge)));
+ vertices_.insert(assembly_graph_edge);
+ vertices_.insert(conjugate(assembly_graph_edge));
+ return true;
+ }
+ return false;
+}
+
+void ScaffoldGraph::AddVertices(const set<ScaffoldGraph::ScaffoldVertex> &vertices) {
+ for (auto v : vertices) {
+ AddVertex(v);
+ }
+}
+
+bool ScaffoldGraph::AddEdge(ScaffoldGraph::ScaffoldVertex v1, ScaffoldGraph::ScaffoldVertex v2, size_t lib_id, double weight) {
+ VERIFY(Exists(v1));
+ VERIFY(Exists(v2));
+
+ ScaffoldEdge e(v1, v2, lib_id, weight);
+ if (Exists(e)) {
+ return false;
+ }
+
+
+ AddEdgeSimple(e);
+ return true;
+}
+
+void ScaffoldGraph::Print(ostream &os) const {
+ for (auto v: vertices_) {
+ os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v))
+ << ": len = " << assembly_graph_.length(v) << ", cov = " << assembly_graph_.coverage(v) << endl;
+ }
+ for (auto e_iter = edges_.begin(); e_iter != edges_.end(); ++e_iter) {
+ os << "Edge " << e_iter->second.getId() <<
+ ": " << int_id(e_iter->second.getStart()) << " -> " << int_id(e_iter->second.getEnd()) <<
+ ", lib index = " << e_iter->second.getColor() << ", weight " << e_iter->second.getWeight() << endl;
+ }
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ VERIFY(HasUniqueIncoming(assembly_graph_edge));
+ return edges_.at(incoming_edges_.find(assembly_graph_edge)->second);
+}
+
+ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ VERIFY(HasUniqueOutgoing(assembly_graph_edge));
+ return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second);
+}
+
+bool ScaffoldGraph::HasUniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return IncomingEdgeCount(assembly_graph_edge) == 1;
+}
+
+bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return OutgoingEdgeCount(assembly_graph_edge) == 1;
+}
+
+size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return incoming_edges_.count(assembly_graph_edge);
+}
+
+size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ return outgoing_edges_.count(assembly_graph_edge);
+}
+
+vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::IncomingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ vector<ScaffoldEdge> result;
+ auto e_range = incoming_edges_.equal_range(assembly_graph_edge);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ result.push_back(edges_.at(edge_id->second));
+ }
+ return result;
+}
+
+vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::OutgoingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ vector<ScaffoldEdge> result;
+ auto e_range = outgoing_edges_.equal_range(assembly_graph_edge);
+ for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
+ result.push_back(edges_.at(edge_id->second));
+ }
+ return result;
+}
+
+const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const {
+ return assembly_graph_;
+}
+
+size_t ScaffoldGraph::EdgeCount() const {
+ return edges_.size();
+}
+
+size_t ScaffoldGraph::VertexCount() const {
+ return vertices_.size();
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const {
+ return e.getEnd();
+}
+
+ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const {
+ return e.getStart();
+}
+
+size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const {
+ return e.getId();
+}
+
+size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldVertex v) const {
+ return assembly_graph_.int_id(v);
+}
+
+ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const {
+ return ConstScaffoldEdgeIterator(edges_.cend());
+}
+
+ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const {
+ return ConstScaffoldEdgeIterator(edges_.cbegin());
+}
+
+ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const {
+ return vertices_.cend();
+}
+
+ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const {
+ return vertices_.cbegin();
+}
+
+adt::iterator_range<ScaffoldGraph::VertexStorage::const_iterator> ScaffoldGraph::vertices() const {
+ return adt::make_range(vbegin(), vend());
+}
+
+adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> ScaffoldGraph::edges() const {
+ return adt::make_range(ebegin(), eend());
+}
+
+bool ScaffoldGraph::IsVertexIsolated(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
+ bool result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0;
+ return result;
+}
+
+bool ScaffoldGraph::RemoveVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
+ if (Exists(assembly_graph_edge)) {
+ VERIFY(Exists(conjugate(assembly_graph_edge)));
+
+ DeleteAllOutgoingEdgesSimple(assembly_graph_edge);
+ DeleteAllIncomingEdgesSimple(assembly_graph_edge);
+ DeleteAllOutgoingEdgesSimple(conjugate(assembly_graph_edge));
+ DeleteAllIncomingEdgesSimple(conjugate(assembly_graph_edge));
+
+ VERIFY(incoming_edges_.count(assembly_graph_edge) == 0);
+ VERIFY(outgoing_edges_.count(assembly_graph_edge) == 0);
+ VERIFY(incoming_edges_.count(conjugate(assembly_graph_edge)) == 0);
+ VERIFY(outgoing_edges_.count(conjugate(assembly_graph_edge)) == 0);
+
+ vertices_.erase(assembly_graph_edge);
+ vertices_.erase(conjugate(assembly_graph_edge));
+
+ return true;
+ }
+ return false;
+}
+
+bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) {
+ if (Exists(e)) {
+ DeleteOutgoing(e);
+ DeleteIncoming(e);
+ DeleteEdgeFromStorage(e);
+
+ return true;
+ }
+ return false;
+}
+
+bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) {
+ return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight());
+}
+
+} //scaffold_graph
+} //path_extend
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp
new file mode 100644
index 0000000..9ac3fdf
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph.hpp
@@ -0,0 +1,228 @@
+//
+// Created by andrey on 17.09.15.
+//
+#pragma once
+
+#include "utils/logger/logger.hpp"
+#include "assembly_graph/core/graph.hpp"
+#include "modules/path_extend/paired_library.hpp"
+#include "connection_condition2015.hpp"
+
+#include "utils/standard_base.hpp"
+#include "common/adt/iterator_range.hpp"
+
+namespace path_extend {
+namespace scaffold_graph {
+
+//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs
+
+class ScaffoldGraph {
+
+public:
+ //EdgeId in de Bruijn graph is vertex in scaffolding graph
+ typedef debruijn_graph::EdgeId ScaffoldVertex;
+
+ //Unique edge id
+ typedef size_t ScaffoldEdgeIdT;
+
+ //Scaffold edge indormation class
+ struct ScaffoldEdge {
+ private:
+ //unique id
+ ScaffoldEdgeIdT id_;
+ //id counter
+ static std::atomic<ScaffoldEdgeIdT> scaffold_edge_id_;
+
+ ScaffoldVertex start_;
+ ScaffoldVertex end_;
+ //color = lib#
+ size_t color_;
+ //read pair weight or anything else
+ double weight_;
+
+ public:
+
+ ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0) :
+ id_(scaffold_edge_id_++),
+ start_(start), end_(end),
+ color_(lib_id),
+ weight_(weight) {
+ }
+
+ ScaffoldEdgeIdT getId() const {
+ return id_;
+ }
+
+
+ size_t getColor() const {
+ return color_;
+ }
+
+ double getWeight() const {
+ return weight_;
+ }
+
+ const ScaffoldVertex getStart() const {
+ return start_;
+ }
+
+ const ScaffoldVertex getEnd() const {
+ return end_;
+ }
+
+ bool operator==(const ScaffoldEdge &e) const {
+ return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
+ }
+
+ bool operator==(const ScaffoldEdge &e) {
+ return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
+ }
+ };
+
+ //typedef for possibility to use in templated graph visualizers
+ typedef ScaffoldVertex VertexId;
+ typedef ScaffoldEdge EdgeId;
+
+ //All vertices are stored in set
+ typedef std::set<ScaffoldVertex> VertexStorage;
+ //Edges are stored in map: Id -> Edge Information
+ typedef std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdge> EdgeStorage;
+ //Adjacency list contains vertrx and edge id (instead of whole edge information)
+ typedef std::unordered_multimap<ScaffoldVertex, ScaffoldEdgeIdT> AdjacencyStorage;
+
+ struct ConstScaffoldEdgeIterator: public boost::iterator_facade<ConstScaffoldEdgeIterator,
+ const ScaffoldEdge,
+ boost::forward_traversal_tag> {
+ private:
+ EdgeStorage::const_iterator iter_;
+
+ public:
+ ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) {
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ ++iter_;
+ }
+
+ bool equal(const ConstScaffoldEdgeIterator &other) const {
+ return iter_ == other.iter_;
+ }
+
+ const ScaffoldEdge& dereference() const {
+ return iter_->second;
+ }
+ };
+
+//TODO:: fix this. Seems that only ebegin and eend are broken.
+private:
+ EdgeStorage edges_;
+
+ VertexStorage vertices_;
+
+ const debruijn_graph::Graph &assembly_graph_;
+
+ AdjacencyStorage outgoing_edges_;
+
+ AdjacencyStorage incoming_edges_;
+
+ void AddEdgeSimple(const ScaffoldEdge &e);
+
+ //Delete outgoing edge from adjancecy list without checks
+ void DeleteOutgoing(const ScaffoldEdge &e);
+
+ //Delete incoming edge from adjancecy list without checks
+ void DeleteIncoming(const ScaffoldEdge &e);
+
+ //Delete all edge info from storage
+ void DeleteEdgeFromStorage(const ScaffoldEdge &e);
+
+ //Detelte all outgoing from v edges from adjacency lists
+ void DeleteAllOutgoingEdgesSimple(ScaffoldVertex v);
+
+ //Detelte all incoming from v edges from adjacency lists
+ void DeleteAllIncomingEdgesSimple(ScaffoldVertex v);
+
+public:
+ ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) {
+ }
+
+ bool Exists(ScaffoldVertex assembly_graph_edge) const;
+
+ bool Exists(const ScaffoldEdge &e) const;
+
+ ScaffoldVertex conjugate(ScaffoldVertex assembly_graph_edge) const;
+
+ //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph)
+ ScaffoldEdge conjugate(const ScaffoldEdge &e) const;
+
+ //Add isolated vertex to the graph if not exitsts
+ bool AddVertex(ScaffoldVertex assembly_graph_edge);
+
+ void AddVertices(const set<ScaffoldVertex> &vertices);
+
+ //Add edge (and conjugate) if not exists
+ //v1 and v2 must exist
+ bool AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight);
+
+ bool AddEdge(const ScaffoldEdge &e);
+
+ //Rempve edge from edge container and all adjacency lists
+ bool RemoveEdge(const ScaffoldEdge &e);
+
+ //Remove vertex and all adjacent edges
+ bool RemoveVertex(ScaffoldVertex assembly_graph_edge);
+
+ bool IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const;
+
+ VertexStorage::const_iterator vbegin() const;
+
+ VertexStorage::const_iterator vend() const;
+
+ adt::iterator_range<VertexStorage::const_iterator> vertices() const;
+
+ ConstScaffoldEdgeIterator ebegin() const;
+
+ ConstScaffoldEdgeIterator eend() const;
+
+ adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> edges() const;
+
+ size_t int_id(ScaffoldVertex v) const;
+
+ size_t int_id(ScaffoldEdge e) const;
+
+ ScaffoldVertex EdgeStart(ScaffoldEdge e) const;
+
+ ScaffoldVertex EdgeEnd(ScaffoldEdge e) const;
+
+ size_t VertexCount() const;
+
+ size_t EdgeCount() const;
+
+ const debruijn_graph::Graph & AssemblyGraph() const;
+
+ vector<ScaffoldEdge> OutgoingEdges(ScaffoldVertex assembly_graph_edge) const;
+
+ vector<ScaffoldEdge> IncomingEdges(ScaffoldVertex assembly_graph_edge) const;
+
+ size_t OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
+
+ size_t IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
+
+ bool HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
+
+ bool HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
+
+ ScaffoldEdge UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
+
+ ScaffoldEdge UniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
+
+ void Print(ostream &os) const;
+
+};
+
+} //scaffold_graph
+} //path_extend
+
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
new file mode 100644
index 0000000..f05f7e7
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
@@ -0,0 +1,75 @@
+//
+// Created by andrey on 04.12.15.
+//
+
+#include "scaffold_graph_constructor.hpp"
+
+namespace path_extend {
+namespace scaffold_graph {
+
+
+void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(func::TypedPredicate<typename Graph::EdgeId> edge_condition,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+ for (auto e = graph_->AssemblyGraph().ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ if (edge_condition(*e)) {
+ graph_->AddVertex(*e);
+ }
+ }
+ ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromSet(const set<EdgeId> edge_set,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+ graph_->AddVertices(edge_set);
+ ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only) {
+//TODO :: awful. It depends on ordering of connected conditions.
+ for (auto condition : connection_conditions) {
+ if (condition->GetLibIndex() == (size_t) -1)
+ ConstructFromSingleCondition(condition, true);
+ else
+ ConstructFromSingleCondition(condition, use_terminal_vertices_only);
+ }
+}
+
+void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
+ bool use_terminal_vertices_only) {
+ for (const auto& v : graph_->vertices()) {
+ TRACE("Vertex " << graph_->int_id(v));
+
+ if (use_terminal_vertices_only && graph_->OutgoingEdgeCount(v) > 0)
+ continue;
+
+ auto connected_with = condition->ConnectedWith(v);
+ for (const auto& pair : connected_with) {
+ EdgeId connected = pair.first;
+ double w = pair.second;
+ TRACE("Connected with " << graph_->int_id(connected));
+ if (graph_->Exists(connected)) {
+ if (use_terminal_vertices_only && graph_->IncomingEdgeCount(connected) > 0)
+ continue;
+ graph_->AddEdge(v, connected, condition->GetLibIndex(), w);
+ }
+ }
+ }
+}
+
+
+shared_ptr<ScaffoldGraph> SimpleScaffoldGraphConstructor::Construct() {
+ ConstructFromSet(edge_set_, connection_conditions_);
+ return graph_;
+}
+
+shared_ptr<ScaffoldGraph> DefaultScaffoldGraphConstructor::Construct() {
+ ConstructFromSet(edge_set_, connection_conditions_);
+ ConstructFromEdgeConditions(edge_condition_, connection_conditions_);
+ return graph_;
+}
+
+} //scaffold_graph
+} //path_extend
\ No newline at end of file
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
new file mode 100644
index 0000000..fe6c34a
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
@@ -0,0 +1,80 @@
+//
+// Created by andrey on 04.12.15.
+//
+
+#pragma once
+
+#include "scaffold_graph.hpp"
+
+
+namespace path_extend {
+namespace scaffold_graph {
+
+
+//Iterface
+class ScaffoldGraphConstructor {
+
+public:
+ virtual shared_ptr<ScaffoldGraph> Construct() = 0;
+};
+
+//Basic scaffold graph constructor functions
+class BaseScaffoldGraphConstructor: public ScaffoldGraphConstructor {
+protected:
+ shared_ptr<ScaffoldGraph> graph_;
+
+ BaseScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph) {
+ graph_ = make_shared<ScaffoldGraph>(assembly_graph);
+ }
+
+ void ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
+ bool use_terminal_vertices_only);
+
+ void ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only = false);
+
+ void ConstructFromSet(const set<EdgeId> edge_set,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only = false);
+
+ void ConstructFromEdgeConditions(func::TypedPredicate<typename Graph::EdgeId> edge_condition,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ bool use_terminal_vertices_only = false);
+};
+
+
+class SimpleScaffoldGraphConstructor: public BaseScaffoldGraphConstructor {
+protected:
+ const set<EdgeId>& edge_set_;
+ vector<shared_ptr<ConnectionCondition>>& connection_conditions_;
+
+public:
+ SimpleScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph,
+ const set<EdgeId>& edge_set,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions):
+ BaseScaffoldGraphConstructor(assembly_graph),
+ edge_set_(edge_set), connection_conditions_(connection_conditions) {}
+
+ shared_ptr<ScaffoldGraph> Construct() override;
+};
+
+class DefaultScaffoldGraphConstructor: public SimpleScaffoldGraphConstructor {
+protected:
+ func::TypedPredicate<typename Graph::EdgeId> edge_condition_;
+
+public:
+ DefaultScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph,
+ const set<EdgeId>& edge_set,
+ vector<shared_ptr<ConnectionCondition>> &connection_conditions,
+ func::TypedPredicate<typename Graph::EdgeId> edge_condition):
+ SimpleScaffoldGraphConstructor(assembly_graph, edge_set, connection_conditions),
+ edge_condition_(edge_condition)
+ {}
+
+ shared_ptr<ScaffoldGraph> Construct() override;
+};
+
+
+} //scaffold_graph
+} //path_extend
+
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
new file mode 100644
index 0000000..8017eee
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
@@ -0,0 +1,69 @@
+//
+// Created by andrey on 21.09.15.
+//
+
+#include "scaffold_graph_visualizer.hpp"
+
+namespace path_extend{ namespace scaffold_graph {
+
+const map<size_t, string> ScaffoldEdgeColorer::color_map =
+ {{(size_t) -1, "black"},
+ {0, "red"},
+ {1, "blue"},
+ {2, "green"},
+ {3, "magenta"},
+ {4, "orange"},
+ {5, "cyan"}};
+
+const string ScaffoldEdgeColorer::default_color = "black";
+
+string ScaffoldGraphLabeler::label(EdgeId e) const {
+ return "ID: " + ToString(e.getId()) +
+ "\\n Weight: " + ToString(e.getWeight()) +
+ "\\n Lib#: " + ToString(e.getColor());
+}
+
+string ScaffoldGraphLabeler::label(VertexId v) const {
+ auto it = additional_vertex_labels_.find(v);
+ string additional_label = it == additional_vertex_labels_.end() ? "" : it->second + "\n";
+ return "ID: " + ToString(graph_.int_id(v)) +
+ "\\n Len: " + ToString(graph_.AssemblyGraph().length(v)) +
+ "\\n Cov: " + ToString(graph_.AssemblyGraph().coverage(v)) + "\n" +
+ additional_label;
+}
+
+void ScaffoldGraphVisualizer::Visualize(graph_printer::GraphPrinter<ScaffoldGraph> &printer) {
+ printer.open();
+ printer.AddVertices(graph_.vbegin(), graph_.vend());
+ for (const auto& e : graph_.edges()) {
+ printer.AddEdge(e);
+ }
+ printer.close();
+}
+
+void ScaffoldGraphVisualizer::Visualize(ostream &os, graph_colorer::CompositeGraphColorer<ScaffoldGraph>& colorer) {
+ ScaffoldGraphLabeler labeler(graph_, additional_vertex_labels_);
+ vertex_linker::EmptyGraphLinker<ScaffoldGraph> linker;
+
+ graph_printer::SingleGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(printer);
+}
+
+string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const {
+ auto it = color_map.find(e.getColor());
+ if (it != color_map.end()) {
+ return it->second;
+ }
+ return default_color;
+}
+
+string ScaffoldVertexSetColorer::GetValue(ScaffoldGraph::VertexId v) const {
+ if (vertex_set_.count(v) > 0)
+ return "white";
+ return "yellow";
+}
+} //scaffold_graph
+} //path_extend
+
+
+
diff --git a/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
new file mode 100644
index 0000000..51d40ef
--- /dev/null
+++ b/src/common/modules/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
@@ -0,0 +1,79 @@
+//
+// Created by andrey on 21.09.15.
+//
+
+#ifndef PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
+#define PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
+
+#include "pipeline/graphio.hpp"
+#include "scaffold_graph.hpp"
+
+namespace path_extend { namespace scaffold_graph {
+
+using namespace visualization;
+
+
+class ScaffoldGraphLabeler : public graph_labeler::GraphLabeler<ScaffoldGraph> {
+
+private:
+ const ScaffoldGraph &graph_;
+
+ const map<VertexId, string>& additional_vertex_labels_;
+
+public:
+ ScaffoldGraphLabeler(const ScaffoldGraph &graph, const map<VertexId, string>& additional_vertex_labels):
+ graph_(graph), additional_vertex_labels_(additional_vertex_labels) {
+ }
+
+ string label(VertexId v) const;
+
+ string label(EdgeId e) const;
+};
+
+
+class ScaffoldEdgeColorer : public graph_colorer::ElementColorer<ScaffoldGraph::EdgeId> {
+private:
+ static const map<size_t, string> color_map;
+
+ static const string default_color;
+
+public:
+ string GetValue(ScaffoldGraph::EdgeId e) const;
+};
+
+
+class ScaffoldVertexSetColorer : public graph_colorer::ElementColorer<ScaffoldGraph::VertexId> {
+ private:
+ set<ScaffoldGraph::VertexId> vertex_set_;
+
+ public:
+ ScaffoldVertexSetColorer(const set<ScaffoldGraph::VertexId>& vertex_set): vertex_set_(vertex_set) {
+ }
+
+ string GetValue(ScaffoldGraph::VertexId v) const;
+};
+
+class ScaffoldGraphVisualizer {
+private:
+ const ScaffoldGraph &graph_;
+
+ const map<ScaffoldGraph::VertexId, string>& additional_vertex_labels_;
+
+private:
+ void Visualize(graph_printer::GraphPrinter<ScaffoldGraph> &printer);
+
+public:
+ ScaffoldGraphVisualizer(const ScaffoldGraph &graph,
+ const map<ScaffoldGraph::VertexId, string>& additional_vertex_labels) :
+ graph_(graph),
+ additional_vertex_labels_(additional_vertex_labels){
+ }
+
+ void Visualize(ostream &os, graph_colorer::CompositeGraphColorer<ScaffoldGraph>& colorer);
+};
+
+} //scaffold_graph
+} //path_extend
+
+
+#endif //PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
diff --git a/src/common/modules/path_extend/split_graph_pair_info.hpp b/src/common/modules/path_extend/split_graph_pair_info.hpp
new file mode 100644
index 0000000..007495c
--- /dev/null
+++ b/src/common/modules/path_extend/split_graph_pair_info.hpp
@@ -0,0 +1,432 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SPLIT_GRAPH_PAIR_INFO_HPP_
+#define SPLIT_GRAPH_PAIR_INFO_HPP_
+
+#include <paired_info/weights.hpp>
+#include "modules/alignment/sequence_mapper_notifier.hpp"
+#include "io/dataset_support/read_converter.hpp"
+#include "ideal_pair_info.hpp"
+
+using namespace debruijn_graph;
+
+namespace path_extend {
+
+inline double FindIntersection(vector<double>& pi1, vector<double>& pi2) {
+ std::sort(pi1.begin(), pi1.end());
+ std::sort(pi2.begin(), pi2.end());
+ size_t iter1 = 0;
+ size_t iter2 = 0;
+ double threshold = 0.0;
+ double percent1 = 0.0;
+ double percent2 = 1.0;
+ while (percent1 < percent2 and iter1 < pi1.size() and iter2 < pi2.size()) {
+ threshold = pi1[iter1];
+ while (iter2 < pi2.size() and pi2[iter2] <= threshold) {
+ iter2++;
+ }
+ percent1 = (double) iter1 / (double) pi1.size();
+ percent2 = 1.0 - (double) iter2 / (double) pi2.size();
+ iter1 += 1;
+ }
+ return threshold;
+}
+
+class Basket {
+ EdgeId edgeId_;
+ size_t index_;
+
+public:
+ Basket(EdgeId edgeId, size_t index)
+ : edgeId_(edgeId), index_(index) { }
+
+ Basket(const Basket& b)
+ : edgeId_(b.edgeId_), index_(b.index_) {}
+
+ const EdgeId edgeId() const {
+ return edgeId_;
+ }
+
+ size_t index() const {
+ return index_;
+ }
+
+ bool operator<(const Basket& rhs) const {
+ if (edgeId() != rhs.edgeId()) {
+ return edgeId() < rhs.edgeId();
+ }
+ return index() < rhs.index();
+ }
+
+ bool operator==(const Basket& rhs) const {
+ return edgeId() == rhs.edgeId() && index() == rhs.index();
+ }
+};
+
+struct PairInfo {
+ double weight_;
+ double distance_;
+ size_t count_;
+
+ PairInfo()
+ : weight_(0.), distance_(0.), count_(0) {}
+
+ PairInfo(double weight, double distance, size_t count = 0)
+ : weight_(weight), distance_(distance), count_(count) {}
+
+};
+
+class EdgePairInfo {
+ EdgeId edgeId_;
+ size_t basket_size_;
+ vector<map<Basket, PairInfo> > pair_info_;
+
+public:
+ EdgePairInfo() {
+ basket_size_ = 0;
+ }
+
+ EdgePairInfo(size_t length, EdgeId edgeId, size_t basket_size)
+ : edgeId_(edgeId),
+ basket_size_(basket_size) {
+ size_t count_baskets = length / basket_size_ + 1;
+ for (size_t index = 0; index < count_baskets; ++index) {
+ pair_info_.push_back(map<Basket, PairInfo>());
+ }
+ }
+
+ EdgePairInfo(const EdgePairInfo& pairInfo)
+ : edgeId_(pairInfo.edgeId_),
+ basket_size_(pairInfo.basket_size_) {
+ for (size_t index = 0; index < pairInfo.pair_info_.size(); ++index) {
+ pair_info_.push_back(pairInfo.pair_info_[index]);
+ }
+ }
+
+ void AddPairInfo(size_t pos_begin1, size_t pos_end1, EdgeId edgeId2,
+ size_t pos_begin2, size_t pos_end2, double weight,
+ double edge_distance) {
+ size_t begin_basket_index1 = GetBasketIndex(pos_begin1);
+ size_t end_basket_index1 = GetBasketIndex(pos_end1);
+ size_t begin_basket_index2 = GetBasketIndex(pos_begin2);
+ size_t end_basket_index2 = GetBasketIndex(pos_end2);
+ for (size_t index1 = begin_basket_index1; index1 <= end_basket_index1;
+ ++index1) {
+ for (size_t index2 = begin_basket_index2;
+ index2 <= end_basket_index2; ++index2) {
+ AddPairInfoToBasket(index1, edgeId2, index2, weight,
+ edge_distance);
+ }
+ }
+ }
+
+ void AddPairInfo(const EdgePairInfo& edgePairInfo) {
+ for (size_t index = 0; index < pair_info_.size(); ++index) {
+ const map<Basket, PairInfo>& basketInfoToAdd = edgePairInfo
+ .pair_info_[index];
+ map<Basket, PairInfo>& oldBasketInfo = pair_info_[index];
+ for (auto iter = basketInfoToAdd.begin();
+ iter != basketInfoToAdd.end(); ++iter) {
+ if (oldBasketInfo.find(iter->first) == oldBasketInfo.end()) {
+ oldBasketInfo[iter->first] = iter->second;
+ } else {
+ PairInfo& pairInfo = oldBasketInfo[iter->first];
+ oldBasketInfo[iter->first] = PairInfo(
+ pairInfo.weight_ + iter->second.weight_,
+ CountNewDistance(pairInfo, iter->second.distance_,
+ iter->second.count_),
+ iter->second.count_ + pairInfo.count_);
+ }
+ }
+ }
+ }
+
+ map<Basket, PairInfo>& GetInfo(size_t index) {
+ return pair_info_.at(index);
+ }
+
+ size_t size() {
+ return pair_info_.size();
+ }
+
+private:
+ size_t GetBasketIndex(size_t pos) const {
+ return pos / basket_size_;
+ }
+
+ void AddPairInfoToBasket(size_t index1, EdgeId edgeId2, size_t index2,
+ double weight, double edge_distance) {
+ Basket basket2(edgeId2, index2);
+ if (pair_info_[index1].find(basket2) == pair_info_[index1].end()) {
+ pair_info_[index1][basket2] = PairInfo(0.0, 0);
+ }
+ PairInfo oldPairInfo = pair_info_[index1][basket2];
+ double basket_distance = GetBasketDistance(edge_distance, index1,
+ index2);
+ pair_info_[index1][basket2] = PairInfo(
+ oldPairInfo.weight_ + weight,
+ CountNewDistance(oldPairInfo, basket_distance),
+ oldPairInfo.count_ + 1);
+ }
+
+ double CountNewDistance(PairInfo& oldPairInfo, double distance,
+ size_t count = 1) {
+ return (oldPairInfo.distance_ * (double) oldPairInfo.count_
+ + distance * (double) count)
+ / (double) (oldPairInfo.count_ + count);
+ }
+
+ double GetBasketDistance(double edge_distance, size_t index1,
+ size_t index2) {
+ return edge_distance - (double) index1 * (double) basket_size_
+ + (double) index2 * (double) basket_size_;
+ }
+};
+
+class BasketsPairInfoIndex {
+ const conj_graph_pack& gp_;
+ size_t basket_size_;
+ map<EdgeId, EdgePairInfo> pair_info_;
+
+public:
+ BasketsPairInfoIndex(const conj_graph_pack& gp, size_t basket_size)
+ : gp_(gp),
+ basket_size_(basket_size) {
+ }
+
+ void AddPairInfo(EdgeId edgeId1, size_t pos_begin1, size_t pos_end1,
+ EdgeId edgeId2, size_t pos_begin2, size_t pos_end2,
+ double weight, double edge_distance) {
+ if (pair_info_.find(edgeId1) == pair_info_.end()) {
+ EdgePairInfo edgePairInfo2(gp_.g.length(edgeId1), edgeId1,
+ basket_size_);
+ pair_info_.insert(make_pair(edgeId1, edgePairInfo2));
+ }
+ pair_info_[edgeId1].AddPairInfo(pos_begin1, pos_end1, edgeId2,
+ pos_begin2, pos_end2, weight,
+ edge_distance);
+ }
+
+ EdgePairInfo& GetEdgePairInfo(EdgeId edgeId) {
+ return pair_info_[edgeId];
+ }
+
+ void AddAll(const BasketsPairInfoIndex& index) {
+ for (auto it = index.pair_info_.begin(); it != index.pair_info_.end();
+ ++it) {
+ if (pair_info_.find(it->first) == pair_info_.end()) {
+ pair_info_.insert(make_pair(it->first, it->second));
+ } else {
+ pair_info_[it->first].AddPairInfo(it->second);
+ }
+ }
+ }
+
+ void Clear() {
+ pair_info_.clear();
+ }
+
+ size_t size() const {
+ return pair_info_.size();
+ }
+
+};
+
+class SplitGraphPairInfo : public SequenceMapperListener {
+
+public:
+ //TODO: d_min = ? d_max = ? for ideal_pi_counter_
+ SplitGraphPairInfo(conj_graph_pack& gp, size_t is,
+ size_t is_var,
+ size_t is_min, size_t is_max,
+ size_t read_size, size_t /* k */, size_t basket_size,
+ const std::map<int, size_t>& is_distribution)
+ : gp_(gp),
+ is_(is),
+ is_var_(is_var),
+ is_min_(is_min),
+ is_max_(is_max),
+ basket_size_(basket_size),
+ basket_index_(gp, basket_size),
+ threshold_(-1),
+ ideal_pi_counter_(gp.g, (int)is_min_,
+ (int)is_max_, read_size, is_distribution) {
+
+ }
+
+ void StartProcessLibrary(size_t threads_count) override {
+ baskets_buffer_.clear();
+ for (size_t i = 0; i < threads_count; ++i)
+ baskets_buffer_.emplace_back(gp_, basket_size_);
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
+ read1, read2, r.distance());
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
+ read1, read2, r.distance());
+ }
+
+ void MergeBuffer(size_t thread_index) override {
+ basket_index_.AddAll(baskets_buffer_[thread_index]);
+ baskets_buffer_[thread_index].Clear();
+ }
+
+ void StopProcessLibrary() override {
+ FindThreshold();
+
+ baskets_buffer_.clear();
+ }
+
+ double GetThreshold() const {
+ return threshold_;
+ }
+
+private:
+ void FindThreshold() {
+ size_t min_long_edge = basket_size_;
+ const Graph& g = gp_.g;
+ vector<double> good_pi;
+ vector<double> bad_pi;
+ double insert_size_min = (double) is_ - 2. * (double) is_var_;
+ double insert_size_max = (double) is_ + 2. * (double) is_var_;
+ for (auto e = g.ConstEdgeBegin(); !e.IsEnd(); ++e) {
+ EdgeId edge = *e;
+
+ if (g.length(edge) > min_long_edge) {
+ if (g.int_id(edge) <= 0)
+ continue;
+
+ EdgePairInfo& edge_pi = basket_index_.GetEdgePairInfo(edge);
+ if (edge_pi.size() == 0)
+ continue;
+ size_t count_backets = LastBasketIndex(edge, (int) insert_size_max,
+ edge_pi);
+ for (size_t index = 0; index <= count_backets; ++index) {
+ map<Basket, PairInfo>& basket_info = edge_pi.GetInfo(index);
+ set<size_t> pair_baskets = GetBaskets(index,
+ (int) insert_size_min,
+ (int) insert_size_max,
+ edge_pi);
+ for (auto iter = basket_info.begin(); iter != basket_info.end(); ++iter) {
+ PairInfo& pi = iter->second;
+ if (iter->first.edgeId() == edge &&
+ pair_baskets.find(iter->first.index()) != pair_baskets.end()) {
+ good_pi.push_back(GetNormalizedWeight(pi));
+ } else {
+ bad_pi.push_back(GetNormalizedWeight(pi));
+ }
+ }
+ }
+ }
+ }
+ DEBUG("good pi size " << good_pi.size() << " bad pi size " << bad_pi.size());
+ threshold_ = FindIntersection(good_pi, bad_pi);
+ INFO("Threshold for paired information " << threshold_);
+ }
+
+ size_t LastBasketIndex(EdgeId edgeId, int insert_size_max,
+ EdgePairInfo& edge_pair_info) {
+ return min((gp_.g.length(edgeId) - insert_size_max) / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ size_t FindBeginPairBasket(size_t index, int insert_size_min,
+ EdgePairInfo& edge_pair_info) {
+ return min(index + insert_size_min / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ size_t FindEndPairBasket(size_t index, int insert_size_max,
+ EdgePairInfo& edge_pair_info) {
+ return min(index + insert_size_max / basket_size_,
+ edge_pair_info.size() - 1);
+ }
+
+ set<size_t> GetBaskets(size_t index, int insert_size_min,
+ int insert_size_max, EdgePairInfo& edge_pair_info) {
+ set<size_t> result;
+ size_t begin = FindBeginPairBasket(index, insert_size_min,
+ edge_pair_info);
+ size_t end = FindEndPairBasket(index, insert_size_max, edge_pair_info);
+ for (size_t pair_index = begin; pair_index <= end; ++pair_index) {
+ result.insert(pair_index);
+ }
+ return result;
+ }
+
+ double GetNormalizedWeight(PairInfo& pi) {
+ return pi.weight_
+ / ideal_pi_counter_.IdealPairedInfo(basket_size_, basket_size_,
+ (int) pi.distance_);
+ }
+
+ void InnerProcess(BasketsPairInfoIndex& basket_index,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2,
+ size_t read_distance) {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+ double weight = PairedReadCountWeight(std::make_pair(mapping_edge_1.first, mapping_edge_2.first),
+ mapping_edge_1.second, mapping_edge_2.second);
+ size_t kmer_distance = read_distance
+ + mapping_edge_2.second.initial_range.end_pos
+ - mapping_edge_1.second.initial_range.start_pos;
+ int edge_distance = (int) kmer_distance
+ + (int) mapping_edge_1.second.mapped_range.start_pos
+ - (int) mapping_edge_2.second.mapped_range.end_pos;
+
+ basket_index.AddPairInfo(
+ mapping_edge_1.first,
+ mapping_edge_1.second.mapped_range.start_pos,
+ mapping_edge_1.second.mapped_range.end_pos,
+ mapping_edge_2.first,
+ mapping_edge_2.second.mapped_range.start_pos,
+ mapping_edge_2.second.mapped_range.end_pos, weight,
+ (double) edge_distance);
+ }
+ }
+ }
+
+ void ProcessPairedRead(BasketsPairInfoIndex& basket_index,
+ size_t r1_length,
+ size_t r2_length,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2,
+ size_t read_distance) {
+ InnerProcess(basket_index, path1, path2, read_distance);
+ InnerProcess(basket_index, ConjugateMapping(gp_.g, path2, r2_length),
+ ConjugateMapping(gp_.g, path1, r1_length), read_distance);
+ }
+
+ const conj_graph_pack& gp_;
+ size_t is_;
+ size_t is_var_;
+ size_t is_min_;
+ size_t is_max_;
+ size_t basket_size_;
+ BasketsPairInfoIndex basket_index_;
+ vector<BasketsPairInfoIndex> baskets_buffer_;
+ double threshold_;
+ IdealPairInfoCounter ideal_pi_counter_;
+};
+
+} /* path_extend */
+
+#endif /* SPLIT_GRAPH_PAIR_INFO_HPP_ */
diff --git a/src/common/modules/path_extend/weight_counter.hpp b/src/common/modules/path_extend/weight_counter.hpp
new file mode 100644
index 0000000..d031bb2
--- /dev/null
+++ b/src/common/modules/path_extend/weight_counter.hpp
@@ -0,0 +1,357 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * weight_counter.hpp
+ *
+ * Created on: Feb 19, 2012
+ * Author: andrey
+ */
+
+#ifndef WEIGHT_COUNTER_HPP_
+#define WEIGHT_COUNTER_HPP_
+
+#include "assembly_graph/paths/bidirectional_path.hpp"
+#include "paired_library.hpp"
+#include <algorithm>
+#include <boost/math/special_functions/fpclassify.hpp>
+
+namespace path_extend {
+
+inline int median(const vector<int>& dist, const vector<double>& w, int min, int max) {
+ VERIFY(dist.size() == w.size());
+ double S = 0;
+ for (size_t i = 0; i < w.size(); ++i) {
+ if (dist[i] >= min && dist[i] <= max)
+ S += w[i];
+ }
+ if (S == 0) {
+ DEBUG("Empty histogram");
+ return 0;
+ }
+
+ double sum = S;
+ for (size_t i = 0; i < w.size(); ++i) {
+ if (dist[i] >= min && dist[i] <= max) {
+ sum -= w[i];
+ if (sum <= S / 2) {
+ return dist[i];
+ }
+ }
+ }
+ VERIFY(false);
+ return -1;
+}
+
+struct EdgeWithPairedInfo {
+ size_t e_;
+ double pi_;
+
+ EdgeWithPairedInfo(size_t e_, double pi) :
+ e_(e_), pi_(pi) {
+
+ }
+};
+
+struct EdgeWithDistance {
+ EdgeId e_;
+ int d_;
+
+ EdgeWithDistance(EdgeId e, size_t d) :
+ e_(e), d_((int) d) {
+ }
+
+ struct DistanceComparator {
+ bool operator()(const EdgeWithDistance& e1, const EdgeWithDistance& e2) {
+ if (e1.d_ == e2.d_)
+ return e1.e_ < e2.e_;
+ return e1.d_ > e2.d_;
+ }
+ };
+
+ //static DistanceComparator comparator;
+};
+
+class IdealInfoProvider {
+public:
+ virtual ~IdealInfoProvider() {}
+
+ virtual std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const = 0;
+};
+
+class BasicIdealInfoProvider : public IdealInfoProvider {
+ const shared_ptr<PairedInfoLibrary> lib_;
+public:
+ BasicIdealInfoProvider(const shared_ptr<PairedInfoLibrary>& lib) : lib_(lib) {
+ }
+
+ std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
+ std::vector<EdgeWithPairedInfo> covered;
+ for (int i = (int) path.Size() - 1; i >= 0; --i) {
+ double w = lib_->IdealPairedInfo(path[i], candidate,
+ (int) path.LengthAt(i));
+ //FIXME think if we need extremely low ideal weights
+ if (math::gr(w, 0.)) {
+ covered.push_back(EdgeWithPairedInfo(i, w));
+ }
+ }
+ return covered;
+ }
+};
+
+class WeightCounter {
+
+protected:
+ const Graph& g_;
+ const shared_ptr<PairedInfoLibrary> lib_;
+ bool normalize_weight_;
+ shared_ptr<IdealInfoProvider> ideal_provider_;
+
+public:
+
+ WeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight = true,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ g_(g), lib_(lib), normalize_weight_(normalize_weight), ideal_provider_(ideal_provider) {
+ if (!ideal_provider_) {
+ ideal_provider_ = make_shared<BasicIdealInfoProvider>(lib);
+ }
+ }
+
+ virtual std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const = 0;
+
+ virtual double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges = std::set<size_t>(), int gapLength = 0) const = 0;
+
+ const PairedInfoLibrary& lib() const {
+ return *lib_;
+ }
+
+ const shared_ptr<PairedInfoLibrary> get_libptr() const {
+ return lib_;
+ };
+
+private:
+ DECL_LOGGER("WeightCounter");
+};
+
+class ReadCountWeightCounter: public WeightCounter {
+
+ std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
+ int add_gap = 0) const {
+ std::vector<EdgeWithPairedInfo> answer;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e)) {
+ double w = lib_->CountPairedInfo(path[e_w_pi.e_], e,
+ (int) path.LengthAt(e_w_pi.e_) + add_gap);
+
+ if (normalize_weight_) {
+ w /= e_w_pi.pi_;
+ }
+ answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, w));
+ }
+
+ return answer;
+ }
+
+public:
+
+ ReadCountWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight = true,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ WeightCounter(g, lib, normalize_weight, ideal_provider) {
+ }
+
+ double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges, int gap) const override {
+ double weight = 0.0;
+
+ for (const auto& e_w_pi : CountLib(path, e, gap)) {
+ if (!excluded_edges.count(e_w_pi.e_)) {
+ weight += e_w_pi.pi_;
+ }
+ }
+
+ return weight;
+ }
+
+ std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const override {
+ std::set<size_t> answer;
+ for (const auto& e_w_pi : CountLib(path, e, gap)) {
+ if (math::gr(e_w_pi.pi_, 0.)) {
+ answer.insert(e_w_pi.e_);
+ }
+ }
+
+ return answer;
+ }
+
+};
+
+class PathCoverWeightCounter: public WeightCounter {
+ double single_threshold_;
+
+ double TotalIdealNonExcluded(const std::vector<EdgeWithPairedInfo>& ideally_covered_edges,
+ const std::set<size_t>& excluded_edges) const {
+ double ideal_total = 0.0;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
+ if (!excluded_edges.count(e_w_pi.e_))
+ ideal_total += e_w_pi.pi_;
+ }
+
+ return ideal_total;
+ }
+
+ std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
+ const std::vector<EdgeWithPairedInfo>& ideally_covered_edges, int add_gap = 0) const {
+ std::vector<EdgeWithPairedInfo> answer;
+
+ for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
+ double ideal_weight = e_w_pi.pi_;
+
+ double weight = lib_->CountPairedInfo(
+ path[e_w_pi.e_], e,
+ (int) path.LengthAt(e_w_pi.e_) + add_gap);
+
+ if (normalize_weight_) {
+ weight /= ideal_weight;
+ }
+
+ if (math::ge(weight, single_threshold_)) {
+ answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, ideal_weight));
+ }
+ }
+
+ return answer;
+ }
+
+public:
+
+ PathCoverWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ bool normalize_weight,
+ double single_threshold,
+ shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
+ WeightCounter(g, lib, normalize_weight, ideal_provider),
+ single_threshold_(single_threshold) {
+ VERIFY_MSG(math::gr(single_threshold_, 0.), "Threshold value not initialized");
+ }
+
+ double CountWeight(const BidirectionalPath& path, EdgeId e,
+ const std::set<size_t>& excluded_edges, int gap) const override {
+ double lib_weight = 0.;
+ const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e);
+
+ for (const auto& e_w_pi : CountLib(path, e, ideal_coverage, gap)) {
+ if (!excluded_edges.count(e_w_pi.e_)) {
+ lib_weight += e_w_pi.pi_;
+ }
+ }
+
+ double total_ideal_coverage = TotalIdealNonExcluded(ideal_coverage, excluded_edges);
+ return math::eq(total_ideal_coverage, 0.) ? 0. : lib_weight / total_ideal_coverage;
+ }
+
+ std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+ int gap = 0) const override {
+ std::set<size_t> answer;
+ for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e), gap)) {
+ if (math::gr(e_w_pi.pi_, 0.)) {
+ answer.insert(e_w_pi.e_);
+ }
+ }
+ return answer;
+ }
+};
+
+class CoverageAwareIdealInfoProvider : public BasicIdealInfoProvider {
+ static constexpr double MAGIC_COEFF = 2.;
+ const Graph& g_;
+ size_t read_length_;
+
+public:
+ //works for single lib only!!!
+ virtual double EstimatePathCoverage(const BidirectionalPath& path) const {
+ VERIFY(path.Length() > 0);
+ double answer = std::numeric_limits<double>::max();
+ for (size_t i = 0; i < path.Size(); ++i) {
+ answer = std::min(g_.coverage(path.At(i)), answer);
+ }
+ return answer;
+ }
+
+ CoverageAwareIdealInfoProvider(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+ size_t read_length) :
+ BasicIdealInfoProvider(lib), g_(g), read_length_(read_length) {
+ VERIFY(read_length_ > g_.k());
+ }
+
+ std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
+ VERIFY(read_length_ != -1ul);
+ //bypassing problems with ultra-low coverage estimates
+ double estimated_coverage = max(EstimatePathCoverage(path), 1.0);
+ double correction_coeff = estimated_coverage / ((double(read_length_) - double(g_.k())) * MAGIC_COEFF);
+
+ std::vector<EdgeWithPairedInfo> answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate);
+ for (auto& e_w_pi : answer) {
+ e_w_pi.pi_ *= correction_coeff;
+ }
+ return answer;
+ }
+};
+
+class GlobalCoverageAwareIdealInfoProvider : public CoverageAwareIdealInfoProvider {
+ double lib_coverage_;
+
+public:
+
+ GlobalCoverageAwareIdealInfoProvider(const Graph& g,
+ const shared_ptr<PairedInfoLibrary>& lib,
+ size_t read_length,
+ double lib_coverage):
+ CoverageAwareIdealInfoProvider(g, lib, read_length),
+ lib_coverage_(lib_coverage) {
+ }
+
+ double EstimatePathCoverage(const BidirectionalPath&) const override {
+ return lib_coverage_;
+ }
+};
+
+//TODO optimize number of calls of EstimatePathCoverage(path)
+//class MetagenomicWeightCounter: public WeightCounter {
+// shared_ptr<CoverageAwareIdealInfoProvider> cov_info_provider_;
+// shared_ptr<WeightCounter> normalizing_wc_;
+//
+//public:
+//
+// //negative raw_threshold leads to the halt if no sufficiently long edges are in the path
+// MetagenomicWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
+// size_t read_length, double weight_threshold) :
+// WeightCounter(g, lib) {
+// cov_info_provider_ = make_shared<CoverageAwareIdealInfoProvider>(g, lib, read_length);
+// normalizing_wc_ = make_shared<PathCoverWeightCounter>(g, lib,
+// /*normalize weight*/true, weight_threshold, cov_info_provider_);
+// }
+//
+// double CountWeight(const BidirectionalPath& path, EdgeId e,
+// const std::set<size_t>& excluded_edges, int gap = 0) const override {
+// VERIFY(path.Length() > 0);
+// return normalizing_wc_->CountWeight(path, e, excluded_edges, gap);
+// }
+//
+// std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
+// int gap = 0) const override {
+// return normalizing_wc_->PairInfoExist(path, e, gap);
+// }
+//};
+
+};
+
+#endif /* WEIGHT_COUNTER_HPP_ */
diff --git a/src/common/modules/simplification/bulge_remover.hpp b/src/common/modules/simplification/bulge_remover.hpp
new file mode 100644
index 0000000..73254b1
--- /dev/null
+++ b/src/common/modules/simplification/bulge_remover.hpp
@@ -0,0 +1,680 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * bulge_remover.hpp
+ *
+ * Created on: Apr 13, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "assembly_graph/graph_support/comparators.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "sequence/sequence_tools.hpp"
+#include "utils/standard_base.hpp"
+#include <cmath>
+#include <stack>
+#include "math/xmath.h"
+
+namespace omnigraph {
+
+template<class Graph>
+struct SimplePathCondition {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+
+ SimplePathCondition(const Graph& g) :
+ g_(g) {
+
+ }
+
+ bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
+ if (edge == g_.conjugate(edge))
+ return false;
+ for (size_t i = 0; i < path.size(); ++i)
+ if (edge == path[i] || edge == g_.conjugate(path[i]))
+ return false;
+ for (size_t i = 0; i < path.size(); ++i) {
+ if (path[i] == g_.conjugate(path[i])) {
+ return false;
+ }
+ for (size_t j = i + 1; j < path.size(); ++j)
+ if (path[i] == path[j] || path[i] == g_.conjugate(path[j]))
+ return false;
+ }
+ return true;
+ }
+};
+
+template<class Graph>
+bool TrivialCondition(typename Graph::EdgeId,
+ const vector<typename Graph::EdgeId>& path) {
+ for (size_t i = 0; i < path.size(); ++i)
+ for (size_t j = i + 1; j < path.size(); ++j)
+ if (path[i] == path[j])
+ return false;
+ return true;
+}
+
+template<class Graph>
+class MostCoveredSimpleAlternativePathChooser: public PathProcessor<Graph>::Callback {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ EdgeId forbidden_edge_;
+
+ double max_coverage_;
+ vector<EdgeId> most_covered_path_;
+
+public:
+
+ MostCoveredSimpleAlternativePathChooser(const Graph& g, EdgeId edge) :
+ g_(g), forbidden_edge_(edge), max_coverage_(-1.0) {
+
+ }
+
+ void HandleReversedPath(const vector<EdgeId>& reversed_path) override {
+ vector<EdgeId> path = this->ReversePath(reversed_path);
+ double path_cov = AvgCoverage(g_, path);
+ for (size_t i = 0; i < path.size(); i++) {
+ if (path[i] == forbidden_edge_)
+ return;
+ }
+ if (path_cov > max_coverage_ && SimplePathCondition<Graph>(g_)(forbidden_edge_, path)) {
+ max_coverage_ = path_cov;
+ most_covered_path_ = path;
+ }
+ }
+
+ double max_coverage() {
+ return max_coverage_;
+ }
+
+ const vector<EdgeId>& most_covered_path() {
+ return most_covered_path_;
+ }
+};
+
+inline size_t CountMaxDifference(size_t absolute_diff, size_t length, double relative_diff) {
+ return std::max((size_t) std::floor(relative_diff * (double) length), absolute_diff);
+}
+
+template<class Graph>
+class BulgeGluer {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+ Graph& g_;
+ BulgeCallbackF opt_callback_;
+ std::function<void(EdgeId)> removal_handler_;
+
+ void InnerProcessBulge(EdgeId edge, const vector<EdgeId>& path) {
+
+ EnsureEndsPositionAligner aligner(CumulativeLength(g_, path),
+ g_.length(edge));
+ double prefix_length = 0.;
+ vector<size_t> bulge_prefix_lengths;
+
+ for (EdgeId e : path) {
+ prefix_length += (double) g_.length(e);
+ bulge_prefix_lengths.push_back(aligner.GetPosition((size_t) prefix_length));
+ }
+
+ EdgeId edge_to_split = edge;
+ size_t prev_length = 0;
+
+ TRACE("Process bulge " << path.size() << " edges");
+
+ VERIFY(bulge_prefix_lengths.back() == g_.length(edge));
+
+ for (size_t i = 0; i < path.size(); ++i) {
+ if (bulge_prefix_lengths[i] > prev_length) {
+ if (bulge_prefix_lengths[i] - prev_length
+ != g_.length(edge_to_split)) {
+
+ TRACE("SplitEdge " << g_.str(edge_to_split));
+ TRACE(
+ "Start: " << g_.str(g_.EdgeStart(edge_to_split)));
+ TRACE(
+ "Start: " << g_.str(g_.EdgeEnd(edge_to_split)));
+
+ pair<EdgeId, EdgeId> split_result = g_.SplitEdge(
+ edge_to_split,
+ bulge_prefix_lengths[i] - prev_length);
+
+ edge_to_split = split_result.second;
+
+ TRACE("GlueEdges " << g_.str(split_result.first));
+ g_.GlueEdges(split_result.first, path[i]);
+
+ } else {
+ TRACE("GlueEdges " << g_.str(edge_to_split));
+ g_.GlueEdges(edge_to_split, path[i]);
+ }
+ }
+ prev_length = bulge_prefix_lengths[i];
+ }
+ }
+
+public:
+
+ BulgeGluer(Graph& g, BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0) :
+ g_(g),
+ opt_callback_(opt_callback),
+ removal_handler_(removal_handler) {
+
+ }
+
+ void operator()(EdgeId edge, const vector<EdgeId>& path) {
+ if (opt_callback_)
+ opt_callback_(edge, path);
+
+ if (removal_handler_)
+ removal_handler_(edge);
+
+ VertexId start = g_.EdgeStart(edge);
+ VertexId end = g_.EdgeEnd(edge);
+
+ TRACE("Projecting edge " << g_.str(edge));
+ InnerProcessBulge(edge, path);
+
+ TRACE("Compressing start vertex " << g_.str(start));
+ g_.CompressVertex(start);
+
+ TRACE("Compressing end vertex " << g_.str(end));
+ g_.CompressVertex(end);
+ }
+
+};
+
+template<class Graph>
+class AlternativesAnalyzer {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph& g_;
+ double max_coverage_;
+ size_t max_length_;
+ double max_relative_coverage_;
+ size_t max_delta_;
+ double max_relative_delta_;
+ size_t max_edge_cnt_;
+
+ static vector<EdgeId> EmptyPath() {
+ static vector<EdgeId> vec = {};
+ return vec;
+ }
+
+ /**
+ * Checks if alternative path is simple (doesn't contain conjugate edges, edge e or conjugate(e))
+ * and its average coverage * max_relative_coverage_ is greater than g.coverage(e)
+ */
+ bool BulgeCondition(EdgeId e, const vector<EdgeId>& path,
+ double path_coverage) const {
+ return math::ge(path_coverage * max_relative_coverage_,
+ g_.coverage(e)) && SimplePathCondition<Graph>(g_)(e, path);
+ }
+
+public:
+ AlternativesAnalyzer(const Graph& g, double max_coverage, size_t max_length,
+ double max_relative_coverage, size_t max_delta,
+ double max_relative_delta, size_t max_edge_cnt) :
+ g_(g),
+ max_coverage_(max_coverage),
+ max_length_(max_length),
+ max_relative_coverage_(max_relative_coverage),
+ max_delta_(max_delta),
+ max_relative_delta_(max_relative_delta),
+ max_edge_cnt_(max_edge_cnt) {
+ DEBUG("Created alternatives analyzer max_length=" << max_length
+ << " max_coverage=" << max_coverage
+ << " max_relative_coverage=" << max_relative_coverage
+ << " max_delta=" << max_delta
+ << " max_relative_delta=" << max_relative_delta);
+ }
+
+ vector<EdgeId> operator()(EdgeId e) const {
+ if (g_.length(e) > max_length_ || math::gr(g_.coverage(e), max_coverage_)) {
+ return EmptyPath();
+ }
+
+ size_t kplus_one_mer_coverage = (size_t) math::round((double) g_.length(e) * g_.coverage(e));
+ TRACE("Processing edge " << g_.str(e) << " and coverage " << kplus_one_mer_coverage);
+
+ size_t delta = CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
+
+ MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g_, e);
+
+ VertexId start = g_.EdgeStart(e);
+ TRACE("Start " << g_.str(start));
+ VertexId end = g_.EdgeEnd(e);
+ TRACE("End " << g_.str(end));
+
+ ProcessPaths(g_, (g_.length(e) > delta) ? g_.length(e) - delta : 0,
+ g_.length(e) + delta, start, end, path_chooser, max_edge_cnt_);
+
+ const vector<EdgeId>& path = path_chooser.most_covered_path();
+ if (!path.empty()) {
+ VERIFY(g_.EdgeStart(path[0]) == start);
+ VERIFY(g_.EdgeEnd(path.back()) == end);
+ }
+
+ double path_coverage = path_chooser.max_coverage();
+ if (math::gr(path_coverage, 0.)) {
+ TRACE("Best path with coverage " << path_coverage << " is " << PrintPath(g_, path));
+
+ if (BulgeCondition(e, path, path_coverage)) {
+ TRACE("Satisfied condition");
+ return path;
+ } else {
+ TRACE("Didn't satisfy condition");
+ return EmptyPath();
+ }
+ } else {
+ TRACE("Didn't find alternative");
+ return EmptyPath();
+ }
+ }
+
+ double max_coverage() const {
+ return max_coverage_;
+ }
+
+ size_t max_length() const {
+ return max_length_;
+ }
+
+private:
+ DECL_LOGGER("AlternativesAnalyzer");
+};
+
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId>
+NecessaryBulgeCondition(const Graph& g, size_t max_length, double max_coverage) {
+ return AddAlternativesPresenceCondition(g,
+ func::And(LengthUpperBound<Graph>(g, max_length),
+ CoverageUpperBound<Graph>(g, max_coverage)));
+}
+
+/**
+ * This class removes simple bulges from given graph with the following algorithm: it iterates through all edges of
+ * the graph and for each edge checks if this edge is likely to be a simple bulge
+ * if edge is judged to be one it is removed.
+ */
+template<class Graph>
+class BulgeRemover: public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ CoverageComparator<Graph>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, CoverageComparator<Graph>> base;
+
+protected:
+
+ /*virtual*/
+ bool Process(EdgeId e) {
+ TRACE("Considering edge " << this->g().str(e)
+ << " of length " << this->g().length(e)
+ << " and avg coverage " << this->g().coverage(e));
+
+ if (!HasAlternatives(this->g(), e)) {
+ TRACE("Not possible bulge edge");
+ return false;
+ }
+
+ vector<EdgeId> alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ gluer_(e, alternative);
+ return true;
+ }
+ return false;
+ }
+
+public:
+
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+
+ BulgeRemover(Graph& g, const std::shared_ptr<InterestingElementFinder<Graph, EdgeId>>& interesting_finder,
+ const AlternativesAnalyzer<Graph>& alternatives_analyzer,
+ BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0,
+ bool track_changes = true) :
+ base(g,
+ interesting_finder,
+ /*canonical_only*/true,
+ CoverageComparator<Graph>(g),
+ track_changes),
+ alternatives_analyzer_(alternatives_analyzer),
+ gluer_(g, opt_callback, removal_handler) {
+ }
+
+private:
+ AlternativesAnalyzer<Graph> alternatives_analyzer_;
+ BulgeGluer<Graph> gluer_;
+private:
+ DECL_LOGGER("BulgeRemover")
+};
+
+template<class Graph>
+class ParallelBulgeRemover : public PersistentAlgorithmBase<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::shared_ptr<InterestingElementFinder<Graph, EdgeId>> CandidateFinderPtr;
+ typedef SmartSetIterator<Graph, EdgeId, CoverageComparator<Graph>> SmartEdgeSet;
+
+ size_t buff_size_;
+ double buff_cov_diff_;
+ double buff_cov_rel_diff_;
+ AlternativesAnalyzer<Graph> alternatives_analyzer_;
+ BulgeGluer<Graph> gluer_;
+ CandidateFinderPtr interesting_edge_finder_;
+ //todo remove
+ bool tracking_;
+
+ size_t curr_iteration_;
+
+ SmartEdgeSet it_;
+
+ static vector<EdgeId> EmptyPath() {
+ static vector<EdgeId> vec = {};
+ return vec;
+ }
+
+ struct BulgeInfo : private boost::noncopyable {
+ size_t id;
+ EdgeId e;
+ std::vector<EdgeId> alternative;
+
+ BulgeInfo() :
+ id(-1ul) {
+ }
+
+ //passing by value is not a mistake!
+ BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId> alternative_) :
+ id(id_), e(e_), alternative(std::move(alternative_)) {
+
+ }
+
+ BulgeInfo(BulgeInfo&& that) {
+ *this = std::move(that);
+ }
+
+ BulgeInfo& operator= (BulgeInfo&& that) {
+ id = that.id;
+ e = that.e;
+ alternative = std::move(that.alternative);
+ return *this;
+ }
+
+// BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId>&& alternative_) :
+// id(id_), e(e_), alternative(std::move(alternative_)) {
+//
+// }
+//
+ bool operator< (const BulgeInfo& that) const {
+// VERIFY_MSG(id != that.id, "Ooops " << id);
+ return id < that.id;
+ }
+
+ std::string str(const Graph& g) const {
+ std::stringstream ss;
+ ss << "BulgeInfo " << id
+ << " e: " << g.str(e)
+ << " path: " << PrintPath(g, alternative);
+ return ss.str();
+ }
+
+ };
+
+ bool CheckInteracting(const BulgeInfo& info, const std::unordered_set<EdgeId>& involved_edges) const {
+ if (involved_edges.count(info.e))
+ return true;
+ for (EdgeId e : info.alternative)
+ if (involved_edges.count(e))
+ return true;
+ return false;
+ }
+
+ void AccountEdge(EdgeId e, std::unordered_set<EdgeId>& involved_edges) const {
+ TRACE("Pushing edge " << this->g().str(e));
+ involved_edges.insert(e);
+ EdgeId conj = this->g().conjugate(e);
+ TRACE("Pushing edge " << this->g().str(conj));
+ involved_edges.insert(conj);
+ }
+
+ void AccountEdges(const BulgeInfo& info, std::unordered_set<EdgeId>& involved_edges) const {
+ AccountEdge(info.e, involved_edges);
+ for (EdgeId e : info.alternative) {
+ AccountEdge(e, involved_edges);
+ }
+ }
+
+ //false if time to stop
+ bool FillEdgeBuffer(vector<EdgeId>& buffer, func::TypedPredicate<EdgeId> proceed_condition) {
+ VERIFY(buffer.empty());
+ DEBUG("Filling edge buffer of size " << buff_size_);
+ perf_counter perf;
+ double low_cov = 0.;
+ double cov_diff = 0.;
+ while (!it_.IsEnd() && buffer.size() < buff_size_) {
+ EdgeId e = *it_;
+ TRACE("Current edge " << this->g().str(e));
+ if (!proceed_condition(e)) {
+ TRACE("Stop condition was reached.");
+ //need to release last element of the iterator to make it replaceable by new elements
+ it_.ReleaseCurrent();
+ return false;
+ }
+
+ double cov = this->g().coverage(e);
+ if (buffer.empty()) {
+ low_cov = cov;
+ cov_diff = max(buff_cov_diff_, buff_cov_rel_diff_ * low_cov);
+ } else {
+ if (math::gr(cov, low_cov + cov_diff)) {
+ //need to release last element of the iterator to make it replaceable by new elements
+ it_.ReleaseCurrent();
+ return true;
+ }
+ }
+ TRACE("Potential bulge edge");
+ buffer.push_back(e);
+ ++it_;
+ }
+
+ DEBUG("Filled in " << perf.time() << " seconds");
+ if (buffer.size() == buff_size_) {
+ TRACE("Buffer filled");
+ return true;
+ } else {
+ TRACE("No more edges in iterator");
+ return false;
+ }
+ }
+
+ std::vector<std::vector<BulgeInfo>> FindBulges(const std::vector<EdgeId>& edge_buffer) const {
+ DEBUG("Looking for bulges (in parallel). Edge buffer size " << edge_buffer.size());
+ perf_counter perf;
+ std::vector<std::vector<BulgeInfo>> bulge_buffers(omp_get_max_threads());
+ size_t n = edge_buffer.size();
+ //order is in agreement with coverage
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < n; ++i) {
+ EdgeId e = edge_buffer[i];
+ auto alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ bulge_buffers[omp_get_thread_num()].push_back(BulgeInfo(i, e, std::move(alternative)));
+ }
+ }
+ DEBUG("Bulges found in " << perf.time() << " seconds");
+ return bulge_buffers;
+ }
+
+ std::vector<BulgeInfo> MergeBuffers(std::vector<std::vector<BulgeInfo>>&& buffers) const {
+ DEBUG("Merging bulge buffers");
+ perf_counter perf;
+
+ std::vector<BulgeInfo> merged_bulges;
+ for (auto& bulge_buffer : buffers) {
+ std::copy(std::make_move_iterator(bulge_buffer.begin()),
+ std::make_move_iterator(bulge_buffer.end()),
+ std::back_inserter(merged_bulges));
+ }
+
+ DEBUG("Sorting");
+ //order is in agreement with coverage
+ std::sort(merged_bulges.begin(), merged_bulges.end());
+ DEBUG("Total bulges " << merged_bulges.size());
+ DEBUG("Buffers merged in " << perf.time() << " seconds");
+ return merged_bulges;
+ }
+
+ SmartEdgeSet RetainIndependentBulges(std::vector<BulgeInfo>& bulges) const {
+ DEBUG("Looking for independent bulges");
+ size_t total_cnt = bulges.size();
+ perf_counter perf;
+
+ std::vector<BulgeInfo> filtered;
+ filtered.reserve(bulges.size());
+ //fixme switch to involved vertices to bring fully parallel glueing closer
+ std::unordered_set<EdgeId> involved_edges;
+ SmartEdgeSet interacting_edges(this->g(), false, CoverageComparator<Graph>(this->g()));
+
+ for (BulgeInfo& info : bulges) {
+ TRACE("Analyzing interactions of " << info.str(this->g()));
+ if (CheckInteracting(info, involved_edges)) {
+ TRACE("Interacting");
+ interacting_edges.push(info.e);
+ } else {
+ TRACE("Independent");
+ AccountEdges(info, involved_edges);
+ filtered.push_back(std::move(info));
+ }
+ }
+ bulges = std::move(filtered);
+
+ DEBUG("Independent bulges identified in " << perf.time() << " seconds");
+ DEBUG("Independent cnt " << bulges.size());
+ DEBUG("Interacting cnt " << interacting_edges.size());
+ VERIFY(bulges.size() + interacting_edges.size() == total_cnt);
+
+ return interacting_edges;
+ }
+
+ size_t ProcessBulges(const std::vector<BulgeInfo>& independent_bulges, SmartEdgeSet&& interacting_edges) {
+ DEBUG("Processing bulges");
+ perf_counter perf;
+
+ size_t triggered = 0;
+
+ for (const BulgeInfo& info : independent_bulges) {
+ TRACE("Processing bulge " << info.str(this->g()));
+ triggered++;
+ gluer_(info.e, info.alternative);
+ }
+
+ DEBUG("Independent bulges glued in " << perf.time() << " seconds");
+ perf.reset();
+
+ DEBUG("Processing remaining interacting bulges " << interacting_edges.size());
+ //usual br strategy
+ for (; !interacting_edges.IsEnd(); ++interacting_edges) {
+ EdgeId e = *interacting_edges;
+ TRACE("Processing edge " << this->g().str(e));
+ std::vector<EdgeId> alternative = alternatives_analyzer_(e);
+ if (!alternative.empty()) {
+ gluer_(e, alternative);
+ triggered++;
+ }
+ }
+ DEBUG("Interacting edges processed in " << perf.time() << " seconds");
+ return triggered;
+ }
+
+public:
+
+ typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
+
+ ParallelBulgeRemover(Graph& g, const CandidateFinderPtr& interesting_edge_finder,
+ size_t buff_size, double buff_cov_diff,
+ double buff_cov_rel_diff, const AlternativesAnalyzer<Graph>& alternatives_analyzer,
+ BulgeCallbackF opt_callback = 0,
+ std::function<void(EdgeId)> removal_handler = 0,
+ bool track_changes = true) :
+ PersistentAlgorithmBase<Graph>(g),
+ buff_size_(buff_size),
+ buff_cov_diff_(buff_cov_diff),
+ buff_cov_rel_diff_(buff_cov_rel_diff),
+ alternatives_analyzer_(alternatives_analyzer),
+ gluer_(g, opt_callback, removal_handler),
+ interesting_edge_finder_(interesting_edge_finder),
+ tracking_(track_changes),
+ curr_iteration_(0),
+ it_(g, true, CoverageComparator<Graph>(g), true) {
+ VERIFY(buff_size_ > 0);
+ it_.Detach();
+ }
+
+ size_t Run(bool force_primary_launch = false) override {
+ bool primary_launch = force_primary_launch ? true : curr_iteration_ == 0;
+ //todo remove if not needed;
+ //potentially can vary coverage threshold in coordination with ec threshold
+ auto proceed_condition = func::AlwaysTrue<EdgeId>();
+
+ if (!it_.IsAttached()) {
+ it_.Attach();
+ }
+ if (primary_launch) {
+ it_.clear();
+ TRACE("Primary launch.");
+ TRACE("Start search for interesting edges");
+ interesting_edge_finder_->Run(this->g(), [&](EdgeId e) {it_.push(e);});
+ TRACE(it_.size() << " interesting edges to process");
+ } else {
+ VERIFY(tracking_);
+ TRACE(it_.size() << " edges to process");
+ }
+
+ size_t triggered = 0;
+ bool proceed = true;
+ while (proceed) {
+ std::vector<EdgeId> edge_buffer;
+ edge_buffer.reserve(buff_size_);
+ proceed = FillEdgeBuffer(edge_buffer, proceed_condition);
+
+ std::vector<BulgeInfo> bulges = MergeBuffers(FindBulges(edge_buffer));
+
+ auto interacting_edges = RetainIndependentBulges(bulges);
+
+ size_t inner_triggered = ProcessBulges(bulges, std::move(interacting_edges));
+ proceed |= (inner_triggered > 0);
+ triggered += inner_triggered;
+ }
+
+ TRACE("Finished processing. Triggered = " << triggered);
+ if (!tracking_)
+ it_.Detach();
+
+ curr_iteration_++;
+
+ return triggered;
+ }
+
+private:
+ DECL_LOGGER("ParallelBulgeRemover")
+};
+
+}
diff --git a/src/common/modules/simplification/cleaner.hpp b/src/common/modules/simplification/cleaner.hpp
new file mode 100644
index 0000000..ce3eac5
--- /dev/null
+++ b/src/common/modules/simplification/cleaner.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+
+namespace omnigraph {
+
+template<class Graph>
+class Cleaner : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+ typedef IsolatedVertexCondition<Graph> ConditionT;
+
+ Graph &g_;
+ ConditionT isolated_condition_;
+
+public:
+ Cleaner(Graph &g, size_t chunk_cnt = 1) :
+ base(g,
+ std::make_shared<ParallelInterestingElementFinder<Graph, VertexId>>(ConditionT(g), chunk_cnt),
+ /*canonical only*/true),
+ g_(g), isolated_condition_(g) {
+ }
+
+protected:
+
+ bool Process(VertexId v) {
+ if (isolated_condition_.Check(v)) {
+ g_.DeleteVertex(v);
+ return true;
+ } else {
+ return false;
+ }
+ }
+};
+
+}
diff --git a/src/common/modules/simplification/complex_bulge_remover.hpp b/src/common/modules/simplification/complex_bulge_remover.hpp
new file mode 100644
index 0000000..2abed3d
--- /dev/null
+++ b/src/common/modules/simplification/complex_bulge_remover.hpp
@@ -0,0 +1,1215 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <cmath>
+#include <stack>
+#include <queue>
+#include "common/adt/concurrent_dsu.hpp"
+#include "utils/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "math/xmath.h"
+#include "sequence/sequence_tools.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "visualization/visualization.hpp"
+#include "dominated_set_finder.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+
+
+namespace omnigraph {
+
+namespace complex_br {
+
+template<class Graph>
+class LocalizedComponent: public GraphActionHandler<Graph> /*: public GraphComponent<Graph>*/{
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ VertexId start_vertex_;
+ set<VertexId> end_vertices_;
+ //usage of inclusive-inclusive range!!!
+ map<VertexId, Range> vertex_depth_;
+ multimap<size_t, VertexId> height_2_vertices_;
+
+ bool AllEdgeOut(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ bool AllEdgeIn(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (!contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ size_t Average(Range r) const {
+ return r.start_pos;
+ }
+
+public:
+
+// template <class It>
+ LocalizedComponent(const Graph& g, //It begin, It end,
+ VertexId start_vertex/*, const vector<VertexId>& end_vertices*/) :
+ base(g, "br_component"), g_(g), start_vertex_(start_vertex) {
+ end_vertices_.insert(start_vertex);
+ vertex_depth_.insert(make_pair(start_vertex_, Range(0, 0)));
+ height_2_vertices_.insert(make_pair(0, start_vertex));
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ bool IsEndVertex(VertexId v) const {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(g_.EdgeEnd(e)))
+ return false;
+ }
+ return true;
+ }
+
+ void AddVertex(VertexId v, Range dist_range) {
+// VERIFY(CheckCloseNeighbour(v));
+// Range r = NeighbourDistanceRange(v);
+ DEBUG("Adding vertex " << g_.str(v) << " to the component");
+ vertex_depth_.insert(make_pair(v, dist_range));
+ height_2_vertices_.insert(make_pair(Average(dist_range), v));
+ DEBUG("Range " << dist_range << " Average height " << Average(dist_range));
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ end_vertices_.erase(g_.EdgeStart(e));
+ }
+ if (IsEndVertex(v)) {
+ end_vertices_.insert(v);
+ }
+ }
+
+ //todo what if path processor will fail inside
+// size_t TotalPathCount() const {
+// size_t answer = 0;
+// size_t max_len = 0;
+// for (VertexId end_v : end_vertices_) {
+// max_len = std::max(max_len, vertex_depth_.find(end_v)->second.end_pos);
+// }
+// PathProcessor<Graph> processor(g_, start_vertex_, max_len);
+// for (VertexId end_v : end_vertices_) {
+// PathStorageCallback<Graph> path_storage(g_);
+// Range r = vertex_depth_.find(end_v)->second;
+// processor.Process(end_v, r.start_pos, r.end_pos, path_storage, /*max_edge_cnt*/ -1ul);
+// answer += path_storage.size();
+// }
+// return answer;
+// }
+
+ bool CheckCompleteness() const {
+ for (VertexId v : key_set(vertex_depth_)) {
+ if (v == start_vertex_)
+ continue;
+ if (!AllEdgeIn(v) && !AllEdgeOut(v))
+ return false;
+ }
+ return true;
+ }
+
+ bool NeedsProjection() const {
+ DEBUG("Checking if component needs projection");
+ for (VertexId v : key_set(vertex_depth_)) {
+ if (v == start_vertex_)
+ continue;
+ vector<EdgeId> filtered_incoming;
+ std::copy_if(g_.in_begin(v), g_.in_end(v), std::back_inserter(filtered_incoming),
+ [&] (EdgeId e) {return contains(g_.EdgeStart(e));});
+ VERIFY_MSG(filtered_incoming.size() == g_.IncomingEdgeCount(v), "Strange component");
+ if (g_.IncomingEdgeCount(v) > 1) {
+ DEBUG("Needs projection");
+ return true;
+ }
+ }
+ DEBUG("Doesn't need projection");
+ return false;
+ }
+
+ bool contains(VertexId v) const {
+ return vertex_depth_.count(v) > 0;
+ }
+
+ bool contains(EdgeId e) const {
+ return contains(g_.EdgeStart(e)) && contains(g_.EdgeEnd(e));
+ }
+
+ Range distance_range(VertexId v) const {
+ VERIFY(contains(v));
+ return vertex_depth_.find(v)->second;
+ }
+
+ size_t avg_distance(VertexId v) const {
+ VERIFY(contains(v));
+ return Average(vertex_depth_.find(v)->second);
+ }
+
+ set<size_t> avg_distances() const {
+ set<size_t> distances;
+ for (VertexId v : key_set(vertex_depth_)) {
+ distances.insert(avg_distance(v));
+ }
+ return distances;
+ }
+
+ size_t v_size() const {
+ return vertex_depth_.size();
+ }
+
+ VertexId start_vertex() const {
+ return start_vertex_;
+ }
+
+ const set<VertexId>& end_vertices() const {
+ return end_vertices_;
+ }
+
+ bool CheckCloseNeighbour(VertexId v) const {
+ DEBUG("Check if vertex " << g_.str(v) << " can be processed");
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (!contains(g_.EdgeStart(e))) {
+ DEBUG(
+ "Blocked by unprocessed or external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
+ DEBUG("Check fail");
+ return false;
+ }
+ }
+ DEBUG("Check ok");
+ return true;
+ }
+
+ GraphComponent<Graph> AsGraphComponent() const {
+ return GraphComponent<Graph>::FromVertices(g_, key_set(vertex_depth_));
+ }
+
+ bool ContainsConjugateVertices() const {
+ set<VertexId> conjugate_vertices;
+ for (VertexId v : key_set(vertex_depth_)) {
+ if (conjugate_vertices.count(v) == 0) {
+ conjugate_vertices.insert(g_.conjugate(v));
+ } else {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ VERIFY(end_vertices_.count(v) == 0);
+ if (contains(v)) {
+ DEBUG("Deleting vertex " << g_.str(v) << " from the component");
+ size_t depth = avg_distance(v);
+ vertex_depth_.erase(v);
+ for (auto it = height_2_vertices_.lower_bound(depth);
+ it != height_2_vertices_.upper_bound(depth); ++it) {
+ if (it->second == v) {
+ height_2_vertices_.erase(it);
+ return;
+ }
+ }
+ VERIFY(false);
+ }
+
+ }
+
+ virtual void HandleDelete(EdgeId /*e*/) {
+ //empty for now
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
+ VERIFY(false);
+ }
+
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) {
+ //empty for now
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, EdgeId /*new_edge_2*/) {
+ VERIFY(old_edge != g_.conjugate(old_edge));
+ VertexId start = g_.EdgeStart(old_edge);
+ VertexId end = g_.EdgeEnd(old_edge);
+ if (contains(start)) {
+ VERIFY(vertex_depth_.count(end) > 0);
+ VERIFY(avg_distance(end) > avg_distance(start));
+ VertexId new_vertex = g_.EdgeEnd(new_edge_1);
+ Range new_vertex_depth(distance_range(start));
+ new_vertex_depth.shift((int) g_.length(new_edge_1));
+ //todo do better later (needs to be synched with splitting strategy)
+// + (vertex_depth_[end] - vertex_depth_[start])
+// * g_.length(new_edge_1) / g_.length(old_edge);
+ DEBUG(
+ "Inserting vertex " << g_.str(new_vertex) << " to component during split");
+ vertex_depth_.insert(make_pair(new_vertex, new_vertex_depth));
+ height_2_vertices_.insert(
+ make_pair(Average(new_vertex_depth), new_vertex));
+ }
+ }
+
+ const multimap<size_t, VertexId>& height_2_vertices() const {
+ return height_2_vertices_;
+ }
+
+ const set<VertexId> vertices_on_height(size_t height) const {
+ set<VertexId> answer;
+ for (auto it = height_2_vertices_.lower_bound(height);
+ it != height_2_vertices_.upper_bound(height); ++it) {
+ answer.insert(it->second);
+ }
+ return answer;
+ }
+
+private:
+ DECL_LOGGER("LocalizedComponent");
+};
+
+template<class Graph>
+class SkeletonTree: public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+public:
+
+ const set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ const set<VertexId>& vertices() const {
+ return vertices_;
+ }
+
+ bool Contains(EdgeId e) const {
+// VertexId start = br_comp_.g().EdgeStart(e);
+// if (next_edges_.count(start) > 0) {
+// const vector<EdgeId> edges = next_edges_.find(start)->second;
+// return find(e, next_edges_.lower_bound(start), next_edges_.upper_bound(start)) != edges.end();
+// }
+// return false;
+ return edges_.count(e) > 0;
+ }
+
+ bool Contains(VertexId v) const {
+// return next_edges_.count(v) > 0;
+ return vertices_.count(v) > 0;
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ //verify v not in the tree
+ VERIFY(!Contains(v));
+ }
+
+ virtual void HandleDelete(EdgeId e) {
+ //verify e not in the tree
+ DEBUG("Trying to delete " << br_comp_.g().str(e));
+ VERIFY(!Contains(e));
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId /*new_edge*/) {
+ //verify false
+ for (EdgeId e : old_edges) {
+ VERIFY(!Contains(e));
+ }
+ }
+
+ virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
+// verify edge2 in tree
+// put new_edge instead of edge2
+ DEBUG("Glueing " << br_comp_.g().str(new_edge) << " " << br_comp_.g().str(edge1) << " " << br_comp_.g().str(edge2));
+ if (Contains(edge2)) {
+ DEBUG("Erasing from tree: " << br_comp_.g().str(edge2));
+ DEBUG("Inserting to tree: " << br_comp_.g().str(new_edge));
+ edges_.erase(edge2);
+ edges_.insert(new_edge);
+ }
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2) {
+ VERIFY(old_edge != br_comp_.g().conjugate(old_edge));
+ if (Contains(old_edge)) {
+ edges_.erase(old_edge);
+ vertices_.insert(br_comp_.g().EdgeEnd(new_edge_1));
+ edges_.insert(new_edge_1);
+ edges_.insert(new_edge_2);
+ }
+ }
+
+ SkeletonTree(const LocalizedComponent<Graph>& br_comp,
+ const set<EdgeId>& edges) :
+ base(br_comp.g(), "br_tree"), br_comp_(br_comp), edges_(edges) {
+ DEBUG("Tree edges " << br_comp.g().str(edges));
+ for (EdgeId e : edges_) {
+ vertices_.insert(br_comp_.g().EdgeStart(e));
+ vertices_.insert(br_comp_.g().EdgeEnd(e));
+ }
+ }
+
+private:
+ const LocalizedComponent<Graph>& br_comp_;
+ set<EdgeId> edges_;
+ set<VertexId> vertices_;
+
+private:
+ DECL_LOGGER("SkeletonTree");
+};
+
+typedef size_t mask;
+typedef mask mixed_color_t;
+typedef unsigned primitive_color_t;
+
+template<class Graph>
+class ComponentColoring: public GraphActionHandler<Graph> {
+ typedef GraphActionHandler<Graph> base;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+public:
+
+ size_t CountPrimitiveColors(mixed_color_t color) const {
+ size_t cnt = 0;
+ for (size_t shift = 0; shift < color_cnt_; ++shift) {
+ mixed_color_t prim_color = 1 << shift;
+ if ((prim_color & color) != 0) {
+ cnt++;
+ }
+ }
+ VERIFY(cnt > 0);
+ return cnt;
+ }
+
+ primitive_color_t GetAnyPrimitiveColor(mixed_color_t color) const {
+ for (size_t shift = 0; shift < color_cnt_; ++shift) {
+ if ((1 << shift & color) != 0) {
+ return primitive_color_t(shift);
+ }
+ }
+ VERIFY(false);
+ return 0;
+ }
+
+ bool IsSubset(mixed_color_t super_set, mixed_color_t sub_set) const {
+ return (super_set | sub_set) == super_set;
+ }
+
+private:
+
+ const LocalizedComponent<Graph>& comp_;
+ const size_t color_cnt_;
+ map<VertexId, mixed_color_t> vertex_colors_;
+
+ mixed_color_t CountVertexColor(VertexId v) const {
+ mixed_color_t answer = mixed_color_t(0);
+ for (EdgeId e : comp_.g().OutgoingEdges(v)) {
+ answer |= color(e);
+ }
+ return answer;
+ }
+
+ void CountAndSetVertexColor(VertexId v) {
+ vertex_colors_.insert(make_pair(v, CountVertexColor(v)));
+ }
+
+ void ColorComponent() {
+ DEBUG("Coloring component");
+ size_t cnt = 0;
+ for (VertexId v : comp_.end_vertices()) {
+ mixed_color_t color = 1 << cnt;
+ DEBUG("Coloring exit " << comp_.g().str(v));
+ vertex_colors_.insert(make_pair(v, color));
+ cnt++;
+ }
+ for (auto it = comp_.height_2_vertices().rbegin();
+ it != comp_.height_2_vertices().rend(); ++it) {
+ if (vertex_colors_.count(it->second) == 0) {
+ DEBUG("Coloring vertex " << comp_.g().str(it->second));
+ CountAndSetVertexColor(it->second);
+ }
+ }
+ DEBUG("Component colored");
+ }
+
+public:
+
+ ComponentColoring(const LocalizedComponent<Graph>& comp) :
+ base(comp.g(), "br_comp_coloring"), comp_(comp), color_cnt_(
+ comp_.end_vertices().size()) {
+ VERIFY(comp.end_vertices().size() <= sizeof(size_t) * 8);
+ ColorComponent();
+ }
+
+ mixed_color_t color(VertexId v) const {
+ auto it = vertex_colors_.find(v);
+ if (it == vertex_colors_.end()) {
+ DEBUG("No color for vertex " << comp_.g().str(v));
+ DEBUG("Incoming edges " << comp_.g().str(comp_.g().IncomingEdges(v)));
+ DEBUG("Outgoing edges " << comp_.g().str(comp_.g().OutgoingEdges(v)));
+ }
+ VERIFY(it != vertex_colors_.end());
+ return it->second;
+ }
+
+ mixed_color_t color(EdgeId e) const {
+ return color(comp_.g().EdgeEnd(e));
+ }
+
+ virtual void HandleDelete(VertexId v) {
+ vertex_colors_.erase(v);
+ }
+
+ virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
+ VERIFY(false);
+ }
+
+ virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId edge1, EdgeId edge2) {
+ if (comp_.contains(edge1)) {
+ VERIFY(comp_.contains(edge2));
+ VERIFY(IsSubset(color(edge2), color(edge1)));
+ }
+ }
+
+ virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId /*new_edge_2*/) {
+ VERIFY(old_edge != comp_.g().conjugate(old_edge));
+ if (comp_.contains(old_edge)) {
+ CountAndSetVertexColor(comp_.g().EdgeEnd(new_edge_1));
+ }
+ }
+
+private:
+ DECL_LOGGER("ComponentColoring");
+};
+
+template<class Graph>
+class SkeletonTreeFinder {
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef ConcurrentDSU color_partition_ds_t;
+
+ const LocalizedComponent<Graph>& component_;
+ const ComponentColoring<Graph>& coloring_;
+
+ vector<size_t> level_heights_;
+
+ int current_level_;
+ color_partition_ds_t current_color_partition_;
+
+ set<VertexId> good_vertices_;
+ set<EdgeId> good_edges_;
+ map<VertexId, vector<EdgeId>> next_edges_;
+ map<VertexId, size_t> subtree_coverage_;
+
+ bool ConsistentWithPartition(mixed_color_t color) const {
+ return current_color_partition_.set_size(
+ GetCorrespondingDisjointSet(color))
+ == coloring_.CountPrimitiveColors(color);
+ }
+
+ bool IsGoodEdge(EdgeId e) const {
+// VertexId start = component_.g().EdgeStart(e);
+ VertexId end = component_.g().EdgeEnd(e);
+ //check if end is good
+ if (good_vertices_.count(end) == 0)
+ return false;
+
+// is subcase of next case
+// //check if end is from previous level
+// if (component_.avg_distance(end) == level_heights_[current_level_+1])
+// return true;
+
+ //check if end color is consistent with partition
+ //on level before the start
+ return ConsistentWithPartition(coloring_.color(end));
+ }
+
+ vector<EdgeId> GoodOutgoingEdges(VertexId v) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (IsGoodEdge(e)) {
+ DEBUG("Edge " << component_.g().str(e) << " is classified as good");
+ answer.push_back(e);
+ } else {
+ DEBUG("Edge " << component_.g().str(e) << " is classified as NOT good");
+ }
+ }
+ return answer;
+ }
+
+ vector<EdgeId> GoodOutgoingEdges(const vector<VertexId>& vertices) const {
+ vector<EdgeId> answer;
+ for (VertexId v : vertices) {
+ if (component_.end_vertices().count(v) == 0) {
+ push_back_all(answer, GoodOutgoingEdges(v));
+ }
+ }
+ return answer;
+ }
+
+ set<EdgeId> VectorAsSet(const vector<EdgeId>& edges) const {
+ return set<EdgeId>(edges.begin(), edges.end());
+ }
+
+ template<class T>
+ vector<T> SetAsVector(const set<T>& edges) const {
+ return vector<T>(edges.begin(), edges.end());
+ }
+
+ primitive_color_t GetCorrespondingDisjointSet(mixed_color_t color) const {
+ return (primitive_color_t) current_color_partition_.find_set(
+ coloring_.GetAnyPrimitiveColor(color));
+ }
+
+ void UpdateColorPartitionWithVertex(VertexId v) {
+ VERIFY(component_.g().OutgoingEdgeCount(v) > 0);
+ primitive_color_t ds = GetCorrespondingDisjointSet(
+ coloring_.color(*(component_.g().OutgoingEdges(v).begin())));
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ current_color_partition_.unite(ds,
+ GetCorrespondingDisjointSet(coloring_.color(e)));
+ }
+ }
+
+ bool IsGoodVertex(VertexId v) const {
+ if (!ConsistentWithPartition(coloring_.color(v)))
+ return false;
+ mixed_color_t union_color_of_good_children = mixed_color_t(0);
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (good_edges_.count(e) > 0) {
+ union_color_of_good_children |= coloring_.color(e);
+ }
+ }
+ return coloring_.color(v) == union_color_of_good_children;
+ }
+
+ void Init() {
+ current_level_ = (int) level_heights_.size() - 1;
+ size_t end_cnt = 0;
+ for (VertexId v : component_.end_vertices()) {
+ good_vertices_.insert(v);
+ subtree_coverage_[v] = 0;
+ end_cnt++;
+ }
+ }
+
+ size_t absolute_coverage(EdgeId e) {
+ return (size_t) (component_.g().coverage(e) * (double) component_.g().length(e));
+ }
+
+ void UpdateNextEdgesAndCoverage(VertexId v) {
+ map<mixed_color_t, size_t> best_subtrees_coverage;
+ map<mixed_color_t, EdgeId> best_alternatives;
+ for (EdgeId e : component_.g().OutgoingEdges(v)) {
+ if (good_edges_.count(e) > 0) {
+ VertexId end = component_.g().EdgeEnd(e);
+ mixed_color_t color = coloring_.color(e);
+ VERIFY(subtree_coverage_.count(end) > 0);
+ if (subtree_coverage_[end] + absolute_coverage(e)
+ >= best_subtrees_coverage[color]) {
+ best_subtrees_coverage[color] = subtree_coverage_[end]
+ + absolute_coverage(e);
+ best_alternatives[color] = e;
+ }
+ }
+ }
+ size_t coverage = 0;
+ for (size_t cov : value_set(best_subtrees_coverage)) {
+ coverage += cov;
+ }
+ next_edges_[v] = SetAsVector<EdgeId>(value_set(best_alternatives));
+ subtree_coverage_[v] = coverage;
+ }
+
+public:
+ SkeletonTreeFinder(const LocalizedComponent<Graph>& component,
+ const ComponentColoring<Graph>& coloring) :
+ component_(component),
+ coloring_(coloring),
+ level_heights_(SetAsVector<size_t>(component_.avg_distances())),
+ current_level_((int) level_heights_.size() - 1),
+ current_color_partition_(component_.end_vertices().size()) {
+
+ Init();
+ }
+
+ const set<EdgeId> GetTreeEdges() const {
+ set<EdgeId> answer;
+ std::queue<VertexId> vertex_queue;
+ vertex_queue.push(component_.start_vertex());
+ while (!vertex_queue.empty()) {
+ VertexId v = vertex_queue.front();
+ vertex_queue.pop();
+ if (next_edges_.count(v) == 0)
+ continue;
+ for (EdgeId e : next_edges_.find(v)->second) {
+ answer.insert(e);
+ vertex_queue.push(component_.g().EdgeEnd(e));
+ }
+ }
+ return answer;
+ }
+
+ const map<VertexId, vector<EdgeId>>& GetTree() const {
+ return next_edges_;
+ }
+
+ bool FindTree() {
+ DEBUG("Looking for tree");
+ while (current_level_ >= 0) {
+ size_t height = level_heights_[current_level_];
+ DEBUG("Processing level " << current_level_ << " on height " << height);
+ set<VertexId> level_vertices = component_.vertices_on_height(
+ height);
+ VERIFY(!level_vertices.empty());
+
+ //looking for good edges
+ insert_all(good_edges_,
+ GoodOutgoingEdges(
+ vector<VertexId>(level_vertices.begin(),
+ level_vertices.end())));
+
+
+
+ //counting colors and color partitions
+ for (VertexId v : level_vertices) {
+ if (component_.end_vertices().count(v) == 0) {
+ UpdateColorPartitionWithVertex(v);
+ if (IsGoodVertex(v)) {
+ DEBUG("Vertex " << component_.g().str(v) << " is classified as good");
+ good_vertices_.insert(v);
+ UpdateNextEdgesAndCoverage(v);
+ } else {
+ DEBUG("Vertex " << component_.g().str(v) << " is classified as NOT good");
+ }
+ }
+ }
+ current_level_--;
+ }
+ if (good_vertices_.count(component_.start_vertex()) > 0) {
+ DEBUG("Looking for tree was successful");
+ return true;
+ } else {
+ DEBUG("Looking for tree failed");
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("SkeletonTreeFinder")
+ ;
+};
+
+template<class Graph>
+void PrintComponent(const LocalizedComponent<Graph>& component,
+ const SkeletonTree<Graph>& tree, const string& file_name) {
+ typedef typename Graph::EdgeId EdgeId;
+ const set<EdgeId> tree_edges = tree.edges();
+ shared_ptr<visualization::graph_colorer::ElementColorer<typename Graph::EdgeId>> edge_colorer =
+ make_shared<visualization::graph_colorer::MapColorer<EdgeId>>(
+ tree_edges.begin(), tree_edges.end(),"green", ""
+ );
+ visualization::visualization_utils::WriteComponentSinksSources(component.AsGraphComponent(), file_name,
+ visualization::graph_colorer::DefaultColorer(component.g(), edge_colorer),
+ *visualization::graph_labeler::StrGraphLabelerInstance(component.g()));
+}
+
+template<class Graph>
+void PrintComponent(const LocalizedComponent<Graph>& component,
+ const string& file_name) {
+ visualization::visualization_utils::WriteComponent(component.AsGraphComponent(), file_name,
+ visualization::graph_colorer::DefaultColorer(component.g()),
+ *visualization::graph_labeler::StrGraphLabelerInstance(component.g()));
+}
+
+template<class Graph>
+class ComponentProjector {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ Graph& g_;
+ const LocalizedComponent<Graph>& component_;
+ const ComponentColoring<Graph>& coloring_;
+ const SkeletonTree<Graph>& tree_;
+
+// DEBUG("Result: edges " << g_.str(split_res.first) << " " << g_.str(split_res.second));
+// DEBUG("New vertex" << g_.str(inner_v) << " ");
+
+ bool SplitComponent() {
+ DEBUG("Splitting component");
+ set<size_t> level_heights(component_.avg_distances());
+ DEBUG("Level heights " << ToString<size_t>(level_heights));
+
+ GraphComponent<Graph> gc = component_.AsGraphComponent();
+
+ for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
+ VertexId start_v = g_.EdgeStart(*it);
+ VertexId end_v = g_.EdgeEnd(*it);
+ size_t start_dist = component_.avg_distance(start_v);
+ size_t end_dist = component_.avg_distance(end_v);
+ DEBUG("Processing edge " << g_.str(*it) << " avg_start " << start_dist << " avg_end " << end_dist);
+ set<size_t> dist_to_split(level_heights.lower_bound(start_dist),
+ level_heights.upper_bound(end_dist));
+ DEBUG("Distances to split " << ToString<size_t>(dist_to_split));
+
+ size_t offset = start_dist;
+ EdgeId e = *it;
+ for (auto split_it = dist_to_split.begin();
+ split_it != dist_to_split.end(); ++split_it) {
+ size_t curr = *split_it;
+ if (curr == start_dist || curr == end_dist)
+ continue;
+ DEBUG("Splitting on " << curr);
+ size_t pos = curr - offset;
+ if (pos >= g_.length(e)) {
+ return false;
+ }
+ DEBUG("Splitting edge " << g_.str(e) << " on position " << pos);
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, pos);
+ //checks accordance
+ VertexId inner_v = g_.EdgeEnd(split_res.first);
+ VERIFY(component_.avg_distance(inner_v) == curr);
+ e = split_res.second;
+ offset = curr;
+ }
+ }
+ DEBUG("Component split");
+ return true;
+ }
+
+ EdgeId CorrespondingTreeEdge(EdgeId e) const {
+ DEBUG("Getting height of vertex " << g_.str(g_.EdgeStart(e)));
+ size_t start_height = component_.avg_distance(g_.EdgeStart(e));
+ DEBUG("Done");
+ mixed_color_t color = coloring_.color(e);
+ DEBUG("Getting height of vertex " << g_.str(g_.EdgeEnd(e)));
+ size_t end_height = component_.avg_distance(g_.EdgeEnd(e));
+ DEBUG("Done");
+ for (VertexId v : component_.vertices_on_height(start_height)) {
+ if (component_.end_vertices().count(v) == 0) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VERIFY(component_.avg_distance(g_.EdgeEnd(e)) == end_height);
+ if (tree_.Contains(e)
+ && coloring_.IsSubset(coloring_.color(e), color)) {
+ return e;
+ }
+ }
+ }
+ }
+ VERIFY(false);
+ return EdgeId(NULL);
+ }
+
+public:
+
+ bool ProjectComponent() {
+ if (!SplitComponent()) {
+ DEBUG("Component can't be split");
+ return false;
+ }
+
+ DEBUG("Projecting split component");
+ GraphComponent<Graph> gc = component_.AsGraphComponent();
+
+ for (auto it = SmartSetIterator<Graph, EdgeId>(g_, gc.e_begin(),
+ gc.e_end()); !it.IsEnd(); ++it) {
+ DEBUG("Trying to project edge " << g_.str(*it));
+ EdgeId target = CorrespondingTreeEdge(*it);
+ DEBUG("Target found " << g_.str(target));
+ if (target != *it) {
+ DEBUG("Glueing " << g_.str(*it) << " to target " << g_.str(target));
+ g_.GlueEdges(*it, target);
+ DEBUG("Glued");
+ }
+ DEBUG("Edge processed");
+ }
+ DEBUG("Component projected");
+ return true;
+ }
+
+ ComponentProjector(Graph& g, const LocalizedComponent<Graph>& component,
+ const ComponentColoring<Graph>& coloring,
+ const SkeletonTree<Graph>& tree) :
+ g_(g), component_(component), coloring_(coloring), tree_(tree) {
+
+ }
+
+private:
+ DECL_LOGGER("ComponentProjector");
+};
+
+template<class Graph>
+class LocalizedComponentFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ static const size_t exit_bound = 32;
+ static const size_t inf = -1ul;
+
+ const Graph& g_;
+ size_t max_length_;
+ size_t length_diff_threshold_;
+
+ LocalizedComponent<Graph> comp_;
+
+ map<VertexId, Range> dominated_;
+ set<VertexId> interfering_;
+
+ std::string ToString(EdgeId e) const {
+ std::stringstream ss;
+ ss << g_.str(e)
+ << " start: "
+ << g_.str(g_.EdgeStart(e))
+ << " end: "
+ << g_.str(g_.EdgeEnd(e));
+ return ss.str();
+ }
+
+ bool CheckCompleteness() const {
+ if (interfering_.size() == 0) {
+ VERIFY(comp_.CheckCompleteness());
+ return true;
+ }
+ return false;
+ }
+
+ //false if new interfering vertex is not dominated
+ //can be slightly modified in new algorithm
+ bool ProcessLocality(VertexId processing_v) {
+ vector<VertexId> processed_neighb;
+ vector<VertexId> unprocessed_neighb;
+ for (EdgeId e : g_.OutgoingEdges(processing_v)) {
+ VertexId v = g_.EdgeEnd(e);
+ if (!comp_.contains(v)) {
+ unprocessed_neighb.push_back(v);
+ } else {
+ processed_neighb.push_back(v);
+ }
+ }
+ if (!processed_neighb.empty()) {
+ for (VertexId v : unprocessed_neighb) {
+ if (dominated_.count(v) > 0) {
+ interfering_.insert(v);
+ } else {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ bool AddVertexWithBackwardPaths(VertexId v) {
+ DEBUG("Adding vertex with backward paths");
+ std::queue<VertexId> q;
+ q.push(v);
+ while (!q.empty()) {
+ VertexId next_v = q.front();
+ q.pop();
+ if (!ProcessLocality(next_v)) {
+ return false;
+ }
+ if (!comp_.contains(next_v)) {
+ VERIFY(dominated_.count(v) > 0);
+ comp_.AddVertex(next_v, dominated_.find(next_v)->second);
+ for (EdgeId e : g_.IncomingEdges(next_v)) {
+ q.push(g_.EdgeStart(e));
+ }
+ }
+ }
+ return true;
+ }
+
+ //todo optimize
+ boost::optional<VertexId> ClosestNeigbour() const {
+ size_t min_dist = inf;
+ boost::optional<VertexId> answer = boost::none;
+ for (auto it = dominated_.begin(); it != dominated_.end(); ++it) {
+ if (!comp_.contains(it->first) && it->second.start_pos < min_dist) {
+ min_dist = it->second.start_pos;
+ answer = boost::optional<VertexId>(it->first);
+ }
+ }
+ return answer;
+ }
+
+ bool ProcessInterferingVertex(VertexId v) {
+ interfering_.erase(v);
+ return AddVertexWithBackwardPaths(v);
+ }
+
+ bool CheckPathLengths() const {
+ VERIFY(CheckCompleteness());
+ for (VertexId v : comp_.end_vertices()) {
+ if (comp_.distance_range(v).size() > length_diff_threshold_)
+ return false;
+ }
+ return true;
+ }
+
+ bool CheckPositiveHeightDiff() const {
+ DEBUG("Checking for positive height diff of each edge");
+ GraphComponent<Graph> gc = comp_.AsGraphComponent();
+ for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
+ size_t start_height = comp_.avg_distance(g_.EdgeStart(*it));
+ size_t end_height = comp_.avg_distance(g_.EdgeEnd(*it));
+ //VERIFY(end_height >= start_height);
+ if (end_height <= start_height) {
+ DEBUG("Check failed for edge " << g_.str(*it) << " start_height " << start_height << " end_height " << end_height);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool CloseComponent() {
+ while (!interfering_.empty()) {
+ VertexId v = *interfering_.begin();
+ DEBUG("Processing interfering vertex " << g_.str(v));
+ if (!ProcessInterferingVertex(v)) {
+ DEBUG("Vertex processing failed");
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ LocalizedComponentFinder(const Graph& g, size_t max_length,
+ size_t length_diff_threshold, VertexId start_v) :
+ g_(g), max_length_(max_length), length_diff_threshold_(
+ length_diff_threshold), comp_(g, start_v) {
+ DEBUG("Component finder from vertex " << g_.str(comp_.start_vertex()) << " created");
+ //todo introduce reasonable vertex bound
+ DominatedSetFinder<Graph> dominated_set_finder(g_, start_v, max_length/*, 1000*/);
+ dominated_set_finder.FillDominated();
+ dominated_ = dominated_set_finder.dominated();
+ }
+
+ bool ProceedFurther() {
+ DEBUG("Processing further");
+
+ DEBUG("Choosing closest vertex");
+ do {
+ optional<VertexId> next_v = ClosestNeigbour();
+
+ if (next_v) {
+ DEBUG("Vertex " << g_.str(*next_v) << " was chosen as closest neighbour");
+ interfering_.insert(*next_v);
+ DEBUG("Trying to construct closure");
+ if (!CloseComponent()) {
+ DEBUG("Failed to close component");
+ return false;
+ } else {
+ DEBUG("Component closed");
+ }
+ } else {
+ DEBUG("No more vertices can be added");
+ return false;
+ }
+ } while (!comp_.NeedsProjection());
+
+ if (!CheckPathLengths()) {
+ DEBUG("Path lengths check failed");
+ return false;
+ }
+ if (!CheckPositiveHeightDiff()) {
+ DEBUG("Check for positive height diff of each edge failed");
+ return false;
+ }
+ if (comp_.ContainsConjugateVertices()) {
+ DEBUG("Found component contains conjugate vertices");
+ return false;
+ }
+ if (comp_.end_vertices().size() > exit_bound) {
+ DEBUG("Too many exits:" << comp_.end_vertices().size());
+ return false;
+ }
+ GraphComponent<Graph> gc = comp_.AsGraphComponent();
+ DEBUG("Found component candidate. Vertices: " << g_.str(gc.vertices()));
+ return true;
+ }
+
+ const LocalizedComponent<Graph>& component() {
+ return comp_;
+ }
+
+private:
+ DECL_LOGGER("LocalizedComponentFinder");
+};
+
+template<class Graph>
+class CandidateFinder : public VertexCondition<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ size_t max_length_;
+ size_t length_diff_;
+
+public:
+ CandidateFinder(const Graph& g, size_t max_length, size_t length_diff) :
+ VertexCondition<Graph>(g), max_length_(max_length), length_diff_(length_diff) {
+ }
+
+ bool Check(VertexId v) const override {
+ const Graph& g = this->g();
+ LocalizedComponentFinder<Graph> comp_finder(g, max_length_,
+ length_diff_, v);
+ while (comp_finder.ProceedFurther()) {
+ DEBUG("Found component candidate start_v " << g.str(v));
+ LocalizedComponent<Graph> component = comp_finder.component();
+ //todo introduce reasonable size bound
+ //if (component.size() > 1000) {
+ // return false;
+ //}
+ ComponentColoring<Graph> coloring(component);
+ SkeletonTreeFinder<Graph> tree_finder(component, coloring);
+ DEBUG("Looking for a tree");
+ if (tree_finder.FindTree()) {
+ return true;
+ }
+ }
+ return false;
+ }
+private:
+ DECL_LOGGER("CBRCandidateFinder");
+};
+
+template<class Graph>
+class ComplexBulgeRemover : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+
+ size_t max_length_;
+ size_t length_diff_;
+ string pics_folder_;
+
+ bool ProcessComponent(LocalizedComponent<Graph>& component,
+ size_t candidate_cnt) {
+ DEBUG("Processing component");
+ ComponentColoring<Graph> coloring(component);
+ SkeletonTreeFinder<Graph> tree_finder(component, coloring);
+ DEBUG("Looking for a tree");
+ if (tree_finder.FindTree()) {
+ DEBUG("Tree found");
+ SkeletonTree<Graph> tree(component, tree_finder.GetTreeEdges());
+
+ if (!pics_folder_.empty()) {
+ PrintComponent(component, tree,
+ pics_folder_ + "success/"
+ + ToString(this->g().int_id(component.start_vertex()))
+ + "_" + ToString(candidate_cnt) + ".dot");
+ }
+
+ ComponentProjector<Graph> projector(this->g(), component, coloring, tree);
+ if (!projector.ProjectComponent()) {
+ //todo think of stopping the whole process
+ DEBUG("Component can't be projected");
+ return false;
+ }
+ DEBUG("Successfully processed component candidate " << candidate_cnt << " start_v " << this->g().str(component.start_vertex()));
+ return true;
+ } else {
+ DEBUG("Failed to find skeleton tree for candidate " << candidate_cnt << " start_v " << this->g().str(component.start_vertex()));
+ if (!pics_folder_.empty()) {
+ //todo check if we rewrite all of the previous pics!
+ PrintComponent(component,
+ pics_folder_ + "fail/"
+ + ToString(this->g().int_id(component.start_vertex())) //+ "_" + ToString(candidate_cnt)
+ + ".dot");
+ }
+ return false;
+ }
+ }
+
+ bool InnerProcess(VertexId v, std::vector<VertexId>& vertices_to_post_process) {
+ size_t candidate_cnt = 0;
+ LocalizedComponentFinder<Graph> comp_finder(this->g(), max_length_,
+ length_diff_, v);
+ while (comp_finder.ProceedFurther()) {
+ candidate_cnt++;
+ DEBUG("Found component candidate " << candidate_cnt << " start_v " << this->g().str(v));
+ LocalizedComponent<Graph> component = comp_finder.component();
+ if (ProcessComponent(component, candidate_cnt)) {
+ GraphComponent<Graph> gc = component.AsGraphComponent();
+ std::copy(gc.v_begin(), gc.v_end(), std::back_inserter(vertices_to_post_process));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ //todo shrink this set if needed
+ set<VertexId> Neighbours(VertexId v) const {
+ set<VertexId> answer;
+ for (EdgeId e : this->g().IncidentEdges(v)) {
+ answer.insert(this->g().EdgeStart(e));
+ answer.insert(this->g().EdgeEnd(e));
+ }
+ return answer;
+ }
+
+public:
+
+ //track_changes=false leads to every iteration run from scratch
+ ComplexBulgeRemover(Graph& g, size_t max_length, size_t length_diff,
+ size_t chunk_cnt, const string& pics_folder = "") :
+ base(g, std::make_shared<omnigraph::ParallelInterestingElementFinder<Graph, VertexId>>(
+ CandidateFinder<Graph>(g, max_length, length_diff), chunk_cnt),
+ false, std::less<VertexId>(), /*track changes*/false),
+ max_length_(max_length),
+ length_diff_(length_diff),
+ pics_folder_(pics_folder) {
+ if (!pics_folder_.empty()) {
+// remove_dir(pics_folder_);
+ make_dir(pics_folder_);
+ make_dir(pics_folder_ + "success/");
+ make_dir(pics_folder_ + "fail/");
+ }
+
+ }
+
+ bool Process(VertexId v) override {
+ DEBUG("Processing vertex " << this->g().str(v));
+ vector<VertexId> vertices_to_post_process;
+ //a bit of hacking (look further)
+ SmartSetIterator<Graph, VertexId> added_vertices(this->g(), true);
+
+ if (InnerProcess(v, vertices_to_post_process)) {
+ for (VertexId p_p : vertices_to_post_process) {
+ //Neighbours(p_p) includes p_p
+ for (VertexId n : Neighbours(p_p)) {
+ this->ReturnForConsideration(n);
+ }
+ this->g().CompressVertex(p_p);
+ }
+ return true;
+ } else {
+ //a bit of hacking:
+ //reverting changes resulting from potentially attempted, but failed split
+ Compressor<Graph> compressor(this->g());
+ for (; !added_vertices.IsEnd(); ++added_vertices) {
+ compressor.CompressVertex(*added_vertices);
+ }
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("ComplexBulgeRemover");
+};
+
+}
+
+}
diff --git a/src/common/modules/simplification/complex_tip_clipper.hpp b/src/common/modules/simplification/complex_tip_clipper.hpp
new file mode 100644
index 0000000..5da0d68
--- /dev/null
+++ b/src/common/modules/simplification/complex_tip_clipper.hpp
@@ -0,0 +1,178 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <limits>
+
+#include "visualization/visualization.hpp"
+#include "compressor.hpp"
+#include "dominated_set_finder.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+
+
+namespace omnigraph{
+
+template<class Graph>
+class ComplexTipFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+
+ double relative_coverage_treshold_;
+ size_t edge_length_treshold_;
+ size_t max_path_length_;
+
+ double GetTipCoverage(const GraphComponent<Graph>& component) const {
+ double cov = numeric_limits<double>::max();
+ for (auto edge : component.edges()) {
+ cov = std::min(cov, g_.coverage(edge));
+ }
+ return cov;
+ }
+
+ double GetOutwardCoverage(const GraphComponent<Graph>& component) const {
+ double cov = 0.0;
+ for (auto v : component.vertices()) {
+ for (auto edge : g_.IncidentEdges(v)) {
+ if (!component.contains(edge)) {
+ cov = std::max(cov, g_.coverage(edge));
+ }
+ }
+ }
+ return cov;
+ }
+
+ double GetRelativeTipCoverage(const GraphComponent<Graph>& component) const {
+ return GetTipCoverage(component) / GetOutwardCoverage(component);
+ }
+
+ bool ComponentCheck(const GraphComponent<Graph>& component) const {
+ if (component.empty() || component.e_size() == 0)
+ return false;
+
+ //check if usual tip
+ if (component.vertices().size() == 2) {
+ DEBUG("Component is a tip! Exiting...");
+ return false;
+ }
+
+ //checking edge lengths
+ if (std::any_of(component.e_begin(), component.e_end(), [&](EdgeId e) {return g_.length(e) > edge_length_treshold_;})) {
+ DEBUG("Tip contains too long edges");
+ return false;
+ }
+
+ if (math::ge(GetRelativeTipCoverage(component), relative_coverage_treshold_)) {
+ DEBUG("Tip is too high covered with respect to external edges");
+ return false;
+ }
+
+ return true;
+ }
+
+public:
+ ComplexTipFinder(const Graph& g, double relative_coverage,
+ size_t max_edge_length, size_t max_path_length)
+ : g_(g),
+ relative_coverage_treshold_(math::ge(relative_coverage, 0.0) ?
+ relative_coverage : std::numeric_limits<double>::max()),
+ edge_length_treshold_(max_edge_length), max_path_length_(max_path_length)
+ { }
+
+ GraphComponent<Graph> operator()(VertexId v) const {
+ GraphComponent<Graph> empty(g_);
+ VERIFY(empty.empty());
+ if (g_.IncomingEdgeCount(v) != 0) {
+ return empty;
+ }
+
+ DominatedSetFinder<Graph> finder(g_, v, max_path_length_);
+ if (finder.FillDominated()) {
+ auto ranges = finder.dominated();
+ auto dom_component = finder.AsGraphComponent();
+ std::set<EdgeId> component_edges(dom_component.edges());
+ for (auto v : dom_component.exits()) {
+ size_t current_path_length = ranges[v].end_pos;
+ for (auto e : g_.OutgoingEdges(v)) {
+ if (current_path_length + g_.length(e) > max_path_length_) {
+ DEBUG("Component contains too long paths");
+ return empty;
+ }
+ component_edges.insert(e);
+ }
+ }
+ auto extended_component = GraphComponent<Graph>::FromEdges(g_, component_edges);
+ if (ComponentCheck(extended_component))
+ return extended_component;
+ else
+ return empty;
+ } else {
+ DEBUG("Failed to find dominated component");
+ return empty;
+ }
+ }
+
+private:
+ DECL_LOGGER("ComplexTipClipper")
+};
+
+template<class Graph>
+class ComplexTipClipper : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+ typedef typename ComponentRemover<Graph>::HandlerF HandlerF;
+
+ string pics_folder_;
+ ComplexTipFinder<Graph> finder_;
+ ComponentRemover<Graph> component_remover_;
+
+public:
+ //track_changes=false leads to every iteration run from scratch
+ ComplexTipClipper(Graph& g, double relative_coverage,
+ size_t max_edge_len, size_t max_path_len,
+ size_t chunk_cnt,
+ const string& pics_folder = "" ,
+ HandlerF removal_handler = nullptr) :
+ base(g, nullptr, false, std::less<VertexId>(), /*track changes*/false),
+ pics_folder_(pics_folder),
+ finder_(g, relative_coverage, max_edge_len, max_path_len),
+ component_remover_(g, removal_handler) {
+ if (!pics_folder_.empty()) {
+ make_dir(pics_folder_);
+ }
+ this->interest_el_finder_ = std::make_shared<ParallelInterestingElementFinder<Graph, VertexId>>(
+ [&](VertexId v) {return !finder_(v).empty();}, chunk_cnt);
+ }
+
+ bool Process(VertexId v) override {
+ DEBUG("Processing vertex " << this->g().str(v));
+ auto component = finder_(v);
+ if (component.empty()) {
+ DEBUG("Failed to detect complex tip starting with vertex " << this->g().str(v));
+ return false;
+ }
+
+ if (!pics_folder_.empty()) {
+ visualization::visualization_utils::WriteComponentSinksSources(component,
+ pics_folder_
+ + ToString(this->g().int_id(v)) //+ "_" + ToString(candidate_cnt)
+ + ".dot");
+ }
+
+ VERIFY(component.e_size() && component.v_size());
+ DEBUG("Detected tip component edge cnt: " << component.e_size());
+ component_remover_.DeleteComponent(component.e_begin(), component.e_end());
+ DEBUG("Complex tip removed");
+ return true;
+ }
+
+private:
+ DECL_LOGGER("ComplexTipClipper")
+};
+
+}
diff --git a/src/common/modules/simplification/compressor.hpp b/src/common/modules/simplification/compressor.hpp
new file mode 100644
index 0000000..7d210fd
--- /dev/null
+++ b/src/common/modules/simplification/compressor.hpp
@@ -0,0 +1,125 @@
+#pragma once
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+namespace omnigraph {
+
+/**
+* Compressor compresses vertices with unique incoming and unique outgoing edge in linear time while
+* simple one-by-one compressing has square complexity.
+*/
+template<class Graph>
+class Compressor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef CompressCondition<Graph> ConditionT;
+
+ Graph &graph_;
+ ConditionT compress_condition_;
+ bool safe_merging_;
+
+ bool GoUniqueWayForward(EdgeId &e) {
+ VertexId u = graph_.EdgeEnd(e);
+ if (!graph_.CheckUniqueOutgoingEdge(u)
+ || !graph_.CheckUniqueIncomingEdge(u)) {
+ return false;
+ }
+ e = graph_.GetUniqueOutgoingEdge(u);
+ return true;
+ }
+
+ bool GoUniqueWayBackward(EdgeId &e) {
+ VertexId u = graph_.EdgeStart(e);
+ if (!graph_.CheckUniqueOutgoingEdge(u)
+ || !graph_.CheckUniqueIncomingEdge(u)) {
+ return false;
+ }
+ e = graph_.GetUniqueIncomingEdge(u);
+ return true;
+ }
+
+ //do not use without checks:)
+ EdgeId CompressWithoutChecks(VertexId v) {
+ EdgeId e = graph_.GetUniqueOutgoingEdge(v);
+ EdgeId start_edge = e;
+ while (GoUniqueWayBackward(e) && e != start_edge
+ && !graph_.RelatedVertices(graph_.EdgeStart(e),
+ graph_.EdgeEnd(e))) {
+ }
+ vector <EdgeId> mergeList;
+ start_edge = e;
+ do {
+ mergeList.push_back(e);
+ } while (GoUniqueWayForward(e) && e != start_edge
+ && !graph_.RelatedVertices(graph_.EdgeStart(e),
+ graph_.EdgeEnd(e)));
+ EdgeId new_edge = graph_.MergePath(mergeList, safe_merging_);
+ TRACE("Vertex compressed and is now part of edge "
+ << graph_.str(new_edge));
+ return new_edge;
+
+ }
+
+public:
+ Compressor(Graph& graph, bool safe_merging = true) :
+ graph_(graph),
+ compress_condition_(graph),
+ safe_merging_(safe_merging) {
+ }
+
+ /**
+ * Method compresses longest possible path, containing given vertex.
+ * @param vertex to be compressed as part of a path
+ * @return true if vertex can be compressed and false otherwise
+ */
+ bool CompressVertex(VertexId v) {
+ return CompressVertexEdgeId(v) != EdgeId(0);
+ }
+
+ EdgeId CompressVertexEdgeId(VertexId v) {
+ TRACE("Processing vertex " << graph_.str(v) << " started");
+ if (!compress_condition_.Check(v)) {
+ return EdgeId(0);
+ }
+ TRACE("Vertex " << graph_.str(v) << " judged compressible");
+ return CompressWithoutChecks(v);
+ }
+
+// bool IsOfInterest(VertexId v) const {
+// return CanCompressVertex(v);
+// }
+
+private:
+ DECL_LOGGER("Compressor")
+};
+
+template<class Graph>
+class CompressingProcessor : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+ typedef CompressCondition<Graph> ConditionT;
+
+ Compressor<Graph> compressor_;
+public:
+ CompressingProcessor(Graph &graph, size_t chunk_cnt = 1, bool safe_merging = true) :
+ base(graph,
+ std::make_shared<ParallelInterestingElementFinder<Graph, VertexId>>(ConditionT(graph), chunk_cnt),
+ /*canonical only*/true),
+ compressor_(graph, safe_merging) {
+ }
+
+protected:
+ bool Process(VertexId v) override {
+ return compressor_.CompressVertex(v);
+ }
+};
+
+/**
+* Method compresses all vertices which can be compressed.
+*/
+template<class Graph>
+bool CompressAllVertices(Graph &g, bool safe_merging = true, size_t chunk_cnt = 1) {
+ CompressingProcessor<Graph> compressor(g, chunk_cnt, safe_merging);
+ return compressor.Run();
+}
+}
diff --git a/src/common/modules/simplification/dominated_set_finder.hpp b/src/common/modules/simplification/dominated_set_finder.hpp
new file mode 100644
index 0000000..b7e779a
--- /dev/null
+++ b/src/common/modules/simplification/dominated_set_finder.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+namespace omnigraph {
+template<class Graph>
+class DominatedSetFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ VertexId start_vertex_;
+ size_t max_length_;
+ size_t max_count_;
+
+ size_t cnt_;
+ std::map<VertexId, Range> dominated_;
+
+ bool CheckCanBeProcessed(VertexId v) const {
+ DEBUG( "Check if vertex " << g_.str(v) << " is dominated close neighbour");
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (dominated_.count(g_.EdgeStart(e)) == 0) {
+ DEBUG( "Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
+ DEBUG("Check fail");
+ return false;
+ }
+ }
+ DEBUG("Check ok");
+ return true;
+ }
+
+ void UpdateCanBeProcessed(VertexId v,
+ std::queue<VertexId>& can_be_processed) const {
+ DEBUG("Updating can be processed");
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ DEBUG("Considering edge " << ToString(e));
+ VertexId neighbour_v = g_.EdgeEnd(e);
+ if (CheckCanBeProcessed(neighbour_v)) {
+ can_be_processed.push(neighbour_v);
+ }
+ }
+ }
+
+ Range NeighbourDistanceRange(VertexId v, bool dominated_only = true) const {
+ DEBUG("Counting distance range for vertex " << g_.str(v));
+ size_t min = numeric_limits<size_t>::max();
+ size_t max = 0;
+ VERIFY(g_.IncomingEdgeCount(v) > 0);
+ VERIFY(!dominated_only || CheckCanBeProcessed(v));
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ //in case of dominated_only == false
+ if (dominated_.count(g_.EdgeStart(e)) == 0)
+ continue;
+ Range range = dominated_.find(g_.EdgeStart(e))->second;
+ range.shift((int) g_.length(e));
+ DEBUG("Edge " << g_.str(e) << " provide distance range " << range);
+ if (range.start_pos < min)
+ min = range.start_pos;
+ if (range.end_pos > max)
+ max = range.end_pos;
+ }
+ VERIFY((max > 0) && (min < numeric_limits<size_t>::max()) && (min <= max));
+ Range answer(min, max);
+ DEBUG("Range " << answer);
+ return answer;
+ }
+
+ bool CheckNoEdgeToStart(VertexId v) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (g_.EdgeEnd(e) == start_vertex_) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ DominatedSetFinder(const Graph& g, VertexId v, size_t max_length = -1ul,
+ size_t max_count = -1ul)
+ : g_(g),
+ start_vertex_(v),
+ max_length_(max_length),
+ max_count_(max_count),
+ cnt_(0) {
+
+ }
+
+ //true if no thresholds exceeded
+ bool FillDominated() {
+ DEBUG("Adding starting vertex " << g_.str(start_vertex_) << " to dominated set");
+ dominated_.insert(make_pair(start_vertex_, Range(0, 0)));
+ cnt_++;
+ std::queue<VertexId> can_be_processed;
+ UpdateCanBeProcessed(start_vertex_, can_be_processed);
+ while (!can_be_processed.empty()) {
+ if (++cnt_ > max_count_) {
+ return false;
+ }
+ VertexId v = can_be_processed.front();
+ can_be_processed.pop();
+ Range r = NeighbourDistanceRange(v);
+ if (r.start_pos > max_length_) {
+ return false;
+ }
+ //Currently dominated vertices cannot have edge to start vertex
+ if (CheckNoEdgeToStart(v)) {
+ DEBUG("Adding vertex " << g_.str(v) << " to dominated set");
+ dominated_.insert(make_pair(v, r));
+ UpdateCanBeProcessed(v, can_be_processed);
+ }
+ }
+ return true;
+ }
+
+ const map<VertexId, Range>& dominated() const {
+ return dominated_;
+ }
+
+ GraphComponent<Graph> AsGraphComponent() const {
+ return GraphComponent<Graph>::FromVertices(g_, key_set(dominated_));
+ }
+
+ //little meaning if FillDominated returned false
+ const map<VertexId, Range> CountBorder() const {
+ map<VertexId, Range> border;
+ for (VertexId v : key_set(border)) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId e_end = g_.EdgeEnd(e);
+ if (dominated_.count(e_end) == 0) {
+ border[e_end] = NeighbourDistanceRange(e_end, false);
+ }
+ }
+ }
+ return border;
+ }
+
+};
+}
diff --git a/src/common/modules/simplification/ec_threshold_finder.hpp b/src/common/modules/simplification/ec_threshold_finder.hpp
new file mode 100644
index 0000000..f0e27eb
--- /dev/null
+++ b/src/common/modules/simplification/ec_threshold_finder.hpp
@@ -0,0 +1,152 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef OMNI_TOOLS_HPP_
+#define OMNI_TOOLS_HPP_
+
+#include "utils/simple_tools.hpp"
+
+#include "utils/path_helper.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
+
+#ifdef USE_GLIBCXX_PARALLEL
+#include "parallel/algorithm"
+#endif
+
+namespace omnigraph {
+
+template<class Graph>
+class ErroneousConnectionThresholdFinder {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ size_t backet_width_;
+
+ bool IsInteresting(EdgeId e) const {
+ if (graph_.length(e) > graph_.k() + 1)
+ return false;
+
+ if (graph_.OutgoingEdgeCount(graph_.EdgeStart(e)) < 2 ||
+ graph_.IncomingEdgeCount(graph_.EdgeEnd(e)) < 2)
+ return false;
+
+ std::vector<EdgeId> v1;
+ push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e)));
+ std::vector<EdgeId> v2;
+ push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e)));
+ bool eq = (v1.size() == 2 && v2.size() == 2) && ((v1[0] == v2[0] && v1[1] == v2[1]) || (v1[0] == v2[1] && v1[0] == v2[1]));
+ return !eq;
+ }
+
+ double weight(size_t value, const map<size_t, size_t> &histogram,
+ size_t backet_width) const {
+ double result = 0;
+ for (size_t i = 0; i < backet_width && value + i < histogram.size(); i++) {
+ result += (double) (getValue(value + i, histogram) * std::min(i + 1, backet_width - i));
+ }
+ return result;
+ }
+
+ double Median(double thr = 500.0) const {
+ vector<double> coverages;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (graph_.length(*it) > thr)
+ coverages.push_back(graph_.coverage(*it));
+ }
+
+ auto middle_it = coverages.begin() + coverages.size() / 2;
+#ifdef USE_GLIBCXX_PARALLEL
+ __gnu_parallel::nth_element(coverages.begin(), middle_it, coverages.end());
+#else
+ std::nth_element(coverages.begin(), middle_it, coverages.end());
+#endif
+ return coverages[coverages.size() / 2];
+ }
+
+ size_t getValue(size_t arg, const map<size_t, size_t> &ssmap) const {
+ auto it = ssmap.find(arg);
+ if (it == ssmap.end())
+ return 0;
+ else
+ return it->second;
+ }
+
+public:
+ ErroneousConnectionThresholdFinder(const Graph &graph, size_t backet_width = 0) :
+ graph_(graph), backet_width_(backet_width) {
+ }
+
+ double AvgCoverage() const {
+ double cov = 0;
+ double length = 0;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ cov += graph_.coverage(*it) * (double) graph_.length(*it);
+ length += (double) graph_.length(*it);
+ }
+ return cov / length;
+ }
+
+ std::map<size_t, size_t> ConstructHistogram() const {
+ std::map<size_t, size_t> result;
+ for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
+ if (IsInteresting(*it))
+ result[(size_t)graph_.coverage(*it)]++;
+ }
+ return result;
+ }
+
+ double FindThreshold(const map<size_t, size_t> &histogram) const {
+ size_t backet_width = backet_width_;
+ if (backet_width == 0) {
+ backet_width = (size_t)(0.3 * AvgCovereageCounter<Graph>(graph_).Count() + 5);
+ }
+ size_t size = 0;
+ if (histogram.size() != 0)
+ size = histogram.rbegin()->first + 1;
+ INFO("Bucket size: " << backet_width);
+ size_t cnt = 0;
+ for (size_t i = 1; i + backet_width < size; i++) {
+ if (weight(i, histogram, backet_width) > weight(i - 1, histogram, backet_width))
+ cnt++;
+
+ if (i > backet_width &&
+ weight(i - backet_width, histogram, backet_width) >
+ weight(i - backet_width - 1, histogram, backet_width)) {
+ cnt--;
+ }
+ if (2 * cnt >= backet_width)
+ return (double) i;
+
+ }
+ INFO("Proper threshold was not found. Threshold set to 0.1 of average coverage");
+ return 0.1 * AvgCovereageCounter<Graph>(graph_).Count();
+ }
+
+ double FindThreshold() const {
+ INFO("Finding threshold started");
+ std::map<size_t, size_t> histogram = ConstructHistogram(/*weights*/);
+ for (size_t i = 0; i < histogram.size(); i++) {
+ TRACE(i << " " << histogram[i]);
+ }
+ double result = FindThreshold(histogram);
+ INFO("Average edge coverage: " << AvgCoverage());
+ INFO("Graph threshold: " << result);
+ result = std::max(AvgCoverage(), result);
+ INFO("Threshold finding finished. Threshold is set to " << result);
+ return result;
+ }
+private:
+ DECL_LOGGER("ThresholdFinder");
+};
+
+}
+
+#endif /* OMNI_TOOLS_HPP_ */
diff --git a/src/common/modules/simplification/erroneous_connection_remover.hpp b/src/common/modules/simplification/erroneous_connection_remover.hpp
new file mode 100644
index 0000000..f841913
--- /dev/null
+++ b/src/common/modules/simplification/erroneous_connection_remover.hpp
@@ -0,0 +1,659 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * erroneous_connection_remover.hpp
+ *
+ * Created on: May 31, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "func/func.hpp"
+#include "math/xmath.h"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/core/coverage.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "modules/simplification/topological_edge_conditions.hpp"
+
+namespace omnigraph {
+
+//todo move to rnaSPAdes project
+template<class Graph>
+class RelativeCoverageECCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ const double rcec_ratio_;
+
+ template<class ContainerType>
+ double SumCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
+ const Graph &g = this->g();
+ double sum = 0;
+ for (EdgeId e : edges) {
+ //update if competitor edge is not loop
+ if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
+ sum += g.coverage(e);
+ }
+ return sum;
+ }
+
+ double AvgLocalityCoverage(EdgeId ec_edge) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
+ auto in_start = g.IncomingEdges(start);
+ auto out_start = g.OutgoingEdges(start);
+ auto in_end = g.IncomingEdges(end);
+ auto out_end = g.OutgoingEdges(end);
+ double total_edges = double(g.IncomingEdgeCount(start) + g.OutgoingEdgeCount(start) +
+ g.IncomingEdgeCount(end) + g.OutgoingEdgeCount(end) - 2);
+ return (SumCompetitorCoverage(ec_edge, in_start) +
+ SumCompetitorCoverage(ec_edge, out_start) +
+ SumCompetitorCoverage(ec_edge, in_end) +
+ SumCompetitorCoverage(ec_edge, out_end)) / total_edges;
+ }
+
+ template<class ContainerType>
+ double MaxCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
+ const Graph &g = this->g();
+ double result = 0;
+ for (EdgeId e : edges) {
+ //update if competitor edge is not loop
+ if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
+ result = std::max(result, g.coverage(e));
+ }
+ return result;
+ }
+
+ double MaxCompetitorCoverage(EdgeId ec_edge) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
+ auto in_start = g.IncomingEdges(start);
+ auto out_start = g.OutgoingEdges(start);
+ auto in_end = g.IncomingEdges(end);
+ auto out_end = g.OutgoingEdges(end);
+ return std::max(
+ std::max(MaxCompetitorCoverage(ec_edge, in_start),
+ MaxCompetitorCoverage(ec_edge, out_start)),
+ std::max(MaxCompetitorCoverage(ec_edge, in_end),
+ MaxCompetitorCoverage(ec_edge, out_end)));
+ }
+
+public:
+
+ RelativeCoverageECCondition(const Graph& g, double rcec_ratio) :
+ base(g), rcec_ratio_(rcec_ratio) {
+ }
+
+ bool Check(EdgeId e) const override {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ double locality_coverage = AvgLocalityCoverage(e) + 1;
+ return math::le(this->g().coverage(e), rcec_ratio_ * locality_coverage);
+ }
+
+};
+
+//todo move to rnaSPAdes project
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId> AddRelativeCoverageECCondition(const Graph &g, double rcec_ratio,
+ func::TypedPredicate<typename Graph::EdgeId> condition) {
+ return func::And(RelativeCoverageECCondition<Graph>(g, rcec_ratio), condition);
+}
+
+//todo move to rnaSPAdes project
+template<class Graph>
+inline bool IsSimpleBulge(const Graph &g, typename Graph::EdgeId e){
+ size_t edge_count = g.GetEdgesBetween(g.EdgeStart(e), g.EdgeEnd(e)).size();
+
+ return edge_count == g.OutgoingEdgeCount(g.EdgeStart(e)) &&
+ edge_count == g.IncomingEdgeCount(g.EdgeEnd(e)) &&
+ edge_count >= 2;
+}
+
+template<class Graph>
+inline bool IsAlternativePathExist(const Graph &g, typename Graph::EdgeId e){
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g, e);
+
+ VertexId start = g.EdgeStart(e);
+ TRACE("Start " << g.str(start));
+ VertexId end = g.EdgeEnd(e);
+ TRACE("End " << g.str(end));
+
+ ProcessPaths(g, 0, std::numeric_limits<std::size_t>::max(), start, end, path_chooser, std::numeric_limits<std::size_t>::max());
+
+ const vector<EdgeId>& path = path_chooser.most_covered_path();
+ double path_coverage = path_chooser.max_coverage();
+ if (!path.empty() && math::gr(path_coverage, 0.)) {
+ VERIFY(g.EdgeStart(path[0]) == start);
+ VERIFY(g.EdgeEnd(path.back()) == end);
+
+ return true;
+ }
+ else
+ return false;
+}
+
+template<class Graph>
+inline bool IsAlternativeInclusivePathExist(const Graph &g, typename Graph::EdgeId forbidden_edge, typename Graph::EdgeId compulsory_edge){
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g, forbidden_edge);
+
+ VertexId start = g.EdgeStart(forbidden_edge);
+ TRACE("Start " << g.str(start));
+ VertexId end = g.EdgeEnd(forbidden_edge);
+ TRACE("End " << g.str(end));
+
+ ProcessPaths(g, 0, std::numeric_limits<std::size_t>::max(), start, end, path_chooser, std::numeric_limits<std::size_t>::max());
+
+ const vector<EdgeId>& path = path_chooser.most_covered_path();
+ double path_coverage = path_chooser.max_coverage();
+ if (!path.empty() && math::gr(path_coverage, 0.)) {
+ VERIFY(g.EdgeStart(path[0]) == start);
+ VERIFY(g.EdgeEnd(path.back()) == end);
+
+ if(std::find(path.begin(), path.end(), compulsory_edge) != path.end()){
+ return true;
+ }
+ }
+ return false;
+}
+
+template<class Graph>
+inline bool IsReachableBulge(const Graph &g, typename Graph::EdgeId e){
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ bool res = IsAlternativePathExist(g, e);
+ if(res)
+ return res;
+ else{
+ VertexId start = g.EdgeStart(e), end = g.EdgeEnd(e);
+ vector<EdgeId> incident;
+ push_back_all(incident, g.IncomingEdges(end));
+ push_back_all(incident, g.OutgoingEdges(start));
+ for (auto it = incident.begin(); it != incident.end(); ++it){
+ res = IsAlternativeInclusivePathExist(g, *it, e);
+ if(res){
+ return res;
+ }
+ }
+ }
+ return false;
+}
+
+//todo move to rnaSPAdes project
+template<class Graph>
+class NotBulgeECCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+public:
+
+ NotBulgeECCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ if (HasAlternatives(this->g(), e) && !IsSimpleBulge(this->g(), e)){
+ DEBUG("edge id = " << this->g().int_id(e)
+ << " between = " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e)).size()
+ << " between ids: " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e))
+ << " outgoing s = " << this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
+ << " incoming e = " << this->g().IncomingEdgeCount(this->g().EdgeEnd(e)));
+ }
+// return !IsSimpleBulge(this->g(), e);
+ return !IsReachableBulge(this->g(), e);
+ }
+
+private:
+ DECL_LOGGER("NotBulgeECCondition");
+
+};
+
+//todo move to rnaSPAdes project
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId> AddNotBulgeECCondition(const Graph &g,
+ func::TypedPredicate<typename Graph::EdgeId> condition) {
+ return func::And(NotBulgeECCondition<Graph>(g), condition);
+}
+
+template<class Graph>
+class TopologicalThornCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+ typedef std::vector<EdgeId> Path;
+
+ size_t max_jump_distance_;
+ size_t max_edge_cnt_;
+
+ bool CheckEdgeCounts(EdgeId e) const {
+ if (this->g().EdgeStart(e) == this->g().EdgeEnd(e))
+ return false;
+ if (this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) != 2)
+ return false;
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) != 1)
+ return false;
+ if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) != 1)
+ return false;
+ if (this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) != 2)
+ return false;
+ return true;
+ }
+
+public:
+
+ TopologicalThornCondition(Graph& g,
+ size_t max_jump_dist,
+ size_t max_edge_cnt = -1ul)
+ : base(g),
+ max_jump_distance_(max_jump_dist),
+ max_edge_cnt_(max_edge_cnt) {
+ }
+
+ bool Check(EdgeId e) const override {
+ const Graph& g = this->g();
+ if (!CheckEdgeCounts(e))
+ return false;
+
+ //fixme micro-optimization to be removed
+ if (g.conjugate(g.EdgeStart(e)) == g.EdgeEnd(e)) {
+ return true;
+ }
+
+ auto comparator = [](const Path& a, const Path& b) {return a.size() >= b.size();};
+
+ BestPathStorage<Graph, decltype(comparator)> callback(g, comparator);
+ ProcessPaths(g, 0, max_jump_distance_, g.EdgeStart(e), g.conjugate(g.EdgeEnd(e)), callback, max_edge_cnt_);
+ return (bool) callback.best_path();
+ }
+};
+
+template<class Graph>
+class AdditionalMDAThornCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+ typedef std::vector<EdgeId> Path;
+
+ size_t uniqueness_length_;
+
+ bool CheckUniqueness(EdgeId e) const {
+ return this->g().length(e) >= uniqueness_length_;
+ }
+
+ bool CheckUnique(VertexId v) const {
+ return this->g().CheckUniqueIncomingEdge(v) &&
+ CheckUniqueness(this->g().GetUniqueIncomingEdge(v));
+ }
+
+ bool CheckUniqueCondition(EdgeId e) const {
+ TRACE("Checking conditions for edge start");
+ return CheckUnique(this->g().EdgeStart(e)) ||
+ CheckUnique(this->g().conjugate(this->g().EdgeEnd(e)));
+ }
+
+ template<class EdgeContainer>
+ bool CheckAlternativesForEC(const EdgeContainer& edges, EdgeId base) const {
+ for (EdgeId e: edges) {
+ if (e != base && this->g().length(e) < 400
+ && math::ls(this->g().coverage(e) / this->g().coverage(base), 15.)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool CheckForECAround(EdgeId e) const {
+ return CheckAlternativesForEC(
+ this->g().IncidentEdges(this->g().EdgeStart(e)), e)
+ && CheckAlternativesForEC(
+ this->g().IncidentEdges(this->g().EdgeEnd(e)), e);
+ }
+
+ public:
+
+ AdditionalMDAThornCondition(Graph& g, size_t uniqueness_length)
+ : base(g),
+ uniqueness_length_(uniqueness_length) {
+ }
+
+ bool Check(EdgeId e) const override {
+ return CheckUniqueCondition(e) || CheckForECAround(e);
+ }
+
+ private:
+ DECL_LOGGER("AdditionalMDAThornCondition");
+};
+
+//todo move to rnaSPAdes simplification
+template<class Graph>
+class ECLoopRemover : public EdgeProcessingAlgorithm<Graph> {
+ typedef std::less<typename Graph::EdgeId> Comparator;
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ double ec_threshold_;
+ double relative_threshold_;
+ const FlankingCoverage<Graph> &flanking_coverage_;
+ EdgeRemover<Graph> edge_remover_;
+ size_t coverage_loops_removed = 0;
+ size_t dead_loops_removed = 0;
+ size_t not_dead_loops_removed = 0;
+ size_t coverage_rc_loops_removed = 0;
+ size_t dead_rc_loops_removed = 0;
+ size_t not_dead_rc_loops_removed = 0;
+
+ bool IsLoop(EdgeId e) {
+ return this->g().EdgeStart(e) == this->g().EdgeEnd(e);
+ }
+
+ bool IsRCLoop(EdgeId e) {
+ return this->g().EdgeStart(e) == this->g().conjugate(this->g().EdgeEnd(e));
+ }
+
+ bool IsAnyLoop(EdgeId e) {
+ return IsRCLoop(e) || IsLoop(e);
+ }
+
+ void RemoveHiddenLoopEC(EdgeId e, bool break_on_end) {
+ if (IsLoop(e))
+ coverage_loops_removed++;
+ else
+ coverage_rc_loops_removed++;
+ if (this->g().length(e) <= this->g().k())
+ edge_remover_.DeleteEdge(e);
+ else {
+ if (break_on_end) {
+ auto split_result = this->g().SplitEdge(e, this->g().length(e) - this->g().k());
+ edge_remover_.DeleteEdge(split_result.second);
+ } else {
+ auto split_result = this->g().SplitEdge(e, this->g().k());
+ edge_remover_.DeleteEdge(split_result.first);
+ }
+ }
+
+ }
+ void RemoveLoopWithNoCheck(EdgeId e) {
+ if (IsLoop(e)) {
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 1 || this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) == 1)
+ dead_loops_removed++;
+ else
+ not_dead_loops_removed++;
+ } else {
+ if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 2)
+ dead_rc_loops_removed++;
+ else
+ not_dead_rc_loops_removed++;
+
+ }
+ edge_remover_.DeleteEdge(e);
+ }
+
+ bool FindHiddenLoopEC(EdgeId e) {
+ if(flanking_coverage_.GetInCov(e) * relative_threshold_ < flanking_coverage_.GetOutCov(e) && flanking_coverage_.GetInCov(e) < ec_threshold_) {
+ //start is bad, end is OK.
+ RemoveHiddenLoopEC(e, false);
+ return true;
+ } else if(flanking_coverage_.GetOutCov(e) * relative_threshold_ < flanking_coverage_.GetInCov(e) && flanking_coverage_.GetOutCov(e) < ec_threshold_) {
+ //end is bad, start is OK.
+ RemoveHiddenLoopEC(e, true);
+ return true;
+ }
+ RemoveLoopWithNoCheck(e);
+ return false;
+ }
+
+ bool ProcessEdge(EdgeId e) {
+ if (IsAnyLoop(e)) {
+ DEBUG("Susp loop: " << this->g().int_id(e) << endl);
+ bool res = FindHiddenLoopEC(e);
+ if (res) {DEBUG ("was removed");} else {DEBUG("was not removed"); }
+ return res;
+ }
+ return false;
+ }
+
+
+public:
+ ECLoopRemover(Graph &g, const FlankingCoverage<Graph> &flanking_coverage, double ec_threshold, double relative_threshold,
+ EdgeRemovalHandlerF<Graph> removal_handler = 0): base(g),ec_threshold_(ec_threshold),
+ relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
+ edge_remover_(g, removal_handler){
+ }
+ void PrintLoopStats(){
+ INFO("Loops: accurately removed/deadend removed/other: "<< coverage_loops_removed <<"/" << dead_loops_removed << "/" <<not_dead_loops_removed);
+ INFO("RC loops: accurately removed/deadend removed/other: "<< coverage_rc_loops_removed <<"/" << dead_rc_loops_removed << "/" <<not_dead_rc_loops_removed);
+ }
+private:
+ DECL_LOGGER("ECLoopRemover");
+};
+
+template<class Graph>
+class MetaHiddenECRemover: public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+ const FlankingCoverage<Graph>& flanking_coverage_;
+ size_t uniqueness_length_;
+ double relative_threshold_;
+
+ EdgeDisconnector<Graph> disconnector_;
+
+ void DisconnectEdges(VertexId v) {
+ while (!this->g().IsDeadEnd(v)) {
+ disconnector_(*(this->g().out_begin(v)), /*compress*/false);
+ }
+ }
+
+ bool CheckUniqueness(EdgeId e) {
+ return UniquePathLengthLowerBound(this->g(), uniqueness_length_)(e);
+ }
+
+ void ProcessHiddenEC(VertexId v) {
+ VERIFY(this->g().OutgoingEdgeCount(v) == 2);
+ vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
+ if (math::gr(flanking_coverage_.CoverageOfStart(edges.front()),
+ flanking_coverage_.CoverageOfStart(edges.back()))) {
+ std::swap(edges.front(), edges.back());
+ }
+ double c1 = flanking_coverage_.CoverageOfStart(edges.front());
+ double c2 = flanking_coverage_.CoverageOfStart(edges.back());
+ TRACE("c1 " << c1 << "; c2 " << c2);
+ if (math::ls(c1 * relative_threshold_, c2)) {
+ TRACE("Disconnecting " << this->g().str(edges.front()));
+ disconnector_(edges.front());
+ } else {
+ TRACE("Disconnecting " << this->g().str(edges.front()) << " and " << this->g().str(edges.back()));
+ DisconnectEdges(v);
+ }
+ }
+
+ bool CheckSuspicious(VertexId v) {
+ if (this->g().IncomingEdgeCount(v) != 1 || this->g().OutgoingEdgeCount(v) != 2) {
+ return false;
+ }
+ vector<EdgeId> edges;
+ push_back_all(edges, this->g().OutgoingEdges(v));
+ VERIFY(edges.size() == 2);
+ if (this->g().conjugate(edges[0]) != edges[1]) {
+ return false;
+ }
+ return CheckUniqueness(this->g().GetUniqueIncomingEdge(v));
+ }
+
+protected:
+
+ bool Process(VertexId v) override {
+ if (CheckSuspicious(v)) {
+ ProcessHiddenEC(v);
+ return true;
+ }
+ return false;
+ }
+
+public:
+ MetaHiddenECRemover(Graph& g, size_t chunk_cnt,
+ const FlankingCoverage<Graph> &flanking_coverage,
+ size_t uniqueness_length,
+ double relative_threshold,
+ EdgeRemovalHandlerF<Graph> removal_handler = 0)
+ : base(g, nullptr, /*canonical only*/ false, std::less<VertexId>(), /*track changes*/false),
+ flanking_coverage_(flanking_coverage),
+ uniqueness_length_(uniqueness_length),
+ relative_threshold_(relative_threshold),
+ disconnector_(g, removal_handler, g.k() + 1) {
+ this->interest_el_finder_ = std::make_shared<ParallelInterestingElementFinder<Graph, VertexId>>(
+ [&](VertexId v) {
+ return CheckSuspicious(v);
+ }, chunk_cnt);
+ }
+
+private:
+ DECL_LOGGER("MetaHiddenECRemover");
+};
+
+//be careful unreliability_threshold_ is dependent on ec_threshold_!
+template<class Graph>
+class HiddenECRemover: public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PersistentProcessingAlgorithm<Graph, VertexId> base;
+ const FlankingCoverage<Graph>& flanking_coverage_;
+ size_t uniqueness_length_;
+ double unreliability_threshold_;
+ double ec_threshold_;
+ double relative_threshold_;
+
+ EdgeDisconnector<Graph> disconnector_;
+
+ void DisconnectEdges(VertexId v) {
+ while (!this->g().IsDeadEnd(v)) {
+ disconnector_(*(this->g().out_begin(v)), /*compress*/false);
+ }
+ }
+
+ bool CheckUniqueness(EdgeId e) {
+ //todo why 8???
+ omnigraph::MultiplicityCounter<Graph> mult_counter(this->g(), uniqueness_length_, 8);
+
+ vector<EdgeId> edges;
+ push_back_all(edges, this->g().OutgoingEdges(this->g().EdgeEnd(e)));
+ VERIFY(edges.size() == 2);
+ return (this->g().conjugate(edges[0]) == edges[1] && mult_counter.count(e, this->g().EdgeStart(e)) <= 1) ||
+ this->g().length(e) >= uniqueness_length_;
+ }
+
+ bool ProcessHiddenEC(VertexId v) {
+ TRACE("Processing outgoing edges for vertex " << this->g().str(v));
+ VERIFY(this->g().OutgoingEdgeCount(v) == 2)
+ vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
+ if (math::gr(flanking_coverage_.CoverageOfStart(edges.front()),
+ flanking_coverage_.CoverageOfStart(edges.back()))) {
+ std::swap(edges.front(), edges.back());
+ }
+ double c1 = flanking_coverage_.CoverageOfStart(edges.front());
+ TRACE("Flank start of e1 " << this->g().str(edges.front()) << ": " << c1);
+ double c2 = flanking_coverage_.CoverageOfStart(edges.back());
+ TRACE("Flank start of e1 " << this->g().str(edges.back()) << ": " << c2);
+ if (math::ls(c2, unreliability_threshold_)) {
+ TRACE("Disconnecting both edges from vertex " << this->g().str(v));
+ DisconnectEdges(v);
+ return true;
+ }
+ if (math::ls(c1 * relative_threshold_, c2) && math::ls(c1, ec_threshold_)) {
+ TRACE("Disconnecting edge " << this->g().str(edges.front()) << " from vertex " << this->g().str(v));
+ disconnector_(edges.front());
+ return true;
+ }
+ return false;
+ }
+
+ bool CheckSuspicious(VertexId v) {
+ if (this->g().IncomingEdgeCount(v) != 1 || this->g().OutgoingEdgeCount(v) != 2) {
+ return false;
+ }
+ return CheckUniqueness(this->g().GetUniqueIncomingEdge(v));
+ }
+
+protected:
+
+ bool Process(VertexId v) override {
+ if (CheckSuspicious(v)) {
+ return ProcessHiddenEC(v);
+ }
+ return false;
+ }
+
+public:
+ HiddenECRemover(Graph& g, size_t chunk_cnt,
+ const FlankingCoverage<Graph> &flanking_coverage,
+ size_t uniqueness_length,
+ double unreliability_coeff,
+ double ec_threshold, double relative_threshold,
+ EdgeRemovalHandlerF<Graph> removal_handler = 0)
+ : base(g, nullptr, /*canonical only*/ false, std::less<VertexId>(), /*track changes*/false),
+ flanking_coverage_(flanking_coverage),
+ uniqueness_length_(uniqueness_length),
+ unreliability_threshold_(unreliability_coeff * ec_threshold), ec_threshold_(ec_threshold),
+ relative_threshold_(relative_threshold),
+ disconnector_(g, removal_handler, g.k() + 1) {
+ VERIFY(math::gr(unreliability_coeff, 0.));
+ this->interest_el_finder_ = std::make_shared<ParallelInterestingElementFinder<Graph, VertexId>>(
+ [&](VertexId v) {
+ return CheckSuspicious(v);
+ }, chunk_cnt);
+ }
+
+private:
+ DECL_LOGGER("HiddenECRemover");
+};
+
+template<class Graph>
+class SelfConjugateDisruptor: public EdgeProcessingAlgorithm<Graph> {
+ typedef EdgeProcessingAlgorithm<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ EdgeRemover<Graph> edge_remover_;
+protected:
+
+ bool ProcessEdge(EdgeId e) override {
+ if (e == this->g().conjugate(e)) {
+ TRACE("Disrupting self-conjugate edge " << this->g().str(e));
+ EdgeId to_del = e;
+ size_t len = this->g().length(e);
+ if (len > 1) {
+ to_del = this->g().SplitEdge(e, len / 2).second;
+ }
+ edge_remover_.DeleteEdge(to_del);
+ return true;
+ }
+ return false;
+ }
+
+public:
+ SelfConjugateDisruptor(Graph& g,
+ std::function<void(EdgeId)> removal_handler = 0)
+ : base(g, true), edge_remover_(g, removal_handler) {
+ }
+
+private:
+ DECL_LOGGER("SelfConjugateDisruptor");
+};
+}
diff --git a/src/modules/algorithms/simplification/mf_ec_remover.hpp b/src/common/modules/simplification/mf_ec_remover.hpp
similarity index 100%
rename from src/modules/algorithms/simplification/mf_ec_remover.hpp
rename to src/common/modules/simplification/mf_ec_remover.hpp
diff --git a/src/common/modules/simplification/parallel_simplification_algorithms.hpp b/src/common/modules/simplification/parallel_simplification_algorithms.hpp
new file mode 100644
index 0000000..f33075b
--- /dev/null
+++ b/src/common/modules/simplification/parallel_simplification_algorithms.hpp
@@ -0,0 +1,900 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "cleaner.hpp"
+#include "bulge_remover.hpp"
+#include "utils/standard_base.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/core/construction_helper.hpp"
+#include "assembly_graph/graph_support/marks_and_locks.hpp"
+#include "compressor.hpp"
+
+namespace debruijn {
+
+namespace simplification {
+
+template<class Graph>
+class ParallelTipClippingFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ size_t length_bound_;
+ double coverage_bound_;
+ omnigraph::EdgeRemovalHandlerF<Graph> handler_f_;
+
+ size_t LockingIncomingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.IncomingEdgeCount(v);
+ }
+
+ size_t LockingOutgoingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.OutgoingEdgeCount(v);
+ }
+
+ bool IsIncomingTip(EdgeId e) const {
+ return g_.length(e) <= length_bound_ && math::le(g_.coverage(e), coverage_bound_)
+ && LockingIncomingCount(g_.EdgeStart(e)) + LockingOutgoingCount(g_.EdgeStart(e)) == 1;
+ }
+
+ void RemoveEdge(EdgeId e) {
+ //even full tip locking can't lead to deadlock
+ VertexLockT lock1(g_.EdgeStart(e));
+ VertexLockT lock2(g_.EdgeEnd(e));
+ g_.DeleteEdge(e);
+ }
+
+public:
+
+ ParallelTipClippingFunctor(Graph& g, size_t length_bound, double coverage_bound,
+ omnigraph::EdgeRemovalHandlerF<Graph> handler_f = nullptr)
+ : g_(g),
+ length_bound_(length_bound),
+ coverage_bound_(coverage_bound),
+ handler_f_(handler_f) {
+
+ }
+
+ bool Process(VertexId v) {
+ if (LockingOutgoingCount(v) == 0)
+ return false;
+
+ vector<EdgeId> tips;
+ //don't need lock here after the previous check
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (IsIncomingTip(e)) {
+ tips.push_back(e);
+ }
+ }
+
+ //if all of edges are tips, leave the longest one
+ if (!tips.empty() && tips.size() == g_.IncomingEdgeCount(v)) {
+ sort(tips.begin(), tips.end(), omnigraph::LengthComparator<Graph>(g_));
+ tips.pop_back();
+ }
+
+ for (EdgeId e : tips) {
+ if (handler_f_) {
+ handler_f_(e);
+ }
+ //don't need any synchronization here!
+ RemoveEdge(e);
+ }
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+};
+
+template<class Graph>
+class ParallelSimpleBRFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ size_t max_length_;
+ double max_coverage_;
+ double max_relative_coverage_;
+ size_t max_delta_;
+ double max_relative_delta_;
+ std::function<void(EdgeId)> handler_f_;
+
+ bool LengthDiffCheck(size_t l1, size_t l2, size_t delta) const {
+ return l1 <= l2 + delta && l2 <= l1 + delta;
+ }
+
+ EdgeId Alternative(EdgeId e, const vector<EdgeId>& edges) const {
+ size_t delta = omnigraph::CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
+ for (auto it = edges.rbegin(); it != edges.rend(); ++it) {
+ EdgeId candidate = *it;
+ if (g_.EdgeEnd(candidate) == g_.EdgeEnd(e) && candidate != e && candidate != g_.conjugate(e)
+ && LengthDiffCheck(g_.length(candidate), g_.length(e), delta)) {
+ return candidate;
+ }
+ }
+ return EdgeId(0);
+ }
+
+ bool ProcessEdges(const vector<EdgeId>& edges) {
+ for (EdgeId e : edges) {
+ if (g_.length(e) <= max_length_ && math::le(g_.coverage(e), max_coverage_)) {
+ EdgeId alt = Alternative(e, edges);
+ if (alt != EdgeId(0) && math::ge(g_.coverage(alt) * max_relative_coverage_, g_.coverage(e))) {
+ //does not work in multiple threads for now...
+ //Reasons: id distribution, kmer-mapping
+ handler_f_(e);
+ g_.GlueEdges(e, alt);
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ vector<VertexId> MultiEdgeDestinations(VertexId v) const {
+ vector<VertexId> answer;
+ set<VertexId> destinations;
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ VertexId end = g_.EdgeEnd(e);
+ if (destinations.count(end) > 0) {
+ answer.push_back(end);
+ }
+ destinations.insert(end);
+ }
+ return answer;
+ }
+
+ VertexId SingleMultiEdgeDestination(VertexId v) const {
+ vector<VertexId> dests = MultiEdgeDestinations(v);
+ if (dests.size() == 1) {
+ return dests.front();
+ } else {
+ return VertexId(0);
+ }
+ }
+
+ void RemoveBulges(VertexId v) {
+ bool flag = true;
+ while (flag) {
+ vector<EdgeId> edges(g_.out_begin(v), g_.out_end(v));
+ if (edges.size() == 1)
+ return;
+ sort(edges.begin(), edges.end(), omnigraph::CoverageComparator<Graph>(g_));
+ flag = ProcessEdges(edges);
+ }
+ }
+
+ bool CheckVertex(VertexId v) const {
+ VertexLockT lock(v);
+ return MultiEdgeDestinations(v).size() == 1 && MultiEdgeDestinations(g_.conjugate(v)).size() == 0;
+ }
+
+ size_t MinId(VertexId v) const {
+ return std::min(v.int_id(), g_.conjugate(v).int_id());
+ }
+
+ bool IsMinimal(VertexId v1, VertexId v2) const {
+ return MinId(v1) < MinId(v2);
+ }
+
+public:
+
+ ParallelSimpleBRFunctor(Graph& g, size_t max_length, double max_coverage, double max_relative_coverage, size_t max_delta, double max_relative_delta,
+ std::function<void(EdgeId)> handler_f = 0)
+ : g_(g),
+ max_length_(max_length),
+ max_coverage_(max_coverage),
+ max_relative_coverage_(max_relative_coverage),
+ max_delta_(max_delta),
+ max_relative_delta_(max_relative_delta),
+ handler_f_(handler_f) {
+
+ }
+
+ bool operator()(VertexId v/*, need number of vertex for stable id distribution*/) {
+ vector<VertexId> multi_dest;
+
+ {
+ VertexLockT lock(v);
+ multi_dest = MultiEdgeDestinations(v);
+ }
+
+ if (multi_dest.size() == 1 && IsMinimal(v, multi_dest.front())) {
+ VertexId dest = multi_dest.front();
+ if (CheckVertex(v) && CheckVertex(g_.conjugate(dest))) {
+ VertexLockT lock1(v);
+ VertexLockT lock2(dest);
+ RemoveBulges(v);
+ }
+ }
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+};
+
+template<class Graph>
+class CriticalEdgeMarker {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ Graph& g_;
+ size_t chunk_cnt_;
+ omnigraph::GraphElementMarker<EdgeId> edge_marker_;
+
+ void ProcessVertex(VertexId v) {
+ if (g_.OutgoingEdgeCount(v) > 0) {
+ auto max_cov_it =
+ std::max_element(g_.out_begin(v), g_.out_end(v), omnigraph::CoverageComparator<Graph>(g_));
+ DEBUG("Marking edge " << g_.str(*max_cov_it));
+ edge_marker_.mark(*max_cov_it);
+ }
+ }
+
+ template<class It>
+ void ProcessVertices(It begin, It end) {
+ for (auto it = begin; !(it == end); ++it) {
+ ProcessVertex(*it);
+ }
+ }
+
+public:
+
+ CriticalEdgeMarker(Graph& g, size_t chunk_cnt) : g_(g), chunk_cnt_(chunk_cnt) {
+ }
+
+ void PutMarks() {
+ auto chunk_iterators = omnigraph::IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt_);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ ProcessVertices(chunk_iterators[i], chunk_iterators[i + 1]);
+ }
+ }
+
+ void ClearMarks() {
+ auto chunk_iterators = omnigraph::IterationHelper<Graph, EdgeId>(g_).Chunks(chunk_cnt_);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ for (auto it = chunk_iterators[i]; it != chunk_iterators[i + 1]; ++ it) {
+ edge_marker_.unmark(*it);
+ }
+ }
+ }
+private:
+ DECL_LOGGER("CriticalEdgeMarker");
+};
+
+template<class Graph>
+class ParallelLowCoverageFunctor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ typename Graph::HelperT helper_;
+ func::TypedPredicate<EdgeId> ec_condition_;
+ omnigraph::EdgeRemovalHandlerF<Graph> handler_f_;
+
+ omnigraph::GraphElementMarker<EdgeId> edge_marker_;
+ vector<EdgeId> edges_to_remove_;
+
+ void UnlinkEdgeFromStart(EdgeId e) {
+ VertexId start = g_.EdgeStart(e);
+ VertexLockT lock(start);
+ helper_.DeleteLink(start, e);
+ }
+
+ void UnlinkEdge(EdgeId e) {
+ UnlinkEdgeFromStart(e);
+ if (g_.conjugate(e) != e)
+ UnlinkEdgeFromStart(g_.conjugate(e));
+ }
+
+public:
+
+ //should be launched with conjugate copies filtered
+ ParallelLowCoverageFunctor(Graph& g, size_t max_length, double max_coverage,
+ omnigraph::EdgeRemovalHandlerF<Graph> handler_f = nullptr)
+ : g_(g),
+ helper_(g_.GetConstructionHelper()),
+ ec_condition_(func::And(func::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
+ omnigraph::CoverageUpperBound<Graph>(g, max_coverage)),
+ omnigraph::AlternativesPresenceCondition<Graph>(g))),
+ handler_f_(handler_f) {}
+
+ bool IsOfInterest(EdgeId e) const {
+ return !edge_marker_.is_marked(e) && ec_condition_(e);
+ }
+
+ void PrepareForProcessing(size_t /*interesting_cnt*/) {
+ }
+
+ //no conjugate copies here!
+ bool Process(EdgeId e, size_t /*idx*/) {
+ if (handler_f_)
+ handler_f_(e);
+ DEBUG("Removing edge " << g_.str(e));
+ g_.FireDeleteEdge(e);
+ UnlinkEdge(e);
+ helper_.DeleteUnlinkedEdge(e);
+ return true;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return true;
+ }
+// bool operator()(EdgeId e) {
+// if (ec_condition_->Check(e)) {
+// edges_to_remove_.push_back(e);
+// }
+// return false;
+// }
+//
+// void RemoveCollectedEdges() {
+// omnigraph::SmartSetIterator<Graph, EdgeId> to_delete(g_, edges_to_remove_.begin(), edges_to_remove_.end());
+// while (!to_delete.IsEnd()) {
+// EdgeId e = *to_delete;
+// handler_f_(e);
+// g_.DeleteEdge(e);
+// ++to_delete;
+// }
+// }
+private:
+ DECL_LOGGER("ParallelLowCoverageFunctor");
+};
+
+template<class Graph>
+class ParallelCompressor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::EdgeData EdgeData;
+ typedef typename Graph::VertexId VertexId;
+ typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
+
+ Graph& g_;
+ typename Graph::HelperT helper_;
+ restricted::IdSegmentStorage segment_storage_;
+
+ bool IsBranching(VertexId v) const {
+// VertexLockT lock(v);
+ return !g_.CheckUniqueOutgoingEdge(v) || !g_.CheckUniqueIncomingEdge(v);
+ }
+
+ size_t LockingIncomingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.IncomingEdgeCount(v);
+ }
+
+ size_t LockingOutgoingCount(VertexId v) const {
+ VertexLockT lock(v);
+ return g_.OutgoingEdgeCount(v);
+ }
+
+ vector<VertexId> LockingNextVertices(VertexId v) const {
+ VertexLockT lock(v);
+ vector<VertexId> answer;
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ answer.push_back(g_.EdgeEnd(e));
+ }
+ return answer;
+ }
+
+ vector<VertexId> FilterBranchingVertices(const vector<VertexId>& vertices) const {
+ vector<VertexId> answer;
+ for (VertexId v : vertices) {
+ VertexLockT lock(v);
+ if (!IsBranching(v)) {
+ answer.push_back(v);
+ }
+ }
+ return answer;
+ }
+
+ //correctly handles self-conjugate case
+ bool IsMinimal(VertexId v1, VertexId v2) const {
+ return !(g_.conjugate(v2) < v1);
+ }
+
+ //true if need to go further, false if stop on any reason!
+ //to_compress is not empty only if compression needs to be done
+ //don't need additional checks for v == init | conjugate(init), because init is branching!
+ //fixme what about plasmids?! =)
+ bool ProcessNextAndGo(VertexId& v, VertexId init, vector<VertexId>& to_compress) {
+ VertexLockT lock(v);
+ if (!CheckConsistent(v)) {
+ to_compress.clear();
+ return false;
+ }
+ if (IsBranching(v)) {
+ if (!IsMinimal(init, v)) {
+ to_compress.clear();
+ }
+ return false;
+ } else {
+ to_compress.push_back(v);
+ v = g_.EdgeEnd(g_.GetUniqueOutgoingEdge(v));
+ return true;
+ }
+ }
+
+ void UnlinkEdge(VertexId v, EdgeId e) {
+ VertexLockT lock(v);
+ helper_.DeleteLink(v, e);
+ }
+
+ void UnlinkEdges(VertexId v) {
+ VertexLockT lock(v);
+ helper_.DeleteLink(v, g_.GetUniqueOutgoingEdge(v));
+ helper_.DeleteLink(g_.conjugate(v), g_.GetUniqueOutgoingEdge(g_.conjugate(v)));
+ }
+
+ //fixme duplication with abstract conj graph
+ //not locking!
+ vector<EdgeId> EdgesToDelete(const vector<EdgeId> &path) const {
+ set<EdgeId> edgesToDelete;
+ edgesToDelete.insert(path[0]);
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ if (edgesToDelete.find(g_.conjugate(e)) == edgesToDelete.end())
+ edgesToDelete.insert(e);
+ }
+ return vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
+ }
+
+ //not locking!
+ //fixme duplication with abstract conj graph
+ vector<VertexId> VerticesToDelete(const vector<EdgeId> &path) const {
+ set<VertexId> verticesToDelete;
+ for (size_t i = 0; i + 1 < path.size(); i++) {
+ EdgeId e = path[i + 1];
+ VertexId v = g_.EdgeStart(e);
+ if (verticesToDelete.find(g_.conjugate(v)) == verticesToDelete.end())
+ verticesToDelete.insert(v);
+ }
+ return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
+ }
+ //todo end duplication with abstract conj graph
+
+ //not locking!
+ vector<EdgeId> CollectEdges(const vector<VertexId>& to_compress) const {
+ vector<EdgeId> answer;
+ answer.push_back(g_.GetUniqueIncomingEdge(to_compress.front()));
+ for (VertexId v : to_compress) {
+ answer.push_back(g_.GetUniqueOutgoingEdge(v));
+ }
+ return answer;
+ }
+
+ void CallHandlers(const vector<EdgeId>& edges, EdgeId new_edge) const {
+ g_.FireMerge(edges, new_edge);
+ g_.FireDeletePath(EdgesToDelete(edges), VerticesToDelete(edges));
+ g_.FireAddEdge(new_edge);
+ }
+
+ EdgeData MergedData(const vector<EdgeId>& edges) const {
+ vector<const EdgeData*> to_merge;
+ for (EdgeId e : edges) {
+ to_merge.push_back(&(g_.data(e)));
+ }
+ return g_.master().MergeData(to_merge);
+ }
+
+ EdgeId SyncAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
+ EdgeId new_edge = helper_.AddEdge(data, id_distributor);
+ {
+ VertexLockT lock(v1);
+ helper_.LinkOutgoingEdge(v1, new_edge);
+ }
+ if (g_.conjugate(new_edge) != new_edge) {
+ VertexLockT lock(v2);
+ helper_.LinkIncomingEdge(v2, new_edge);
+ }
+ return new_edge;
+ }
+
+ void ProcessBranching(VertexId next, VertexId init, size_t idx) {
+ vector<VertexId> to_compress;
+ while (ProcessNextAndGo(next, init, to_compress)) {
+ }
+
+ if (!to_compress.empty()) {
+ //here we are sure that we are the ones to process the path
+ //so we can collect edges without any troubles (and actually without locks todo check!)
+ vector<EdgeId> edges = CollectEdges(to_compress);
+
+ restricted::ListIdDistributor<restricted::SegmentIterator> id_distributor = segment_storage_.GetSegmentIdDistributor(2 * idx, 2 * idx + 1);
+
+ EdgeId new_edge = SyncAddEdge(g_.EdgeStart(edges.front()), g_.EdgeEnd(edges.back()), MergeSequences(g_, edges), id_distributor);
+
+ CallHandlers(edges, new_edge);
+
+ VertexId final = g_.EdgeEnd(edges.back());
+ UnlinkEdge(init, edges.front());
+ for (VertexId v : VerticesToDelete(edges/*to_compress*/)) {
+ UnlinkEdges(v);
+ }
+
+ if (g_.conjugate(new_edge) != new_edge) {
+ UnlinkEdge(g_.conjugate(final), g_.conjugate(edges.back()));
+ }
+
+ for (EdgeId e : EdgesToDelete(edges)) {
+ helper_.DeleteUnlinkedEdge(e);
+ }
+ }
+ }
+
+ //vertex is not consistent if the path has already been compressed or under compression right now
+ //not needed here, but could check if vertex is fully isolated
+ bool CheckConsistent(VertexId v) const {
+ //todo change to incoming edge count
+ return g_.OutgoingEdgeCount(g_.conjugate(v)) > 0;
+ }
+
+ //long, but safe way to get left neighbour
+ //heavily relies on the current graph structure!
+ VertexId LockingGetInit(VertexId v) {
+ VertexLockT lock(v);
+ if (!CheckConsistent(v))
+ return VertexId(0);
+
+ //works even if this edge is already unlinked from the vertex =)
+ VERIFY(g_.CheckUniqueIncomingEdge(v));
+ return g_.EdgeStart(g_.GetUniqueIncomingEdge(v));
+ }
+
+public:
+
+ ParallelCompressor(Graph& g)
+ : g_(g),
+ helper_(g_.GetConstructionHelper()) {
+
+ }
+
+ //returns true iff v is the "leftmost" vertex to compress in the chain
+ bool IsOfInterest(VertexId v) const {
+ return !IsBranching(v) && IsBranching(g_.EdgeStart(g_.GetUniqueIncomingEdge(v)));
+ }
+
+ void PrepareForProcessing(size_t interesting_cnt) {
+ segment_storage_ = g_.GetGraphIdDistributor().Reserve(interesting_cnt * 2);
+ }
+
+ bool Process(VertexId v, size_t idx) {
+ VertexId init = LockingGetInit(v);
+ if (init != VertexId(0))
+ ProcessBranching(v, init, idx);
+ return false;
+ }
+
+ bool ShouldFilterConjugate() const {
+ return false;
+ }
+
+};
+
+
+//todo add conjugate filtration
+template<class Graph, class ElementType>
+class AlgorithmRunner {
+ const Graph& g_;
+
+ template<class Algo, class It>
+ bool ProcessBucket(Algo& algo, It begin, It end) {
+ bool changed = false;
+ for (auto it = begin; it != end; ++it) {
+ changed |= algo.Process(*it);
+ }
+ return changed;
+ }
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ AlgorithmRunner(Graph& g)
+ : g_(g) {
+
+ }
+
+ template<class Algo, class ItVec>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
+ DEBUG("Running from " << chunk_iterators.size() - 1 << "chunks");
+ VERIFY(chunk_iterators.size() > 1);
+ bool changed = false;
+ #pragma omp parallel for schedule(guided) reduction(|:changed)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ changed |= ProcessBucket(algo, chunk_iterators[i], chunk_iterators[i + 1]);
+ }
+ DEBUG("Finished");
+ return changed;
+ }
+private:
+ DECL_LOGGER("AlgorithmRunner")
+ ;
+};
+
+template<class Graph, class ElementType>
+class TwoStepAlgorithmRunner {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+ const bool filter_conjugate_;
+ std::vector<std::vector<ElementType>> elements_of_interest_;
+
+ template<class Algo>
+ bool ProcessBucket(Algo& algo, const std::vector<ElementType>& bucket, size_t idx_offset) const {
+ bool changed = false;
+ for (ElementType el : bucket) {
+ changed |= algo.Process(el, idx_offset++);
+ }
+ return changed;
+ }
+
+ template<class Algo>
+ bool Process(Algo& algo) const {
+ std::vector<size_t> cumulative_bucket_sizes;
+ cumulative_bucket_sizes.push_back(0);
+ for (const auto& bucket : elements_of_interest_) {
+ cumulative_bucket_sizes.push_back(cumulative_bucket_sizes.back() + bucket.size());
+ }
+ DEBUG("Preparing for processing");
+ algo.PrepareForProcessing(cumulative_bucket_sizes.back());
+ bool changed = false;
+ DEBUG("Processing buckets");
+ #pragma omp parallel for schedule(guided) reduction(|:changed)
+ for (size_t i = 0; i < elements_of_interest_.size(); ++i) {
+ changed |= ProcessBucket(algo, elements_of_interest_[i], cumulative_bucket_sizes[i]);
+ }
+ return changed;
+ }
+
+ template<class Algo>
+ void CountElement(Algo& algo, ElementType el, size_t bucket) {
+ if (filter_conjugate_ && g_.conjugate(el) < el)
+ return;
+ if (algo.IsOfInterest(el)) {
+ TRACE("Element " << g_.str(el) << " is of interest");
+ elements_of_interest_[bucket].push_back(el);
+ } else {
+ TRACE("Element " << g_.str(el) << " is not interesting");
+ }
+ }
+
+ template<class Algo, class It>
+ void CountAll(Algo& algo, It begin, It end, size_t bucket) {
+ for (auto it = begin; !(it == end); ++it) {
+ CountElement(algo, *it, bucket);
+ }
+ }
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ //conjugate elements are filtered based on ids
+ //should be used only if both conjugate elements are simultaneously either interesting or not
+ //fixme filter_conjugate is redundant
+ TwoStepAlgorithmRunner(Graph& g, bool filter_conjugate)
+ : g_(g),
+ filter_conjugate_(filter_conjugate) {
+
+ }
+
+ template<class Algo, class ItVec>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
+ DEBUG("Started running from " << chunk_iterators.size() - 1 << " chunks");
+ VERIFY(algo.ShouldFilterConjugate() == filter_conjugate_);
+ VERIFY(chunk_iterators.size() > 1);
+ elements_of_interest_.clear();
+ elements_of_interest_.resize(chunk_iterators.size() - 1);
+ DEBUG("Searching elements of interest");
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
+ CountAll(algo, chunk_iterators[i], chunk_iterators[i + 1], i);
+ }
+ DEBUG("Processing");
+ return Process(algo);
+ }
+
+// template<class Algo, class It>
+// void RunFromIterator(Algo& algo, It begin, It end) {
+// RunFromChunkIterators(algo, std::vector<It> { begin, end });
+// }
+private:
+ DECL_LOGGER("TwoStepAlgorithmRunner")
+ ;
+};
+
+template<class Graph, class ElementType>
+class SemiParallelAlgorithmRunner {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph& g_;
+
+public:
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ SemiParallelAlgorithmRunner(Graph& g)
+ : g_(g) {
+
+ }
+
+ template<class Algo, class ItVec, class Comparator = std::less<ElementType>>
+ bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators,
+ const Comparator& comp = Comparator()) {
+ VERIFY(chunk_iterators.size() > 1);
+ omnigraph::SmartSetIterator<Graph, ElementType, Comparator> it(g_, false, comp);
+
+ omnigraph::FindInterestingFromChunkIterators(chunk_iterators,
+ [&](ElementType el) {return algo.IsOfInterest(el);},
+ [&](ElementType el) {it.push(el);});
+
+ bool changed = false;
+ for (; !it.IsEnd(); ++it) {
+ changed |= algo.Process(*it);
+ }
+ return changed;
+ }
+
+private:
+ DECL_LOGGER("SemiParallelAlgorithmRunner");
+};
+
+template<class Graph>
+class SemiParallelEdgeRemovingAlgorithm {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ Graph& g_;
+ func::TypedPredicate<EdgeId> condition_;
+ omnigraph::EdgeRemover<Graph> edge_remover_;
+
+public:
+ SemiParallelEdgeRemovingAlgorithm(Graph& g,
+ func::TypedPredicate<EdgeId> condition,
+ std::function<void(EdgeId)> removal_handler = 0) :
+ g_(g), condition_(condition), edge_remover_(g, removal_handler) {
+ }
+
+ bool IsOfInterest(EdgeId e) const {
+ return condition_(e);
+ }
+
+ bool Process(EdgeId e) {
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+};
+
+template<class Graph, class AlgoRunner, class Algo>
+bool RunVertexAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
+ return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::VertexId>(g).Chunks(chunk_cnt));
+}
+
+template<class Graph, class AlgoRunner, class Algo>
+bool RunEdgeAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
+ return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::EdgeId>(g).Chunks(chunk_cnt));
+}
+
+template<class Graph>
+void ParallelCompress(Graph &g, size_t chunk_cnt, bool loop_post_compression = true) {
+ INFO("Parallel compression");
+ debruijn::simplification::ParallelCompressor<Graph> compressor(g);
+ TwoStepAlgorithmRunner<Graph, typename Graph::VertexId> runner(g, false);
+ RunVertexAlgorithm(g, runner, compressor, chunk_cnt);
+
+ //have to call cleaner to get rid of new isolated vertices
+ omnigraph::Cleaner<Graph>(g, chunk_cnt).Run();
+
+ if (loop_post_compression) {
+ INFO("Launching post-compression to compress loops");
+ omnigraph::CompressAllVertices(g, chunk_cnt);
+ }
+}
+
+template<class Graph>
+bool ParallelClipTips(Graph &g,
+ size_t max_length,
+ double max_coverage,
+ size_t chunk_cnt,
+ omnigraph::EdgeRemovalHandlerF<Graph> removal_handler = nullptr) {
+ INFO("Parallel tip clipping");
+
+ debruijn::simplification::ParallelTipClippingFunctor<Graph> tip_clipper(g,
+ max_length, max_coverage, removal_handler);
+
+ AlgorithmRunner<Graph, typename Graph::VertexId> runner(g);
+
+ RunVertexAlgorithm(g, runner, tip_clipper, chunk_cnt);
+
+ ParallelCompress(g, chunk_cnt);
+ //Cleaner is launched inside ParallelCompression
+ //CleanGraph(g, info.chunk_cnt());
+
+ return true;
+}
+
+//template<class Graph>
+//bool ParallelRemoveBulges(Graph &g,
+// const config::debruijn_config::simplification::bulge_remover &br_config,
+// size_t /*read_length*/,
+// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
+// INFO("Parallel bulge remover");
+//
+// size_t max_length = LengthThresholdFinder::MaxBulgeLength(
+// g.k(), br_config.max_bulge_length_coefficient,
+// br_config.max_additive_length_coefficient);
+//
+// DEBUG("Max bulge length " << max_length);
+//
+// debruijn::simplification::ParallelSimpleBRFunctor<Graph> bulge_remover(g,
+// max_length,
+// br_config.max_coverage,
+// br_config.max_relative_coverage,
+// br_config.max_delta,
+// br_config.max_relative_delta,
+// removal_handler);
+// for (VertexId v : g) {
+// bulge_remover(v);
+// }
+//
+// Compress(g);
+// return true;
+//}
+
+template<class Graph>
+bool ParallelEC(Graph &g,
+ size_t max_length,
+ double max_coverage,
+ size_t chunk_cnt,
+ omnigraph::EdgeRemovalHandlerF<Graph> removal_handler = nullptr) {
+ INFO("Parallel ec remover");
+
+ debruijn::simplification::CriticalEdgeMarker<Graph> critical_marker(g, chunk_cnt);
+ critical_marker.PutMarks();
+
+ debruijn::simplification::ParallelLowCoverageFunctor<Graph> ec_remover(g,
+ max_length,
+ max_coverage,
+ removal_handler);
+
+ TwoStepAlgorithmRunner<Graph, typename Graph::EdgeId> runner(g, true);
+
+ RunEdgeAlgorithm(g, runner, ec_remover, chunk_cnt);
+
+ critical_marker.ClearMarks();
+
+ ParallelCompress(g, chunk_cnt);
+ //called in parallel compress
+ //CleanGraph(g, info.chunk_cnt());
+ return true;
+}
+
+}
+
+}
diff --git a/src/common/modules/simplification/relative_coverage_remover.hpp b/src/common/modules/simplification/relative_coverage_remover.hpp
new file mode 100644
index 0000000..177f5b6
--- /dev/null
+++ b/src/common/modules/simplification/relative_coverage_remover.hpp
@@ -0,0 +1,690 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "visualization/graph_colorer.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "assembly_graph/graph_support/comparators.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/components/splitters.hpp"
+
+namespace omnigraph {
+
+namespace simplification {
+
+namespace relative_coverage {
+
+template<class Graph>
+class Component {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ set<EdgeId> edges_;
+ set<VertexId> inner_vertices_;
+ set<VertexId> border_;
+ set<VertexId> terminating_vertices_;
+ //maybe use something more sophisticated in future
+ size_t cumm_length_;
+ bool contains_deadends_;
+
+ //if edge start = edge end = v returns v
+ VertexId OppositeEnd(EdgeId e, VertexId v) const {
+ VERIFY(g_.EdgeStart(e) == v
+ || g_.EdgeEnd(e) == v);
+// VERIFY(remover_.g.EdgeStart(e) != remover_.g.EdgeEnd(e));
+ if (g_.EdgeStart(e) == v) {
+ return g_.EdgeEnd(e);
+ } else {
+ return g_.EdgeStart(e);
+ }
+ }
+
+ void RemoveFromBorder(VertexId v) {
+ size_t cnt = border_.erase(v);
+ VERIFY(cnt);
+ }
+
+public:
+
+ Component(const Graph& g, EdgeId e) : g_(g), cumm_length_(0), contains_deadends_(false) {
+ edges_.insert(e);
+ cumm_length_ += g_.length(e);
+ border_.insert(g.EdgeStart(e));
+ border_.insert(g.EdgeEnd(e));
+ }
+
+ void MakeInner(VertexId v) {
+ VERIFY(border_.count(v) > 0);
+ if (g_.IsDeadEnd(v) || g_.IsDeadStart(v)) {
+ contains_deadends_ = true;
+ }
+ inner_vertices_.insert(v);
+ for (EdgeId e : g_.IncidentEdges(v)) {
+ //seems to correctly handle loops
+ if (edges_.count(e) == 0) {
+ edges_.insert(e);
+ cumm_length_ += g_.length(e);
+ VertexId other_end = OppositeEnd(e, v);
+ if (inner_vertices_.count(other_end) == 0) {
+ border_.insert(other_end);
+ }
+ }
+ }
+ RemoveFromBorder(v);
+ }
+
+ void TerminateOnVertex(VertexId v) {
+ terminating_vertices_.insert(v);
+ RemoveFromBorder(v);
+ }
+
+ VertexId NextBorderVertex() const {
+ return *border_.begin();
+ }
+
+ bool IsBorderEmpty() const {
+ return border_.empty();
+ }
+
+ const set<EdgeId>& edges() const {
+ return edges_;
+ }
+
+ bool contains(EdgeId e) const {
+ return edges_.count(e) > 0;
+ }
+
+ const set<VertexId>& terminating_vertices() const {
+ return terminating_vertices_;
+ }
+
+ set<EdgeId> terminating_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.IncidentEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ //terminating edges, going into the component
+ set<EdgeId> terminating_in_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.OutgoingEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ //terminating edges, going out of the component
+ set<EdgeId> terminating_out_edges() const {
+ set<EdgeId> answer;
+ for (VertexId v : terminating_vertices()) {
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ if (contains(e)) {
+ answer.insert(e);
+ }
+ }
+ }
+ return answer;
+ }
+
+ const Graph& g() const {
+ return g_;
+ }
+
+ size_t inner_vertex_cnt() const {
+ return inner_vertices_.size();
+ }
+
+ size_t length() const {
+ return cumm_length_;
+ }
+
+ bool contains_deadends() const {
+ return contains_deadends_;
+ }
+};
+
+template<class Graph>
+class RelativeCoverageHelper {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ const FlankingCoverage<Graph>& flanking_cov_;
+ double min_coverage_gap_;
+
+public:
+ RelativeCoverageHelper(const Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double min_coverage_gap)
+ : g_(g),
+ flanking_cov_(flanking_cov),
+ min_coverage_gap_(min_coverage_gap) {
+ VERIFY(math::gr(min_coverage_gap, 1.));
+ }
+
+ double LocalCoverage(EdgeId e, VertexId v) const {
+ double ans = flanking_cov_.LocalCoverage(e, v);
+ DEBUG("Local coverage of edge " << g_.str(e) << " around vertex " << g_.str(v) << " was " << ans);
+ return ans;
+ }
+
+ template<class EdgeContainer>
+ double MaxLocalCoverage(const EdgeContainer& edges, VertexId v) const {
+ double answer = 0.0;
+ for (EdgeId e : edges) {
+ answer = max(answer, LocalCoverage(e, v));
+ }
+ return answer;
+ }
+
+ template<class EdgeContainer>
+ bool CheckAnyHighlyCovered(const EdgeContainer& edges, VertexId v,
+ double base_coverage) const {
+ return math::gr(MaxLocalCoverage(edges, v),
+ base_coverage * min_coverage_gap_);
+ }
+
+ bool AnyHighlyCoveredOnBothSides(VertexId v, double base_coverage) const {
+ return CheckAnyHighlyCovered(g_.IncomingEdges(v), v, base_coverage) &&
+ CheckAnyHighlyCovered(g_.OutgoingEdges(v), v, base_coverage);
+ }
+
+ bool AnyHighlyCoveredOnFourSides(EdgeId e) const {
+ return AnyHighlyCoveredOnBothSides(g_.EdgeStart(e), LocalCoverage(e, g_.EdgeStart(e))) &&
+ AnyHighlyCoveredOnBothSides(g_.EdgeEnd(e), LocalCoverage(e, g_.EdgeEnd(e)));
+ }
+
+ double RelativeCoverageToReport(VertexId v, double base_coverage) const {
+ return std::min(MaxLocalCoverage(g_.OutgoingEdges(v), v),
+ MaxLocalCoverage(g_.IncomingEdges(v), v))
+ / base_coverage;
+ }
+
+private:
+ DECL_LOGGER("RelativeCoverageHelper");
+};
+
+template<class Graph>
+class RelativeCovDisconnectionCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const RelativeCoverageHelper<Graph> rel_helper_;
+ const double diff_mult_;
+
+ //Total length of highly-covered neighbourhood
+ // We believe that if high-covered component is small it is likely to be repeat or loop
+ const size_t min_neighbourhood_size_;
+public:
+ RelativeCovDisconnectionCondition(const Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double diff_mult,
+ size_t min_neighbourhood_size) :
+ base(g),
+ rel_helper_(g, flanking_cov, diff_mult),
+ diff_mult_(diff_mult),
+ min_neighbourhood_size_(min_neighbourhood_size) {
+ }
+
+ bool Check(EdgeId e) const override {
+ VertexId v = this->g().EdgeStart(e);
+ double coverage_edge_around_v = rel_helper_.LocalCoverage(e, v);
+ DEBUG("Local flanking coverage - " << coverage_edge_around_v);
+ DEBUG("Max local coverage incoming - " << rel_helper_.MaxLocalCoverage(this->g().IncomingEdges(v), v));
+ DEBUG("Max local coverage outgoing - " << rel_helper_.MaxLocalCoverage(this->g().OutgoingEdges(v), v));
+ return rel_helper_.AnyHighlyCoveredOnBothSides(v, coverage_edge_around_v) &&
+ HighCoverageComponentFinder<Graph>(this->g(), this->g().coverage(e) * diff_mult_)
+ .EdgeSummaryLength(v) >= min_neighbourhood_size_;
+ }
+
+private:
+ DECL_LOGGER("RelativeCovDisconnectionCondition");
+};
+
+namespace component_remover {
+template<class Graph>
+class LongestPathFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Component<Graph>& component_;
+ const Graph& g_;
+ map<VertexId, int> max_distance_;
+ vector<VertexId> vertex_stack_;
+ bool cycle_detected_;
+
+ //distance is changed!
+ bool TryGetMaxDistance(VertexId v, int& distance) {
+ if (max_distance_.count(v) > 0) {
+ distance = max_distance_[v];
+ return true;
+ }
+
+ //minus infinity for incoming tips
+ distance = std::numeric_limits<int>::min();
+ for (EdgeId e : g_.IncomingEdges(v)) {
+ VertexId start = g_.EdgeStart(e);
+ if (component_.contains(e)) {
+ if (max_distance_.count(start) == 0) {
+ if (std::find(vertex_stack_.begin(), vertex_stack_.end(), start) != vertex_stack_.end()) {
+ cycle_detected_ = true;
+ }
+ vertex_stack_.push_back(start);
+ return false;
+ } else {
+ distance = std::max(distance, max_distance_[start] + int(g_.length(e)));
+ }
+ }
+ }
+ //todo think...
+ //currently whole length of zig-zag path
+ //through several terminal vertices is counted
+ if (component_.terminating_vertices().count(v) > 0) {
+ distance = std::max(distance, 0);
+ }
+ return true;
+ }
+
+ void ProcessVertex(VertexId init_v) {
+ vertex_stack_.push_back(init_v);
+ while (!vertex_stack_.empty()) {
+ if (cycle_detected_)
+ return;
+
+ VertexId v = vertex_stack_.back();
+ int max_dist = 0;
+ if (TryGetMaxDistance(v, max_dist)) {
+ max_distance_[v] = max_dist;
+ vertex_stack_.pop_back();
+ }
+ }
+ }
+
+public:
+ LongestPathFinder(const Component<Graph>& component)
+ : component_(component), g_(component.g()), cycle_detected_(false) {
+ }
+
+ //-1u if component contains a cycle or no path between terminating vertices
+ size_t Find() {
+ int answer = 0;
+ for (VertexId v : component_.terminating_vertices()) {
+ ProcessVertex(v);
+ if (cycle_detected_)
+ return -1u;
+ VERIFY(max_distance_.count(v) > 0);
+ answer = std::max(answer, get(max_distance_, v));
+ }
+ VERIFY(answer >= 0);
+ if (answer == 0)
+ return -1u;
+ return size_t(answer);
+ }
+};
+
+template<class Graph>
+class ComponentChecker {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ size_t vertex_count_limit_;
+ size_t length_bound_;
+ size_t tip_allowing_length_bound_;
+ size_t longest_connecting_path_bound_;
+ double max_coverage_;
+
+ bool CoverageCheck(const Component<Graph>& component) const {
+ for (EdgeId e : component.edges()) {
+ if (math::gr(g_.coverage(e), max_coverage_)) {
+ TRACE("Too high coverage! Component contains highly covered edge " << g_.str(e)
+ << " of coverage " << g_.coverage(e)
+ << " while threshold was "
+ << max_coverage_);
+ return false;
+ }
+ }
+ return true;
+ }
+
+public:
+ ComponentChecker(const Graph& g, size_t vertex_count_limit, size_t length_bound,
+ size_t tip_allowing_length_bound,
+ size_t longest_connecting_path_bound,
+ double max_coverage)
+ : g_(g), vertex_count_limit_(vertex_count_limit),
+ length_bound_(length_bound),
+ tip_allowing_length_bound_(tip_allowing_length_bound),
+ longest_connecting_path_bound_(longest_connecting_path_bound),
+ max_coverage_(max_coverage) {
+ }
+
+ bool SizeCheck(const Component<Graph>& component) const {
+ if (component.inner_vertex_cnt() > vertex_count_limit_) {
+ TRACE("Too many vertices : " << component.inner_vertex_cnt() << " ! More than " << vertex_count_limit_);
+ return false;
+ }
+ return true;
+ }
+
+ bool FullCheck(const Component<Graph>& component) const {
+ TRACE("Performing full check of the component");
+ size_t longest_connecting_path = LongestPathFinder<Graph>(component).Find();
+ if (longest_connecting_path != -1u) {
+ if (longest_connecting_path >= longest_connecting_path_bound_) {
+ TRACE("Length of longest path: " << longest_connecting_path << "; threshold: "
+ << longest_connecting_path_bound_);
+ return false;
+ }
+ } else {
+ TRACE("Failed to find longest connecting path (check for cycles)");
+ }
+ if (!component.contains_deadends()
+ && component.length() > length_bound_) {
+ TRACE("Too long component of length " << component.length() << "! Longer than length bound "
+ << length_bound_);
+ return false;
+ } else if (component.length() > tip_allowing_length_bound_) {
+ TRACE("Too long component of length " << component.length() << "! Longer than tip allowing length bound "
+ << tip_allowing_length_bound_);
+ return false;
+ }
+
+ return SizeCheck(component) && CoverageCheck(component);
+ }
+
+private:
+ DECL_LOGGER("RelativelyLowCoveredComponentChecker");
+};
+
+template<class Graph>
+class InnerComponentSearcher {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ const Graph& g_;
+ const RelativeCoverageHelper<Graph>& rel_helper_;
+ const ComponentChecker<Graph>& checker_;
+ Component<Graph> component_;
+
+public:
+ InnerComponentSearcher(const Graph& g,
+ const RelativeCoverageHelper<Graph>& rel_helper,
+ const ComponentChecker<Graph>& checker,
+ EdgeId first_edge)
+ : g_(g), rel_helper_(rel_helper), checker_(checker),
+ component_(g_, first_edge) {
+ }
+
+ bool FindComponent() {
+ while (!component_.IsBorderEmpty()) {
+ if (!checker_.SizeCheck(component_))
+ return false;
+
+ VertexId v = component_.NextBorderVertex();
+
+ TRACE("Checking if vertex " << g_.str(v) << " is terminating.");
+ //checking if there is a sufficient coverage gap
+ if (!IsTerminateVertex(v)) {
+ TRACE("Not terminating, adding neighbourhood");
+ component_.MakeInner(v);
+ if (component_.terminating_vertices().count(v) > 0) {
+ TRACE("Terminating vertex classified as non-terminating");
+ return false;
+ }
+ } else {
+ TRACE("Terminating");
+ component_.TerminateOnVertex(v);
+ }
+ }
+
+ return checker_.FullCheck(component_);
+ }
+
+ const Component<Graph>& component() const {
+ return component_;
+ }
+
+private:
+
+ bool IsTerminateVertex(VertexId v) const {
+ double base_coverage = rel_helper_.MaxLocalCoverage(
+ RetainEdgesFromComponent(g_.IncidentEdges(v)), v);
+ return CheckAnyFilteredHighlyCovered(g_.OutgoingEdges(v),
+ v, base_coverage)
+ && CheckAnyFilteredHighlyCovered(
+ g_.IncomingEdges(v), v, base_coverage);
+ }
+
+ template<class EdgeContainer>
+ bool CheckAnyFilteredHighlyCovered(const EdgeContainer& edges,
+ VertexId v,
+ double base_coverage) const {
+ return rel_helper_.CheckAnyHighlyCovered(
+ FilterEdgesFromComponent(edges), v, base_coverage);
+ }
+
+ template<class EdgeContainer>
+ vector<EdgeId> FilterEdgesFromComponent(
+ const EdgeContainer& edges) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : edges) {
+ if (!component_.contains(e)) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+ template<class EdgeContainer>
+ vector<EdgeId> RetainEdgesFromComponent(
+ const EdgeContainer& edges) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : edges) {
+ if (component_.contains(e)) {
+ answer.push_back(e);
+ }
+ }
+ return answer;
+ }
+
+ DECL_LOGGER("InnerComponentSearcher");
+};
+
+template<class Graph>
+class RelativeCovComponentFinder {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph& g_;
+ RelativeCoverageHelper<Graph> rel_helper_;
+ size_t length_bound_;
+ size_t tip_allowing_length_bound_;
+ size_t longest_connecting_path_bound_;
+ double max_coverage_;
+ //bound on the number of inner vertices
+ size_t vertex_count_limit_;
+ std::string vis_dir_;
+
+ mutable std::atomic_uint fail_cnt_;
+ mutable std::atomic_uint succ_cnt_;
+
+ void VisualizeNontrivialComponent(const set<typename Graph::EdgeId>& edges, bool success) const {
+ auto colorer = visualization::graph_colorer::DefaultColorer(g_);
+ auto edge_colorer = make_shared<visualization::graph_colorer::CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer);
+ edge_colorer->AddColorer(make_shared<visualization::graph_colorer::SetColorer<Graph>>(g_, edges, "green"));
+ // shared_ptr<visualization::graph_colorer::GraphColorer<Graph>>
+ auto resulting_colorer = make_shared<visualization::graph_colorer::CompositeGraphColorer<Graph>>(colorer, edge_colorer);
+
+ visualization::graph_labeler::StrGraphLabeler<Graph> str_labeler(g_);
+ visualization::graph_labeler::CoverageGraphLabeler<Graph> cov_labler(g_);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(str_labeler, cov_labler);
+
+ if (edges.size() > 1) {
+ set<typename Graph::VertexId> vertices;
+ for (auto e : edges) {
+ vertices.insert(g_.EdgeStart(e));
+ vertices.insert(g_.EdgeEnd(e));
+ }
+
+ auto filename = success ? vis_dir_ + "/success/" + ToString(succ_cnt_++) : vis_dir_ + "/fail/" + ToString(fail_cnt_++);
+ visualization::visualization_utils::WriteComponent(
+ ComponentCloser<Graph>(g_, 0).CloseComponent(
+ GraphComponent<Graph>::FromVertices(g_, vertices)),
+ filename + ".dot", colorer, labeler);
+ }
+ }
+
+public:
+ RelativeCovComponentFinder(Graph& g,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double min_coverage_gap,
+ size_t length_bound,
+ size_t tip_allowing_length_bound,
+ size_t longest_connecting_path_bound,
+ double max_coverage,
+ size_t vertex_count_limit,
+ const std::string& vis_dir)
+ : g_(g),
+ rel_helper_(g, flanking_cov, min_coverage_gap),
+ length_bound_(length_bound),
+ tip_allowing_length_bound_(tip_allowing_length_bound),
+ longest_connecting_path_bound_(longest_connecting_path_bound),
+ max_coverage_(max_coverage),
+ vertex_count_limit_(vertex_count_limit),
+ vis_dir_(vis_dir),
+ fail_cnt_(0),
+ succ_cnt_(0) {
+ VERIFY(math::gr(min_coverage_gap, 1.));
+ VERIFY(tip_allowing_length_bound >= length_bound);
+ TRACE("Coverage gap " << min_coverage_gap);
+ if (!vis_dir_.empty()) {
+ path::make_dirs(vis_dir_);
+ path::make_dirs(vis_dir_ + "/success/");
+ path::make_dirs(vis_dir_ + "/fail/");
+ }
+ }
+
+ boost::optional<Component<Graph>> operator()(EdgeId e) const {
+ TRACE("Processing edge " << g_.str(e));
+
+ //here we use that the graph is conjugate!
+ VertexId v = g_.EdgeStart(e);
+ if (g_.IncomingEdgeCount(v) == 0 || g_.OutgoingEdgeCount(v) < 2/*==1*/) {
+ TRACE("Tip");
+ return boost::none;
+ }
+
+ double local_cov = rel_helper_.LocalCoverage(e, v);
+
+ TRACE("Local coverage around start " << g_.str(v) << " is " << local_cov);
+
+ //since min_coverage_gap_ > 1, we don't need to think about e here
+ TRACE("Checking presence of highly covered edges around start")
+ if (rel_helper_.AnyHighlyCoveredOnBothSides(v, local_cov)) {
+ TRACE("Looking for component");
+ ComponentChecker<Graph> checker(g_, vertex_count_limit_, length_bound_,
+ tip_allowing_length_bound_,
+ longest_connecting_path_bound_, max_coverage_);
+ //case of e being loop is handled implicitly!
+ InnerComponentSearcher<Graph> component_searcher(
+ g_, rel_helper_, checker, e);
+
+ if (component_searcher.FindComponent()) {
+ TRACE("Deleting component");
+ return boost::optional<Component<Graph>>(component_searcher.component());
+ } else {
+ TRACE("Failed to find component");
+ if (!vis_dir_.empty()) {
+ TRACE("Outputting image");
+ VisualizeNontrivialComponent(component_searcher.component().edges(), false);
+ }
+ }
+ } else {
+ TRACE("No highly covered edges around");
+ }
+ return boost::none;
+ }
+
+private:
+ DECL_LOGGER("RelativeCovComponentFinder")
+};
+} //namespace component_remover
+
+//currently works with conjugate graphs only (due to the assumption in the outer cycle)
+template<class Graph>
+class RelativeCoverageComponentRemover : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId, CoverageComparator<Graph>> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, CoverageComparator<Graph>> base;
+ typedef typename ComponentRemover<Graph>::HandlerF HandlerF;
+
+ component_remover::RelativeCovComponentFinder<Graph> finder_;
+ ComponentRemover<Graph> component_remover_;
+
+public:
+ RelativeCoverageComponentRemover(
+ Graph& g,
+ size_t chunk_cnt,
+ const FlankingCoverage<Graph>& flanking_cov,
+ double min_coverage_gap,
+ size_t length_bound,
+ size_t tip_allowing_length_bound,
+ size_t longest_connecting_path_bound,
+ double max_coverage = std::numeric_limits<double>::max(),
+ HandlerF handler_function = nullptr, size_t vertex_count_limit = 10,
+ std::string vis_dir = "")
+ : base(g, nullptr, /*canonical only*/ false,
+ CoverageComparator<Graph>(g), /*track changes*/ false),
+ finder_(g, flanking_cov,
+ min_coverage_gap, length_bound,
+ tip_allowing_length_bound, longest_connecting_path_bound,
+ max_coverage, vertex_count_limit, vis_dir),
+ component_remover_(g, handler_function) {
+ this->interest_el_finder_ = std::make_shared<ParallelInterestingElementFinder<Graph, EdgeId>>(
+ [&](EdgeId e) { return finder_(e); }, chunk_cnt);
+ }
+
+protected:
+
+ bool Process(EdgeId e) override {
+ DEBUG("Processing edge " << this->g().str(e));
+ auto opt_component = finder_(e);
+ if (!opt_component) {
+ DEBUG("Failed to detect component starting with edge " << this->g().str(e));
+ return false;
+ }
+ VERIFY(opt_component->edges().size());
+ DEBUG("Detected component edge cnt: " << opt_component->edges().size());
+ component_remover_.DeleteComponent(opt_component->edges());
+ DEBUG("Relatively low coverage component removed");
+ return true;
+ }
+
+private:
+ DECL_LOGGER("RelativeCoverageComponentRemover");
+};
+
+}
+}
+}
diff --git a/src/common/modules/simplification/tip_clipper.hpp b/src/common/modules/simplification/tip_clipper.hpp
new file mode 100644
index 0000000..7f87d66
--- /dev/null
+++ b/src/common/modules/simplification/tip_clipper.hpp
@@ -0,0 +1,248 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "math/xmath.h"
+#include "func/func.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "sequence/sequence.hpp"
+
+#include <set>
+
+namespace omnigraph {
+
+template<class Graph>
+class RelativeCoverageTipCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ const double max_relative_coverage_;
+
+ template<class IteratorType>
+ double MaxCompetitorCoverage(EdgeId tip, IteratorType begin, IteratorType end) const {
+ const Graph &g = this->g();
+ double result = 0;
+ for (auto it = begin; it != end; ++it) {
+ EdgeId e = *it;
+ //update if competitor edge is not loop
+ if (e != tip && g.EdgeStart(e) != g.EdgeEnd(e))
+ result = std::max(result, g.coverage(*it));
+ }
+ return result;
+ }
+
+ double MaxCompetitorCoverage(EdgeId tip) const {
+ const Graph &g = this->g();
+ VertexId start = g.EdgeStart(tip), end = g.EdgeEnd(tip);
+ auto out = g.OutgoingEdges(start);
+ auto in = g.IncomingEdges(end);
+ return std::max(
+ MaxCompetitorCoverage(tip, out.begin(), out.end()),
+ MaxCompetitorCoverage(tip, in.begin(), in.end()));
+// return std::max(
+// MaxCompetitorCoverage(tip, g.out_begin(start),
+// g.out_end(start)),
+// MaxCompetitorCoverage(tip, g.in_begin(end), g.in_end(end)));
+ }
+
+public:
+
+ RelativeCoverageTipCondition(const Graph& g, double max_relative_coverage) :
+ base(g), max_relative_coverage_(max_relative_coverage) {
+ }
+
+ bool Check(EdgeId e) const override {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ double max_coverage = MaxCompetitorCoverage(e) + 1;
+ return math::le(this->g().coverage(e),
+ max_relative_coverage_ * max_coverage);
+ }
+};
+
+template<class Graph>
+class TipCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ /**
+ * This method checks if given vertex topologically looks like end of tip
+ * @param v vertex to be checked
+ * @return true if vertex judged to be tip and false otherwise.
+ */
+ bool IsTip(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
+ }
+
+public:
+ TipCondition(const Graph& g) : base(g) {
+ }
+
+ /**
+ * This method checks if given edge topologically looks like a tip.
+ * @param edge edge vertex to be checked
+ * @return true if edge judged to be tip and false otherwise.
+ */
+ bool Check(EdgeId e) const override {
+ return (IsTip(this->g().EdgeEnd(e)) || IsTip(this->g().EdgeStart(e)))
+ && (this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
+ + this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) > 2);
+ }
+
+};
+
+
+template<class Graph>
+class MismatchTipCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ size_t max_diff_;
+
+ size_t Hamming(EdgeId edge1, EdgeId edge2) const {
+ size_t cnt = 0;
+ Sequence seq1 = this->g().EdgeNucls(edge1);
+ Sequence seq2 = this->g().EdgeNucls(edge2);
+ size_t len = std::min(seq1.size(), seq2.size());
+ for(size_t i = this->g().k(); i < len; i++) {
+ if(seq1[i] != seq2[i])
+ cnt++;
+ }
+ return cnt;
+ }
+
+ bool InnerCheck(EdgeId e) const {
+ size_t len = this->g().length(e);
+ for (auto alt : this->g().OutgoingEdges(this->g().EdgeStart(e))) {
+ if (e != alt && len < this->g().length(alt) && Hamming(e, alt) <= max_diff_) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+public:
+ MismatchTipCondition(const Graph& g, size_t max_diff) :
+ base(g), max_diff_(max_diff) {
+ }
+
+ bool Check(EdgeId e) const override {
+ return InnerCheck(e) || InnerCheck(this->g().conjugate(e));
+ }
+
+};
+
+template<class Graph>
+class ATCondition: public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+ const double max_AT_percentage_;
+ const size_t max_tip_length_;
+ const bool check_tip_ ;
+
+public:
+
+ ATCondition(const Graph& g, double max_AT_percentage, size_t max_tip_length, bool check_tip) :
+ base(g), max_AT_percentage_(max_AT_percentage), max_tip_length_(max_tip_length), check_tip_(check_tip) {
+ DEBUG("check_tip: " << check_tip_);
+ }
+
+ bool Check(EdgeId e) const {
+ //+1 is a trick to deal with edges of 0 coverage from iterative run
+ size_t start = 0;
+ //TODO: Do we need this check?
+ if(this->g().length(e) > max_tip_length_)
+ return false;
+ size_t end = this->g().length(e) + this->g().k();
+ if (check_tip_) {
+ if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) == 0)
+ start = this->g().k();
+ else if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 0)
+ end = this->g().length(e);
+ else return false;
+ }
+ std::array<size_t, 4> counts = std::array<size_t, 4>();
+ const Sequence &s_edge = this->g().EdgeNucls(e);
+
+ for (size_t position = start; position < end; position ++) {
+ counts[s_edge[position]] ++;
+ }
+ size_t curm = *std::max_element(counts.begin(), counts.end());
+ if (curm > max_AT_percentage_ * double(end - start)) {
+ DEBUG("deleting edge" << s_edge.str());;
+ DEBUG("curm: " << curm);
+
+ DEBUG("start end cutoff" << start << " " << end << " " << max_AT_percentage_ * double(this->g().length(e)));
+
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+private:
+ DECL_LOGGER("ATCondition")
+};
+
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId> AddTipCondition(const Graph& g,
+ func::TypedPredicate<typename Graph::EdgeId> condition) {
+ return func::And(TipCondition<Graph>(g), condition);
+}
+
+template<class Graph>
+class DeadEndCondition : public EdgeCondition<Graph> {
+ typedef EdgeCondition<Graph> base;
+
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ /**
+ * This method checks if given vertex topologically looks like end of tip
+ * @param v vertex to be checked
+ * @return true if vertex judged to be tip and false otherwise.
+ */
+ bool IsDeadEnd(VertexId v) const {
+ return this->g().IncomingEdgeCount(v) * this->g().OutgoingEdgeCount(v) == 0;
+ }
+
+public:
+ DeadEndCondition(const Graph& g) : base(g) {
+ }
+
+ /**
+ * This method checks if given edge topologically looks like a tip.
+ * @param edge edge vertex to be checked
+ * @return true if edge judged to be tip and false otherwise.
+ */
+ /*virtual*/
+
+ //Careful - no alternative path check!
+ bool Check(EdgeId e) const {
+ return (IsDeadEnd(this->g().EdgeEnd(e)) || IsDeadEnd(this->g().EdgeStart(e)))
+ && (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e))
+ + this->g().IncomingEdgeCount(this->g().EdgeStart(e)) >= 1);
+ }
+
+ private:
+ DECL_LOGGER("DeadEndCondition");
+
+};
+
+template<class Graph>
+func::TypedPredicate<typename Graph::EdgeId>AddDeadEndCondition(const Graph& g,
+ func::TypedPredicate<typename Graph::EdgeId> condition) {
+ return func::And(DeadEndCondition<Graph>(g), condition);
+}
+
+} // namespace omnigraph
diff --git a/src/common/modules/simplification/topological_edge_conditions.hpp b/src/common/modules/simplification/topological_edge_conditions.hpp
new file mode 100644
index 0000000..88164a9
--- /dev/null
+++ b/src/common/modules/simplification/topological_edge_conditions.hpp
@@ -0,0 +1,286 @@
+#pragma once
+
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/core/directions.hpp"
+
+namespace omnigraph {
+
+template<class Graph, class PathFinder>
+class PathLengthLowerBound : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ PathFinder path_finder_;
+ size_t min_length_;
+
+ ForwardDirection<Graph> forward_;
+ BackwardDirection<Graph> backward_;
+
+ size_t CumulativePathLength(EdgeId e, const AbstractDirection<Graph> &direction) const {
+ return CumulativeLength(this->g(), path_finder_(e, direction));
+ }
+
+public:
+ PathLengthLowerBound(const Graph &g, const PathFinder &path_finder,
+ size_t min_length)
+ : base(g),
+ path_finder_(path_finder),
+ min_length_(min_length),
+ forward_(g),
+ backward_(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ size_t forward = CumulativePathLength(e, forward_);
+ size_t backward = CumulativePathLength(e, backward_);
+ //checking that path was trivial in one of directions
+ VERIFY(forward == this->g().length(e) || backward == this->g().length(e));
+ return std::max(forward, backward) >= min_length_;
+ }
+};
+
+template<class Graph, class PathFinder>
+EdgePredicate<Graph>
+MakePathLengthLowerBound(const Graph &g, const PathFinder &path_finder, size_t min_length) {
+ return PathLengthLowerBound<Graph, PathFinder>(g, path_finder, min_length);
+}
+
+template<class Graph>
+EdgePredicate<Graph>
+UniquePathLengthLowerBound(const Graph &g, size_t min_length) {
+ return MakePathLengthLowerBound(g, UniquePathFinder<Graph>(g), min_length);
+}
+
+template<class Graph>
+EdgePredicate<Graph>
+UniqueIncomingPathLengthLowerBound(const Graph &g, size_t min_length) {
+ return [&] (typename Graph::EdgeId e) {
+ typename Graph::VertexId v = g.EdgeStart(e);
+ return g.CheckUniqueIncomingEdge(v) &&
+ UniquePathLengthLowerBound(g, min_length)(g.GetUniqueIncomingEdge(v));
+ };
+}
+
+//todo can disconnect uniqueness and plausibility conditions, since graph is always conjugate!
+template<class Graph>
+class UniquenessPlausabilityCondition : public EdgeCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef EdgeCondition<Graph> base;
+
+ virtual bool CheckUniqueness(EdgeId e, bool forward) const = 0;
+
+ virtual bool CheckPlausibility(EdgeId e, bool forward) const = 0;
+
+ bool SingleUnique(const vector<EdgeId> &edges, bool forward) const {
+ return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
+ }
+
+ bool ExistPlausible(EdgeId init_e, const vector<EdgeId> &edges,
+ bool forward) const {
+ for (EdgeId e : edges) {
+ if (e == init_e)
+ continue;
+ if (CheckPlausibility(e, forward)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool Check(EdgeId e, const AbstractDirection<Graph> &direction) const {
+ return SingleUnique(direction.IncomingEdges(direction.EdgeStart(e)),
+ !direction.IsForward())
+ && ExistPlausible(
+ e, direction.OutgoingEdges(direction.EdgeStart(e)),
+ direction.IsForward());
+ }
+
+public:
+
+ UniquenessPlausabilityCondition(const Graph &g)
+ : base(g) {
+
+ }
+
+ bool Check(EdgeId e) const {
+ return Check(e, ForwardDirection<Graph>(this->g()))
+ || Check(e, BackwardDirection<Graph>(this->g()));
+ }
+
+};
+
+template<class Graph>
+class PredicateUniquenessPlausabilityCondition :
+ public UniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef UniquenessPlausabilityCondition<Graph> base;
+
+ EdgePredicate<Graph> uniqueness_condition_;
+ EdgePredicate<Graph> plausiblity_condition_;
+
+ bool CheckUniqueness(EdgeId e, bool) const {
+ return uniqueness_condition_(e);
+ }
+
+ bool CheckPlausibility(EdgeId e, bool) const {
+ return plausiblity_condition_(e);
+ }
+
+public:
+
+ PredicateUniquenessPlausabilityCondition(
+ const Graph &g, EdgePredicate<Graph> uniqueness_condition,
+ EdgePredicate<Graph> plausiblity_condition)
+ : base(g),
+ uniqueness_condition_(uniqueness_condition),
+ plausiblity_condition_(plausiblity_condition) {
+ }
+
+};
+
+template<class Graph>
+class DefaultUniquenessPlausabilityCondition :
+ public PredicateUniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef PredicateUniquenessPlausabilityCondition<Graph> base;
+
+public:
+
+ DefaultUniquenessPlausabilityCondition(const Graph &g,
+ size_t uniqueness_length,
+ size_t plausibility_length)
+ : base(g,
+ UniquePathLengthLowerBound(g, uniqueness_length),
+ MakePathLengthLowerBound(g,
+ PlausiblePathFinder<Graph>(g, 2 * plausibility_length),
+ plausibility_length)) {
+ }
+
+};
+
+template<class Graph>
+class MultiplicityCounter {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+ size_t uniqueness_length_;
+ size_t max_depth_;
+
+ bool search(VertexId a, VertexId start, EdgeId e, size_t depth,
+ std::set<VertexId> &was, pair<size_t, size_t> &result) const {
+ if (depth > max_depth_)
+ return false;
+ if (was.count(a) == 1)
+ return true;
+ was.insert(a);
+ if (graph_.OutgoingEdgeCount(a) == 0
+ || graph_.IncomingEdgeCount(a) == 0)
+ return false;
+ for (auto I = graph_.out_begin(a), E = graph_.out_end(a); I != E; ++I) {
+ if (*I == e) {
+ if (a != start) {
+ return false;
+ }
+ } else {
+ if (graph_.length(*I) >= uniqueness_length_) {
+ result.second++;
+ } else {
+ if (!search(graph_.EdgeEnd(*I), start, e,
+ depth + 1 /*graph_.length(*it)*/, was, result))
+ return false;
+ }
+ }
+ }
+ for (EdgeId in_e : graph_.IncomingEdges(a)) {
+ if (in_e == e) {
+ if (a != start) {
+ return false;
+ }
+ } else {
+ if (graph_.length(in_e) >= uniqueness_length_) {
+ result.first++;
+ } else {
+ if (!search(graph_.EdgeStart(in_e), start, e,
+ depth + 1 /*graph_.length(*it)*/, was, result))
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+public:
+ MultiplicityCounter(const Graph &graph, size_t uniqueness_length,
+ size_t max_depth)
+ : graph_(graph),
+ uniqueness_length_(uniqueness_length),
+ max_depth_(max_depth) {
+ }
+
+ size_t count(EdgeId e, VertexId start) const {
+ std::pair<size_t, size_t> result;
+ std::set<VertexId> was;
+ bool valid = search(start, start, e, 0, was, result);
+ if (!valid) {
+ return (size_t) (-1);
+ }
+ if (graph_.EdgeStart(e) == start) {
+ if (result.first < result.second) {
+ return (size_t) (-1);
+ }
+ return result.first - result.second;
+ } else {
+ if (result.first > result.second) {
+ return (size_t) (-1);
+ }
+ return -result.first + result.second;
+ }
+ }
+};
+
+template<class Graph>
+class MultiplicityCountingCondition : public UniquenessPlausabilityCondition<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef UniquenessPlausabilityCondition<Graph> base;
+
+ MultiplicityCounter<Graph> multiplicity_counter_;
+ EdgePredicate<Graph> plausiblity_condition_;
+
+public:
+ bool CheckUniqueness(EdgeId e, bool forward) const {
+ TRACE( "Checking " << this->g().int_id(e) << " for uniqueness in " << (forward ? "forward" : "backward") << " direction");
+ VertexId start =
+ forward ? this->g().EdgeEnd(e) : this->g().EdgeStart(e);
+ bool result = multiplicity_counter_.count(e, start) <= 1;
+ TRACE( "Edge " << this->g().int_id(e) << " is" << (result ? "" : " not") << " unique");
+ return result;
+ }
+
+ bool CheckPlausibility(EdgeId e, bool) const {
+ return plausiblity_condition_(e);
+ }
+
+ MultiplicityCountingCondition(const Graph& g, size_t uniqueness_length,
+ EdgePredicate<Graph> plausiblity_condition)
+ :
+ //todo why 8???
+ base(g),
+ multiplicity_counter_(g, uniqueness_length, 8),
+ plausiblity_condition_(plausiblity_condition) {
+
+ }
+
+private:
+
+ DECL_LOGGER("MultiplicityCountingCondition");
+};
+
+
+}
diff --git a/src/common/paired_info/concurrent_pair_info_buffer.hpp b/src/common/paired_info/concurrent_pair_info_buffer.hpp
new file mode 100644
index 0000000..5662a32
--- /dev/null
+++ b/src/common/paired_info/concurrent_pair_info_buffer.hpp
@@ -0,0 +1,120 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "histogram.hpp"
+#include "histptr.hpp"
+
+#include <btree/btree_map.h>
+#include <cuckoo/cuckoohash_map.hh>
+
+namespace omnigraph {
+
+namespace de {
+
+template<typename G, typename Traits, template<typename, typename> class Container>
+class ConcurrentPairedBuffer : public PairedBufferBase<ConcurrentPairedBuffer<G, Traits, Container>,
+ G, Traits> {
+ typedef ConcurrentPairedBuffer<G, Traits, Container> self;
+ typedef PairedBufferBase<self, G, Traits> base;
+
+ friend class PairedBufferBase<self, G, Traits>;
+
+ protected:
+ using typename base::InnerPoint;
+ typedef omnigraph::de::Histogram<InnerPoint> InnerHistogram;
+ typedef omnigraph::de::StrongWeakPtr<InnerHistogram> InnerHistPtr;
+
+
+ public:
+ using typename base::Graph;
+ using typename base::EdgeId;
+ using typename base::EdgePair;
+ using typename base::Point;
+
+ typedef Container<EdgeId, InnerHistPtr> InnerMap;
+ typedef cuckoohash_map<EdgeId, InnerMap> StorageMap;
+
+ public:
+ ConcurrentPairedBuffer(const Graph &g)
+ : base(g) {
+ clear();
+ }
+
+ //---------------- Miscellaneous ----------------
+
+ /**
+ * @brief Clears the whole index. Used in merging.
+ */
+ void clear() {
+ storage_.clear();
+ this->size_ = 0;
+ }
+
+ typename StorageMap::locked_table lock_table() {
+ return storage_.lock_table();
+ }
+
+ private:
+ std::pair<typename InnerHistPtr::pointer, size_t> InsertOne(EdgeId e1, EdgeId e2, InnerPoint p) {
+ if (!storage_.contains(e1))
+ storage_.insert(e1, InnerMap()); // We can fail to insert here, it's ok
+
+ size_t added = 0;
+ typename InnerHistPtr::pointer inserted = nullptr;
+ storage_.update_fn(e1,
+ [&](InnerMap &second) { // Now we will hold lock to the whole "subtree" starting from e1
+ if (!second.count(e2)) {
+ inserted = new InnerHistogram();
+ second.insert(std::make_pair(e2, InnerHistPtr(inserted, /* owning */ true)));
+ }
+ added = second[e2]->merge_point(p);
+ });
+
+ return { inserted, added };
+ }
+
+ template<class OtherHist>
+ std::pair<typename InnerHistPtr::pointer, size_t> InsertHist(EdgeId e1, EdgeId e2, const OtherHist &h) {
+ if (!storage_.contains(e1))
+ storage_.insert(e1, InnerMap()); // We can fail to insert here, it's ok
+
+ size_t added = 0;
+ typename InnerHistPtr::pointer inserted = nullptr;
+ storage_.update_fn(e1,
+ [&](InnerMap &second) { // Now we will hold lock to the whole "subtree" starting from e1
+ if (!second.count(e2)) {
+ inserted = new InnerHistogram();
+ second.insert(std::make_pair(e2, InnerHistPtr(inserted, /* owning */ true)));
+ }
+ added = second[e2]->merge(h);
+ });
+
+ return { inserted, added };
+ }
+
+ void InsertHistView(EdgeId e1, EdgeId e2, typename InnerHistPtr::pointer p) {
+ if (!storage_.contains(e1))
+ storage_.insert(e1, InnerMap()); // We can fail to insert here, it's ok
+
+ storage_.update_fn(e1,
+ [&](InnerMap &second) { // Now we will hold lock to the whole "subtree" starting from e1
+ auto res = second.insert(std::make_pair(e2, InnerHistPtr(p, /* owning */ false)));
+ VERIFY_MSG(res.second, "Index insertion inconsistency");
+ });
+ }
+
+ protected:
+ StorageMap storage_;
+};
+
+template<class Graph>
+using ConcurrentPairedInfoBuffer = ConcurrentPairedBuffer<Graph, RawPointTraits, btree_map>;
+
+} // namespace de
+
+} // namespace omnigraph
diff --git a/src/common/paired_info/data_divider.hpp b/src/common/paired_info/data_divider.hpp
new file mode 100644
index 0000000..c124470
--- /dev/null
+++ b/src/common/paired_info/data_divider.hpp
@@ -0,0 +1,137 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* data_divider.hpp
+*
+* Created on: Aug 16, 2011
+* Author: alexeyka
+*/
+
+
+#ifndef DATA_DIVIDER_HPP_
+#define DATA_DIVIDER_HPP_
+
+#include <iostream>
+#include <math.h>
+#include "utils/verify.hpp"
+#include <vector>
+#include <utility>
+#include <cstdlib>
+#include <cstdio>
+#include "index_point.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class EdgeId>
+class DataDivider {
+ typedef pair<size_t, size_t> Interval;
+ typedef vector<PairInfo<EdgeId> > PairInfos;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<Point> PointArray;
+ typedef std::function<double(int)> WeightFunction;
+
+ // double LeftDerivative(int index, vector<int> x, vector<int> y) {
+ // return outf[dist - min_value_ + 1][0] - outf[dist - min][0];
+ // }
+ //
+ // double RightDerivative(index, std::vector<int> x, std::vector<int> y) {
+ // return outf[dist - min_value_][0] - outf[dist - min - 1][0];
+ // }
+ //
+ // double MiddleDerivative(int index, std::vector<int> x, std::vector<int> y) {
+ // return 0.5f * (outf[dist - min_value_ + 1][0] - outf[dist - min - 1][0]);
+ // }
+
+public:
+ DataDivider(size_t threshold, const PointArray &points) :
+ threshold_(threshold), points_(points) {
+ }
+
+ vector<Interval> DivideData() {
+ VERIFY(points_.size() > 0);
+ vector<Interval> answer;
+ min_value_ = rounded_d(points_.front());
+ max_value_ = rounded_d(points_.back());
+ size_t begin = 0;
+ for (size_t i = 0; i < points_.size() - 1; ++i) {
+ if (IsANewCluster(i, points_)) {
+ answer.push_back(make_pair(begin, i + 1));
+ begin = i + 1;
+ }
+ }
+ answer.push_back(make_pair(begin, points_.size()));
+
+ return answer;
+ }
+
+ vector<Interval> DivideAndSmoothData(const EdgePair &ep,
+ PairInfos &new_data,
+ WeightFunction weight_f) {
+ VERIFY(points_.size() > 0);
+ vector<Interval> answer;
+
+ TRACE("Data");
+ //Print();
+ const Point &point = points_.front();
+ min_value_ = rounded_d(point);
+ max_value_ = rounded_d(points_.back());
+ size_t begin = 0;
+ for (size_t i = 0; i < points_.size(); ++i) {
+ if (i == points_.size() - 1 || IsANewCluster(i)) {
+ int low_val = rounded_d(points_[begin]);
+ int high_val = rounded_d(points_[i]);
+ size_t new_begin = new_data.size();
+ VERIFY(low_val <= high_val);
+ for (int j = low_val; j <= high_val; ++j) {
+ double val = 0.;
+ for (size_t k = begin; k <= i; ++k) {
+ val += points_[k].weight * weight_f(j - rounded_d(points_[k]));
+ }
+ new_data.push_back(PairInfo<EdgeId>(ep.first, ep.second, j, val, 0.));
+ }
+ size_t new_end = new_data.size();
+ answer.push_back(make_pair(new_begin, new_end));
+
+ begin = i + 1;
+ }
+ }
+ //answer.push_back(make_pair(beginc, new_data.size()));
+ TRACE("New_data ");
+ Print();
+
+ return answer;
+ }
+
+private:
+ int min_value_;
+ int max_value_;
+ size_t threshold_;
+ PointArray points_;
+
+ void Print() const {
+ for (size_t i = 0; i < points_.size(); ++i) {
+ TRACE(points_[i].d << " " << points_[i].weight);
+ }
+ }
+
+ bool IsANewCluster(size_t index) {
+ VERIFY(index < points_.size() - 1);
+ return (math::gr(abs(points_[index + 1].d - points_[index].d), (DEDistance) threshold_));
+ }
+
+ DECL_LOGGER("DataDivider");
+};
+
+}
+
+
+}
+
+#endif /* DATA_DIVIDER_HPP_ */
diff --git a/src/common/paired_info/distance_estimation.hpp b/src/common/paired_info/distance_estimation.hpp
new file mode 100644
index 0000000..97663a4
--- /dev/null
+++ b/src/common/paired_info/distance_estimation.hpp
@@ -0,0 +1,300 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef DISTANCE_ESTIMATION_HPP_
+#define DISTANCE_ESTIMATION_HPP_
+
+#include "math/xmath.h"
+#include "utils/openmp_wrapper.h"
+
+#include "paired_info.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "paired_info/pair_info_bounds.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+//todo move to some more common place
+template<class Graph>
+class GraphDistanceFinder {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef std::vector<EdgeId> Path;
+ typedef std::vector<size_t> GraphLengths;
+ typedef std::map<EdgeId, GraphLengths> LengthMap;
+
+public:
+ GraphDistanceFinder(const Graph &graph, size_t insert_size, size_t read_length, size_t delta) :
+ graph_(graph), insert_size_(insert_size), gap_((int) (insert_size - 2 * read_length)),
+ delta_((double) delta) { }
+
+ std::vector<size_t> GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const {
+ LengthMap m;
+ m.insert({e2, {}});
+
+ FillGraphDistancesLengths(e1, m);
+
+ return m[e2];
+ }
+
+ // finds all distances from a current edge to a set of edges
+ void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
+ vector<size_t> path_lower_bounds;
+
+ size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_);
+
+ PathProcessor<Graph> paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound);
+
+ for (auto &entry : second_edges) {
+ EdgeId e2 = entry.first;
+ size_t path_lower_bound = PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1),
+ graph_.length(e2), gap_, delta_);
+
+ TRACE("Bounds for paths are " << path_lower_bound << " " << path_upper_bound);
+
+ DistancesLengthsCallback<Graph> callback(graph_);
+ paths_proc.Process(graph_.EdgeStart(e2), path_lower_bound, path_upper_bound, callback);
+ GraphLengths lengths = callback.distances();
+ for (size_t j = 0; j < lengths.size(); ++j) {
+ lengths[j] += graph_.length(e1);
+ TRACE("Resulting distance set for " <<
+ " edge " << graph_.int_id(e2) <<
+ " #" << j << " length " << lengths[j]);
+ }
+
+ if (e1 == e2)
+ lengths.push_back(0);
+
+ std::sort(lengths.begin(), lengths.end());
+ entry.second = lengths;
+ }
+ }
+
+private:
+ DECL_LOGGER("GraphDistanceFinder");
+
+ const Graph &graph_;
+ const size_t insert_size_;
+ const int gap_;
+ const double delta_;
+};
+
+template<class Graph>
+class AbstractDistanceEstimator {
+protected:
+ typedef UnclusteredPairedInfoIndexT<Graph> InPairedIndex;
+ typedef PairedInfoIndexT<Graph> OutPairedIndex;
+ typedef typename InPairedIndex::HistProxy InHistogram;
+ typedef typename OutPairedIndex::Histogram OutHistogram;
+
+public:
+ AbstractDistanceEstimator(const Graph &graph,
+ const InPairedIndex &index,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ size_t linkage_distance = 0)
+ : graph_(graph), index_(index),
+ distance_finder_(distance_finder), linkage_distance_(linkage_distance) { }
+
+ virtual void Estimate(PairedInfoIndexT<Graph> &result, size_t nthreads) const = 0;
+
+ virtual ~AbstractDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef vector<size_t> GraphLengths;
+ typedef std::map<EdgeId, GraphLengths> LengthMap;
+
+ const Graph &graph() const { return graph_; }
+
+ const InPairedIndex &index() const { return index_; }
+
+ void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
+ distance_finder_.FillGraphDistancesLengths(e1, second_edges);
+ }
+
+ OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist &estimated) const {
+ OutHistogram result;
+ for (size_t i = 0; i < estimated.size(); ++i) {
+ size_t left = i;
+ DEWeight weight = DEWeight(estimated[i].second);
+ while (i + 1 < estimated.size() &&
+ (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) {
+ ++i;
+ weight += estimated[i].second;
+ }
+ DEDistance center = DEDistance((estimated[left].first + estimated[i].first) * 0.5);
+ DEVariance var = DEVariance((estimated[i].first - estimated[left].first) * 0.5);
+ result.insert(Point(center, weight, var));
+ }
+ return result;
+ }
+
+ void AddToResult(const OutHistogram &clustered, EdgePair ep, PairedInfoBuffer<Graph> &result) const {
+ result.AddMany(ep.first, ep.second, clustered);
+ }
+
+private:
+ const Graph &graph_;
+ const InPairedIndex &index_;
+ const GraphDistanceFinder<Graph> &distance_finder_;
+ const size_t linkage_distance_;
+
+ virtual const string Name() const = 0;
+};
+
+template<class Graph>
+class DistanceEstimator : public AbstractDistanceEstimator<Graph> {
+ typedef AbstractDistanceEstimator<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef vector<size_t> GraphLengths;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+
+protected:
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+
+public:
+ DistanceEstimator(const Graph &graph,
+ const InPairedIndex &index,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ size_t linkage_distance, size_t max_distance)
+ : base(graph, index, distance_finder, linkage_distance), max_distance_(max_distance) { }
+
+ virtual ~DistanceEstimator() { }
+
+ void Init() const {
+ INFO("Using " << this->Name() << " distance estimator");
+ }
+
+ virtual void Estimate(OutPairedIndex &result, size_t nthreads) const {
+ this->Init();
+ const auto &index = this->index();
+
+ DEBUG("Collecting edge infos");
+ std::vector<EdgeId> edges;
+ for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it)
+ edges.push_back(*it);
+
+ DEBUG("Processing");
+ PairedInfoBuffersT<Graph> buffer(this->graph(), nthreads);
+# pragma omp parallel for num_threads(nthreads) schedule(guided, 10)
+ for (size_t i = 0; i < edges.size(); ++i) {
+ EdgeId edge = edges[i];
+ ProcessEdge(edge, index, buffer[omp_get_thread_num()]);
+ }
+
+ for (size_t i = 0; i < nthreads; ++i) {
+ result.Merge(buffer[i]);
+ buffer[i].clear();
+ }
+ }
+
+protected:
+ const DEDistance max_distance_;
+
+ virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
+ const InHistogram &histogram,
+ const GraphLengths &raw_forward) const {
+ using std::abs;
+ using namespace math;
+ EdgeId e1 = ep.first, e2 = ep.second;
+ size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2);
+ int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max());
+
+ TRACE("Bounds are " << minD << " " << maxD);
+ EstimHist result;
+ vector<DEDistance> forward;
+ forward.reserve(raw_forward.size());
+ for (auto raw_length : raw_forward) {
+ int length = int(raw_length);
+ if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_))
+ forward.push_back(DEDistance(length));
+ }
+ if (forward.size() == 0)
+ return result;
+
+ size_t cur_dist = 0;
+ vector<DEWeight> weights(forward.size(), 0);
+ for (auto point : histogram) {
+ if (ls(2 * point.d + DEDistance(second_len), DEDistance(first_len)))
+ continue;
+ while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d)
+ ++cur_dist;
+
+ if (cur_dist + 1 < forward.size() &&
+ ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
+ ++cur_dist;
+
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight;
+ } else if (cur_dist + 1 < forward.size() &&
+ eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight * 0.5;
+ ++cur_dist;
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight * 0.5;
+ } else {
+ if (le(abs(forward[cur_dist] - point.d), max_distance_))
+ weights[cur_dist] += point.weight;
+ }
+ }
+
+ for (size_t i = 0; i < forward.size(); ++i)
+ if (ge(weights[i], DEWeight(0)))
+ result.push_back(make_pair(forward[i], weights[i]));
+
+ VERIFY(result.size() == forward.size());
+ return result;
+ }
+
+private:
+ virtual void ProcessEdge(EdgeId e1,
+ const InPairedIndex &pi,
+ PairedInfoBuffer<Graph> &result) const {
+ typename base::LengthMap second_edges;
+ auto inner_map = pi.GetHalf(e1);
+ for (auto i : inner_map)
+ second_edges[i.first];
+
+ this->FillGraphDistancesLengths(e1, second_edges);
+
+ for (const auto &entry: second_edges) {
+ EdgeId e2 = entry.first;
+ EdgePair ep(e1, e2);
+
+ VERIFY(ep <= pi.ConjugatePair(ep));
+
+ const GraphLengths &forward = entry.second;
+ TRACE("Edge pair is " << this->graph().int_id(ep.first)
+ << " " << this->graph().int_id(ep.second));
+ auto hist = pi.Get(e1, e2);
+ const EstimHist &estimated = this->EstimateEdgePairDistances(ep, hist, forward);
+ OutHistogram res = this->ClusterResult(ep, estimated);
+ this->AddToResult(res, ep, result);
+ }
+ }
+
+ virtual const string Name() const {
+ static const string my_name = "SIMPLE";
+ return my_name;
+ }
+
+ DECL_LOGGER("DistanceEstimator");
+};
+
+}
+
+}
+
+#endif /* DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/common/paired_info/histogram.hpp b/src/common/paired_info/histogram.hpp
new file mode 100644
index 0000000..d8983fc
--- /dev/null
+++ b/src/common/paired_info/histogram.hpp
@@ -0,0 +1,199 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <btree/btree_set.h>
+#include "common/adt/flat_set.hpp"
+#include "common/adt/small_pod_vector.hpp"
+#include "index_point.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Point>
+class Histogram {
+ typedef Histogram<Point> self_type;
+ typedef typename std::less<Point> key_compare;
+ typedef typename std::allocator<Point> allocator_type;
+ typedef typename adt::flat_set<Point, key_compare, adt::SmallPODVector> Tree;
+
+public:
+ typedef typename Tree::key_type key_type;
+ typedef typename Tree::value_type value_type;
+ typedef typename Tree::pointer pointer;
+ typedef typename Tree::const_pointer const_pointer;
+ typedef typename Tree::reference reference;
+ typedef typename Tree::const_reference const_reference;
+ typedef typename Tree::size_type size_type;
+ typedef typename Tree::difference_type difference_type;
+ typedef typename Tree::iterator iterator;
+ typedef typename Tree::const_iterator const_iterator;
+ typedef typename Tree::reverse_iterator reverse_iterator;
+ typedef typename Tree::const_reverse_iterator const_reverse_iterator;
+
+ enum {
+ kValueSize = sizeof(Point)
+ };
+
+public:
+ // Default constructor.
+ Histogram() = default;
+
+ // Copy constructor.
+ Histogram(const self_type &x)
+ : tree_(x.tree_) {}
+
+ template <class InputIterator>
+ Histogram(InputIterator b, InputIterator e) {
+ insert(b, e);
+ }
+
+ Histogram(std::initializer_list<Point> l) {
+ insert(l.begin(), l.end());
+ }
+
+ // Iterator routines.
+ iterator begin() { return tree_.begin(); }
+ const_iterator begin() const { return tree_.begin(); }
+ iterator end() { return tree_.end(); }
+ const_iterator end() const { return tree_.end(); }
+ reverse_iterator rbegin() { return tree_.rbegin(); }
+ const_reverse_iterator rbegin() const { return tree_.rbegin(); }
+ reverse_iterator rend() { return tree_.rend(); }
+ const_reverse_iterator rend() const { return tree_.rend(); }
+
+ // Lookup routines.
+ iterator lower_bound(const key_type &key) { return tree_.lower_bound(key); }
+ const_iterator lower_bound(const key_type &key) const { return tree_.lower_bound(key); }
+ iterator upper_bound(const key_type &key) { return tree_.upper_bound(key); }
+ const_iterator upper_bound(const key_type &key) const { return tree_.upper_bound(key); }
+ std::pair<iterator,iterator> equal_range(const key_type &key) { return tree_.equal_range(key); }
+ std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { return tree_.equal_range(key); }
+
+ // Utility routines.
+ void clear() { tree_.clear(); }
+ void swap(self_type &x) { tree_.swap(x.tree_); }
+
+ // Size routines.
+ size_type size() const { return tree_.size(); }
+ size_type max_size() const { return tree_.max_size(); }
+ bool empty() const { return tree_.empty(); }
+ size_type bytes_used() const { return tree_.bytes_used(); }
+
+ // Lookup routines.
+ iterator find(const key_type &key) { return tree_.find(key); }
+ const_iterator find(const key_type &key) const { return tree_.find(key); }
+ size_type count(const key_type &key) const { return tree_.count(key); }
+
+ // Insertion routines.
+ std::pair<iterator,bool> insert(const value_type &x) { return tree_.insert(x); }
+ iterator insert(iterator position, const value_type &x) { return tree_.insert(position, x); }
+ template <typename InputIterator>
+ void insert(InputIterator b, InputIterator e) { tree_.insert(b, e); }
+
+ // Deletion routines.
+ size_type erase(const key_type &key) { return tree_.erase(key); }
+ // Erase the specified iterator from the btree. The iterator must be valid
+ // (i.e. not equal to end()). Return an iterator pointing to the node after
+ // the one that was erased (or end() if none exists).
+ iterator erase(const iterator &iter) { return tree_.erase(iter); }
+ void erase(const iterator &first, const iterator &last) { tree_.erase(first, last); }
+
+ bool operator==(const self_type& x) const {
+ if (size() != x.size())
+ return false;
+
+ for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi)
+ if (*i != *xi)
+ return false;
+
+ return true;
+ }
+
+ bool operator!=(const self_type& other) const {
+ return !operator==(other);
+ }
+
+protected:
+ Tree tree_;
+
+private:
+ // This is template voodoo which creates function overload depending on
+ // whether Point has const operator+= or not.
+ template<class>
+ struct true_helper : std::true_type {};
+ template<class T = Point>
+ static auto test_can_merge(int) -> true_helper<decltype(std::declval<const T>().operator+=(std::declval<const T>()))>;
+ template<class>
+ static auto test_can_merge(long) -> std::false_type;
+ template<class T = Point>
+ struct can_merge : decltype(test_can_merge<T>(0)) {};
+
+public:
+ // This function overload is enabled only when Point has const operator+= (e.g. RawPoint)
+ // and therefore we can update it inplace.
+ template<class U = Point>
+ typename std::enable_if<can_merge<U>::value, size_t>::type
+ merge_point(const U &new_point) {
+ // First, try to insert a point
+ const auto &result = insert(new_point);
+ if (result.second)
+ return 1;
+ // We already having something there. Try to merge stuff in.
+ *result.first += new_point;
+ return 0;
+ }
+
+ // Otherwise this overload is used, which removes the point from set,
+ // updates it and re-inserts back.
+ template<class U = Point>
+ typename std::enable_if<!can_merge<U>::value, size_t>::type
+ merge_point(const U &new_point) {
+ auto result = insert(new_point);
+ if (result.second)
+ return 1;
+ Point updated = *result.first + new_point;
+ auto after_removed = erase(result.first);
+ insert(after_removed, updated);
+ return 0;
+ }
+
+ template<class OtherHist>
+ size_t merge(const OtherHist &other) {
+ // If histogram is empty, we could simply insert everything
+ if (size() == 0) {
+ insert(other.begin(), other.end());
+ return size();
+ }
+
+ size_t old_size = size();
+ for (const auto &new_point : other)
+ merge_point(new_point);
+ return size() - old_size;
+ }
+};
+
+template<typename T>
+inline std::ostream &operator<<(std::ostream &os, const Histogram<T> &b) {
+ os << "{";
+ for (const auto& e : b)
+ os << e << "; ";
+ os << "}";
+ return os;
+}
+
+typedef Histogram<RawGapPoint> RawGapHistogram;
+typedef Histogram<GapPoint> GapHistogram;
+
+typedef Histogram<RawPoint> RawHistogram;
+typedef Histogram<Point> HistogramWithWeight;
+
+}
+
+}
diff --git a/src/common/paired_info/histptr.hpp b/src/common/paired_info/histptr.hpp
new file mode 100644
index 0000000..58f34c7
--- /dev/null
+++ b/src/common/paired_info/histptr.hpp
@@ -0,0 +1,156 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+namespace omnigraph {
+namespace de {
+
+template<class T>
+class StrongWeakPtr {
+ public:
+ typedef T element_type;
+ typedef T* pointer;
+
+ StrongWeakPtr() noexcept
+ : ptr_(pointer(), false) {}
+
+ StrongWeakPtr(std::nullptr_t) noexcept
+ : ptr_(pointer(), false) {}
+
+ StrongWeakPtr(pointer p, bool owning = true) noexcept
+ : ptr_(std::move(p), owning) { }
+
+ StrongWeakPtr(StrongWeakPtr &&p) noexcept
+ : ptr_(p.release(), p.owning()) {}
+
+ StrongWeakPtr& operator=(StrongWeakPtr &&p) noexcept {
+ reset(p.release(), p.owning());
+ return *this;
+ }
+
+ ~StrongWeakPtr() {
+ reset();
+ }
+
+ StrongWeakPtr &operator=(std::nullptr_t) noexcept {
+ reset();
+ return *this;
+ }
+
+ typename std::add_lvalue_reference<T>::type operator*() const {
+ return *ptr_.getPointer();
+ }
+
+ pointer operator->() const noexcept {
+ return ptr_.getPointer();
+ }
+
+ pointer get() const noexcept {
+ return ptr_.getPointer();
+ }
+
+ explicit operator bool() const noexcept {
+ return ptr_.getPointer() != nullptr;
+ }
+
+ bool owning() const noexcept {
+ return ptr_.getInt();
+ }
+
+ pointer release() noexcept {
+ pointer p = ptr_.getPointer();
+ ptr_ = raw_type();
+ return p;
+ }
+
+ void reset(pointer p = pointer(), bool own = true) {
+ pointer tmp = ptr_.getPointer(); bool is_owning = ptr_.getInt();
+ ptr_ = raw_type(p, own);
+ if (is_owning)
+ delete tmp;
+ }
+
+ void swap(StrongWeakPtr &p) noexcept {
+ std::swap(p.ptr_, ptr_);
+ }
+
+ private:
+ llvm::PointerIntPair<pointer, 1, bool> ptr_;
+ public:
+ typedef decltype(ptr_) raw_type;
+};
+
+
+template<class T>
+inline void swap(StrongWeakPtr<T> &x, StrongWeakPtr<T> &y) noexcept {
+ x.swap(y);
+}
+
+template<class T>
+inline bool operator==(const StrongWeakPtr<T> &x, const StrongWeakPtr<T> &y) noexcept {
+ return x.get() == y.get();
+}
+
+template<class T>
+inline bool operator!=(const StrongWeakPtr<T> &x, const StrongWeakPtr<T> &y) noexcept {
+ return !(x == y);
+}
+
+template<class T1, class T2>
+inline bool operator<(const StrongWeakPtr<T1> &x, const StrongWeakPtr<T2> &y) noexcept {
+ typedef typename StrongWeakPtr<T1>::pointer P1;
+ typedef typename StrongWeakPtr<T2>::pointer P2;
+ typedef typename std::common_type<P1, P2>::type Common;
+
+ using namespace std;
+ return less<Common>()(x.get(), y.get());
+}
+
+template<class T1, class T2>
+inline bool operator>(const StrongWeakPtr<T1> &x, const StrongWeakPtr<T2> &y) noexcept {
+ return y < x;
+}
+
+template<class T1, class T2>
+inline bool operator<=(const StrongWeakPtr<T1> &x, const StrongWeakPtr<T2> &y) noexcept {
+ return !(y < x);
+}
+
+template<class T1, class T2>
+inline bool operator>=(const StrongWeakPtr<T1> &x, const StrongWeakPtr<T2> &y) noexcept {
+ return !(x < y);
+}
+
+template<class T>
+inline bool operator==(const StrongWeakPtr<T> &x, std::nullptr_t) noexcept {
+ return !x;
+}
+
+template<class T>
+inline bool operator==(std::nullptr_t, const StrongWeakPtr<T> &x) noexcept {
+ return !x;
+}
+
+template<class T>
+inline bool operator!=(const StrongWeakPtr<T> &x, std::nullptr_t) noexcept {
+ return static_cast<bool>(x);
+}
+
+template<class T>
+inline bool operator!=(std::nullptr_t, const StrongWeakPtr<T> &x) noexcept {
+ return static_cast<bool>(x);
+}
+
+template<class T, class... Args>
+StrongWeakPtr<T>
+make_sw(Args&&... args) {
+ return StrongWeakPtr<T>(new T(std::forward<Args>(args)...));
+}
+
+}
+}
+
diff --git a/src/modules/paired_info/index_point.hpp b/src/common/paired_info/index_point.hpp
similarity index 100%
rename from src/modules/paired_info/index_point.hpp
rename to src/common/paired_info/index_point.hpp
diff --git a/src/common/paired_info/insert_size_refiner.hpp b/src/common/paired_info/insert_size_refiner.hpp
new file mode 100644
index 0000000..6910899
--- /dev/null
+++ b/src/common/paired_info/insert_size_refiner.hpp
@@ -0,0 +1,165 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "utils/cpp_utils.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+//#include "sequence_mapper.hpp"
+
+namespace omnigraph {
+
+typedef std::map<int, size_t> HistType;
+
+inline double get_median(const HistType &hist) {
+ double S = 0;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter)
+ S += (double) iter->second;
+
+ double sum = S;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ sum -= (double) iter->second;
+ if (sum <= S / 2) {
+ return iter->first;
+ }
+ }
+ assert(false);
+ return -1;
+}
+
+inline double get_mad(const HistType &hist, double median) { // median absolute deviation
+ std::map<int, size_t> hist2;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ int x = abs(iter->first - math::round_to_zero(median));
+ hist2[x] = iter->second;
+ }
+ return get_median(hist2);
+}
+
+inline void hist_crop(const HistType &hist, double low, double high, HistType &res) {
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first >= low && iter->first <= high) {
+ DEBUG("Cropped histogram " << iter->first << " " << iter->second);
+ res.insert(*iter);
+ }
+ }
+}
+
+inline
+std::pair<double, double> GetISInterval(double quantile,
+ const HistType &is_hist) {
+ // First, obtain the sum of the values
+ double S = 0;
+ for (auto iter : is_hist)
+ S += (double) iter.second;
+
+ double lval = S * (1 - quantile) / 2, rval = S * (1 + quantile) / 2;
+ double is_min, is_max;
+
+ // Now, find the quantiles
+ double cS = 0;
+ is_min = is_hist.begin()->first;
+ is_max = is_hist.rbegin()->first;
+ for (auto iter : is_hist) {
+ if (cS <= lval)
+ is_min = iter.first;
+ else if (cS <= rval)
+ is_max = iter.first;
+ cS += (double) iter.second;
+ }
+
+ return std::make_pair(is_min, is_max);
+}
+
+inline void find_median(const HistType &hist, double &median, double &mad, HistType &cropped_hist) {
+ DEBUG("Counting median and MAD");
+ median = get_median(hist);
+ mad = get_mad(hist, median);
+ double low = median - 5. * 1.4826 * mad;
+ double high = median + 5. * 1.4826 * mad;
+ omnigraph::hist_crop(hist, low, high, cropped_hist);
+ median = get_median(cropped_hist);
+ mad = get_mad(cropped_hist, median);
+}
+
+//Moved from insert size counter.
+//TODO: Please explain constants like 1.4826.
+inline void find_mean(const HistType &hist, double &mean, double &delta, std::map<size_t, size_t> &percentiles) {
+ double median = get_median(hist);
+ double mad = get_mad(hist, median);
+ double low = median - 5. * 1.4826 * mad;
+ double high = median + 5. * 1.4826 * mad;
+
+ DEBUG("Median IS: " << median);
+ DEBUG("MAD: " << mad);
+ DEBUG("Thresholds set to: [" << low << ", " << high << "]");
+
+ size_t n = 0;
+ double sum = 0.;
+ double sum2 = 0.;
+ DEBUG("Counting average");
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ n += iter->second;
+ sum += (double) iter->second * 1. * (double) iter->first;
+ sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
+ }
+ mean = sum / (double) n;
+ delta = sqrt(sum2 / (double) n - mean * mean);
+
+ low = mean - 5 * delta;
+ high = mean + 5 * delta;
+
+ DEBUG("Mean IS: " << mean);
+ DEBUG("sd: " << delta);
+ DEBUG("Thresholds set to: [" << low << ", " << high << "]");
+
+ n = 0;
+ sum = 0.;
+ sum2 = 0.;
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ n += iter->second;
+ sum += (double) iter->second * 1. * (double) iter->first;
+ sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
+ }
+ mean = sum / (double) n;
+ delta = sqrt(sum2 / (double) n - mean * mean);
+
+ DEBUG("Mean IS: " << mean);
+ DEBUG("sd: " << delta);
+
+ size_t m = 0;
+
+ DEBUG("Counting percentiles");
+ //todo optimize
+ size_t q[19];
+ for (size_t i = 1; i < 20; ++i) {
+ q[i - 1] = 5 * i;
+ }
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
+ if (iter->first < low || iter->first > high) {
+ continue;
+ }
+ size_t mm = m + iter->second;
+ for (size_t i = 0; i < utils::array_size(q); i++) {
+ size_t scaled_q_i((size_t) ((double) q[i] / 100. * (double) n));
+ if (m < scaled_q_i && mm >= scaled_q_i) {
+ percentiles[q[i]] = (size_t) iter->first;
+ }
+ }
+ m = mm;
+ }
+}
+
+
+}
diff --git a/src/common/paired_info/is_counter.hpp b/src/common/paired_info/is_counter.hpp
new file mode 100644
index 0000000..bde7736
--- /dev/null
+++ b/src/common/paired_info/is_counter.hpp
@@ -0,0 +1,150 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef IS_COUNTER_HPP_
+#define IS_COUNTER_HPP_
+
+
+#include "paired_info/insert_size_refiner.hpp"
+#include "modules/alignment/sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+using namespace omnigraph;
+
+class InsertSizeCounter: public SequenceMapperListener {
+
+public:
+
+ InsertSizeCounter(const conj_graph_pack& gp,
+ size_t edge_length_threshold,
+ bool ignore_negative = false)
+ : gp_(gp),
+ edge_length_threshold_(edge_length_threshold),
+ ignore_negative_(ignore_negative) {
+ }
+
+ HistType hist() { return hist_; }
+ size_t total() const { return total_.total_; }
+ size_t mapped() const { return counted_.total_; }
+ size_t negative() const { return negative_.total_; }
+
+
+ void StartProcessLibrary(size_t threads_count) override {
+ hist_.clear();
+ tmp_hists_ = vector<HistType>(threads_count);
+
+ total_ = count_data(threads_count);
+ counted_ = count_data(threads_count);
+ negative_ = count_data(threads_count);
+ }
+
+ void StopProcessLibrary() override {
+ tmp_hists_.clear();
+ total_.merge();
+ counted_.merge();
+ negative_.merge();
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
+ (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
+ }
+
+ void ProcessPairedRead(size_t thread_index,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
+ (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
+ }
+
+ void MergeBuffer(size_t thread_index) override {
+ for (const auto& kv: tmp_hists_[thread_index])
+ hist_[kv.first] += kv.second;
+
+ tmp_hists_[thread_index].clear();
+ }
+
+ void FindMean(double& mean, double& delta, std::map<size_t, size_t>& percentiles) const {
+ find_mean(hist_, mean, delta, percentiles);
+ }
+
+ void FindMedian(double& median, double& mad, HistType& histogram) const {
+ find_median(hist_, median, mad, histogram);
+ }
+
+private:
+ void ProcessPairedRead(size_t thread_index,
+ const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2,
+ int read2_size,
+ int is_delta) {
+
+ ++total_.arr_[thread_index];
+
+ if (read1.size() == 1 && read2.size() == 1 &&
+ read2.simple_path().front() == read1.simple_path().front() &&
+ gp_.g.length(read1.simple_path().front()) >= edge_length_threshold_) {
+
+ auto mapping_edge_1 = read1.front().second;
+ auto mapping_edge_2 = read2.front().second;
+
+ int read1_start = (int) mapping_edge_1.mapped_range.start_pos - (int) mapping_edge_1.initial_range.start_pos ;
+ TRACE("Read 1: " << (int) mapping_edge_1.mapped_range.start_pos << " - " << (int) mapping_edge_1.initial_range.start_pos << " = " << read1_start);
+ int read2_start = (int) mapping_edge_2.mapped_range.start_pos - (int) mapping_edge_2.initial_range.start_pos;
+ TRACE("Read 2: " << (int) mapping_edge_2.mapped_range.start_pos << " - " << (int) mapping_edge_2.initial_range.start_pos << " = " << read2_start);
+ int is = read2_start - read1_start + read2_size + is_delta;
+ TRACE("IS: " << read2_start << " - " << read1_start << " + " << (int) is_delta << " = " << is);
+
+ if (is > 0 || ignore_negative_) {
+ tmp_hists_[thread_index][is] += 1;
+ ++counted_.arr_[thread_index];
+ } else {
+ ++negative_.arr_[thread_index];
+ }
+
+ }
+
+ }
+ struct count_data {
+ size_t total_;
+ vector<size_t> arr_;
+ count_data()
+ : total_(0) {}
+
+ count_data(size_t nthreads)
+ : total_(0), arr_(nthreads, 0) {}
+
+ void inc(size_t i) { ++arr_[i]; }
+ void merge() {
+ for (size_t i = 0; i < arr_.size(); ++i) {
+ total_ += arr_[i];
+ }
+ }
+ };
+
+private:
+ const conj_graph_pack &gp_;
+
+ HistType hist_;
+ vector<HistType> tmp_hists_;
+
+ count_data total_;
+ count_data counted_;
+ count_data negative_;
+
+ size_t edge_length_threshold_;
+ bool ignore_negative_;
+};
+
+}
+
+
+#endif /* IS_COUNTER_HPP_ */
diff --git a/src/common/paired_info/pair_info_bounds.hpp b/src/common/paired_info/pair_info_bounds.hpp
new file mode 100644
index 0000000..c6c4b0c
--- /dev/null
+++ b/src/common/paired_info/pair_info_bounds.hpp
@@ -0,0 +1,30 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef OMNI_UTILS_HPP_
+#define OMNI_UTILS_HPP_
+
+#include "utils/standard_base.hpp"
+
+namespace omnigraph {
+
+
+inline size_t PairInfoPathLengthUpperBound(size_t k, size_t insert_size,
+ double delta) {
+ double answer = 0. + (double) insert_size + delta - (double) k - 2.;
+ VERIFY(math::gr(answer, 0.));
+ return (size_t)std::floor(answer);
+}
+
+inline size_t PairInfoPathLengthLowerBound(size_t k, size_t l1, size_t l2,
+ int gap, double delta) {
+ double answer = 0. + (double) gap + (double) k + 2. - (double) l1 - (double) l2 - delta;
+ return math::gr(answer, 0.) ? (size_t)std::floor(answer) : 0;
+}
+
+}
+#endif /* OMNI_UTILS_HPP_ */
diff --git a/src/common/paired_info/pair_info_filler.hpp b/src/common/paired_info/pair_info_filler.hpp
new file mode 100644
index 0000000..e0633f0
--- /dev/null
+++ b/src/common/paired_info/pair_info_filler.hpp
@@ -0,0 +1,108 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef PAIR_INFO_FILLER_HPP_
+#define PAIR_INFO_FILLER_HPP_
+
+#include "paired_info/concurrent_pair_info_buffer.hpp"
+#include "modules/alignment/sequence_mapper_notifier.hpp"
+
+namespace debruijn_graph {
+
+/**
+ * As for now it ignores sophisticated case of repeated consecutive
+ * occurrence of edge in path due to gaps in mapping
+ *
+ */
+class LatePairedIndexFiller : public SequenceMapperListener {
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+public:
+ typedef std::function<double(const EdgePair&, const MappingRange&, const MappingRange&)> WeightF;
+
+ LatePairedIndexFiller(const Graph &graph, WeightF weight_f,
+ unsigned round_distance,
+ omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index)
+ : weight_f_(std::move(weight_f)),
+ paired_index_(paired_index),
+ buffer_pi_(graph),
+ round_distance_(round_distance) {}
+
+ void StartProcessLibrary(size_t) override {
+ DEBUG("Start processing: start");
+ buffer_pi_.clear();
+ DEBUG("Start processing: end");
+ }
+
+ void StopProcessLibrary() override {
+ // paired_index_.Merge(buffer_pi_);
+ paired_index_.MoveAssign(buffer_pi_);
+ buffer_pi_.clear();
+ }
+
+ void ProcessPairedRead(size_t,
+ const io::PairedRead& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(read1, read2, r.distance());
+ }
+
+ void ProcessPairedRead(size_t,
+ const io::PairedReadSeq& r,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(read1, read2, r.distance());
+ }
+
+ virtual ~LatePairedIndexFiller() {}
+
+private:
+ void ProcessPairedRead(const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2, size_t read_distance) {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+
+ omnigraph::de::DEWeight weight =
+ weight_f_({mapping_edge_1.first, mapping_edge_2.first},
+ mapping_edge_1.second, mapping_edge_2.second);
+
+ // Add only if weight is non-zero
+ if (math::gr(weight, 0)) {
+ size_t kmer_distance = read_distance
+ + mapping_edge_2.second.initial_range.end_pos
+ - mapping_edge_1.second.initial_range.start_pos;
+ int edge_distance = (int) kmer_distance
+ + (int) mapping_edge_1.second.mapped_range.start_pos
+ - (int) mapping_edge_2.second.mapped_range.end_pos;
+
+ // Additionally round, if necessary
+ if (round_distance_ > 1)
+ edge_distance = int(std::round(edge_distance / double(round_distance_))) * round_distance_;
+
+ buffer_pi_.Add(mapping_edge_1.first, mapping_edge_2.first,
+ omnigraph::de::RawPoint(edge_distance, weight));
+
+ }
+ }
+ }
+ }
+
+private:
+ WeightF weight_f_;
+ omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index_;
+ omnigraph::de::ConcurrentPairedInfoBuffer<Graph> buffer_pi_;
+ unsigned round_distance_;
+
+ DECL_LOGGER("LatePairedIndexFiller");
+};
+
+
+}
+
+
+#endif /* PAIR_INFO_FILLER_HPP_ */
diff --git a/src/modules/paired_info/pair_info_filters.hpp b/src/common/paired_info/pair_info_filters.hpp
similarity index 100%
rename from src/modules/paired_info/pair_info_filters.hpp
rename to src/common/paired_info/pair_info_filters.hpp
diff --git a/src/common/paired_info/pair_info_improver.hpp b/src/common/paired_info/pair_info_improver.hpp
new file mode 100644
index 0000000..8b6ccfc
--- /dev/null
+++ b/src/common/paired_info/pair_info_improver.hpp
@@ -0,0 +1,280 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "split_path_constructor.hpp"
+#include "paired_info/paired_info_helpers.hpp"
+#include "assembly_graph/paths/path_utils.hpp"
+#include <math.h>
+#include <io/reads/read_processor.hpp>
+
+namespace debruijn_graph {
+
+inline bool ClustersIntersect(omnigraph::de::Point p1, omnigraph::de::Point p2) {
+ return math::le(p1.d, p2.d + p1.var + p2.var) &&
+ math::le(p2.d, p1.d + p1.var + p2.var);
+}
+
+
+//todo move out
+template<class Graph>
+class ParallelEdgeProcessor {
+ class ConstEdgeIteratorWrapper {
+ public:
+ typedef typename Graph::EdgeId ReadT;
+
+ ConstEdgeIteratorWrapper(const Graph &g)
+ : it_(g) {}
+
+ bool eof() const { return it_.IsEnd(); }
+
+ ConstEdgeIteratorWrapper& operator>>(typename Graph::EdgeId &val) {
+ val = *it_;
+ ++it_;
+ return *this;
+ }
+
+ private:
+ ConstEdgeIterator<Graph> it_;
+ };
+
+public:
+ ParallelEdgeProcessor(const Graph &g, unsigned nthreads)
+ : rp_(nthreads), it_(g) {}
+
+ template <class Processor>
+ bool Run(Processor &op) { return rp_.Run(it_, op); }
+
+ bool IsEnd() const { return it_.eof(); }
+ size_t processed() const { return rp_.processed(); }
+
+private:
+ hammer::ReadProcessor rp_;
+ ConstEdgeIteratorWrapper it_;
+};
+
+template<class Graph>
+static
+bool TryToAddPairInfo(omnigraph::de::PairedInfoIndexT<Graph>& clustered_index,
+ typename Graph::EdgeId e1, typename Graph::EdgeId e2,
+ const omnigraph::de::Point& point_to_add) {
+ auto histogram = clustered_index.Get(e1, e2);
+ for (auto i : histogram) {
+ if (ClustersIntersect(i, point_to_add))
+ return false;
+ }
+
+ clustered_index.Add(e1, e2, point_to_add);
+ return true;
+}
+
+template<class Graph>
+class PairInfoImprover {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::vector<omnigraph::de::PairInfo<EdgeId> > PairInfos;
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+ typedef omnigraph::de::PairedInfoIndexT<Graph> Index;
+
+ public:
+ PairInfoImprover(const Graph& g,
+ Index& clustered_index,
+ const io::SequencingLibrary<config::DataSetData> &lib, size_t max_repeat_length)
+ : graph_(g), index_(clustered_index), lib_(lib), max_repeat_length_(max_repeat_length) { }
+
+ void ImprovePairedInfo(unsigned num_threads = 1) {
+ CorrectPairedInfo(num_threads);
+ CorrectPairedInfo(num_threads);
+ }
+
+ private:
+ void CorrectPairedInfo(unsigned nthreads) {
+ size_t missing_paired_info_count = 0;
+ size_t extra_paired_info_count = 0;
+ extra_paired_info_count = RemoveContradictional(nthreads);
+ missing_paired_info_count = FillMissing(nthreads);
+
+ INFO("Paired info stats: missing = " << missing_paired_info_count
+ << "; contradictional = " << extra_paired_info_count);
+ }
+
+ class ContradictionalRemover {
+ public:
+ ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
+ const Graph &g,
+ omnigraph::de::PairedInfoIndexT<Graph>& index, size_t max_repeat_length)
+ : to_remove_(to_remove), graph_(g), index_(index), max_repeat_length_(max_repeat_length) {}
+
+ bool operator()(std::unique_ptr<EdgeId> e) {
+ omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
+
+ if (graph_.length(*e)>= max_repeat_length_ && index_.contains(*e))
+ FindInconsistent(*e, to_remove);
+
+ return false;
+ }
+
+ private:
+ bool IsConsistent(EdgeId /*e*/, EdgeId e1, EdgeId e2,
+ const omnigraph::de::Point& p1, const omnigraph::de::Point& p2) const {
+ if (math::le(p1.d, 0.f) || math::le(p2.d, 0.f) || math::gr(p1.d, p2.d))
+ return true;
+
+ double pi_dist = p2.d - p1.d;
+ int first_length = (int) graph_.length(e1);
+ double var = p1.var + p2.var;
+
+ TRACE(" PI " << p1 << " tr " << omp_get_thread_num());
+ TRACE("vs PI " << p2 << " tr " << omp_get_thread_num());
+
+ if (math::le(pi_dist, first_length + var) &&
+ math::le((double)first_length, pi_dist + var)) {
+ if (graph_.EdgeEnd(e1) == graph_.EdgeStart(e2))
+ return true;
+
+ auto paths = GetAllPathsBetweenEdges(graph_, e1, e2, 0, (size_t) ceil(pi_dist - first_length + var));
+ return (paths.size() > 0);
+ } else {
+ if (math::gr(p2.d, p1.d + omnigraph::de::DEDistance(first_length))) {
+ auto paths = GetAllPathsBetweenEdges(graph_, e1, e2,
+ (size_t) floor(pi_dist - first_length - var),
+ (size_t) ceil(pi_dist - first_length + var));
+ return (paths.size() > 0);
+ }
+ return false;
+ }
+ }
+
+ // Checking the consistency of two edge pairs (e, e_1) and (e, e_2) for all pairs (base_edge, <some_edge>)
+ void FindInconsistent(EdgeId base_edge,
+ Index& pi) const {
+ for (auto i1 : index_.Get(base_edge)) {
+ auto e1 = i1.first;
+ for (auto i2 : index_.Get(base_edge)) {
+ auto e2 = i2.first;
+ if (e1 == e2)
+ continue;
+ for (auto p1 : i1.second) {
+ for (auto p2 : i2.second) {
+ if (!IsConsistent(base_edge, e1, e2, p1, p2)) {
+ if (p1.lt(p2))
+ pi.Add(base_edge, e1, p1);
+ else
+ pi.Add(base_edge, e2, p2);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
+ const Graph &graph_;
+ Index& index_;
+ size_t max_repeat_length_;
+ };
+
+ size_t RemoveContradictional(unsigned nthreads) {
+ size_t cnt = 0;
+
+ omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
+
+ // FIXME: Replace with lambda
+ ContradictionalRemover remover(to_remove, graph_, index_, max_repeat_length_);
+ ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
+
+ DEBUG("ParallelRemoveContraditional: Threads finished");
+
+ DEBUG("Merging maps");
+ for (size_t i = 1; i < nthreads; ++i) {
+ to_remove[0].Merge(to_remove[i]);
+ to_remove[i].clear();
+ }
+ DEBUG("Resulting size " << to_remove[0].size());
+
+ DEBUG("Deleting paired infos, liable to removing");
+ for (auto I = omnigraph::de::half_pair_begin(to_remove[0]);
+ I != omnigraph::de::half_pair_end(to_remove[0]); ++I) {
+ cnt += DeleteIfExist(I.first(), I.second(), *I);
+ }
+ to_remove[0].clear();
+
+ DEBUG("Size of index " << index_.size());
+ DEBUG("ParallelRemoveContraditional: Clean finished");
+ return cnt;
+
+ }
+
+ size_t FillMissing(unsigned nthreads) {
+ DEBUG("Fill missing: Creating indexes");
+ const size_t NUM_CHUNKS = nthreads * 16;
+ omnigraph::de::PairedInfoIndicesT<Graph> to_add(graph_, NUM_CHUNKS);
+
+ SplitPathConstructor<Graph> spc(graph_);
+ IterationHelper<Graph, EdgeId> edges(graph_);
+ auto iters = edges.Chunks(NUM_CHUNKS);
+
+ DEBUG("Fill missing: Start threads");
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ TRACE("Processing chunk #" << i);
+ for (auto e = iters[i]; e != iters[i + 1]; ++e) {
+ TRACE("Checking for edge " << *e);
+ auto paths = spc.ConvertPIToSplitPaths(*e, index_,
+ lib_.data().mean_insert_size,
+ lib_.data().insert_size_deviation);
+ for (const auto &path : paths) {
+ TRACE("Path " << path.PrintPath(graph_));
+ for (const auto &pi : path)
+ TryToAddPairInfo(to_add[i], pi.first, pi.second, pi.point);
+ }
+ }
+ }
+ //ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(filler);
+ DEBUG("Fill missing: Threads finished");
+
+ size_t cnt = 0;
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ DEBUG("Adding map #" << i);
+ for (auto I = omnigraph::de::half_pair_begin(to_add[i]);
+ I != omnigraph::de::half_pair_end(to_add[i]);
+ ++I) {
+ EdgeId e1 = I.first();
+ EdgeId e2 = I.second();
+ for (auto p : *I)
+ cnt += TryToAddPairInfo(index_, e1, e2, p);
+ }
+ to_add[i].clear();
+ }
+
+ DEBUG("Size of paired index " << index_.size());
+
+ DEBUG("Fill missing: Clean finished");
+ DEBUG("Added " << cnt);
+ return cnt;
+ }
+
+ private:
+ size_t DeleteIfExist(EdgeId e1, EdgeId e2, const typename Index::HistProxy& infos) {
+ size_t cnt = 0;
+ for (auto point : infos) {
+ cnt += index_.Remove(e1, e2, point);
+ TRACE("cnt += " << cnt);
+ }
+
+ return cnt;
+ }
+
+ const Graph& graph_;
+ Index& index_;
+ const io::SequencingLibrary<config::DataSetData>& lib_;
+ size_t max_repeat_length_;
+ DECL_LOGGER("PairInfoImprover")
+};
+
+}
diff --git a/src/common/paired_info/paired_info.hpp b/src/common/paired_info/paired_info.hpp
new file mode 100644
index 0000000..0bba662
--- /dev/null
+++ b/src/common/paired_info/paired_info.hpp
@@ -0,0 +1,630 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "common/adt/iterator_range.hpp"
+#include <boost/iterator/iterator_facade.hpp>
+#include <btree/safe_btree_map.h>
+
+#include "paired_info_buffer.hpp"
+
+#include <type_traits>
+
+namespace omnigraph {
+
+namespace de {
+
+template<typename G, typename Traits, template<typename, typename> class Container>
+class PairedIndex : public PairedBuffer<G, Traits, Container> {
+ typedef PairedIndex<G, Traits, Container> self;
+ typedef PairedBuffer<G, Traits, Container> base;
+
+ typedef typename base::InnerHistogram InnerHistogram;
+ typedef typename base::InnerHistPtr InnerHistPtr;
+ typedef typename base::InnerPoint InnerPoint;
+
+ using typename base::EdgePair;
+
+public:
+ using typename base::Graph;
+ using typename base::EdgeId;
+ typedef typename base::InnerMap InnerMap;
+ typedef typename base::StorageMap StorageMap;
+ using typename base::Point;
+
+ typedef omnigraph::de::Histogram<Point> Histogram;
+
+ //--Data access types--
+
+ typedef typename StorageMap::const_iterator ImplIterator;
+
+ //---------------- Data accessing methods ----------------
+
+ /**
+ * @brief Underlying raw implementation data (for custom iterator helpers).
+ */
+ ImplIterator data_begin() const {
+ return this->storage_.begin();
+ }
+
+ /**
+ * @brief Underlying raw implementation data (for custom iterator helpers).
+ */
+ ImplIterator data_end() const {
+ return this->storage_.end();
+ }
+
+ /**
+ * @brief Smart proxy set representing a composite histogram of points between two edges.
+ * @detail You can work with the proxy just like any constant set.
+ * The only major difference is that it returns all consisting points by value,
+ * because some of them don't exist in the underlying sets and are
+ * restored from the conjugate info on-the-fly.
+ */
+ class HistProxy {
+
+ public:
+ /**
+ * @brief Iterator over a proxy set of points.
+ */
+ class Iterator: public boost::iterator_facade<Iterator, Point, boost::bidirectional_traversal_tag, Point> {
+
+ typedef typename InnerHistogram::const_iterator InnerIterator;
+
+ public:
+ Iterator(InnerIterator iter, DEDistance offset, bool back = false)
+ : iter_(iter), offset_(offset), back_(back)
+ {}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ Point dereference() const {
+ auto i = iter_;
+ if (back_) --i;
+ Point result = Traits::Expand(*i, offset_);
+ if (back_)
+ result.d = -result.d;
+ return result;
+ }
+
+ void increment() {
+ back_ ? --iter_ : ++iter_;
+ }
+
+ void decrement() {
+ back_ ? ++iter_ : --iter_;
+ }
+
+ inline bool equal(const Iterator &other) const {
+ return iter_ == other.iter_ && back_ == other.back_;
+ }
+
+ InnerIterator iter_; //current position
+ DEDistance offset_; //edge length
+ bool back_;
+ };
+
+ /**
+ * @brief Returns a wrapper for a histogram.
+ */
+ HistProxy(const InnerHistogram& hist, DEDistance offset = 0, bool back = false)
+ : hist_(hist), offset_(offset), back_(back)
+ {}
+
+ /**
+ * @brief Returns an empty proxy (effectively a Null object pattern).
+ */
+ static const InnerHistogram& empty_hist() {
+ static InnerHistogram res;
+ return res;
+ }
+
+ Iterator begin() const {
+ return Iterator(back_ ? hist_.end() : hist_.begin(), offset_, back_);
+ }
+
+ Iterator end() const {
+ return Iterator(back_ ? hist_.begin() : hist_.end(), offset_, back_);
+ }
+
+ /**
+ * @brief Finds the point with the minimal distance.
+ */
+ Point min() const {
+ VERIFY(!empty());
+ return *begin();
+ }
+
+ /**
+ * @brief Finds the point with the maximal distance.
+ */
+ Point max() const {
+ VERIFY(!empty());
+ return *--end();
+ }
+
+ /**
+ * @brief Returns the copy of all points in a simple flat histogram.
+ */
+ Histogram Unwrap() const {
+ return Histogram(begin(), end());
+ }
+
+ size_t size() const {
+ return hist_.size();
+ }
+
+ bool empty() const {
+ return hist_.empty();
+ }
+
+ private:
+ const InnerHistogram& hist_;
+ DEDistance offset_;
+ bool back_;
+ };
+
+ typedef typename HistProxy::Iterator HistIterator;
+
+ //---- Traversing edge neighbours ----
+
+ using EdgeHist = std::pair<EdgeId, HistProxy>;
+
+ /**
+ * @brief A proxy map representing neighbourhood of an edge,
+ * where `Key` is the graph edge ID and `Value` is the proxy histogram.
+ * @detail You can work with the proxy just like with any constant map.
+ * The only major difference is that it returns all consisting pairs by value,
+ * because proxies are constructed on-the-fly.
+ */
+ class EdgeProxy {
+ public:
+
+ /**
+ * @brief Iterator over a proxy map.
+ * @detail For a full proxy, traverses both straight and conjugate pairs.
+ * For a half proxy, traverses only lesser pairs (i.e., (a,b) where (a,b)<=(b',a')) of edges.
+ */
+ class Iterator: public boost::iterator_facade<Iterator, EdgeHist, boost::forward_traversal_tag, EdgeHist> {
+
+ typedef typename InnerMap::const_iterator InnerIterator;
+
+ void Skip() { //For a half iterator, skip conjugate pairs
+ while (half_ && iter_ != stop_ && index_.GreaterPair(edge_, iter_->first))
+ ++iter_;
+ }
+
+ public:
+ Iterator(const PairedIndex &index, InnerIterator iter, InnerIterator stop, EdgeId edge, bool half)
+ : index_ (index)
+ , iter_(iter)
+ , stop_(stop)
+ , edge_(edge)
+ , half_(half)
+ {
+ Skip();
+ }
+
+ void increment() {
+ ++iter_;
+ Skip();
+ }
+
+ void operator=(const Iterator &other) {
+ //TODO: is this risky without an assertion?
+ //VERIFY(index_ == other.index_);
+ //We shouldn't reassign iterators from one index onto another
+ iter_ = other.iter_;
+ stop_ = other.stop_;
+ edge_ = other.edge_;
+ half_ = other.half_;
+ }
+
+ private:
+ friend class boost::iterator_core_access;
+
+ bool equal(const Iterator &other) const {
+ return iter_ == other.iter_;
+ }
+
+ EdgeHist dereference() const {
+ const auto& hist = *iter_->second;
+ return std::make_pair(iter_->first, HistProxy(hist, index_.CalcOffset(edge_)));
+ }
+
+ private:
+ const PairedIndex &index_; //TODO: get rid of this somehow
+ InnerIterator iter_, stop_;
+ EdgeId edge_;
+ bool half_;
+ };
+
+ EdgeProxy(const PairedIndex &index, const InnerMap& map, EdgeId edge, bool half = false)
+ : index_(index), map_(map), edge_(edge), half_(half)
+ {}
+
+ Iterator begin() const {
+ return Iterator(index_, map_.begin(), map_.end(), edge_, half_);
+ }
+
+ Iterator end() const {
+ return Iterator(index_, map_.end(), map_.end(), edge_, half_);
+ }
+
+ HistProxy operator[](EdgeId e2) const {
+ if (half_ && index_.GreaterPair(edge_, e2))
+ return HistProxy::empty_hist();
+ return index_.Get(edge_, e2);
+ }
+
+ bool empty() const {
+ return map_.empty();
+ }
+
+ private:
+ const PairedIndex& index_;
+ const InnerMap& map_;
+ EdgeId edge_;
+ //When false, represents all neighbours (consisting both of directly added data and "restored" conjugates).
+ //When true, proxifies only half of the added edges.
+ bool half_;
+ };
+
+ typedef typename EdgeProxy::Iterator EdgeIterator;
+
+ //---------------- Constructor ----------------
+
+ PairedIndex(const Graph &graph)
+ : base(graph)
+ {}
+
+private:
+ bool GreaterPair(EdgeId e1, EdgeId e2) const {
+ auto ep = std::make_pair(e1, e2);
+ return ep > this->ConjugatePair(ep);
+ }
+
+public:
+ /**
+ * @brief Adds a lot of info from another index, using fast merging strategy.
+ * Should be used instead of point-by-point index merge.
+ */
+ template<class Buffer>
+ void Merge(Buffer& index_to_add) {
+ if (index_to_add.size() == 0)
+ return;
+
+ auto locked_table = index_to_add.lock_table();
+ for (auto& kvpair : locked_table) {
+ EdgeId e1_to_add = kvpair.first; auto& map_to_add = kvpair.second;
+
+ for (auto& to_add : map_to_add) {
+ EdgePair ep(e1_to_add, to_add.first), conj = this->ConjugatePair(e1_to_add, to_add.first);
+ if (ep > conj)
+ continue;
+
+ base::Merge(ep.first, ep.second, *to_add.second);
+ }
+ }
+ VERIFY(this->size() >= index_to_add.size());
+ }
+
+ template<class Buffer>
+ typename std::enable_if<std::is_convertible<typename Buffer::InnerMap, InnerMap>::value,
+ void>::type MoveAssign(Buffer& from) {
+ auto& base_index = this->storage_;
+ base_index.clear();
+ auto locked_table = from.lock_table();
+ for (auto& kvpair : locked_table) {
+ base_index[kvpair.first] = std::move(kvpair.second);
+ }
+ this->size_ = from.size();
+ }
+
+public:
+ //---------------- Data deleting methods ----------------
+
+ /**
+ * @brief Removes the specific entry from the index, and its conjugate.
+ * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
+ * @return The number of deleted entries (0 if there wasn't such entry)
+ */
+ size_t Remove(EdgeId e1, EdgeId e2, Point p) {
+ InnerPoint point = Traits::Shrink(p, this->graph_.length(e1));
+
+ // We remove first "non-owning part"
+ EdgePair minep, maxep;
+ std::tie(minep, maxep) = this->MinMaxConjugatePair({ e1, e2 });
+
+ size_t res = RemoveSingle(minep.first, minep.second, point);
+ size_t removed = (this->IsSelfConj(e1, e2) ? res : 2 * res);
+ this->size_ -= removed;
+
+ Prune(maxep.first, maxep.second);
+ Prune(minep.first, minep.second);
+
+ return removed;
+ }
+
+ /**
+ * @brief Removes the whole histogram from the index, and its conjugate.
+ * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
+ * @return The number of deleted entries
+ */
+ size_t Remove(EdgeId e1, EdgeId e2) {
+ EdgePair minep, maxep;
+ std::tie(minep, maxep) = this->MinMaxConjugatePair({ e1, e2 });
+
+ size_t removed = RemoveAll(maxep.first, maxep.second);
+ removed += RemoveAll(minep.first, minep.second);
+ this->size_ -= removed;
+
+ return removed;
+ }
+
+ private:
+ void Prune(EdgeId e1, EdgeId e2) {
+ auto i1 = this->storage_.find(e1);
+ if (i1 == this->storage_.end())
+ return;
+
+ auto& map = i1->second;
+ auto i2 = map.find(e2);
+ if (i2 == map.end())
+ return;
+
+ if (!i2->second->empty())
+ return;
+
+ map.erase(e2);
+ if (map.empty())
+ this->storage_.erase(e1);
+ }
+
+ size_t RemoveSingle(EdgeId e1, EdgeId e2, InnerPoint point) {
+ auto i1 = this->storage_.find(e1);
+ if (i1 == this->storage_.end())
+ return 0;
+
+ auto& map = i1->second;
+ auto i2 = map.find(e2);
+ if (i2 == map.end())
+ return 0;
+
+ if (!i2->second->erase(point))
+ return 0;
+
+ return 1;
+ }
+
+ size_t RemoveAll(EdgeId e1, EdgeId e2) {
+ auto i1 = this->storage_.find(e1);
+ if (i1 == this->storage_.end())
+ return 0;
+ auto& map = i1->second;
+ auto i2 = map.find(e2);
+ if (i2 == map.end())
+ return 0;
+
+ size_t size_decrease = i2->second->size();
+ map.erase(i2);
+ if (map.empty()) //Prune empty maps
+ this->storage_.erase(i1);
+ return size_decrease;
+ }
+
+public:
+
+ /**
+ * @brief Removes all neighbourhood of an edge (all edges referring to it, and their histograms)
+ * @warning To keep the symmetricity, it also deletes all conjugates, so the actual complexity is O(size).
+ * @return The number of deleted entries
+ */
+ size_t Remove(EdgeId edge) {
+ InnerMap &inner_map = this->storage_[edge];
+ std::vector<EdgeId> to_remove;
+ to_remove.reserve(inner_map.size());
+ size_t old_size = this->size();
+ for (const auto& ep : inner_map)
+ to_remove.push_back(ep.first);
+ for (auto e2 : to_remove)
+ this->Remove(edge, e2);
+ return old_size - this->size();
+ }
+
+private:
+ //When there is no such edge, returns a fake empty map for safety
+ const InnerMap& GetImpl(EdgeId e) const {
+ auto i = this->storage_.find(e);
+ if (i != this->storage_.end())
+ return i->second;
+ return empty_map_;
+ }
+
+ //When there is no such histogram, returns a fake empty histogram for safety
+ const InnerHistogram& GetImpl(EdgeId e1, EdgeId e2) const {
+ auto i = this->storage_.find(e1);
+ if (i != this->storage_.end()) {
+ auto j = i->second.find(e2);
+ if (j != i->second.end())
+ return *j->second;
+ }
+ return HistProxy::empty_hist();
+ }
+
+public:
+
+ /**
+ * @brief Returns a whole proxy map to the neighbourhood of some edge.
+ * @param e ID of starting edge
+ */
+ EdgeProxy Get(EdgeId e) const {
+ return EdgeProxy(*this, GetImpl(e), e);
+ }
+
+ /**
+ * @brief Returns a half proxy map to the neighbourhood of some edge.
+ * @param e ID of starting edge
+ */
+ EdgeProxy GetHalf(EdgeId e) const {
+ return EdgeProxy(*this, GetImpl(e), e, true);
+ }
+
+ /**
+ * @brief Operator alias of Get(id).
+ */
+ EdgeProxy operator[](EdgeId e) const {
+ return Get(e);
+ }
+
+ /**
+ * @brief Returns a histogram proxy for all points between two edges.
+ */
+ HistProxy Get(EdgeId e1, EdgeId e2) const {
+ return HistProxy(GetImpl(e1, e2), this->CalcOffset(e1));
+ }
+
+ /**
+ * @brief Operator alias of Get(e1, e2).
+ */
+ HistProxy operator[](EdgePair p) const {
+ return Get(p.first, p.second);
+ }
+
+ /**
+ * @brief Checks if an edge (or its conjugated twin) is consisted in the index.
+ */
+ bool contains(EdgeId edge) const {
+ return this->storage_.count(edge) + this->storage_.count(this->graph_.conjugate(edge)) > 0;
+ }
+
+ /**
+ * @brief Checks if there is a histogram for two points (or their conjugated pair).
+ */
+ bool contains(EdgeId e1, EdgeId e2) const {
+ auto i1 = this->storage_.find(e1);
+ if (i1 != this->storage_.end() && i1->second.count(e2))
+ return true;
+ return false;
+ }
+
+ /**
+ * @brief Inits the index with graph data. For each edge, adds a loop with zero weight.
+ * @warning Do not call this on non-empty indexes.
+ */
+ void Init() {
+ //VERIFY(size() == 0);
+ for (auto it = this->graph_.ConstEdgeBegin(); !it.IsEnd(); ++it)
+ this->Add(*it, *it, Point());
+ }
+
+private:
+ InnerMap empty_map_; //null object
+};
+
+template<class T>
+class NoLockingAdapter : public T {
+ public:
+ class locked_table {
+ public:
+ using iterator = typename T::iterator;
+ using const_iterator = typename T::const_iterator;
+
+ locked_table(T& table)
+ : table_(table) {}
+
+ iterator begin() { return table_.begin(); }
+ const_iterator begin() const { return table_.begin(); }
+ const_iterator cbegin() const { return table_.begin(); }
+
+ iterator end() { return table_.end(); }
+ const_iterator end() const { return table_.end(); }
+ const_iterator cend() const { return table_.end(); }
+
+ size_t size() const { return table_.size(); }
+
+ private:
+ T& table_;
+ };
+
+ // Nothing to lock here
+ locked_table lock_table() {
+ return locked_table(*this);
+ }
+};
+
+//Aliases for common graphs
+template<typename K, typename V>
+using safe_btree_map = NoLockingAdapter<btree::safe_btree_map<K, V>>; //Two-parameters wrapper
+template<typename Graph>
+using PairedInfoIndexT = PairedIndex<Graph, PointTraits, safe_btree_map>;
+
+template<typename K, typename V>
+using btree_map = NoLockingAdapter<btree::btree_map<K, V>>; //Two-parameters wrapper
+
+template<typename Graph>
+using UnclusteredPairedInfoIndexT = PairedIndex<Graph, RawPointTraits, btree_map>;
+
+/**
+ * @brief A collection of paired indexes which can be manipulated as one.
+ * Used as a convenient wrapper in parallel index processing.
+ */
+template<class Index>
+class PairedIndices {
+ typedef std::vector<Index> Storage;
+ Storage data_;
+
+public:
+ PairedIndices() {}
+
+ PairedIndices(const typename Index::Graph& graph, size_t lib_num) {
+ for (size_t i = 0; i < lib_num; ++i)
+ data_.emplace_back(graph);
+ }
+
+ /**
+ * @brief Initializes all indexes with zero points.
+ */
+ void Init() { for (auto& it : data_) it.Init(); }
+
+ /**
+ * @brief Clears all indexes.
+ */
+ void Clear() { for (auto& it : data_) it.clear(); }
+
+ Index& operator[](size_t i) { return data_[i]; }
+
+ const Index& operator[](size_t i) const { return data_[i]; }
+
+ size_t size() const { return data_.size(); }
+
+ typename Storage::iterator begin() { return data_.begin(); }
+ typename Storage::iterator end() { return data_.end(); }
+
+ typename Storage::const_iterator begin() const { return data_.begin(); }
+ typename Storage::const_iterator end() const { return data_.end(); }
+};
+
+template<class Graph>
+using PairedInfoIndicesT = PairedIndices<PairedInfoIndexT<Graph>>;
+
+template<class Graph>
+using UnclusteredPairedInfoIndicesT = PairedIndices<UnclusteredPairedInfoIndexT<Graph>>;
+
+template<typename K, typename V>
+using unordered_map = NoLockingAdapter<std::unordered_map<K, V>>; //Two-parameters wrapper
+template<class Graph>
+using PairedInfoBuffer = PairedBuffer<Graph, RawPointTraits, unordered_map>;
+
+template<class Graph>
+using PairedInfoBuffersT = PairedIndices<PairedInfoBuffer<Graph>>;
+
+}
+
+}
diff --git a/src/common/paired_info/paired_info_buffer.hpp b/src/common/paired_info/paired_info_buffer.hpp
new file mode 100644
index 0000000..2c26c7d
--- /dev/null
+++ b/src/common/paired_info/paired_info_buffer.hpp
@@ -0,0 +1,227 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "histogram.hpp"
+#include "histptr.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+/**
+ * @brief Index of paired reads information. For each pair of edges, we store so-called histogram which is a set
+ * of points with distance between those edges. Index is internally arranged as a map of map of histograms:
+ * edge1 -> (edge2 -> histogram)
+ * When we add a point (a,b)->p into the index, we automatically insert a conjugate point (b',a')->p',
+ * (self-conjugate edge pairs are the sole exception), so the index is always conjugate-symmetrical.
+ * Index provides access for a lot of different information:
+ * - if you need to have a histogram between two edges, use Get(edge1, edge2);
+ * - if you need to get a neighbourhood of some edge (second edges with corresponding histograms), use Get(edge1);
+ * - if you need to skip a symmetrical half of that neighbourhood, use GetHalf(edge1);
+ * Backward information (e.g., (b,a)->-p) is currently inaccessible.
+ * @param G graph type
+ * @param Traits Policy-like structure with associated types of inner and resulting points, and how to convert between them
+ * @param C map-like container type (parameterized by key and value type)
+ */
+
+template<class Derived, class G, class Traits>
+class PairedBufferBase {
+ protected:
+ typedef typename Traits::Gapped InnerPoint;
+
+ public:
+ typedef G Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::pair<EdgeId, EdgeId> EdgePair;
+ typedef typename Traits::Expanded Point;
+
+ public:
+ PairedBufferBase(const Graph &g)
+ : size_(0), graph_(g) {}
+
+ //---------------- Data inserting methods ----------------
+ /**
+ * @brief Adds a point between two edges to the index,
+ * merging weights if there's already one with the same distance.
+ */
+ void Add(EdgeId e1, EdgeId e2, Point p) {
+ InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
+ InsertWithConj(e1, e2, sp);
+ }
+
+ /**
+ * @brief Adds a whole set of points between two edges to the index.
+ */
+ template<typename TH>
+ void AddMany(EdgeId e1, EdgeId e2, const TH& hist) {
+ for (auto p : hist) {
+ InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
+ InsertWithConj(e1, e2, sp);
+ }
+ }
+ //---------------- Miscellaneous ----------------
+
+ /**
+ * Returns the graph the index is based on. Needed for custom iterators.
+ */
+ const Graph &graph() const { return graph_; }
+
+ /**
+ * @brief Returns the physical index size (total count of all histograms).
+ */
+ size_t size() const { return size_; }
+
+ public:
+ /**
+ * @brief Returns a conjugate pair for two edges.
+ */
+ EdgePair ConjugatePair(EdgeId e1, EdgeId e2) const {
+ return std::make_pair(this->graph_.conjugate(e2), this->graph_.conjugate(e1));
+ }
+ /**
+ * @brief Returns a conjugate pair for a pair of edges.
+ */
+ EdgePair ConjugatePair(EdgePair ep) const {
+ return ConjugatePair(ep.first, ep.second);
+ }
+
+ private:
+ void InsertWithConj(EdgeId e1, EdgeId e2, InnerPoint p) {
+ EdgePair minep, maxep;
+ std::tie(minep, maxep) = this->MinMaxConjugatePair({ e1, e2 });
+ bool selfconj = this->IsSelfConj(e1, e2);
+
+ auto res = static_cast<Derived*>(this)->InsertOne(minep.first, minep.second, p);
+ size_t added = (selfconj ? res.second : 2 * res.second);
+# pragma omp atomic
+ size_ += added;
+ if (res.first && !selfconj)
+ static_cast<Derived*>(this)->InsertHistView(maxep.first, maxep.second, res.first);
+ else if (selfconj) // This would double the weigth of self-conjugate pairs
+ static_cast<Derived*>(this)->InsertOne(minep.first, minep.second, p);
+ }
+
+ protected:
+ template<class OtherHist>
+ void Merge(EdgeId e1, EdgeId e2, const OtherHist &h) {
+ EdgePair minep, maxep;
+ std::tie(minep, maxep) = this->MinMaxConjugatePair({ e1, e2 });
+ bool selfconj = this->IsSelfConj(e1, e2);
+
+ auto res = static_cast<Derived*>(this)->InsertHist(minep.first, minep.second, h);
+ size_t added = (selfconj ? res.second : 2 * res.second);
+# pragma omp atomic
+ size_ += added;
+ if (res.first && !selfconj)
+ static_cast<Derived*>(this)->InsertHistView(maxep.first, maxep.second, res.first);
+ else if (selfconj) // This would double the weigth of self-conjugate pairs
+ static_cast<Derived*>(this)->InsertHist(minep.first, minep.second, h);
+ }
+
+ std::pair<EdgePair, EdgePair> MinMaxConjugatePair(EdgePair ep) const {
+ EdgePair conj = ConjugatePair(ep);
+
+ return (ep < conj ? std::make_pair(ep, conj) : std::make_pair(conj, ep));
+ }
+
+ bool IsSelfConj(EdgeId e1, EdgeId e2) const {
+ return e1 == this->graph_.conjugate(e2);
+ }
+
+ size_t CalcOffset(EdgeId e) const {
+ return this->graph().length(e);
+ }
+
+ protected:
+ size_t size_;
+ const Graph& graph_;
+};
+
+
+template<typename G, typename Traits, template<typename, typename> class Container>
+class PairedBuffer : public PairedBufferBase<PairedBuffer<G, Traits, Container>,
+ G, Traits> {
+ typedef PairedBuffer<G, Traits, Container> self;
+ typedef PairedBufferBase<self, G, Traits> base;
+
+ friend class PairedBufferBase<self, G, Traits>;
+
+ protected:
+ using typename base::InnerPoint;
+ typedef omnigraph::de::Histogram<InnerPoint> InnerHistogram;
+ typedef omnigraph::de::StrongWeakPtr<InnerHistogram> InnerHistPtr;
+
+ public:
+ using typename base::Graph;
+ using typename base::EdgeId;
+ using typename base::EdgePair;
+ using typename base::Point;
+
+ typedef Container<EdgeId, InnerHistPtr> InnerMap;
+ typedef Container<EdgeId, InnerMap> StorageMap;
+
+ public:
+ PairedBuffer(const Graph &g)
+ : base(g) {
+ clear();
+ }
+
+ //---------------- Miscellaneous ----------------
+
+ /**
+ * @brief Clears the whole index. Used in merging.
+ */
+ void clear() {
+ storage_.clear();
+ this->size_ = 0;
+ }
+
+ typename StorageMap::locked_table lock_table() {
+ return storage_.lock_table();
+ }
+
+ private:
+ std::pair<typename InnerHistPtr::pointer, size_t> InsertOne(EdgeId e1, EdgeId e2, InnerPoint p) {
+ InnerMap& second = storage_[e1];
+ typename InnerHistPtr::pointer inserted = nullptr;
+ if (!second.count(e2)) {
+ inserted = new InnerHistogram();
+ second.insert(std::make_pair(e2, InnerHistPtr(inserted, /* owning */ true)));
+ }
+
+ size_t added = second[e2]->merge_point(p);
+
+ return { inserted, added };
+ }
+
+ template<class OtherHist>
+ std::pair<typename InnerHistPtr::pointer, size_t> InsertHist(EdgeId e1, EdgeId e2, const OtherHist &h) {
+ InnerMap& second = storage_[e1];
+ typename InnerHistPtr::pointer inserted = nullptr;
+ if (!second.count(e2)) {
+ inserted = new InnerHistogram();
+ second.insert(std::make_pair(e2, InnerHistPtr(inserted, /* owning */ true)));
+ }
+
+ size_t added = second[e2]->merge(h);
+
+ return { inserted, added };
+ }
+
+ void InsertHistView(EdgeId e1, EdgeId e2, typename InnerHistPtr::pointer p) {
+ auto res = storage_[e1].insert(std::make_pair(e2, InnerHistPtr(p, /* owning */ false)));
+ VERIFY_MSG(res.second, "Index insertion inconsistency");
+ }
+
+ protected:
+ StorageMap storage_;
+};
+
+} // namespace de
+
+} // namespace omnigraph
diff --git a/src/modules/paired_info/paired_info_helpers.hpp b/src/common/paired_info/paired_info_helpers.hpp
similarity index 100%
rename from src/modules/paired_info/paired_info_helpers.hpp
rename to src/common/paired_info/paired_info_helpers.hpp
diff --git a/src/common/paired_info/peak_finder.hpp b/src/common/paired_info/peak_finder.hpp
new file mode 100644
index 0000000..059c5ea
--- /dev/null
+++ b/src/common/paired_info/peak_finder.hpp
@@ -0,0 +1,385 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * PeakFinder.hpp
+ *
+ * Created on: Aug 15, 2011
+ * Author: alexeyka
+ */
+
+#ifndef PEAKFINDER_HPP_
+#define PEAKFINDER_HPP_
+
+#include "utils/verify.hpp"
+#include "data_divider.hpp"
+#include "paired_info.hpp"
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <complex>
+#include <cmath>
+
+namespace omnigraph{
+
+namespace de {
+
+template <class EdgeId>
+class PeakFinder {
+
+ typedef std::complex<double> complex_t;
+
+ public:
+ PeakFinder(const vector<PairInfo<EdgeId> >& data,
+ size_t begin,
+ size_t end,
+ size_t /*range*/,
+ size_t delta,
+ double percentage,
+ double der_thr) :
+ delta_(delta),
+ percentage_(percentage),
+ der_thr_(der_thr)
+ {
+ for (size_t i = begin; i < end; ++i) {
+ x_.push_back(rounded_d(data[i]));
+ y_.push_back(data[i].weight());
+ }
+ Init();
+ }
+
+ double weight() const {
+ return weight_;
+ }
+
+ double GetNormalizedWeight() const {
+ return weight_;
+ }
+
+ void PrintStats(string host) const {
+ for (size_t i = 0; i < data_len_; ++i)
+ DEBUG(host << (x_left_ + (int) i) << " " << hist_[i]);
+ }
+
+ void FFTSmoothing(double cutoff) {
+ VERIFY(data_len_ > 0);
+ if (data_len_ == 1) {
+ hist_[0] = y_[0];
+ return;
+ }
+ InitBaseline();
+ SubtractBaseline();
+ FFTForward(hist_);
+ size_t Ncrit = (size_t) (cutoff);
+
+ // cutting off - standard parabolic filter
+ for (size_t i = 0; i < data_len_ && i < Ncrit; ++i)
+ hist_[i] *= 1. - ((double) i * (double) i * 1.) / (double) (Ncrit * Ncrit);
+
+ for (size_t i = Ncrit; i < hist_.size(); ++i)
+ hist_[i] = 0.;
+
+ FFTBackward(hist_);
+ AddBaseline();
+ }
+
+ bool IsPeak(int dist, size_t range) const {
+ return IsLocalMaximum(dist, range);
+ }
+
+ bool IsPeak(int dist) const {
+ return IsLocalMaximum(dist, 10);
+ }
+
+ // looking for one maximum in the picture
+ vector<pair<int, double> > ListPeaks(/*int delta = 3*/) const {
+ TRACE("Smoothed data");
+ //size_t index_max = 0;
+ //for (size_t i = 0; i < data_len_; ++i) {
+ //TRACE(x_left_ + (int) i << " " << hist_[i]);
+ //if (hist_[i].real() > hist_[index_max].real())
+ //index_max = i;
+ //}
+ //vector<pair<int, double> > result;
+ //result.push_back(make_pair(x_left_ + index_max, hist_[index_max].real()));
+ //return result;
+ DEBUG("Listing peaks");
+ map<int, double> peaks_;
+ //another data_len_
+ size_t data_len_ = (size_t) (x_right_ - x_left_);
+ vector<bool> was;
+ srand((unsigned) time(NULL));
+ for (size_t i = 0; i < data_len_; ++i)
+ was.push_back(false);
+
+ size_t iteration = 0;
+ for (size_t l = 0; l < data_len_; ++l) {
+ //for (size_t k = 0; k < 4; ++k) {
+ //size_t v = std::rand() % data_len_;
+ size_t v = l;
+ if (was[v])
+ continue;
+
+ was[v] = true;
+ int index = (int) v + x_left_;
+ while (index < (x_right_ - 1) && index > x_left_ && iteration < 5) {
+ // if @index is local maximum, then leave it
+ double right_derivative = RightDerivative(index);
+ double left_derivative = LeftDerivative(index);
+
+ if (math::gr(right_derivative, 0.) && math::gr(right_derivative, -left_derivative)) {
+ index++;
+ if ((iteration & 1) == 0)
+ ++iteration;
+ }
+ else if (math::le(left_derivative, 0.)) {
+ index--;
+ if ((iteration & 1) == 1)
+ ++iteration;
+ }
+ else
+ break;
+ }
+
+ TRACE("FOUND " << index);
+
+ //double right_derivative = RightDerivative(index);
+ //double left_derivative = LeftDerivative(index);
+
+ if (index < 0)
+ continue;
+
+ //if (index >= x_right_ - delta || index < x_left_ + delta)
+ //continue;
+
+ TRACE("Is in range");
+
+ if (IsPeak(index, 5)) {
+ TRACE("Is local maximum " << index);
+ double weight_ = 0.;
+ int left_bound = (x_left_ > (index - 20) ? x_left_ : (index - 20));
+ int right_bound = (x_right_ < (index + 1 + 20) ? x_right_ : (index + 1 + 20));
+ for (int i = left_bound; i < right_bound; ++i)
+ weight_ += hist_[i - x_left_].real();
+ TRACE("WEIGHT counted");
+ pair<int, double> tmp_pair = make_pair(index, 100. * weight_);
+ if (!peaks_.count(index)) {
+ TRACE("Peaks size " << peaks_.size() << ", inserting " << tmp_pair);
+ peaks_.insert(tmp_pair);
+ } else {
+ TRACE("NON UNIQUE");
+ }
+ }
+ }
+ TRACE("FINISHED " << peaks_.size());
+ vector<pair<int, double> > peaks;
+ for (auto iter = peaks_.begin(); iter != peaks_.end(); ++iter) {
+ const pair<int, double>& tmp_pair = *iter;
+ TRACE("next peak " << tmp_pair);
+ peaks.push_back(tmp_pair);
+ //for (int i = -10; i <= 10; ++i) {
+ //peaks.push_back(make_pair(tmp_pair.first + i, tmp_pair.second / 21.));
+ //}
+ }
+ return peaks;
+ }
+
+ vector<complex_t> getIn() const {
+ return hist_;
+ }
+
+ vector<complex_t> getOut() const {
+ return hist_;
+ }
+
+private:
+ double x1, x2, y1, y2;
+ size_t delta_;
+ double percentage_;
+ double der_thr_;
+ double weight_;
+ vector<int> x_;
+ vector<double> y_;
+ size_t data_size_, data_len_;
+ int x_left_, x_right_;
+ vector<complex_t> hist_;
+
+ size_t Rev(size_t num, size_t lg_n) {
+ size_t res = 0;
+ for (size_t i = 0; i < lg_n; ++i)
+ if (num & (1 << i))
+ res |= 1 << (lg_n - 1 - i);
+ return res;
+ }
+
+ void FFT(vector<complex_t>& vect, bool invert) {
+ size_t n = vect.size();
+ size_t lg_n = 0;
+ while ( (1u << lg_n) < n)
+ ++lg_n;
+
+ while (n < (1u << lg_n)) {
+ vect.push_back(0.);
+ ++n;
+ }
+
+ for (size_t i = 0; i < n; ++i)
+ if (i < Rev(i, lg_n))
+ swap(vect[i], vect[Rev(i, lg_n)]);
+
+ for (size_t len = 2; len < 1 + n; len <<= 1) {
+ double ang = 2 * M_PI / (double) len * (invert ? -1 : 1);
+ complex_t wlen(cos(ang), sin(ang));
+ for (size_t i = 0; i < n; i += len) {
+ complex_t w(1.);
+ for (size_t j = 0; j < (len >> 1); ++j) {
+ complex_t u = vect[i + j];
+ complex_t v = vect[i + j + (len >> 1)] * w;
+ vect[i + j] = u + v;
+ vect[i + j + (len >> 1)] = u - v;
+ w *= wlen;
+ }
+ }
+ }
+
+ if (invert)
+ for (size_t i = 0; i < n; ++i)
+ vect[i] /= (double) n;
+ }
+
+
+ void FFTForward(vector<complex_t>& vect) {
+ FFT(vect, false);
+ }
+
+ void FFTBackward(vector<complex_t>& vect) {
+ FFT(vect, true);
+ }
+
+ void ExtendLinear(vector<complex_t>& hist) {
+ size_t ind = 0;
+ weight_ = 0.;
+ for (size_t i = 0; i < data_len_; ++i) {
+ if (ind == data_size_ - 1)
+ hist.push_back((double) x_right_);
+ else {
+ VERIFY(x_[ind + 1] > x_[ind]);
+ hist.push_back(((double) (i + x_left_ - x_[ind]) *
+ y_[ind + 1] + y_[ind] *
+ (double) (x_[ind + 1] - i - x_left_)) /
+ (double) (1 * (x_[ind + 1] - x_[ind])));
+ }
+ weight_ += hist[i].real(); // filling the array on the fly
+
+ if (ind < data_size_ && ((int) i == x_[ind + 1] - x_left_))
+ ++ind;
+ }
+
+ }
+
+
+ void InitBaseline() {
+ size_t Np = (size_t) ((double) data_len_ * percentage_);
+ if (Np == 0) Np++; // Np <> 0 !!!!
+
+ double mean_beg = 0.;
+ double mean_end = 0.;
+ for (size_t i = 0; i < Np; ++i) {
+ mean_beg += hist_[i].real();
+ mean_end += hist_[data_len_ - i - 1].real();
+ }
+ mean_beg /= 1. * (double) Np;
+ mean_end /= 1. * (double) Np;
+
+ // two points defining the line
+ x1 = (double) Np / 2.;
+ x2 = (double) data_len_ - (double) Np / 2.;
+ y1 = mean_beg;
+ y2 = mean_end;
+ }
+
+ void SubtractBaseline() {
+ // subtracting a baseline
+ // it's being constructed like this: the first point is (Np/2; mean of the first percentage of data),
+ // the second point is (data_len_ - Np/2; mean of the last $percentage of data)
+ for (size_t i = 0; i < data_len_; ++i) {
+ hist_[i] -= (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
+ }
+ }
+
+ void AddBaseline() {
+ for (size_t i = 0; i < data_len_; ++i) {
+ hist_[i] += (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
+ }
+ }
+
+ void Init() {
+ data_size_ = x_.size();
+ x_left_ = x_[0];
+ x_right_ = x_[data_size_ - 1] + 1;
+ data_len_ = x_right_ - x_left_;
+ ExtendLinear(hist_);
+ }
+
+ bool IsInRange(int peak) const {
+ return peak < x_right_ && peak >= x_left_;
+ }
+
+ double LeftDerivative(int dist) const {
+ VERIFY(dist > x_left_);
+ return hist_[dist - x_left_].real() - hist_[dist - x_left_ - 1].real();
+ }
+
+ double RightDerivative(int dist) const {
+ VERIFY(dist < x_right_ - 1);
+ return hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_].real();
+ }
+
+ double MiddleDerivative(int dist) const {
+ VERIFY(dist > x_left_ && dist < x_right_ - 1);
+ return .5 * (hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_ - 1].real());
+ }
+
+ double Derivative(int dist) const {
+ if (dist == x_right_ - 1)
+ return LeftDerivative(dist);
+ else if (dist == x_left_)
+ return RightDerivative(dist);
+ else
+ return MiddleDerivative(dist);
+ }
+
+ bool IsLocalMaximum(int peak, size_t range, int left_bound, int right_bound, size_t delta) const {
+
+ DEBUG("Is local maximum : peak " << peak << " range " << range
+ << " bounds " << left_bound << " " << right_bound << " delta " << delta);
+ int index_max = peak;
+ TRACE("Looking for the maximum");
+ for (int j = left_bound; j < right_bound; ++j)
+ if (math::ls(hist_[index_max - x_left_].real(), hist_[j - x_left_].real())) {
+ index_max = j;
+ }// else if (j < i && hist_[index_max - x_left_][0] == hist_[j - x_left][0] ) index_max = j;
+ TRACE("Maximum is " << index_max);
+
+ if ((size_t)abs(index_max - peak) <= delta)
+ return true;
+
+ return false;
+ }
+
+ bool IsLocalMaximum(int peak, size_t range) const {
+ return IsLocalMaximum(peak, range, x_left_, x_right_, delta_);
+ }
+
+ DECL_LOGGER("PeakFinder");
+};
+
+}
+
+}
+
+#endif /* PEAKFINDER_HPP_ */
diff --git a/src/common/paired_info/smoothing_distance_estimation.hpp b/src/common/paired_info/smoothing_distance_estimation.hpp
new file mode 100644
index 0000000..c605e00
--- /dev/null
+++ b/src/common/paired_info/smoothing_distance_estimation.hpp
@@ -0,0 +1,283 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SMOOTHING_DISTANCE_ESTIMATION_HPP_
+#define SMOOTHING_DISTANCE_ESTIMATION_HPP_
+
+#include "paired_info.hpp"
+#include "data_divider.hpp"
+#include "peak_finder.hpp"
+#include "weighted_distance_estimation.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Graph>
+class SmoothingDistanceEstimator : public WeightedDistanceEstimator<Graph> {
+ //FIXME configure
+ static const size_t OVERLAP_TOLERANCE = 1000;
+protected:
+ typedef WeightedDistanceEstimator<Graph> base;
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+ typedef typename InPairedIndex::Histogram TempHistogram;
+
+public:
+ SmoothingDistanceEstimator(const Graph &graph,
+ const InPairedIndex &histogram,
+ const GraphDistanceFinder<Graph> &dist_finder,
+ std::function<double(int)> weight_f,
+ size_t linkage_distance, size_t max_distance, size_t threshold,
+ double range_coeff, double delta_coeff,
+ size_t cutoff,
+ size_t min_peak_points,
+ double inv_density,
+ double percentage,
+ double derivative_threshold,
+ bool only_scaffolding = false) :
+ base(graph, histogram, dist_finder, weight_f, linkage_distance, max_distance),
+ threshold_(threshold),
+ range_coeff_(range_coeff),
+ delta_coeff_(delta_coeff),
+ cutoff_((int) cutoff),
+ min_peak_points_(min_peak_points),
+ inv_density_(inv_density),
+ percentage_(percentage),
+ deriv_thr(derivative_threshold),
+ only_scaffolding_(only_scaffolding),
+ gap_distances(0) { }
+
+ virtual ~SmoothingDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<pair<int, double> > EstimHist;
+ typedef vector<PairInfo<EdgeId> > PairInfos;
+ typedef vector<size_t> GraphLengths;
+
+ EstimHist EstimateEdgePairDistances(EdgePair /*ep*/,
+ const InHistogram & /*raw_data*/,
+ const vector<size_t> & /*forward*/) const override {
+ VERIFY_MSG(false, "Sorry, the SMOOOOTHING estimator is not available anymore." <<
+ "SPAdes is going to terminate");
+
+ return EstimHist();
+ }
+
+private:
+ typedef pair<size_t, size_t> Interval;
+
+ size_t threshold_;
+ double range_coeff_;
+ double delta_coeff_;
+ int cutoff_;
+ size_t min_peak_points_;
+ double inv_density_;
+ double percentage_;
+ double deriv_thr;
+ bool only_scaffolding_;
+ mutable size_t gap_distances;
+
+ EstimHist FindEdgePairDistances(EdgePair ep,
+ const TempHistogram &raw_hist) const {
+ size_t first_len = this->graph().length(ep.first);
+ size_t second_len = this->graph().length(ep.second);
+ TRACE("Lengths are " << first_len << " " << second_len);
+ TempHistogram data;
+ for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) {
+ Point p = *I;
+ if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len)) if (
+ (long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len)
+ data.insert(p);
+ }
+ EstimHist result;
+ double picture_weight = 0.;
+ for (auto I = data.begin(), E = data.end(); I != E; ++I)
+ picture_weight += I->weight;
+ if (math::ls(picture_weight, 3.))
+ return result;
+
+ DataDivider<EdgeId> data_divider(threshold_,
+ vector<Point>(data.begin(), data.end()));
+
+ PairInfos infos;
+ infos.reserve(data.size());
+ const vector<Interval> &clusters =
+ data_divider.DivideAndSmoothData(ep, infos, this->weight_f_);
+ DEBUG("Seeking for distances");
+ TRACE("size " << infos.size());
+
+ for (size_t i = 0; i < clusters.size(); ++i) {
+ size_t begin = clusters[i].first;
+ size_t end = clusters[i].second;
+ TRACE("begin " << begin << " at " << rounded_d(infos[begin])
+ << ", " << " end " << end << " at " << rounded_d(infos[end - 1]));
+ size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1;
+ TRACE("data length " << data_length);
+ if (end - begin > min_peak_points_) {
+ size_t range = (size_t) math::round((double) data_length * range_coeff_);
+ size_t delta = (size_t) math::round((double) data_length * delta_coeff_);
+ PeakFinder<EdgeId> peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr);
+ DEBUG("Processing window : " << rounded_d(infos[begin])
+ << " " << rounded_d(infos[end - 1]));
+ peakfinder.FFTSmoothing(cutoff_);
+ TRACE("Listing peaks");
+ const EstimHist &peaks = peakfinder.ListPeaks();
+ //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) {
+ //TRACE("PEAKS " << iter->first << " " << iter->second);
+ //}
+ if (peaks.size() == 0)
+ continue;
+ size_t index_of_max_weight = 0;
+ for (size_t i = 0; i < peaks.size(); ++i)
+ if (math::ls(peaks[index_of_max_weight].second, peaks[i].second))
+ index_of_max_weight = i;
+ result.push_back(peaks[index_of_max_weight]);
+ }
+ }
+
+ if (result.size() == 0)
+ return result;
+ size_t index_of_max_weight = 0;
+ for (size_t i = 0; i < result.size(); ++i)
+ if (math::ls(result[index_of_max_weight].second, result[i].second))
+ index_of_max_weight = i;
+
+ EstimHist new_result;
+ for (size_t i = 0; i < result.size(); ++i)
+ if (result[i].second > .5 * result[index_of_max_weight].second)
+ new_result.push_back(result[i]);
+ return new_result;
+ }
+
+ void ProcessEdge(EdgeId e1,
+ const InPairedIndex &pi,
+ PairedInfoBuffer<Graph> &result) const override {
+ typename base::LengthMap second_edges;
+ auto inner_map = pi.GetHalf(e1);
+ for (auto I : inner_map)
+ second_edges[I.first];
+
+ this->FillGraphDistancesLengths(e1, second_edges);
+
+ for (const auto &entry: second_edges) {
+ EdgeId e2 = entry.first;
+ EdgePair ep(e1, e2);
+
+ VERIFY(ep <= pi.ConjugatePair(ep));
+
+ TRACE("Processing edge pair " << this->graph().int_id(e1)
+ << " " << this->graph().int_id(e2));
+ const GraphLengths &forward = entry.second;
+
+ auto hist = pi.Get(e1, e2).Unwrap();
+ EstimHist estimated;
+ //DEBUG("Extending paired information");
+ //DEBUG("Extend left");
+ //this->base::ExtendInfoLeft(e1, e2, hist, 1000);
+ DEBUG("Extend right");
+ this->ExtendInfoRight(e1, e2, hist, 1000);
+ if (forward.size() == 0) {
+ estimated = FindEdgePairDistances(ep, hist);
+ ++gap_distances;
+ } else if (forward.size() > 0 && (!only_scaffolding_)) {
+ //TODO: remove THIS
+ InPairedIndex temp_index(this->graph());
+ temp_index.AddMany(e1, e2, hist);
+ auto hist = temp_index.Get(e1, e2);
+ estimated = this->base::EstimateEdgePairDistances(ep, hist, forward);
+ }
+ DEBUG(gap_distances << " distances between gap edge pairs have been found");
+ OutHistogram res = this->ClusterResult(ep, estimated);
+ this->AddToResult(res, ep, result);
+ }
+ }
+
+ bool IsTipTip(EdgeId e1, EdgeId e2) const {
+ return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 &&
+ this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 &&
+ this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 &&
+ this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1);
+ }
+
+ void ExtendInfoRight(EdgeId e1, EdgeId e2, TempHistogram &data, size_t max_shift) const {
+ ExtendRightDFS(e1, e2, data, 0, max_shift);
+ }
+
+ void MergeInto(const InHistogram &what, TempHistogram &where, int shift) const {
+ // assuming they are sorted already
+ if (what.size() == 0)
+ return;
+
+ if (where.size() == 0) {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ where.insert(to_be_added);
+ }
+
+ return;
+ }
+
+ // Check, whether two histograms intersect. If not, we can just merge them
+ // straightforwardly.
+ if (math::ls(where.rbegin()->d, what.min().d + float(shift)) ||
+ math::gr(where.begin()->d, what.max().d + float(shift))) {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ where.insert(to_be_added);
+ }
+ } else {
+ for (auto to_be_added : what) {
+ to_be_added.d += shift;
+ auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added);
+ if (low_bound != where.end() && to_be_added == *low_bound) {
+ to_be_added.weight += low_bound->weight;
+ where.erase(to_be_added);
+ where.insert(to_be_added);
+ } else
+ where.insert(low_bound, to_be_added);
+ }
+ }
+ }
+
+ void ExtendRightDFS(const EdgeId &first, EdgeId current, TempHistogram &data, int shift,
+ size_t max_shift) const {
+ auto end = this->graph().EdgeEnd(current);
+ if (current == first)
+ return;
+ if (this->graph().IncomingEdgeCount(end) > 1)
+ return;
+
+ for (EdgeId next : this->graph().OutgoingEdges(end)) {
+ auto hist = this->index().Get(first, next);
+ if (-shift < (int) max_shift)
+ ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift);
+
+ //auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next));
+ //if (filtered_infos.size() > 0)
+ // MergeInto(filtered_infos, data, shift - (int) this->graph().length(current));
+ MergeInto(hist, data, shift - (int) this->graph().length(current));
+ }
+ }
+
+ const string Name() const override {
+ static const string my_name = "SMOOTHING";
+ return my_name;
+ }
+
+ DECL_LOGGER("SmoothingDistanceEstimator")
+};
+
+}
+
+}
+
+#endif /* SMOOTHING_DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/common/paired_info/split_path_constructor.hpp b/src/common/paired_info/split_path_constructor.hpp
new file mode 100644
index 0000000..d2d23b2
--- /dev/null
+++ b/src/common/paired_info/split_path_constructor.hpp
@@ -0,0 +1,142 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+* split_path_constructor.hpp
+*
+* Created on: Jun 14, 2012
+* Author: avsirotkin
+*/
+
+#pragma once
+
+#include <common/assembly_graph/paths/path_utils.hpp>
+#include "utils/logger/logger.hpp"
+#include "paired_info/paired_info.hpp"
+#include "assembly_graph/paths/path_processor.hpp"
+#include "paired_info/pair_info_bounds.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+class PathInfoClass {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
+
+ EdgeId base_edge;
+ vector<PairInfo> path;
+
+ PathInfoClass() : base_edge(NULL) { };
+
+ PathInfoClass(const EdgeId Edge) : base_edge(Edge) { };
+
+ std::pair<EdgeId, double> operator[](const size_t i) const {
+ if (i == 0)
+ return std::make_pair(base_edge, 0.0);
+
+ VERIFY(i < path.size() + 1);
+ return std::make_pair(path[i - 1].second, path[i - 1].d());
+ }
+
+ size_t size() const { return path.size() + 1; }
+
+ void push_back(const PairInfo &pi) { path.push_back(pi); }
+
+ typename std::vector<PairInfo>::const_iterator begin() const { return path.begin(); }
+
+ typename std::vector<PairInfo>::const_iterator end() const { return path.end(); }
+
+ std::string PrintPath(const Graph &graph) const {
+ std::ostringstream ss;
+ ss << " " << graph.int_id(base_edge) << ": ";
+ for (size_t j = 0; j < path.size(); j++) {
+ ss << "(" << graph.int_id(path[j].second) << ", " << path[j].d() << "), ";
+ }
+ return ss.str();
+ }
+};
+
+template<class Graph>
+class SplitPathConstructor {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PathInfoClass<Graph> PathInfo;
+ typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
+ static const size_t MAX_DIJKSTRA_DEPTH = 3000;
+public:
+ SplitPathConstructor(const Graph &graph) : graph_(graph) { }
+
+ vector<PathInfo> ConvertPIToSplitPaths(EdgeId cur_edge, const omnigraph::de::PairedInfoIndexT<Graph> &pi,
+ double is, double is_var) const {
+ vector<PairInfo> pair_infos; //TODO: this is an adaptor for the old implementation
+ for (auto i : pi.Get(cur_edge))
+ for (auto j : i.second)
+ pair_infos.emplace_back(cur_edge, i.first, j);
+ std::sort(pair_infos.begin(), pair_infos.end(),[&](const PairInfo p1, const PairInfo p2){
+ return (p1.point.d > p2.point.d || ((p1.point.d == p2.point.d) && (p1.second < p2.second )));
+ });
+ vector<PathInfo> result;
+ if (pair_infos.empty())
+ return result;
+
+ vector<bool> pair_info_used(pair_infos.size());
+ TRACE("Preparing path_processor for this base edge");
+ size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), (size_t) is, is_var);
+
+ //FIXME is path_upper_bound enough?
+
+
+ typename omnigraph::DijkstraHelper<Graph>::BoundedDijkstra dijkstra(omnigraph::DijkstraHelper<Graph>::CreateBoundedDijkstra(graph_, path_upper_bound, MAX_DIJKSTRA_DEPTH));
+ dijkstra.Run(graph_.EdgeEnd(cur_edge));
+ for (size_t i = pair_infos.size(); i > 0; --i) {
+ const PairInfo &cur_info = pair_infos[i - 1];
+ if (math::le(cur_info.d(), 0.))
+ continue;
+ if (pair_info_used[i - 1])
+ continue;
+ DEBUG("SPC: pi " << cur_info);
+
+ vector<EdgeId> common_part = GetCommonPathsEnd(graph_, cur_edge, cur_info.second,
+ (size_t) (cur_info.d() - cur_info.var()),
+ (size_t) (cur_info.d() + cur_info.var()),
+ dijkstra);
+ DEBUG("Found common part of size " << common_part.size());
+ PathInfoClass<Graph> sub_res(cur_edge);
+ if (common_part.size() > 0) {
+ size_t total_length = 0;
+ for (size_t j = 0; j < common_part.size(); ++j)
+ total_length += graph_.length(common_part[j]);
+
+ DEBUG("Common part " << ToString(common_part));
+ for (size_t j = 0; j < common_part.size(); ++j) {
+ PairInfo cur_pi(cur_edge, common_part[j],
+ cur_info.d() - (double) total_length,
+ cur_info.weight(),
+ cur_info.var());
+
+ sub_res.push_back(cur_pi);
+ total_length -= graph_.length(common_part[j]);
+ for (size_t ind = 0; ind + 1 < i; ++ind) {
+ if (cur_pi == pair_infos[ind])
+ pair_info_used[ind] = true;
+ }
+ }
+ }
+
+ sub_res.push_back(cur_info);
+ result.push_back(sub_res);
+ DEBUG(sub_res.PrintPath(graph_));
+ }
+ return result;
+ }
+
+private:
+ const Graph &graph_;
+};
+
+
+}
diff --git a/src/common/paired_info/weighted_distance_estimation.hpp b/src/common/paired_info/weighted_distance_estimation.hpp
new file mode 100644
index 0000000..486a608
--- /dev/null
+++ b/src/common/paired_info/weighted_distance_estimation.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef WEIGHTED_DISTANCE_ESTIMATION_HPP_
+#define WEIGHTED_DISTANCE_ESTIMATION_HPP_
+
+#include "math/xmath.h"
+#include "paired_info.hpp"
+#include "distance_estimation.hpp"
+
+namespace omnigraph {
+
+namespace de {
+
+template<class Graph>
+class WeightedDistanceEstimator : public DistanceEstimator<Graph> {
+protected:
+ typedef DistanceEstimator<Graph> base;
+ typedef typename base::InPairedIndex InPairedIndex;
+ typedef typename base::OutPairedIndex OutPairedIndex;
+ typedef typename base::InHistogram InHistogram;
+ typedef typename base::OutHistogram OutHistogram;
+
+public:
+ WeightedDistanceEstimator(const Graph &graph,
+ const InPairedIndex &histogram,
+ const GraphDistanceFinder<Graph> &distance_finder,
+ std::function<double(int)> weight_f,
+ size_t linkage_distance, size_t max_distance) :
+ base(graph, histogram, distance_finder, linkage_distance, max_distance), weight_f_(weight_f) { }
+
+ virtual ~WeightedDistanceEstimator() { }
+
+protected:
+ typedef typename Graph::EdgeId EdgeId;
+
+ typedef vector<pair<int, double> > EstimHist;
+ typedef pair<EdgeId, EdgeId> EdgePair;
+ typedef vector<size_t> GraphLengths;
+
+ std::function<double(int)> weight_f_;
+
+ virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
+ const InHistogram &histogram,
+ const GraphLengths &raw_forward) const override {
+ using std::abs;
+ using namespace math;
+ TRACE("Estimating with weight function");
+ size_t first_len = this->graph().length(ep.first);
+ size_t second_len = this->graph().length(ep.second);
+
+ EstimHist result;
+ int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min());
+ vector<int> forward;
+ for (auto len : raw_forward) {
+ int length = (int) len;
+ if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) {
+ forward.push_back(length);
+ }
+ }
+ if (forward.size() == 0)
+ return result;
+
+ DEDistance max_dist = this->max_distance_;
+ size_t i = 0;
+ vector<double> weights(forward.size());
+ for (auto point : histogram) {
+ DEDistance cur_dist(forward[i]), next_dist(forward[i + 1]);
+ if (le(2 * point.d + DEDistance(second_len), DEDistance(first_len)))
+ continue;
+ while (i + 1 < forward.size() && next_dist < point.d) {
+ ++i;
+ }
+ if (i + 1 < forward.size() && ls(DEDistance(next_dist) - point.d, point.d - DEDistance(cur_dist))) {
+ ++i;
+ if (le(abs(cur_dist - point.d), max_dist))
+ weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point));
+ }
+ else if (i + 1 < forward.size() && eq(next_dist - point.d, point.d - cur_dist)) {
+ if (le(abs(cur_dist - point.d), max_dist))
+ weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point));
+
+ ++i;
+
+ if (le(abs(cur_dist - point.d), max_dist))
+ weights[i] += point.weight * 0.5 * weight_f_(forward[i] - rounded_d(point));
+ } else if (le(abs(cur_dist - point.d), max_dist))
+ weights[i] += point.weight * weight_f_(forward[i] - rounded_d(point));
+ }
+
+ for (size_t i = 0; i < forward.size(); ++i)
+ if (gr(weights[i], 0.))
+ result.push_back(make_pair(forward[i], weights[i]));
+
+ return result;
+ }
+
+ const string Name() const override {
+ static const string my_name = "WEIGHTED";
+ return my_name;
+ }
+
+};
+
+}
+
+}
+#endif
diff --git a/src/common/paired_info/weights.hpp b/src/common/paired_info/weights.hpp
new file mode 100644
index 0000000..c0e8b43
--- /dev/null
+++ b/src/common/paired_info/weights.hpp
@@ -0,0 +1,83 @@
+#pragma once
+
+using omnigraph::Path;
+using omnigraph::MappingPath;
+using omnigraph::Range;
+using omnigraph::MappingRange;
+
+namespace debruijn_graph {
+inline double PairedReadCountWeight(const std::pair<EdgeId, EdgeId>&,
+ const MappingRange&, const MappingRange&) {
+ return 1.;
+}
+
+inline double KmerCountProductWeight(const std::pair<EdgeId, EdgeId>&,
+ const MappingRange& mr1, const MappingRange& mr2) {
+ return (double)(mr1.initial_range.size() * mr2.initial_range.size());
+}
+
+class WeightDEWrapper {
+private:
+
+ vector<double> new_hist;
+ int left_x;
+ int insert_size;
+
+ void ExtendLinear(const std::map<int, size_t> & hist) {
+ size_t sum_weight = 0;
+
+ for (auto iter = hist.begin(); iter != hist.end(); ++iter)
+ sum_weight += iter->second;
+ DEBUG(sum_weight);
+
+ VERIFY(hist.size() > 0);
+ auto iter = hist.begin();
+
+ left_x = iter->first;
+
+ int prev = iter->first;
+ size_t prev_val = iter->second;
+
+ new_hist.push_back((double)prev_val / (double)sum_weight);
+ ++iter;
+
+ for (; iter != hist.end(); ++iter) {
+ int x = iter->first;
+ size_t y = iter->second;
+ double tan = ((double)y - (double)prev_val) / (x - prev);
+
+ VERIFY(prev < x);
+ for (int i = prev + 1; i <= x; ++i) {
+ new_hist.push_back(((double)prev_val + tan * (i - prev)) / (double)sum_weight);
+ }
+ prev = x;
+ prev_val = y;
+ DEBUG("hist " << x << " " << y);
+ }
+ }
+
+public:
+ WeightDEWrapper(const map<int, size_t>& hist, double IS) {
+ DEBUG("WeightDEWrapper " << IS);
+ insert_size = (int) IS;
+ DEBUG("Extending linear");
+ ExtendLinear(hist);
+ }
+
+ ~WeightDEWrapper() {
+ }
+
+
+ double CountWeight(int x) const {
+ int xx = insert_size - left_x + x - 1;
+
+ if (!(xx >= 0 && xx < (int) new_hist.size())) return 0.;
+ VERIFY(math::le(new_hist[xx], 1.));
+ return 1000. * new_hist[xx];
+ }
+};
+
+inline double UnityFunction(int /*x*/) {
+ return 1.;
+}
+}
diff --git a/src/modules/pipeline/CMakeLists.txt b/src/common/pipeline/CMakeLists.txt
similarity index 100%
rename from src/modules/pipeline/CMakeLists.txt
rename to src/common/pipeline/CMakeLists.txt
diff --git a/src/common/pipeline/config_common.hpp b/src/common/pipeline/config_common.hpp
new file mode 100755
index 0000000..0f38490
--- /dev/null
+++ b/src/common/pipeline/config_common.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/simple_tools.hpp"
+#include "utils/path_helper.hpp"
+#include "utils/verify.hpp"
+
+// todo: undo dirty fix
+
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/info_parser.hpp>
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <map>
+
+namespace config_common {
+// for enable_if/disable_if
+namespace details {
+template<class T, class S>
+struct is_equal_type {
+ static const bool value = false;
+};
+
+template<class T>
+struct is_equal_type<T, T> {
+ static const bool value = true;
+};
+}
+
+template<class T>
+typename boost::enable_if_c<details::is_equal_type<T, std::string>::value ||
+ boost::is_arithmetic<T>::value>::type
+load(T &value,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found())
+ value = pt.get<T>(key);
+}
+
+template<class T>
+typename boost::disable_if_c<details::is_equal_type<T,
+ std::string>::value ||
+ boost::is_arithmetic<T>::value>::type
+load(T &value,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found())
+ load(value, pt.get_child(key), complete);
+}
+
+template<class T>
+void load_items(std::vector <T> &vec, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ std::string vector_key = key + std::string(".count");
+ if (complete || pt.find(vector_key) != pt.not_found()) {
+ size_t count = pt.get<size_t>(vector_key);
+
+ for (size_t i = 0; i != count; ++i) {
+ T t;
+ load(t, pt.get_child(fmt::format("{:s}.item_{:d}", key, i)),
+ complete);
+ vec.push_back(t);
+ }
+ }
+}
+
+template<class T>
+void load(std::vector <T> &vec, boost::property_tree::ptree const &pt, std::string const &key,
+ bool /*complete*/) {
+ boost::optional<T> value = pt.get_optional<T>(key);
+ if (value) {
+ vec.push_back(*value);
+ return;
+ }
+ for (size_t i = 1; ; i++) {
+ value = pt.get_optional<std::string>(key + "#" + ToString(i));
+ if (value) {
+ vec.push_back(*value);
+ continue;
+ }
+ value = pt.get_optional<std::string>(key + "." + ToString(i));
+ if (value) {
+ vec.push_back(*value);
+ continue;
+ }
+ if (i > 0) {
+ return;
+ }
+ }
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt, std::string const &key) {
+ load(value, pt, key, true);
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt, const char *key) {
+ load(value, pt, std::string(key), true);
+}
+
+template<class T>
+void load(T &value, boost::property_tree::ptree const &pt) {
+ load(value, pt, true);
+}
+
+template<class T>
+void load_param(const std::string &filename, const std::string &key,
+ boost::optional<T> &value) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+ value = pt.get_optional<T>(key);
+}
+
+template<class T>
+void write_param(const std::string &filename, const std::string &key,
+ const boost::optional<T> &value) {
+ if (value) {
+ std::ofstream params_stream(filename.c_str(), std::ios_base::app);
+ params_stream << key << "\t" << value << std::endl;
+ }
+}
+
+template<class T>
+void write_param(const std::string &filename, const std::string &key,
+ const T &value) {
+ std::ofstream params_stream(filename.c_str(), std::ios_base::app);
+ params_stream << key << "\t" << value << std::endl;
+}
+
+}
diff --git a/src/common/pipeline/config_singl.hpp b/src/common/pipeline/config_singl.hpp
new file mode 100644
index 0000000..93f1767
--- /dev/null
+++ b/src/common/pipeline/config_singl.hpp
@@ -0,0 +1,57 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __CONFIG_SINGL_HPP__
+#define __CONFIG_SINGL_HPP__
+
+
+#include "utils/verify.hpp"
+
+#include <string>
+
+namespace config_common {
+
+// config singleton-wrap
+template<class Config>
+struct config {
+ static std::string dirnameOf(const std::string &fname) {
+ size_t pos = fname.find_last_of("\\/");
+ return (std::string::npos == pos) ? "" : fname.substr(0, pos);
+ }
+
+ template<class Source>
+ static void create_instance(Source const &source) {
+ load(inner_cfg(), source);
+ is_initialized() = true;
+ }
+
+ static Config const &get() {
+ VERIFY_MSG(is_initialized(), "Config not initialized");
+ return inner_cfg();
+ }
+
+ static Config &get_writable() {
+ VERIFY_MSG(is_initialized(), "Config not initialized");
+ return inner_cfg();
+ }
+
+private:
+ static Config &inner_cfg() {
+ static Config config;
+ return config;
+ }
+
+ static bool &is_initialized() {
+ static bool is_initialized = false;
+ return is_initialized;
+ }
+};
+
+}
+
+
+#endif // __CONFIG_SINGLE_HPP__
diff --git a/src/common/pipeline/config_struct.cpp b/src/common/pipeline/config_struct.cpp
new file mode 100644
index 0000000..ad5795c
--- /dev/null
+++ b/src/common/pipeline/config_struct.cpp
@@ -0,0 +1,858 @@
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/config_struct.hpp"
+
+#include "pipeline/config_common.hpp"
+#include "utils/openmp_wrapper.h"
+
+#include "utils/logger/logger.hpp"
+#include "utils/verify.hpp"
+
+#include "io/reads/file_reader.hpp"
+
+#include <string>
+#include <vector>
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+using namespace llvm;
+
+namespace io {
+template<>
+void SequencingLibrary<debruijn_graph::config::DataSetData>::yamlize(llvm::yaml::IO &io) {
+ // First, load the "common stuff"
+ SequencingLibraryBase::yamlize(io);
+
+ // Now load the remaining stuff
+ io.mapOptional("read length" , data_.read_length);
+ io.mapOptional("average read length" , data_.avg_read_length);
+ io.mapOptional("insert size mean" , data_.mean_insert_size);
+ io.mapOptional("insert size deviation" , data_.insert_size_deviation);
+ io.mapOptional("insert size left quantile" , data_.insert_size_left_quantile);
+ io.mapOptional("insert size right quantile" , data_.insert_size_right_quantile);
+ io.mapOptional("insert size median" , data_.median_insert_size);
+ io.mapOptional("insert size mad" , data_.insert_size_mad);
+ io.mapOptional("insert size distribution" , data_.insert_size_distribution);
+ io.mapOptional("average coverage" , data_.average_coverage);
+ io.mapOptional("pi threshold" , data_.pi_threshold);
+ io.mapOptional("binary converted" , data_.binary_reads_info.binary_coverted);
+ io.mapOptional("single reads mapped" , data_.single_reads_mapped);
+ io.mapOptional("library index" , data_.lib_index);
+ io.mapOptional("number of reads" , data_.read_count);
+ io.mapOptional("total nucleotides" , data_.total_nucls);
+}
+
+template<>
+void SequencingLibrary<debruijn_graph::config::DataSetData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
+ // Simply ask base class to validate for us
+ SequencingLibraryBase::validate(io, res);
+}
+}
+
+#include "pipeline/library.inl"
+
+template class io::DataSet<debruijn_graph::config::DataSetData>;
+
+namespace debruijn_graph {
+namespace config {
+
+template<typename mode_t>
+vector<string> CheckedNames(const vector<pair<string, mode_t>>& mapping, mode_t total) {
+ VERIFY_MSG(size_t(total) == mapping.size(), "Names for some modes missing")
+ vector<string> answer;
+ for (size_t i = 0; i < mapping.size(); ++i) {
+ VERIFY_MSG(size_t(mapping[i].second) == i, "Id/name mapping error");
+ answer.push_back(mapping[i].first);
+ }
+ return answer;
+}
+
+vector<string> InfoPrinterPosNames() {
+ return CheckedNames<info_printer_pos>({
+ {"default", info_printer_pos::default_pos},
+ {"before_first_gap_closer", info_printer_pos::before_first_gap_closer},
+ {"before_simplification", info_printer_pos::before_simplification},
+ {"before_post_simplification", info_printer_pos::before_post_simplification},
+ {"final_simplified", info_printer_pos::final_simplified},
+ {"final_gap_closed", info_printer_pos::final_gap_closed},
+ {"before_repeat_resolution", info_printer_pos::before_repeat_resolution}}, info_printer_pos::total);
+}
+
+vector<string> PipelineTypeNames() {
+ return CheckedNames<pipeline_type>({
+ {"base", pipeline_type::base},
+ {"isolate", pipeline_type::isolate},
+ {"mda", pipeline_type::mda},
+ {"meta", pipeline_type::meta},
+ {"moleculo", pipeline_type::moleculo},
+ {"diploid", pipeline_type::diploid},
+ {"rna", pipeline_type::rna},
+ {"plasmid", pipeline_type::plasmid},
+ {"large_genome", pipeline_type::large_genome}
+ }, pipeline_type::total);
+}
+
+vector<string> ConstructionModeNames() {
+ return CheckedNames<construction_mode>({
+ {"old", construction_mode::old},
+ {"extension", construction_mode::extention}}, construction_mode::total);
+}
+
+vector<string> EstimationModeNames() {
+ return CheckedNames<estimation_mode>({
+ {"simple", estimation_mode::simple},
+ {"weighted", estimation_mode::weighted},
+ {"smoothing", estimation_mode::smoothing}}, estimation_mode::total);
+}
+
+
+vector<string> ResolveModeNames() {
+ return CheckedNames<resolving_mode>({
+ {"none", resolving_mode::none},
+ {"path_extend", resolving_mode::path_extend}}, resolving_mode::total);
+}
+
+vector<string> SingleReadResolveModeNames() {
+ return CheckedNames<single_read_resolving_mode>({
+ {"none", single_read_resolving_mode::none},
+ {"only_single_libs", single_read_resolving_mode::only_single_libs},
+ {"all", single_read_resolving_mode::all}}, single_read_resolving_mode::total);
+}
+
+vector<string> BrokenScaffoldsModeNames() {
+ return CheckedNames<output_broken_scaffolds>({
+ {"none", output_broken_scaffolds::none},
+ {"break_gaps", output_broken_scaffolds::break_gaps},
+ {"break_all", output_broken_scaffolds::break_all}}, output_broken_scaffolds::total);
+}
+
+
+void load_lib_data(const std::string& prefix) {
+ // First, load the data into separate libs
+ cfg::get_writable().ds.reads.load(prefix + ".lib_data");
+
+ // Now, infer the common parameters
+ size_t max_rl = 0;
+ double avg_cov = 0.0;
+ double avg_rl = 0.0;
+ for (const auto& lib : cfg::get().ds.reads.libraries()) {
+ auto const& data = lib.data();
+ if (lib.is_graph_contructable())
+ max_rl = std::max(max_rl, data.read_length);
+ if (data.average_coverage > 0)
+ avg_cov = data.average_coverage;
+ if (data.avg_read_length > 0)
+ avg_rl = data.avg_read_length;
+ }
+
+ cfg::get_writable().ds.set_RL(max_rl);
+ cfg::get_writable().ds.set_aRL(avg_rl);
+ cfg::get_writable().ds.set_avg_coverage(avg_cov);
+}
+
+void write_lib_data(const std::string& prefix) {
+ cfg::get_writable().ds.reads.save(prefix + ".lib_data");
+}
+
+void load(debruijn_config::simplification::tip_clipper &tc,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(tc.condition, pt, "condition");
+}
+
+void load(debruijn_config::simplification::dead_end_clipper& dead_end,
+ boost::property_tree::ptree const &pt,
+ bool /* complete */) {
+ using config_common::load;
+ load(dead_end.condition, pt, "condition");
+ load(dead_end.enabled, pt, "enabled");
+}
+
+void load(resolving_mode &rm, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ rm = ModeByName<resolving_mode>(pt.get<std::string>(key), ResolveModeNames());
+ }
+}
+
+void load(single_read_resolving_mode &rm, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ std::string ep = pt.get<std::string>(key);
+ rm = ModeByName<single_read_resolving_mode>(ep, SingleReadResolveModeNames());
+ }
+}
+
+void load(output_broken_scaffolds &obs, boost::property_tree::ptree const &pt,
+ std::string const &key, bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ obs = ModeByName<output_broken_scaffolds>(pt.get<std::string>(key), BrokenScaffoldsModeNames());
+ }
+}
+
+void load(construction_mode& con_mode,
+ boost::property_tree::ptree const& pt, std::string const& key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ con_mode = ModeByName<construction_mode>(pt.get<std::string>(key), ConstructionModeNames());
+ }
+}
+
+void load(debruijn_config::construction::early_tip_clipper& etc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(etc.enable, pt, "enable");
+ etc.length_bound = pt.get_optional<size_t>("length_bound");
+}
+
+void load(debruijn_config::construction& con,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(con.con_mode, pt, "mode", complete);
+ load(con.keep_perfect_loops, pt, "keep_perfect_loops", complete);
+ load(con.read_buffer_size, pt, "read_buffer_size", complete);
+ con.read_buffer_size *= 1024 * 1024;
+ load(con.early_tc, pt, "early_tip_clipper", complete);
+}
+
+void load(debruijn_config::sensitive_mapper& sensitive_map,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(sensitive_map.k, pt, "k", complete);
+}
+
+void load(estimation_mode &est_mode,
+ boost::property_tree::ptree const &pt, std::string const &key,
+ bool complete) {
+ if (complete || pt.find(key) != pt.not_found()) {
+ est_mode = ModeByName<estimation_mode>(pt.get<std::string>(key), EstimationModeNames());
+ }
+}
+
+void load(debruijn_config::simplification::bulge_remover& br,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(br.enabled , pt, "enabled" , complete);
+ load(br.main_iteration_only , pt, "main_iteration_only" , complete);
+ load(br.max_bulge_length_coefficient , pt, "max_bulge_length_coefficient", complete);
+ load(br.max_additive_length_coefficient , pt,
+ "max_additive_length_coefficient", complete);
+ load(br.max_coverage, pt, "max_coverage", complete);
+ load(br.max_relative_coverage, pt, "max_relative_coverage", complete);
+ load(br.max_delta, pt, "max_delta", complete);
+ load(br.max_relative_delta, pt, "max_relative_delta", complete);
+ load(br.max_number_edges, pt, "max_number_edges", complete);
+ load(br.parallel, pt, "parallel", complete);
+ load(br.buff_size, pt, "buff_size", complete);
+ load(br.buff_cov_diff, pt, "buff_cov_diff", complete);
+ load(br.buff_cov_rel_diff, pt, "buff_cov_rel_diff", complete);
+}
+
+void load(debruijn_config::simplification::topology_tip_clipper& ttc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(ttc.length_coeff, pt, "length_coeff");
+ load(ttc.plausibility_length, pt, "plausibility_length");
+ load(ttc.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::complex_tip_clipper &ctc,
+ boost::property_tree::ptree const &pt, bool complete) {
+ using config_common::load;
+ load(ctc.enabled, pt, "enabled", complete);
+ load(ctc.max_relative_coverage, pt, "max_relative_coverage", complete);
+ load(ctc.max_edge_len, pt, "max_edge_len", complete);
+ load(ctc.condition, pt, "condition", complete);
+}
+
+void load(debruijn_config::simplification::relative_coverage_edge_disconnector& relative_ed,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(relative_ed.enabled, pt, "enabled", complete);
+ load(relative_ed.diff_mult, pt, "diff_mult", complete);
+ load(relative_ed.edge_sum, pt, "edge_sum", complete);
+}
+
+void load(debruijn_config::simplification::relative_coverage_comp_remover& rcc,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(rcc.enabled, pt, "enabled", complete);
+ load(rcc.coverage_gap, pt, "coverage_gap", complete);
+ load(rcc.length_coeff, pt, "max_length_coeff", complete);
+ load(rcc.tip_allowing_length_coeff, pt, "max_length_with_tips_coeff", complete);
+ load(rcc.vertex_count_limit, pt, "max_vertex_cnt", complete);
+ load(rcc.max_ec_length_coefficient, pt, "max_ec_length_coefficient", complete);
+ load(rcc.max_coverage_coeff, pt, "max_coverage_coeff", complete);
+}
+
+void load(debruijn_config::simplification::isolated_edges_remover& ier,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(ier.enabled, pt, "enabled", complete);
+ load(ier.max_length, pt, "max_length", complete);
+ load(ier.max_coverage, pt, "max_coverage", complete);
+ load(ier.max_length_any_cov, pt, "max_length_any_cov", complete);
+}
+
+void load(debruijn_config::simplification::init_cleaning& init_clean,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(init_clean.self_conj_condition, pt, "self_conj_condition", complete);
+ load(init_clean.early_it_only, pt, "early_it_only", complete);
+ load(init_clean.activation_cov, pt, "activation_cov", complete);
+ load(init_clean.ier, pt, "ier", complete);
+ load(init_clean.tip_condition, pt, "tip_condition", complete);
+ load(init_clean.ec_condition, pt, "ec_condition", complete);
+ load(init_clean.disconnect_flank_cov, pt, "disconnect_flank_cov", complete);
+}
+
+void load(debruijn_config::simplification::complex_bulge_remover& cbr,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(cbr.enabled, pt, "enabled");
+ load(cbr.max_relative_length, pt, "max_relative_length", complete);
+ load(cbr.max_length_difference, pt, "max_length_difference", complete);
+}
+
+void load(debruijn_config::simplification::erroneous_connections_remover& ec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ec.condition, pt, "condition");
+}
+
+void load(debruijn_config::simplification::relative_coverage_ec_remover& rcec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(rcec.enabled, pt, "enabled");
+ load(rcec.max_ec_length, pt, "rcec_lb");
+ load(rcec.rcec_ratio, pt, "rcec_cb");
+}
+
+void load(debruijn_config::simplification::topology_based_ec_remover& tec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(tec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(tec.plausibility_length, pt, "plausibility_length");
+ load(tec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::interstrand_ec_remover &isec,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(isec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(isec.uniqueness_length, pt, "uniqueness_length");
+ load(isec.span_distance, pt, "span_distance");
+}
+
+void load(debruijn_config::simplification::tr_based_ec_remover &trec,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+ load(trec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(trec.unreliable_coverage, pt, "unreliable_coverage");
+ load(trec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::max_flow_ec_remover& mfec,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(mfec.enabled, pt, "enabled");
+ load(mfec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
+ load(mfec.plausibility_length, pt, "plausibility_length");
+ load(mfec.uniqueness_length, pt, "uniqueness_length");
+}
+
+void load(debruijn_config::simplification::hidden_ec_remover& her,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(her.enabled, pt, "enabled");
+ load(her.uniqueness_length, pt, "uniqueness_length");
+ load(her.unreliability_threshold, pt, "unreliability_threshold");
+ load(her.relative_threshold, pt, "relative_threshold");
+}
+
+void load(debruijn_config::distance_estimator& de,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(de.linkage_distance_coeff, pt, "linkage_distance_coeff", complete);
+ load(de.max_distance_coeff, pt, "max_distance_coeff", complete);
+ load(de.max_distance_coeff_scaff, pt, "max_distance_coeff_scaff", complete);
+ load(de.clustered_filter_threshold, pt, "clustered_filter_threshold", complete);
+ load(de.raw_filter_threshold, pt, "raw_filter_threshold", complete);
+ load(de.rounding_coeff, pt, "rounding_coeff", complete);
+ load(de.rounding_thr, pt, "rounding_threshold", complete);
+}
+
+void load(debruijn_config::smoothing_distance_estimator& ade,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ade.threshold, pt, "threshold");
+ load(ade.range_coeff, pt, "range_coeff");
+ load(ade.delta_coeff, pt, "delta_coeff");
+ load(ade.percentage, pt, "percentage");
+ load(ade.cutoff, pt, "cutoff");
+ load(ade.min_peak_points, pt, "min_peak_points");
+ load(ade.inv_density, pt, "inv_density");
+ load(ade.derivative_threshold, pt, "derivative_threshold");
+}
+
+//FIXME make amb_de optional field
+void load(debruijn_config::ambiguous_distance_estimator &amde,
+ boost::property_tree::ptree const &pt, bool complete) {
+ using config_common::load;
+
+ load(amde.enabled, pt, "enabled", complete);
+ load(amde.haplom_threshold, pt, "haplom_threshold", complete);
+ load(amde.relative_length_threshold, pt, "relative_length_threshold", complete);
+ load(amde.relative_seq_threshold, pt, "relative_seq_threshold", complete);
+}
+
+void load(debruijn_config::scaffold_correction& sc_corr,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(sc_corr.scaffolds_file, pt, "scaffolds_file");
+ load(sc_corr.output_unfilled, pt, "output_unfilled");
+ load(sc_corr.max_insert, pt, "max_insert");
+ load(sc_corr.max_cut_length, pt, "max_cut_length");
+}
+
+void load(debruijn_config::truseq_analysis& tsa,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(tsa.scaffolds_file, pt, "scaffolds_file");
+ load(tsa.genome_file, pt, "genome_file");
+}
+
+void load(debruijn_config::bwa_aligner& bwa,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(bwa.bwa_enable, pt, "bwa_enable");
+ load(bwa.debug, pt, "debug");
+ load(bwa.path_to_bwa, pt, "path_to_bwa");
+ load(bwa.min_contig_len, pt, "min_contig_len");
+}
+
+void load(debruijn_config::pacbio_processor& pb,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pb.pacbio_k, pt, "pacbio_k");
+ load(pb.additional_debug_info, pt, "additional_debug_info");
+ load(pb.compression_cutoff, pt, "compression_cutoff");
+ load(pb.domination_cutoff, pt, "domination_cutoff");
+ load(pb.path_limit_stretching, pt, "path_limit_stretching");
+ load(pb.path_limit_pressing, pt, "path_limit_pressing");
+ load(pb.max_path_in_dijkstra, pt, "max_path_in_dijkstra");
+ load(pb.max_vertex_in_dijkstra, pt, "max_vertex_in_dijkstra");
+ load(pb.ignore_middle_alignment, pt, "ignore_middle_alignment");
+ load(pb.long_seq_limit, pt, "long_seq_limit");
+ load(pb.pacbio_min_gap_quantity, pt, "pacbio_min_gap_quantity");
+ load(pb.contigs_min_gap_quantity, pt, "contigs_min_gap_quantity");
+ load(pb.max_contigs_gap_length, pt, "max_contigs_gap_length");
+
+}
+
+
+void load(debruijn_config::position_handler& pos,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pos.max_mapping_gap, pt, "max_mapping_gap");
+ load(pos.max_gap_diff, pt, "max_gap_diff");
+ load(pos.contigs_for_threading, pt, "contigs_for_threading");
+ load(pos.contigs_to_analyze, pt, "contigs_to_analyze");
+ load(pos.late_threading, pt, "late_threading");
+ load(pos.careful_labeling, pt, "careful_labeling");
+}
+void load(debruijn_config::plasmid& pd,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(pd.long_edge_length, pt, "long_edge_length");
+ load(pd.edge_length_for_median, pt, "edge_length_for_median");
+
+ load(pd.relative_coverage, pt, "relative_coverage");
+ load(pd.small_component_size, pt, "small_component_size");
+ load(pd.small_component_relative_coverage, pt, "small_component_relative_coverage");
+ load(pd.min_component_length, pt, "min_component_length");
+ load(pd.min_isolated_length, pt, "min_isolated_length");
+
+}
+
+
+void load(debruijn_config::gap_closer& gc,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(gc.minimal_intersection, pt, "minimal_intersection");
+ load(gc.before_simplify, pt, "before_simplify");
+ load(gc.in_simplify, pt, "in_simplify");
+ load(gc.after_simplify, pt, "after_simplify");
+ load(gc.weight_threshold, pt, "weight_threshold");
+}
+
+void load(debruijn_config::contig_output& co,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(co.contigs_name, pt, "contigs_name", complete);
+ load(co.scaffolds_name, pt, "scaffolds_name", complete);
+ load(co.obs_mode, pt, "output_broken_scaffolds", complete);
+}
+
+void load(debruijn_config::graph_read_corr_cfg& graph_read_corr,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(graph_read_corr.enable, pt, "enable");
+ load(graph_read_corr.output_dir, pt, "output_dir");
+ load(graph_read_corr.binary, pt, "binary");
+}
+
+void load(debruijn_config::kmer_coverage_model& kcm,
+ boost::property_tree::ptree const& pt, bool /*complete*/) {
+ using config_common::load;
+ load(kcm.probability_threshold, pt, "probability_threshold");
+ load(kcm.strong_probability_threshold, pt, "strong_probability_threshold");
+ load(kcm.coverage_threshold, pt, "coverage_threshold");
+ load(kcm.use_coverage_threshold, pt, "use_coverage_threshold");
+}
+
+void load(dataset &ds,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+
+ load(ds.reads_filename, pt, "reads");
+
+ ds.reference_genome_filename = "";
+ boost::optional<std::string> refgen =
+ pt.get_optional<std::string>("reference_genome");
+ if (refgen && *refgen != "N/A") {
+ ds.reference_genome_filename = *refgen;
+ }
+}
+
+void load_reads(dataset &ds,
+ std::string input_dir) {
+ if (ds.reads_filename[0] != '/')
+ ds.reads_filename = input_dir + ds.reads_filename;
+ path::CheckFileExistenceFATAL(ds.reads_filename);
+ ds.reads.load(ds.reads_filename);
+}
+
+void load_reference_genome(dataset &ds,
+ std::string input_dir) {
+ if (ds.reference_genome_filename == "") {
+ ds.reference_genome = "";
+ return;
+ }
+ if (ds.reference_genome_filename[0] != '/')
+ ds.reference_genome_filename = input_dir + ds.reference_genome_filename;
+ path::CheckFileExistenceFATAL(ds.reference_genome_filename);
+ io::FileReadStream genome_stream(ds.reference_genome_filename);
+ io::SingleRead genome;
+ genome_stream >> genome;
+ ds.reference_genome = genome.GetSequenceString();
+}
+
+void load(debruijn_config::simplification& simp,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+
+ load(simp.cycle_iter_count, pt, "cycle_iter_count", complete);
+
+ load(simp.post_simplif_enabled, pt, "post_simplif_enabled", complete);
+ load(simp.topology_simplif_enabled, pt, "topology_simplif_enabled", complete);
+ load(simp.tc, pt, "tc", complete); // tip clipper:
+
+ load(simp.dead_end, pt, "dead_end", complete); // dead end:
+ load(simp.ttc, pt, "ttc", complete); // topology tip clipper:
+ load(simp.complex_tc, pt, "complex_tc", complete); // complex tip clipper:
+ load(simp.br, pt, "br", complete); // bulge remover:
+ load(simp.ec, pt, "ec", complete); // erroneous connections remover:
+ load(simp.rcec, pt, "rcec", complete); // relative coverage erroneous connections remover
+ load(simp.rcc, pt, "rcc", complete); // relative coverage component remover:
+ load(simp.relative_ed, pt, "relative_ed", complete); // relative edge disconnector:
+ load(simp.tec, pt, "tec", complete); // topology aware erroneous connections remover:
+ load(simp.trec, pt, "trec", complete); // topology and reliability based erroneous connections remover:
+ load(simp.isec, pt, "isec", complete); // interstrand erroneous connections remover (thorn remover):
+ load(simp.mfec, pt, "mfec", complete); // max flow erroneous connections remover:
+ load(simp.ier, pt, "ier", complete); // isolated edges remover
+ load(simp.cbr, pt, "cbr", complete); // complex bulge remover
+ load(simp.her, pt, "her", complete); // hidden ec remover
+ load(simp.init_clean, pt, "init_clean", complete); // presimplification
+ load(simp.final_tc, pt, "final_tc", complete);
+ load(simp.final_br, pt, "final_br", complete);
+ simp.second_final_br = simp.final_br;
+ load(simp.second_final_br, pt, "second_final_br", false);
+}
+
+void load(debruijn_config::info_printer& printer,
+ boost::property_tree::ptree const& pt, bool complete) {
+ using config_common::load;
+ load(printer.basic_stats, pt, "basic_stats", complete);
+ load(printer.lib_info, pt, "lib_info", complete);
+ load(printer.extended_stats, pt, "extended_stats", complete);
+ load(printer.write_components, pt, "write_components", complete);
+ load(printer.components_for_kmer, pt, "components_for_kmer", complete);
+ load(printer.components_for_genome_pos, pt, "components_for_genome_pos",
+ complete);
+ load(printer.write_components_along_genome, pt,
+ "write_components_along_genome", complete);
+ load(printer.write_components_along_contigs, pt,
+ "write_components_along_contigs", complete);
+ load(printer.save_full_graph, pt, "save_full_graph", complete);
+ load(printer.save_all, pt, "save_all", complete);
+ load(printer.save_graph_pack, pt, "save_graph_pack", complete);
+ load(printer.write_full_graph, pt, "write_full_graph", complete);
+ load(printer.write_full_nc_graph, pt, "write_full_nc_graph", complete);
+ load(printer.write_error_loc, pt, "write_error_loc", complete);
+}
+
+//void clear(debruijn_config::info_printer& printer) {
+// printer.print_stats = false;
+// printer.write_components = false;
+// printer.components_for_kmer = "";
+// printer.components_for_genome_pos = "";
+// printer.write_components_along_genome = false;
+// printer.save_full_graph = false;
+// printer.write_full_graph = false;
+// printer.write_full_nc_graph = false;
+// printer.write_error_loc = false;
+//}
+
+void load(debruijn_config::info_printers_t &printers,
+ boost::property_tree::ptree const &pt, bool /*complete*/) {
+ using config_common::load;
+
+ debruijn_config::info_printer def;
+ load(def, pt, ModeName(info_printer_pos::default_pos, InfoPrinterPosNames()), true);
+
+ for (size_t pos = size_t(info_printer_pos::default_pos) + 1; pos != size_t(info_printer_pos::total); ++pos) {
+ debruijn_config::info_printer printer(def);
+ load(printer, pt, ModeName(pos, InfoPrinterPosNames()), false);
+
+ printers[info_printer_pos(pos)] = printer;
+ }
+}
+
+void load_launch_info(debruijn_config &cfg, boost::property_tree::ptree const &pt) {
+ using config_common::load;
+ load(cfg.K, pt, "K");
+ // input options:
+ load(cfg.dataset_file, pt, "dataset");
+ // input dir is based on dataset file location (all paths in datasets are relative to its location)
+ cfg.input_dir = path::parent_path(cfg.dataset_file);
+ if (cfg.input_dir[cfg.input_dir.length() - 1] != '/')
+ cfg.input_dir += '/';
+
+ load(cfg.output_base, pt, "output_base");
+ if (cfg.output_base[cfg.output_base.length() - 1] != '/')
+ cfg.output_base += '/';
+
+ load(cfg.log_filename, pt, "log_filename");
+
+ load(cfg.developer_mode, pt, "developer_mode");
+ if (cfg.developer_mode) {
+ load(cfg.output_pictures, pt, "output_pictures");
+ load(cfg.output_nonfinal_contigs, pt, "output_nonfinal_contigs");
+ load(cfg.compute_paths_number, pt, "compute_paths_number");
+ } else {
+ cfg.output_pictures = false;
+ cfg.output_nonfinal_contigs = false;
+ cfg.compute_paths_number = false;
+ }
+
+ load(cfg.load_from, pt, "load_from");
+ if (cfg.load_from[0] != '/') { // relative path
+ cfg.load_from = cfg.output_dir + cfg.load_from;
+ }
+
+ load(cfg.tmp_dir, pt, "tmp_dir");
+ load(cfg.main_iteration, pt, "main_iteration");
+
+ load(cfg.entry_point, pt, "entry_point");
+
+ load(cfg.use_additional_contigs, pt, "use_additional_contigs");
+ load(cfg.additional_contigs, pt, "additional_contigs");
+ INFO("Additional contigs is " << cfg.additional_contigs);
+
+ load(cfg.rr_enable, pt, "rr_enable");
+
+ load(cfg.buffer_size, pt, "buffer_size");
+ cfg.buffer_size <<= 20; //turn MB to bytes
+
+ load(cfg.temp_bin_reads_dir, pt, "temp_bin_reads_dir");
+ if (cfg.temp_bin_reads_dir[cfg.temp_bin_reads_dir.length() - 1] != '/')
+ cfg.temp_bin_reads_dir += '/';
+
+ load(cfg.max_threads, pt, "max_threads");
+ // Fix number of threads according to OMP capabilities.
+ cfg.max_threads = std::min(cfg.max_threads, (size_t) omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads((int) cfg.max_threads);
+
+ load(cfg.max_memory, pt, "max_memory");
+
+ path::CheckFileExistenceFATAL(cfg.dataset_file);
+ boost::property_tree::ptree ds_pt;
+ boost::property_tree::read_info(cfg.dataset_file, ds_pt);
+ load(cfg.ds, ds_pt, true);
+ load_reads(cfg.ds, cfg.input_dir);
+ load_reference_genome(cfg.ds, cfg.input_dir);
+}
+
+// main debruijn config load function
+void load_cfg(debruijn_config &cfg, boost::property_tree::ptree const &pt,
+ bool complete) {
+ using config_common::load;
+
+ string mode_str = pt.get("mode", "");
+ if (!mode_str.empty()) {
+ cfg.mode = ModeByName<pipeline_type>(mode_str, PipelineTypeNames());
+ }
+
+ //FIXME
+ load(cfg.tsa, pt, "tsa", complete);
+
+ load(cfg.co, pt, "contig_output", complete);
+
+ load(cfg.use_unipaths, pt, "use_unipaths", complete);
+
+ load(cfg.pb, pt, "pacbio_processor", complete);
+
+ load(cfg.two_step_rr, pt, "two_step_rr", complete);
+ load(cfg.use_intermediate_contigs, pt, "use_intermediate_contigs", complete);
+ load(cfg.single_reads_rr, pt, "single_reads_rr", complete);
+
+ load(cfg.preserve_raw_paired_index, pt, "preserve_raw_paired_index", complete);
+
+ load(cfg.correct_mismatches, pt, "correct_mismatches", complete);
+ load(cfg.paired_info_statistics, pt, "paired_info_statistics", complete);
+ load(cfg.paired_info_scaffolder, pt, "paired_info_scaffolder", complete);
+ load(cfg.gap_closer_enable, pt, "gap_closer_enable", complete);
+
+ load(cfg.max_repeat_length, pt, "max_repeat_length", complete);
+
+ load(cfg.est_mode, pt, "estimation_mode", complete);
+ load(cfg.de, pt, "de", complete);
+ load(cfg.ade, pt, "ade", complete); // advanced distance estimator:
+ load(cfg.amb_de, pt, "amb_de", complete);
+
+ load(cfg.con, pt, "construction", complete);
+ load(cfg.gc, pt, "gap_closer", complete);
+ load(cfg.simp, pt, "simp", complete);
+ load(cfg.flanking_range, pt, "flanking_range", complete);
+ load(cfg.graph_read_corr, pt, "graph_read_corr", complete);
+ load(cfg.kcm, pt, "kmer_coverage_model", complete);
+ load(cfg.pos, pt, "pos", complete); // position handler:
+
+ load(cfg.rm, pt, "resolving_mode", complete);
+ load(cfg.pe_params, pt, "pe", complete);
+
+ load(cfg.use_scaffolder, pt, "use_scaffolder", complete);
+ load(cfg.avoid_rc_connections, pt, "avoid_rc_connections", complete);
+
+ load(cfg.sensitive_map, pt, "sensitive_mapper", complete);
+
+ load(cfg.info_printers, pt, "info_printers", complete);
+
+ load(cfg.bwa, pt, "bwa_aligner", complete);
+
+ load(cfg.series_analysis, pt, "series_analysis", complete);
+
+ if (pt.count("plasmid")) {
+ VERIFY_MSG(!cfg.pd, "Option can be loaded only once");
+ cfg.pd.reset(debruijn_config::plasmid());
+ load(*cfg.pd, pt, "plasmid");
+ }
+
+ if (pt.count("sc_cor")) {
+ VERIFY_MSG(!cfg.sc_cor, "Option sc_cor can be loaded only once");
+ cfg.sc_cor.reset(debruijn_config::scaffold_correction());
+ load(*cfg.sc_cor, pt, "sc_cor");
+ }
+
+ if (pt.count("preliminary_simp")) {
+ VERIFY_MSG(!cfg.preliminary_simp, "Option preliminary can be loaded only once");
+ cfg.preliminary_simp.reset(cfg.simp);
+ load(*cfg.preliminary_simp, pt, "preliminary_simp", false);
+ }
+ if (pt.count("prelim_pe")) {
+ VERIFY_MSG(!cfg.prelim_pe_params, "Option prelim_pe can be loaded only once");
+ cfg.prelim_pe_params.reset(cfg.pe_params);
+ load(*cfg.prelim_pe_params, pt, "prelim_pe", false);
+ }
+}
+
+void load(debruijn_config &cfg, const std::string &cfg_fns) {
+ load(cfg, std::vector<std::string>({ cfg_fns }));
+}
+
+void load(debruijn_config &cfg, const std::vector<std::string> &cfg_fns) {
+ VERIFY_MSG(cfg_fns.size() > 0, "Should provide at least one config file");
+ boost::property_tree::ptree base_pt;
+ boost::property_tree::read_info(cfg_fns[0], base_pt);
+
+ load_launch_info(cfg, base_pt);
+ load_cfg(cfg, base_pt, true);
+
+ for (size_t i = 1 ; i < cfg_fns.size(); ++i) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(cfg_fns[i], pt);
+
+ //FIXME add logging of loading configs
+ load_cfg(cfg, pt, false);
+ }
+
+ //some post-loading processing
+ using config::pipeline_type;
+ cfg.uneven_depth = set<pipeline_type>{pipeline_type::mda, pipeline_type::rna, pipeline_type::meta}.count(cfg.mode);
+ if (!cfg.developer_mode) {
+ cfg.pe_params.debug_output = false;
+ cfg.pe_params.viz.DisableAll();
+ cfg.pe_params.output.DisableAll();
+ }
+
+ if (!cfg.use_scaffolder) {
+ cfg.pe_params.param_set.scaffolder_options.enabled = false;
+ }
+ cfg.need_mapping = cfg.developer_mode || cfg.correct_mismatches
+ || cfg.gap_closer_enable || cfg.rr_enable;
+
+ cfg.output_dir = cfg.output_base + "/K" + std::to_string(cfg.K) + "/";
+
+ cfg.output_saves = cfg.output_dir + "saves/";
+
+ if (cfg.tmp_dir[0] != '/') { // relative path
+ cfg.tmp_dir = cfg.output_dir + cfg.tmp_dir;
+ }
+
+ cfg.temp_bin_reads_path =
+ cfg.project_name.empty() ?
+ (cfg.output_base + "/" + cfg.temp_bin_reads_dir) :
+ (cfg.output_base + cfg.project_name + "/"
+ + cfg.temp_bin_reads_dir);
+ //cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
+
+ for (size_t i = 0; i < cfg.ds.reads.lib_count(); ++i) {
+ auto& lib = cfg.ds.reads[i];
+ lib.data().lib_index = i;
+ lib.data().binary_reads_info.chunk_num = cfg.max_threads;
+ lib.data().binary_reads_info.bin_reads_info_file = cfg.temp_bin_reads_path + "INFO_" + std::to_string(i);
+ lib.data().binary_reads_info.buffer_size = cfg.buffer_size;
+ lib.data().binary_reads_info.paired_read_prefix = cfg.temp_bin_reads_path + "paired_" + std::to_string(i);
+ lib.data().binary_reads_info.single_read_prefix = cfg.temp_bin_reads_path + "single_" + std::to_string(i);
+ }
+}
+
+}
+}
diff --git a/src/common/pipeline/config_struct.hpp b/src/common/pipeline/config_struct.hpp
new file mode 100644
index 0000000..deddf72
--- /dev/null
+++ b/src/common/pipeline/config_struct.hpp
@@ -0,0 +1,608 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "pipeline/config_singl.hpp"
+#include "modules/path_extend/pe_config_struct.hpp"
+#include "pipeline/library.hpp"
+
+#include <boost/optional.hpp>
+#include "math/xmath.h"
+
+namespace debruijn_graph {
+namespace config {
+
+enum class info_printer_pos : char {
+ default_pos = 0,
+ before_first_gap_closer,
+ before_simplification,
+ before_post_simplification,
+ final_simplified,
+ final_gap_closed,
+ before_repeat_resolution,
+
+ total
+};
+
+std::vector<std::string> InfoPrinterPosNames();
+
+enum class pipeline_type : char {
+ base = 0,
+ isolate,
+ mda,
+ meta,
+ moleculo,
+ diploid,
+ rna,
+ plasmid,
+ large_genome,
+
+ total
+};
+
+std::vector<std::string> PipelineTypeNames();
+
+enum class construction_mode : char {
+ old = 0,
+ extention,
+
+ total
+};
+
+std::vector<std::string> ConstructionModeNames();
+
+enum class estimation_mode : char {
+ simple = 0,
+ weighted,
+ smoothing,
+
+ total
+};
+
+std::vector<std::string> EstimationModeNames();
+
+enum class resolving_mode : char {
+ none = 0,
+ path_extend,
+
+ total
+};
+
+std::vector<std::string> ResolveModeNames();
+
+enum class single_read_resolving_mode : char {
+ none = 0,
+ only_single_libs,
+ all,
+
+ total
+};
+
+enum class output_broken_scaffolds: char {
+ none = 0,
+ break_gaps,
+ break_all,
+
+ total
+};
+
+std::vector<std::string> SingleReadResolveModeNames();
+
+template<typename mode_t>
+mode_t ModeByName(const std::string& name, const std::vector<std::string>& names) {
+ auto it = std::find(names.begin(), names.end(), name);
+ VERIFY_MSG(it != names.end(), "Unrecognized mode name");
+ return mode_t(it - names.begin());
+}
+
+template<typename mode_t>
+std::string ModeName(const mode_t& mode, const std::vector<std::string>& names) {
+ VERIFY_MSG(size_t(mode) < names.size(), "Unrecognized mode id");
+ return names[size_t(mode)];
+}
+
+struct DataSetData {
+ size_t read_length;
+ double avg_read_length;
+ double mean_insert_size;
+ double insert_size_deviation;
+ double insert_size_left_quantile;
+ double insert_size_right_quantile;
+ double median_insert_size;
+ double insert_size_mad;
+ std::map<int, size_t> insert_size_distribution;
+
+ size_t lib_index;
+ bool single_reads_mapped;
+ uint64_t total_nucls;
+ size_t read_count;
+
+ double average_coverage;
+ double pi_threshold;
+
+ struct BinaryReadsInfo {
+ BinaryReadsInfo(): binary_coverted(false), chunk_num(0), buffer_size(0) {}
+
+ bool binary_coverted;
+ std::string bin_reads_info_file;
+ std::string paired_read_prefix;
+ std::string single_read_prefix;
+ size_t chunk_num;
+ size_t buffer_size;
+ } binary_reads_info;
+
+
+ DataSetData(): read_length(0), avg_read_length(0.0),
+ mean_insert_size(0.0),
+ insert_size_deviation(0.0),
+ insert_size_left_quantile(0.0),
+ insert_size_right_quantile(0.0),
+ median_insert_size(0.0),
+ insert_size_mad(0.0),
+ lib_index(0),
+ single_reads_mapped(false),
+ total_nucls(0),
+ read_count(0),
+ average_coverage(0.0),
+ pi_threshold(0.0),
+ binary_reads_info() {}
+};
+
+struct dataset {
+ typedef io::DataSet<DataSetData>::Library Library;
+
+ io::DataSet<DataSetData> reads;
+
+ size_t max_read_length;
+ double average_coverage;
+ double average_read_length;
+
+ size_t RL() const { return max_read_length; }
+ void set_RL(size_t RL) {
+ max_read_length = RL;
+ }
+
+ double aRL() const { return average_read_length; }
+ void set_aRL(double aRL) {
+ average_read_length = aRL;
+ for (size_t i = 0; i < reads.lib_count(); ++i) {
+ reads[i].data().avg_read_length = aRL;
+ }
+ }
+
+ double avg_coverage() const { return average_coverage; }
+ void set_avg_coverage(double avg_coverage) {
+ average_coverage = avg_coverage;
+ for (size_t i = 0; i < reads.lib_count(); ++i) {
+ reads[i].data().average_coverage = avg_coverage;
+ }
+ }
+
+ std::string reference_genome_filename;
+ std::string reads_filename;
+
+ std::string reference_genome;
+
+ dataset(): max_read_length(0), average_coverage(0.0) {
+ }
+};
+
+// struct for debruijn project's configuration file
+struct debruijn_config {
+
+ pipeline_type mode;
+ bool uneven_depth;
+
+ bool developer_mode;
+
+ bool preserve_raw_paired_index;
+
+ struct simplification {
+ struct tip_clipper {
+ std::string condition;
+ tip_clipper() {}
+ tip_clipper(std::string condition_) : condition(condition_) {}
+ };
+
+ struct dead_end_clipper {
+ std::string condition;
+ bool enabled;
+ };
+
+ struct topology_tip_clipper {
+ double length_coeff;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct complex_tip_clipper {
+ bool enabled;
+ double max_relative_coverage;
+ size_t max_edge_len;
+ std::string condition;
+ };
+
+ struct bulge_remover {
+ bool enabled;
+ bool main_iteration_only;
+ double max_bulge_length_coefficient;
+ size_t max_additive_length_coefficient;
+ double max_coverage;
+ double max_relative_coverage;
+ size_t max_delta;
+ double max_relative_delta;
+ size_t max_number_edges;
+ bool parallel;
+ size_t buff_size;
+ double buff_cov_diff;
+ double buff_cov_rel_diff;
+ };
+
+ struct erroneous_connections_remover {
+ std::string condition;
+ erroneous_connections_remover() {}
+ erroneous_connections_remover(std::string condition_) : condition(condition_) {}
+ };
+
+ struct relative_coverage_ec_remover {
+ bool enabled;
+ size_t max_ec_length;
+ double rcec_ratio;
+ };
+
+ struct topology_based_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct tr_based_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ double unreliable_coverage;
+ };
+
+ struct interstrand_ec_remover {
+ size_t max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t span_distance;
+ };
+
+ struct max_flow_ec_remover {
+ bool enabled;
+ double max_ec_length_coefficient;
+ size_t uniqueness_length;
+ size_t plausibility_length;
+ };
+
+ struct isolated_edges_remover {
+ bool enabled;
+ size_t max_length;
+ double max_coverage;
+ size_t max_length_any_cov;
+ };
+
+ struct complex_bulge_remover {
+ bool enabled;
+ double max_relative_length;
+ size_t max_length_difference;
+ };
+
+ struct hidden_ec_remover {
+ bool enabled;
+ size_t uniqueness_length;
+ double unreliability_threshold;
+ double relative_threshold;
+ };
+
+ struct relative_coverage_edge_disconnector {
+ bool enabled;
+ double diff_mult;
+ size_t edge_sum;
+ };
+
+ struct relative_coverage_comp_remover {
+ bool enabled;
+ double coverage_gap;
+ double length_coeff;
+ double tip_allowing_length_coeff;
+ size_t max_ec_length_coefficient;
+ double max_coverage_coeff;
+ size_t vertex_count_limit;
+ };
+
+ struct init_cleaning {
+ std::string self_conj_condition;
+
+ bool early_it_only;
+ double activation_cov;
+ isolated_edges_remover ier;
+ std::string tip_condition;
+ std::string ec_condition;
+ double disconnect_flank_cov;
+ };
+
+ size_t cycle_iter_count;
+
+ bool post_simplif_enabled;
+ bool topology_simplif_enabled;
+ tip_clipper tc;
+ dead_end_clipper dead_end;
+ complex_tip_clipper complex_tc;
+ topology_tip_clipper ttc;
+ bulge_remover br;
+ erroneous_connections_remover ec;
+ relative_coverage_ec_remover rcec;
+ relative_coverage_comp_remover rcc;
+ relative_coverage_edge_disconnector relative_ed;
+ topology_based_ec_remover tec;
+ tr_based_ec_remover trec;
+ interstrand_ec_remover isec;
+ max_flow_ec_remover mfec;
+ isolated_edges_remover ier;
+ complex_bulge_remover cbr;
+ hidden_ec_remover her;
+
+ tip_clipper final_tc;
+ bulge_remover final_br;
+ bulge_remover second_final_br;
+
+ init_cleaning init_clean;
+ };
+
+ struct construction {
+ struct early_tip_clipper {
+ bool enable;
+ boost::optional<size_t> length_bound;
+ early_tip_clipper() : enable(false) {}
+ };
+
+ construction_mode con_mode;
+ early_tip_clipper early_tc;
+ bool keep_perfect_loops;
+ size_t read_buffer_size;
+ construction() :
+ con_mode(construction_mode::extention),
+ keep_perfect_loops(true),
+ read_buffer_size(0) {}
+ };
+
+ simplification simp;
+ boost::optional<simplification> preliminary_simp;
+
+ struct sensitive_mapper {
+ size_t k;
+ };
+
+ struct distance_estimator {
+ double linkage_distance_coeff;
+ double max_distance_coeff;
+ double max_distance_coeff_scaff;
+ double clustered_filter_threshold;
+ unsigned raw_filter_threshold;
+ double rounding_thr;
+ double rounding_coeff;
+ };
+
+ struct smoothing_distance_estimator {
+ size_t threshold;
+ double range_coeff;
+ double delta_coeff;
+ double percentage;
+ size_t cutoff;
+ size_t min_peak_points;
+ double inv_density;
+ double derivative_threshold;
+ };
+
+ struct ambiguous_distance_estimator {
+ bool enabled;
+ double haplom_threshold;
+ double relative_length_threshold;
+ double relative_seq_threshold;
+ };
+
+ struct plasmid {
+ size_t long_edge_length;
+ size_t edge_length_for_median;
+ double relative_coverage;
+ size_t small_component_size;
+ double small_component_relative_coverage;
+ size_t min_component_length;
+ size_t min_isolated_length;
+ };
+
+ struct pacbio_processor {
+ //align and traverse.
+ size_t pacbio_k; //13
+ bool additional_debug_info; //false
+ double compression_cutoff;// 0.6
+ double domination_cutoff; //1.5
+ double path_limit_stretching; //1.3
+ double path_limit_pressing;//0.7
+ bool ignore_middle_alignment; //true; false for stats and mate_pairs;
+ size_t max_path_in_dijkstra; //15000
+ size_t max_vertex_in_dijkstra; //2000
+ //gap_closer
+ size_t long_seq_limit; //400
+ size_t pacbio_min_gap_quantity; //2
+ size_t contigs_min_gap_quantity; //1
+ size_t max_contigs_gap_length; // 10000
+ };
+
+ struct position_handler {
+ size_t max_mapping_gap;
+ size_t max_gap_diff;
+ std::string contigs_for_threading;
+ std::string contigs_to_analyze;
+ bool late_threading;
+ bool careful_labeling;
+ };
+
+ struct gap_closer {
+ int minimal_intersection;
+ bool before_simplify;
+ bool in_simplify;
+ bool after_simplify;
+ double weight_threshold;
+ };
+
+ struct info_printer {
+ bool basic_stats;
+ bool lib_info;
+ bool extended_stats;
+ bool write_components;
+ std::string components_for_kmer;
+ std::string components_for_genome_pos;
+ bool write_components_along_genome;
+ bool write_components_along_contigs;
+ bool save_full_graph;
+ bool save_all;
+ bool save_graph_pack;
+ bool write_error_loc;
+ bool write_full_graph;
+ bool write_full_nc_graph;
+ };
+
+ struct graph_read_corr_cfg {
+ bool enable;
+ std::string output_dir;
+ bool binary;
+ };
+
+ struct kmer_coverage_model {
+ double probability_threshold;
+ double strong_probability_threshold;
+ double coverage_threshold;
+ bool use_coverage_threshold;
+ };
+
+ struct bwa_aligner {
+ bool bwa_enable;
+ bool debug;
+ std::string path_to_bwa;
+ size_t min_contig_len;
+ };
+
+ typedef std::map<info_printer_pos, info_printer> info_printers_t;
+
+ std::string dataset_file;
+ std::string project_name;
+ std::string input_dir;
+ std::string output_base;
+ std::string output_dir;
+ std::string tmp_dir;
+ std::string output_suffix;
+ std::string output_saves;
+ std::string final_contigs_file;
+ std::string log_filename;
+ std::string series_analysis;
+
+ bool output_pictures;
+ bool output_nonfinal_contigs;
+ bool compute_paths_number;
+
+ bool use_additional_contigs;
+ bool use_unipaths;
+ std::string additional_contigs;
+
+ struct scaffold_correction {
+ std::string scaffolds_file;
+ bool output_unfilled;
+ size_t max_insert;
+ size_t max_cut_length;
+ };
+
+ struct truseq_analysis {
+ std::string scaffolds_file;
+ std::string genome_file;
+ };
+
+ struct contig_output {
+ std::string contigs_name;
+ std::string scaffolds_name;
+ output_broken_scaffolds obs_mode;
+ };
+
+ contig_output co;
+
+ boost::optional<scaffold_correction> sc_cor;
+ truseq_analysis tsa;
+ std::string load_from;
+
+ std::string entry_point;
+
+ bool rr_enable;
+ bool two_step_rr;
+ bool use_intermediate_contigs;
+
+ single_read_resolving_mode single_reads_rr;
+ bool use_single_reads;
+
+ bool correct_mismatches;
+ bool paired_info_statistics;
+ bool paired_info_scaffolder;
+ bool gap_closer_enable;
+
+ size_t max_repeat_length;
+
+ //Convertion options
+ size_t buffer_size;
+ std::string temp_bin_reads_dir;
+ std::string temp_bin_reads_path;
+ std::string temp_bin_reads_info;
+ std::string paired_read_prefix;
+ std::string single_read_prefix;
+
+ size_t K;
+
+ bool main_iteration;
+
+ size_t max_threads;
+ size_t max_memory;
+
+ estimation_mode est_mode;
+ resolving_mode rm;
+ path_extend::pe_config::MainPEParamsT pe_params;
+ boost::optional<path_extend::pe_config::MainPEParamsT> prelim_pe_params;
+ bool avoid_rc_connections;
+
+ construction con;
+ sensitive_mapper sensitive_map;
+ distance_estimator de;
+ smoothing_distance_estimator ade;
+ ambiguous_distance_estimator amb_de;
+ pacbio_processor pb;
+ bool use_scaffolder;
+ dataset ds;
+ position_handler pos;
+ gap_closer gc;
+ graph_read_corr_cfg graph_read_corr;
+ info_printers_t info_printers;
+ kmer_coverage_model kcm;
+ bwa_aligner bwa;
+ boost::optional<plasmid> pd;
+ size_t flanking_range;
+
+ bool need_mapping;
+
+ debruijn_config() :
+ use_single_reads(false) {
+
+ }
+};
+
+void load(debruijn_config& cfg, const std::vector<std::string> &filenames);
+void load(debruijn_config& cfg, const std::string &filename);
+void load_lib_data(const std::string& prefix);
+void write_lib_data(const std::string& prefix);
+
+} // config
+} // debruijn_graph
+
+
+typedef config_common::config<debruijn_graph::config::debruijn_config> cfg;
diff --git a/src/modules/pipeline/genomic_info.hpp b/src/common/pipeline/genomic_info.hpp
similarity index 100%
rename from src/modules/pipeline/genomic_info.hpp
rename to src/common/pipeline/genomic_info.hpp
diff --git a/src/common/pipeline/genomic_info_filler.cpp b/src/common/pipeline/genomic_info_filler.cpp
new file mode 100644
index 0000000..67a91aa
--- /dev/null
+++ b/src/common/pipeline/genomic_info_filler.cpp
@@ -0,0 +1,149 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "genomic_info_filler.hpp"
+
+#include "utils/coverage_model/kmer_coverage_model.hpp"
+#include "modules/simplification/ec_threshold_finder.hpp"
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+#include <string>
+
+#include <map>
+#include <vector>
+
+using namespace llvm;
+using namespace debruijn_graph;
+
+static std::vector<size_t> extract(const std::map<size_t, size_t> &hist) {
+ std::map<size_t, size_t> tmp = hist;
+
+ size_t maxcov = 0;
+ for (auto it = tmp.cbegin(), et = tmp.cend(); it != et; ++it)
+ maxcov = std::max(maxcov, it->first);
+
+ // Touch all the values until maxcov to make sure all the values exist in the map
+ for (size_t i = 0; i <= maxcov; ++i)
+ tmp[i];
+
+ // Extract the values
+ std::vector<size_t> res(maxcov);
+ for (size_t i = 0; i < maxcov; ++i)
+ res[i] = tmp[i + 1];
+
+ return res;
+}
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<GenomicInfo> {
+ static void mapping(yaml::IO &io, GenomicInfo &info) {
+ info.yamlize(io);
+ }
+};
+
+
+template <>
+struct SequenceTraits<std::vector<std::size_t>> {
+ static size_t size(IO &, std::vector<std::size_t> &seq) {
+ return seq.size();
+ }
+ static size_t&
+ element(IO &, std::vector<std::size_t> &seq, size_t index) {
+ if (index >= seq.size())
+ seq.resize(index+1);
+ return seq[index];
+ }
+ static const bool flow = true;
+};
+}}
+
+void GenomicInfo::yamlize(yaml::IO &io) {
+ io.mapOptional("ec bound", ec_bound_, 0.0);
+ io.mapOptional("estimated mean", estimated_mean_, 0.0);
+ io.mapOptional("trusted bound", trusted_bound_, 0.0);
+ io.mapOptional("genome size", genome_size_, size_t(0));
+ io.mapOptional("coverage histogram", cov_histogram_);
+}
+
+
+bool GenomicInfo::Load(const std::string &filename) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
+ if (!Buf)
+ return false;
+
+ yaml::Input yin(*Buf.get());
+ yin >> *this;
+
+ if (yin.error())
+ return false;
+
+ return true;
+}
+
+void GenomicInfo::Save(const std::string &filename) const {
+ std::error_code EC;
+ llvm::raw_fd_ostream ofs(filename, EC, llvm::sys::fs::OpenFlags::F_Text);
+ llvm::yaml::Output yout(ofs);
+ yout << const_cast<GenomicInfo&>(*this);
+}
+
+void GenomicInfoFiller::run(conj_graph_pack &gp, const char*) {
+ if (cfg::get().uneven_depth) {
+ ErroneousConnectionThresholdFinder<decltype(gp.g)> finder(gp.g);
+ std::map<size_t, size_t> hist = finder.ConstructHistogram();
+ double avg = finder.AvgCoverage();
+ double gthr = finder.FindThreshold(hist);
+ INFO("Average edge coverage: " << avg);
+ INFO("Graph threshold: " << gthr);
+
+ gp.ginfo.set_cov_histogram(extract(hist));
+ gp.ginfo.set_ec_bound(std::min(avg, gthr));
+ } else {
+ // First, get k-mer coverage histogram
+ std::map<size_t, size_t> tmp;
+ size_t maxcov = 0;
+ size_t kmer_per_record = 1;
+ if (conj_graph_pack::index_t::InnerIndex::storing_type::IsInvertable())
+ kmer_per_record = 2;
+
+ for (auto I = gp.index.inner_index().value_cbegin(), E = gp.index.inner_index().value_cend(); I != E; ++I) {
+ size_t ccov = I->count;
+ maxcov = std::max(ccov, maxcov);
+ tmp[ccov] += kmer_per_record;
+ }
+
+ gp.ginfo.set_cov_histogram(extract(tmp));
+
+ // Fit the coverage model and get the threshold
+ utils::coverage_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold);
+ CovModel.Fit();
+
+ gp.ginfo.set_genome_size(CovModel.GetGenomeSize());
+ gp.ginfo.set_ec_bound((double)CovModel.GetErrorThreshold());
+ if (CovModel.converged()) {
+ gp.ginfo.set_estimated_mean((double)CovModel.GetMeanCoverage());
+ INFO("Mean coverage was calculated as " << gp.ginfo.estimated_mean());
+ } else
+ INFO("Failed to estimate mean coverage");
+
+ if (cfg::get().kcm.use_coverage_threshold) {
+ double coef = (cfg::get().ds.aRL() - double(cfg::get().K) + 1) / cfg::get().ds.aRL();
+ if (coef < 0)
+ coef = double(cfg::get().ds.RL() - cfg::get().K + 1) / double(cfg::get().ds.RL());
+ gp.ginfo.set_trusted_bound(CovModel.converged() && cfg::get().kcm.coverage_threshold == 0.0 ?
+ double(CovModel.GetLowThreshold()) :
+ cfg::get().kcm.coverage_threshold * coef);
+ }
+ }
+
+ INFO("EC coverage threshold value was calculated as " << gp.ginfo.ec_bound());
+ INFO("Trusted kmer low bound: " << gp.ginfo.trusted_bound());
+}
diff --git a/src/modules/pipeline/genomic_info_filler.hpp b/src/common/pipeline/genomic_info_filler.hpp
similarity index 100%
rename from src/modules/pipeline/genomic_info_filler.hpp
rename to src/common/pipeline/genomic_info_filler.hpp
diff --git a/src/common/pipeline/graph_pack.hpp b/src/common/pipeline/graph_pack.hpp
new file mode 100644
index 0000000..e05a243
--- /dev/null
+++ b/src/common/pipeline/graph_pack.hpp
@@ -0,0 +1,170 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/indices/edge_position_index.hpp"
+#include "utils/indices/storing_traits.hpp"
+#include "sequence/genome_storage.hpp"
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/core/graph.hpp"
+#include "paired_info/paired_info.hpp"
+#include "pipeline/config_struct.hpp"
+#include "modules/alignment/edge_index.hpp"
+#include "assembly_graph/graph_support/genomic_quality.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+#include "genomic_info.hpp"
+#include "modules/alignment/long_read_storage.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "assembly_graph/components/connected_component.hpp"
+#include "modules/alignment/kmer_mapper.hpp"
+#include "common/visualization/position_filler.hpp"
+#include "common/assembly_graph/paths/bidirectional_path.hpp"
+
+namespace debruijn_graph {
+
+template<class Graph>
+struct graph_pack: private boost::noncopyable {
+ typedef Graph graph_t;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef RtSeq seq_t;
+ typedef EdgeIndex<graph_t> index_t;
+ using PairedInfoIndicesT = omnigraph::de::PairedInfoIndicesT<Graph>;
+ //typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
+ typedef omnigraph::de::UnclusteredPairedInfoIndicesT<Graph> UnclusteredPairedInfoIndicesT;
+ typedef LongReadContainer<Graph> LongReadContainerT;
+
+ size_t k_value;
+
+ graph_t g;
+ index_t index;
+ KmerMapper<graph_t> kmer_mapper;
+ FlankingCoverage<graph_t> flanking_cov;
+ UnclusteredPairedInfoIndicesT paired_indices;
+ PairedInfoIndicesT clustered_indices;
+ PairedInfoIndicesT scaffolding_indices;
+ LongReadContainerT single_long_reads;
+ GenomicInfo ginfo;
+
+ GenomeStorage genome;
+ EdgeQuality<Graph> edge_qual;
+ mutable EdgesPositionHandler<graph_t> edge_pos;
+ ConnectedComponentCounter components;
+ path_extend::PathContainer contig_paths;
+
+ graph_pack(size_t k, const std::string &workdir, size_t lib_count,
+ const std::string &genome = "",
+ size_t flanking_range = 50,
+ size_t max_mapping_gap = 0,
+ size_t max_gap_diff = 0,
+ bool detach_indices = true)
+ : k_value(k), g(k), index(g, workdir),
+ kmer_mapper(g),
+ flanking_cov(g, flanking_range),
+ paired_indices(g, lib_count),
+ clustered_indices(g, lib_count),
+ scaffolding_indices(g, lib_count),
+ single_long_reads(g, lib_count),
+ genome(genome),
+ edge_qual(g),
+ edge_pos(g, max_mapping_gap + k, max_gap_diff),
+ components(g),
+ contig_paths()
+ {
+ if (detach_indices) {
+ DetachAll();
+ }
+ }
+
+ void FillQuality() {
+ edge_qual.Fill(index, kmer_mapper, genome.GetSequence());
+ }
+
+ //todo remove with usages after checking
+ void ClearQuality() {
+ edge_qual.clear();
+ }
+
+ void EnsureIndex() {
+ if (!index.IsAttached()) {
+ INFO("Index refill");
+ index.Refill();
+ index.Attach();
+ }
+ }
+
+ void EnsureBasicMapping() {
+ VERIFY(kmer_mapper.IsAttached());
+ EnsureIndex();
+ INFO("Normalizing k-mer map. Total " << kmer_mapper.size() << " kmers to process");
+ kmer_mapper.Normalize();
+ INFO("Normalizing done");
+ }
+
+ void EnsureQuality() {
+ if (!edge_qual.IsAttached()) {
+ ClearQuality();
+ FillQuality();
+ edge_qual.Attach();
+ }
+ }
+
+ //positions are refilled every time
+ void EnsurePos() {
+ if (!edge_pos.IsAttached()) {
+ edge_pos.Attach();
+ }
+ edge_pos.clear();
+ visualization::position_filler::FillPos(*this, genome.GetSequence(), "ref0");
+ visualization::position_filler::FillPos(*this, !genome.GetSequence(), "ref1");
+ }
+
+ void EnsureDebugInfo() {
+ EnsureBasicMapping();
+ EnsureQuality();
+ EnsurePos();
+ }
+
+ void InitRRIndices() {
+ clustered_indices.Init();
+ scaffolding_indices.Init();
+ }
+
+ void ClearRRIndices() {
+ for (auto& pi : paired_indices) {
+ pi.clear();
+ }
+ clustered_indices.Clear();
+ scaffolding_indices.Clear();
+ single_long_reads.Clear();
+ }
+
+ void ClearPaths() {
+ contig_paths.DeleteAllPaths();
+ }
+
+ void DetachAll() {
+ index.Detach();
+ kmer_mapper.Detach();
+ edge_pos.Detach();
+ edge_qual.Detach();
+ }
+
+};
+
+typedef graph_pack<ConjugateDeBruijnGraph> conj_graph_pack;
+typedef conj_graph_pack::index_t Index;
+
+typedef conj_graph_pack::PairedInfoIndicesT PairedIndicesT;
+typedef conj_graph_pack::UnclusteredPairedInfoIndicesT UnclusteredPairedIndicesT;
+typedef conj_graph_pack::LongReadContainerT LongReadContainerT;
+typedef omnigraph::de::PairedInfoIndexT<ConjugateDeBruijnGraph> PairedIndexT;
+typedef omnigraph::de::UnclusteredPairedInfoIndexT<ConjugateDeBruijnGraph> UnclusteredPairedIndexT;
+
+} // namespace debruijn_graph
diff --git a/src/common/pipeline/graphio.hpp b/src/common/pipeline/graphio.hpp
new file mode 100644
index 0000000..118b484
--- /dev/null
+++ b/src/common/pipeline/graphio.hpp
@@ -0,0 +1,1047 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+
+#include "assembly_graph/handlers/id_track_handler.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+
+#include "paired_info/paired_info.hpp"
+
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "modules/alignment/long_read_storage.hpp"
+
+#include "assembly_graph/core/order_and_law.hpp"
+
+#include <cmath>
+#include <set>
+#include <map>
+#include <algorithm>
+#include <fstream>
+#include <cstdio>
+
+namespace debruijn_graph {
+
+namespace graphio {
+
+using namespace omnigraph;
+using namespace omnigraph::de;
+//todo think of inner namespace
+
+template<class KmerMapper>
+void SaveKmerMapper(const string& file_name,
+ const KmerMapper& mapper) {
+ std::ofstream file;
+ file.open((file_name + ".kmm").c_str(),
+ std::ios_base::binary | std::ios_base::out);
+ DEBUG("Saving kmer mapper, " << file_name <<" created");
+ VERIFY(file.is_open());
+
+ uint32_t k = (uint32_t) mapper.k();
+ file.write((char *) &k, sizeof(uint32_t));
+ mapper.BinWrite(file);
+
+ file.close();
+ DEBUG("kmer mapper saved ")
+}
+
+template<class KmerMapper>
+bool LoadKmerMapper(const string& file_name,
+ KmerMapper& kmer_mapper) {
+ kmer_mapper.clear();
+ std::ifstream file;
+ file.open((file_name + ".kmm").c_str(),
+ std::ios_base::binary | std::ios_base::in);
+ if (!file.is_open()) {
+ return false;
+ }
+ INFO("Reading kmer mapper, " << file_name <<" started");
+
+ uint32_t k_;
+ file.read((char *) &k_, sizeof(uint32_t));
+
+ VERIFY_MSG(k_ == kmer_mapper.k(), "Cannot read kmer mapper, different Ks");
+ kmer_mapper.BinRead(file);
+
+ file.close();
+ return true;
+}
+
+template<class EdgeIndex>
+void SaveEdgeIndex(const std::string& file_name,
+ const EdgeIndex& index) {
+ std::ofstream file;
+ file.open((file_name + ".kmidx").c_str(),
+ std::ios_base::binary | std::ios_base::out);
+ DEBUG("Saving kmer index, " << file_name <<" created");
+ VERIFY(file.is_open());
+
+ uint32_t k_ = index.k();
+ file.write((char *) &k_, sizeof(uint32_t));
+ index.BinWrite(file);
+
+ file.close();
+ DEBUG("index saved ")
+}
+
+template<class EdgeIndex>
+bool LoadEdgeIndex(const std::string& file_name,
+ EdgeIndex& index) {
+ std::ifstream file;
+ file.open((file_name + ".kmidx").c_str(),
+ std::ios_base::binary | std::ios_base::in);
+ INFO("Reading kmer index, " << file_name <<" started");
+ if (!file.is_open())
+ return false;
+
+ uint32_t k_;
+ file.read((char *) &k_, sizeof(uint32_t));
+ VERIFY_MSG(k_ == index.k(), "Cannot read edge index, different Ks:");
+
+ index.BinRead(file, file_name + ".kmidx");
+
+ file.close();
+
+ return true;
+}
+
+inline
+void SaveMapCoverage(const std::string& path, const std::map<int, int>& data ) {
+ std::ofstream outFile;
+ outFile.open(path.c_str());
+
+ INFO("Saving detailed coverage in file " << path <<" started");
+ outFile << data.size() << "\n";
+ for (auto dataIterator = data.begin(); dataIterator != data.end(); ++dataIterator){
+ outFile << dataIterator->first << " " << dataIterator->second << " .\n";
+ }
+}
+
+template<class KmerIndex>
+void SaveDetailCoverage(const std::string& pathInCov, const std::string& pathOutCov, const KmerIndex& index ) {
+ SaveMapCoverage(pathInCov, index.inCoverage);
+ SaveMapCoverage(pathOutCov, index.outCoverage);
+}
+
+inline void SerializePoint(FILE* file, size_t e1, size_t e2, const RawPoint &p) {
+ fprintf(file, "%zu %zu %.2f %.2f 0.00 .\n", e1, e2, (double)p.d, (double)p.weight);
+}
+
+inline void SerializePoint(FILE* file, size_t e1, size_t e2, const Point &p) {
+ fprintf(file, "%zu %zu %.2f %.2f %.2f .\n", e1, e2, (double)p.d, (double)p.weight, (double)p.var);
+}
+
+inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, RawPoint &p) {
+ float unused;
+ size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
+ (float *)&p.d, (float *)&p.weight, (float *)&unused);
+ VERIFY(read_count == 5);
+
+}
+
+inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, Point &p) {
+ size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
+ (float *)&p.d, (float *)&p.weight, (float *)&p.var);
+ VERIFY(read_count == 5);
+}
+
+
+template<class Graph>
+class DataPrinter {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ //todo reduce duplication
+ template<class T>
+ void SaveEdgeAssociatedInfo(std::function<T (EdgeId)> access_f, ostream& out) const {
+ out << component_.e_size() << endl;
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ //todo fixme currently matches old format .cvr format
+ out << e.int_id()/* << endl*/;
+ out << " " << access_f(e) << " ." << endl;
+ }
+ }
+
+// template<class C>
+// void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
+// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
+// }
+
+ template<class C>
+ void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
+ out << component_.e_size() << endl;
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ //todo fixme currently matches old format .cvr format
+ out << e.int_id()/* << endl*/;
+ out << " ";
+ c.Save(e, out);
+ out << " ." << endl;
+ }
+ }
+
+ public:
+
+ void SaveGraph(const string& file_name) const {
+ FILE* gid_file = fopen((file_name + ".gid").c_str(), "w");
+ size_t max_id = this->component().g().GetGraphIdDistributor().GetMax();
+ fprintf(gid_file, "%zu\n", max_id);
+ fclose(gid_file);
+ FILE* file = fopen((file_name + ".grp").c_str(), "w");
+ DEBUG("Graph saving to " << file_name << " started");
+ VERIFY_MSG(file != NULL,
+ "Couldn't open file " << (file_name + ".grp") << " on write");
+ size_t vertex_count = component_.v_size();
+ size_t edge_count = component_.e_size();
+ fprintf(file, "%zu %zu \n", vertex_count, edge_count);
+ for (auto iter = component_.v_begin(); iter != component_.v_end(); ++iter) {
+ Save(file, *iter);
+ }
+
+ fprintf(file, "\n");
+
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ Save(file, *iter);
+ }
+ DEBUG("Graph saving to " << file_name << " finished");
+
+ fclose(file);
+ }
+
+ void SaveEdgeSequences(const string& file_name) const {
+ ofstream out(file_name + ".sqn");
+ //todo switch to general function after its switching to fasta
+ //SaveEdgeAssociatedInfo<Sequence>(boost::bind(&Graph::EdgeNucls, component_.g(), _1), out);
+ DEBUG("Saving sequences, " << file_name <<" created");
+ for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
+ EdgeId e = *iter;
+ out << ">" << e.int_id() << endl;
+ out << component_.g().EdgeNucls(e) << endl;
+ }
+ }
+
+ void SaveCoverage(const string& file_name) const {
+ ofstream out(file_name + ".cvr");
+ DEBUG("Saving coverage, " << file_name <<" created");
+ SaveEdgeAssociatedInfo(component_.g().coverage_index(), out);
+ }
+
+ void SaveFlankingCoverage(const string& file_name, const FlankingCoverage<Graph>& flanking_cov) const {
+ ofstream out(file_name + ".flcvr");
+ DEBUG("Saving flanking coverage, " << file_name <<" created");
+ SaveEdgeAssociatedInfo(flanking_cov, out);
+ }
+
+ template<class Index>
+ void SavePaired(const string& file_name,
+ Index const& paired_index) const {
+ FILE* file = fopen((file_name + ".prd").c_str(), "w");
+ DEBUG("Saving paired info, " << file_name <<" created");
+ VERIFY(file != NULL);
+
+ size_t comp_size = 0;
+ for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
+ EdgeId e1 = *I;
+ auto inner_map = paired_index.GetHalf(e1);
+ for (auto entry : inner_map) {
+ if (component_.contains(entry.first)) { // if the second edge also lies in the same component
+ comp_size += entry.second.size();
+ continue;
+ }
+ }
+ }
+
+ fprintf(file, "%zu\n", comp_size);
+
+ for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
+ EdgeId e1 = *I;
+ const auto& inner_map = paired_index.GetHalf(e1);
+ std::map<typename Graph::EdgeId, typename Index::HistProxy> ordermap(inner_map.begin(), inner_map.end());
+ for (auto entry : ordermap) {
+ EdgeId e2 = entry.first;
+ if (component_.contains(e2))
+ for (auto point : entry.second)
+ SerializePoint(file, e1.int_id(), e2.int_id(), point);
+ }
+ }
+
+ fclose(file);
+ }
+
+ void SavePositions(const string& file_name,
+ EdgesPositionHandler<Graph> const& ref_pos) const {
+ ofstream file((file_name + ".pos").c_str());
+ DEBUG("Saving edges positions, " << file_name << " created");
+ VERIFY(file.is_open());
+ file << component_.e_size() << endl;
+ for (auto it = component_.e_begin(); it != component_.e_end(); ++it) {
+ vector<omnigraph::EdgePosition> pos_it = ref_pos.GetEdgePositions(*it);
+ file << it->int_id() << " " << pos_it.size() << endl;
+ for (size_t i = 0; i < pos_it.size(); i++) {
+ file << " " << pos_it[i].contigId << " " << pos_it[i].mr << endl;
+ }
+ }
+ }
+
+ private:
+ void Save(FILE* file, EdgeId eid) const {
+ fprintf(file, "%s\n", ToPrint(eid).c_str());
+ }
+
+ void Save(FILE* file, VertexId vid) const {
+ fprintf(file, "%s\n", ToPrint(vid).c_str());
+ }
+
+ const GraphComponent<Graph> component_;
+
+ virtual std::string ToPrint(VertexId v) const = 0;
+ virtual std::string ToPrint(EdgeId e) const = 0;
+
+ protected:
+
+ //todo optimize component copy
+// DataPrinter(const GraphComponent<Graph>& component) :
+// component_(component) {
+// }
+
+ DataPrinter(GraphComponent<Graph>&& component) :
+ component_(std::move(component)) {
+ }
+
+ const GraphComponent<Graph>& component() const {
+ return component_;
+ }
+
+ public:
+ virtual ~DataPrinter() {
+ }
+};
+
+template<class Graph>
+class ConjugateDataPrinter: public DataPrinter<Graph> {
+ typedef DataPrinter<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ public:
+ ConjugateDataPrinter(Graph const& g) :
+ base(GraphComponent<Graph>::WholeGraph(g)) {
+ }
+
+ ConjugateDataPrinter(const GraphComponent<Graph>& graph_component) :
+ base(GraphComponent<Graph>(graph_component, true)) {
+ }
+
+ template<class VertexIt>
+ ConjugateDataPrinter(const Graph& g, VertexIt begin, VertexIt end) :
+ base(GraphComponent<Graph>::FromVertices(g, begin, end, true)) {
+ }
+
+ std::string ToPrint(VertexId v) const {
+ stringstream ss;
+ ss
+ << "Vertex "
+ << v.int_id()
+ << " ~ "
+ << this->component().g().conjugate(v).int_id() << " .";
+ return ss.str();
+ }
+
+ std::string ToPrint(EdgeId e) const {
+ stringstream ss;
+ ss
+ << "Edge "
+ << e.int_id()
+ << " : "
+ << this->component().g().EdgeStart(e).int_id()
+ << " -> "
+ << this->component().g().EdgeEnd(e).int_id()
+ << ", l = "
+ << this->component().g().length(e)
+ << " ~ "
+ << this->component().g().conjugate(e).int_id() << " .";
+ return ss.str();
+ }
+
+};
+
+template<class Graph>
+class DataScanner {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ template<class T>
+ void LoadEdgeAssociatedInfo(std::function<void (EdgeId, T)> setting_f, istream& in) const {
+ size_t cnt;
+ in >> cnt;
+ for (size_t i = 0 ; i < cnt; ++i) {
+ size_t edge_id;
+ T t;
+ string delim;
+ in >> edge_id;
+ in >> t;
+ in >> delim;
+ VERIFY(delim == ".");
+ VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
+ setting_f(this->edge_id_map()[edge_id], t);
+ }
+ }
+
+ template<class T>
+ void LoadEdgeAssociatedInfo(T& t, istream& in) const {
+ size_t cnt;
+ in >> cnt;
+ for (size_t i = 0 ; i < cnt; ++i) {
+ size_t edge_id;
+ in >> edge_id;
+ VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
+ EdgeId eid = this->edge_id_map().find(edge_id)->second;
+ t.Load(eid, in);
+ string delim;
+ in >> delim;
+ VERIFY(delim == ".");
+ }
+ }
+
+// template<class C>
+// void LoadEdgeAssociatedInfo(const C& c, ostream& out) const {
+// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
+// }
+
+ public:
+ virtual void LoadGraph(const string& file_name) = 0;
+
+ void LoadCoverage(const string& file_name) {
+ INFO("Reading coverage from " << file_name);
+ ifstream in(file_name + ".cvr");
+ LoadEdgeAssociatedInfo(g_.coverage_index(), in);
+ }
+
+ bool LoadFlankingCoverage(const string& file_name, FlankingCoverage<Graph>& flanking_cov) {
+ if (!path::FileExists(file_name + ".flcvr")) {
+ INFO("Flanking coverage saves are absent");
+ return false;
+ }
+ INFO("Reading flanking coverage from " << file_name);
+ ifstream in(file_name + ".flcvr");
+ LoadEdgeAssociatedInfo(flanking_cov, in);
+ return true;
+ }
+
+ template<typename Index>
+ void LoadPaired(const string& file_name,
+ Index& paired_index,
+ bool force_exists = true) {
+ typedef typename Graph::EdgeId EdgeId;
+ FILE* file = fopen((file_name + ".prd").c_str(), "r");
+ INFO((file_name + ".prd"));
+ if (force_exists) {
+ VERIFY(file != NULL);
+ } else if (file == NULL) {
+ INFO("Paired info not found, skipping");
+ return;
+ }
+ INFO("Reading paired info from " << file_name << " started");
+
+ size_t paired_count;
+ int read_count = fscanf(file, "%zu \n", &paired_count);
+ VERIFY(read_count == 1);
+ while (!feof(file)) {
+ size_t first_real_id, second_real_id;
+
+ typename Index::Point point;
+ DeserializePoint(file, first_real_id, second_real_id, point);
+
+ TRACE(first_real_id << " " << second_real_id << " " << point);
+ VERIFY(this->edge_id_map().find(first_real_id) != this->edge_id_map().end())
+ EdgeId e1 = this->edge_id_map()[first_real_id];
+ EdgeId e2 = this->edge_id_map()[second_real_id];
+ if (e1 == EdgeId(NULL) || e2 == EdgeId(NULL))
+ continue;
+ TRACE(e1 << " " << e2 << " " << point);
+ //Need to prevent doubling of self-conjugate edge pairs
+ //Their weight would be always even, so we don't lose precision
+ auto ep = std::make_pair(e1, e2);
+ if (ep == paired_index.ConjugatePair(ep))
+ point.weight = math::round(point.weight / 2);
+ paired_index.Add(e1, e2, point);
+ }
+ DEBUG("PII SIZE " << paired_index.size());
+ fclose(file);
+ }
+
+ bool LoadPositions(const string& file_name,
+ EdgesPositionHandler<Graph>& edge_pos) {
+ FILE* file = fopen((file_name + ".pos").c_str(), "r");
+ if (file == NULL) {
+ INFO("No positions were saved");
+ return false;
+ }
+ VERIFY(!edge_pos.IsAttached());
+ edge_pos.Attach();
+ INFO("Reading edges positions, " << file_name <<" started");
+ VERIFY(file != NULL);
+ size_t pos_count;
+ int read_count = fscanf(file, "%zu\n", &pos_count);
+ VERIFY(read_count == 1);
+ for (size_t i = 0; i < pos_count; i++) {
+ size_t edge_real_id, pos_info_count;
+ char contigId[500];
+ char cur_str[500];
+ read_count = fscanf(file, "%zu %zu\n", &edge_real_id, &pos_info_count);
+ VERIFY(read_count == 2);
+ // INFO( edge_real_id);
+ for (size_t j = 0; j < pos_info_count; j++) {
+ int start_pos, end_pos;
+ int m_start_pos, m_end_pos;
+ read_count = fscanf(file, "%[^\n]s", cur_str);
+ read_count = fscanf(file, "\n");
+ read_count = sscanf(cur_str, "%s [%d - %d] --> [%d - %d]", contigId,
+ &start_pos, &end_pos, &m_start_pos, &m_end_pos);
+ // INFO(cur_str);
+ // INFO (contigId<<" "<< start_pos<<" "<<end_pos);
+ // VERIFY(read_count == 3);
+ VERIFY(read_count == 5);
+ VERIFY(this->edge_id_map().find(edge_real_id) != this->edge_id_map().end());
+ EdgeId eid = this->edge_id_map()[edge_real_id];
+ edge_pos.AddEdgePosition(eid, string(contigId), start_pos - 1, end_pos, m_start_pos - 1, m_end_pos);
+ }
+ }
+ fclose(file);
+ return true;
+ }
+
+ private:
+ Graph& g_;
+ // int edge_count_;
+ map<size_t, EdgeId> edge_id_map_;
+ map<size_t, VertexId> vertex_id_map_;
+
+ protected:
+ DataScanner(Graph &g) : g_(g) {
+ INFO("Creating of scanner started");
+ // edge_count_ = 0;
+ }
+
+ Graph& g() {
+ return g_;
+ }
+
+ map<size_t, EdgeId> &edge_id_map() {
+ return edge_id_map_;
+ }
+
+ map<size_t, VertexId> &vertex_id_map() {
+ return vertex_id_map_;
+ }
+
+ const map<size_t, EdgeId> &edge_id_map() const {
+ return edge_id_map_;
+ }
+
+ const map<size_t, VertexId> &vertex_id_map() const {
+ return vertex_id_map_;
+ }
+
+ public:
+ virtual ~DataScanner() {
+
+ }
+};
+
+template<class Graph>
+class ConjugateDataScanner: public DataScanner<Graph> {
+ typedef DataScanner<Graph> base;
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+private:
+ restricted::IdSegmentStorage CreateIdStorage(const string& file_name) {
+ FILE* file = fopen((file_name + ".gid").c_str(), "r");
+ //This is to support compatibility to old saves. Will be removed soon
+ if(file == NULL) {
+ return this->g().GetGraphIdDistributor().ReserveUpTo(1000000000);
+ }
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".gid"));
+ size_t max;
+ int flag = fscanf(file, "%zu\n", &max);
+ VERIFY(flag == 1);
+ fclose(file);
+ return this->g().GetGraphIdDistributor().ReserveUpTo(max);
+ }
+
+ public:
+ /*virtual*/
+ void LoadGraph(const string& file_name) {
+ restricted::IdSegmentStorage id_storage = CreateIdStorage(file_name);
+ INFO("Trying to read conjugate de bruijn graph from " << file_name << ".grp");
+ FILE* file = fopen((file_name + ".grp").c_str(), "r");
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".grp"));
+ FILE* sequence_file = fopen((file_name + ".sqn").c_str(), "r");
+ VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".sqn"));
+ INFO("Reading conjugate de bruijn graph from " << file_name << " started");
+ size_t vertex_count;
+ size_t edge_count;
+ int flag = fscanf(file, "%zu %zu \n", &vertex_count, &edge_count);
+ VERIFY(flag == 2);
+ for (size_t i = 0; i < vertex_count; i++) {
+ size_t vertex_real_id, conjugate_id;
+ flag = fscanf(file, "Vertex %zu ~ %zu .\n", &vertex_real_id, &conjugate_id);
+ TRACE("Vertex "<<vertex_real_id<<" ~ "<<conjugate_id<<" .");
+ VERIFY(flag == 2);
+
+ if (this->vertex_id_map().find((int) vertex_real_id) == this->vertex_id_map().end()) {
+ size_t ids[2] = {vertex_real_id, conjugate_id};
+ auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
+ VertexId vid = this->g().AddVertex(typename Graph::VertexData(), id_distributor);
+ VertexId conj_vid = this->g().conjugate(vid);
+
+ this->vertex_id_map()[vertex_real_id] = vid;
+ this->vertex_id_map()[conjugate_id] = conj_vid;
+ }
+ }
+
+ char first_char = (char) getc(sequence_file);
+ VERIFY(!ferror(sequence_file));
+ ungetc(first_char, sequence_file);
+ bool fasta = (first_char == '>'); // if it's not fasta, then it's old .sqn
+
+
+ if (!fasta) {
+ size_t tmp_edge_count;
+ flag = fscanf(sequence_file, "%zu", &tmp_edge_count);
+ VERIFY(flag == 1);
+ VERIFY(edge_count == tmp_edge_count);
+ }
+
+ const size_t longstring_size = 1000500; // TODO: O RLY magic constant? => Can't load edges >= 1Mbp
+ char longstring[longstring_size];
+ for (size_t i = 0; i < edge_count; i++) {
+ size_t e_real_id, start_id, fin_id, length, conjugate_edge_id;
+ flag = fscanf(file, "Edge %zu : %zu -> %zu, l = %zu ~ %zu .\n",
+ &e_real_id, &start_id, &fin_id, &length, &conjugate_edge_id);
+ VERIFY(flag == 5);
+ VERIFY(length < longstring_size);
+ if (fasta) {
+ flag = fscanf(sequence_file, ">%zu\n%s\n", &e_real_id, longstring);
+ }
+ else {
+ flag = fscanf(sequence_file, "%zu %s .", &e_real_id, longstring);
+ }
+ VERIFY(flag == 2);
+ TRACE("Edge " << e_real_id << " : " << start_id << " -> "
+ << fin_id << " l = " << length << " ~ " << conjugate_edge_id);
+ if (this->edge_id_map().find((int) e_real_id) == this->edge_id_map().end()) {
+ size_t ids[2] = {e_real_id, conjugate_edge_id};
+ auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
+ Sequence tmp(longstring);
+ EdgeId eid = this->g().AddEdge(this->vertex_id_map()[start_id], this->vertex_id_map()[fin_id], tmp, id_distributor);
+ this->edge_id_map()[e_real_id] = eid;
+ this->edge_id_map()[conjugate_edge_id] = this->g().conjugate(eid);
+ }
+ }
+ fclose(file);
+ fclose(sequence_file);
+ }
+ public:
+ ConjugateDataScanner(Graph& g) :
+ base(g) {
+ }
+};
+
+inline std::string MakeSingleReadsFileName(const std::string& file_name,
+ size_t index) {
+ return file_name + "_paths_" + ToString(index) + ".mpr";
+}
+
+//helper methods
+// todo think how to organize them in the most natural way
+
+template<class Graph>
+void PrintBasicGraph(const string& file_name, DataPrinter<Graph>& printer) {
+ printer.SaveGraph(file_name);
+ printer.SaveEdgeSequences(file_name);
+ printer.SaveCoverage(file_name);
+}
+
+template<class graph_pack>
+void PrintGraphPack(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp) {
+ PrintBasicGraph(file_name, printer);
+ // printer.SavePaired(file_name + "_et", gp.etalon_paired_index);
+ if (gp.edge_pos.IsAttached())
+ printer.SavePositions(file_name, gp.edge_pos);
+ if (gp.index.IsAttached())
+ SaveEdgeIndex(file_name, gp.index.inner_index());
+ if (gp.kmer_mapper.IsAttached())
+ SaveKmerMapper(file_name, gp.kmer_mapper);
+ if (gp.flanking_cov.IsAttached())
+ printer.SaveFlankingCoverage(file_name, gp.flanking_cov);
+}
+
+template<class graph_pack>
+void PrintGraphPack(const string& file_name, const graph_pack& gp) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g);
+ PrintGraphPack(file_name, printer, gp);
+}
+
+template<class Graph>
+void PrintPairedIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& paired_index) {
+ printer.SavePaired(file_name, paired_index);
+}
+
+template<class Graph>
+void PrintUnclusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const UnclusteredPairedInfoIndexT<Graph>& paired_index) {
+ printer.SavePaired(file_name, paired_index);
+}
+
+template<class Graph>
+void PrintClusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& clustered_index) {
+ PrintPairedIndex(file_name + "_cl", printer, clustered_index);
+}
+
+template<class Graph>
+void PrintScaffoldingIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& clustered_index) {
+ PrintPairedIndex(file_name + "_scf", printer, clustered_index);
+}
+
+template<class Graph>
+void PrintScaffoldIndex(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndexT<Graph>& scaffold_index) {
+ PrintPairedIndex(file_name + "_scf", printer, scaffold_index);
+}
+
+template<class Graph>
+void PrintUnclusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const UnclusteredPairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintUnclusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class Graph>
+void PrintClusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintClusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class Graph>
+void PrintScaffoldingIndices(const string& file_name, DataPrinter<Graph>& printer,
+ const PairedInfoIndicesT<Graph>& paired_indices) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ PrintScaffoldingIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndex(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+
+ PrintGraphPack(file_name, printer, gp);
+ if (!clustered_index) {
+ PrintPairedIndex(file_name, printer, paired_index);
+ } else {
+ PrintClusteredIndex(file_name, printer, paired_index);
+ }
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndex(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index) {
+ PrintWithPairedIndex(file_name, printer, gp, paired_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndices(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+ PrintGraphPack(file_name, printer, gp);
+ if (!clustered_index)
+ PrintPairedIndices(file_name, printer, paired_indices);
+ else
+ PrintClusteredIndices(file_name, printer, paired_indices);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndices(const string& file_name,
+ DataPrinter<typename graph_pack::graph_t>& printer,
+ const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ PrintWithPairedIndices(file_name, printer, gp, paired_indices, true);
+}
+
+template<class Graph>
+void PrintSingleLongReads(const string& file_name, const LongReadContainer<Graph>& single_long_reads) {
+ for (size_t i = 0; i < single_long_reads.size(); ++i){
+ single_long_reads[i].DumpToFile(MakeSingleReadsFileName(file_name, i));
+ }
+}
+
+template<class graph_pack>
+void PrintAll(const string& file_name, const graph_pack& gp) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
+ PrintGraphPack(file_name, printer, gp);
+ PrintUnclusteredIndices(file_name, printer, gp.paired_indices);
+ PrintClusteredIndices(file_name, printer, gp.clustered_indices);
+ PrintScaffoldingIndices(file_name, printer, gp.scaffolding_indices);
+ PrintSingleLongReads(file_name, gp.single_long_reads);
+ gp.ginfo.Save(file_name + ".ginfo");
+}
+
+template<class graph_pack, class VertexIt>
+void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintWithPairedIndex(file_name, printer, gp, paired_index, clustered_index);
+}
+
+template<class graph_pack, class VertexIt>
+void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintWithPairedIndex(file_name, printer, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ PrintWithPairedIndex(file_name, gp, gp.g.begin(), gp.g.end(), paired_index,
+ clustered_index);
+}
+
+template<class graph_pack, class VertexIt>
+void PrinGraphPack(const string& file_name, const graph_pack& gp,
+ VertexIt begin, VertexIt end) {
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
+ begin, end);
+ PrintGraphPack(file_name, printer, gp);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ PrintWithPairedIndex(file_name, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void PrintWithPairedIndices(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+
+ ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
+
+ PrintWithPairedIndices(file_name, printer, gp, paired_indices, clustered_index);
+}
+
+template<class graph_pack>
+void PrintWithClusteredIndices(const string& file_name, const graph_pack& gp,
+ const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ PrintWithPairedIndices(file_name, gp, paired_indices, true);
+}
+
+template<class Graph>
+void ScanBasicGraph(const string& file_name, DataScanner<Graph>& scanner) {
+ scanner.LoadGraph(file_name);
+ scanner.LoadCoverage(file_name);
+}
+
+template<class graph_pack>
+void ScanGraphPack(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp) {
+ ScanBasicGraph(file_name, scanner);
+ gp.index.Attach();
+ if (LoadEdgeIndex(file_name, gp.index.inner_index())) {
+ gp.index.Update();
+ } else {
+ WARN("Cannot load edge index, kmer coverages will be missed");
+ gp.index.Refill();
+ }
+ // scanner.LoadPaired(file_name + "_et", gp.etalon_paired_index);
+ scanner.LoadPositions(file_name, gp.edge_pos);
+ //load kmer_mapper only if needed
+ if (gp.kmer_mapper.IsAttached())
+ if (!LoadKmerMapper(file_name, gp.kmer_mapper)) {
+ WARN("Cannot load kmer_mapper, information on projected kmers will be missed");
+ }
+ if (!scanner.LoadFlankingCoverage(file_name, gp.flanking_cov)) {
+ WARN("Cannot load flanking coverage, flanking coverage will be recovered from index");
+ gp.flanking_cov.Fill(gp.index.inner_index());
+ }
+}
+
+template<class Graph>
+void ScanPairedIndex(const string& file_name, DataScanner<Graph>& scanner,
+ UnclusteredPairedInfoIndexT<Graph>& paired_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name, paired_index, force_exists);
+}
+
+template<class Graph>
+void ScanClusteredIndex(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndexT<Graph>& clustered_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name + "_cl", clustered_index, force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldingIndex(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndexT<Graph>& clustered_index,
+ bool force_exists = true) {
+ scanner.LoadPaired(file_name + "_scf", clustered_index, force_exists);
+}
+
+template<class Graph>
+void ScanPairedIndices(const std::string& file_name, DataScanner<Graph>& scanner,
+ UnclusteredPairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanPairedIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanClusteredIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanClusteredIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldingIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& paired_indices,
+ bool force_exists = true) {
+ for (size_t i = 0; i < paired_indices.size(); ++i)
+ ScanScaffoldingIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
+}
+
+template<class Graph>
+void ScanScaffoldIndices(const string& file_name, DataScanner<Graph>& scanner,
+ PairedInfoIndicesT<Graph>& scaffold_indices) {
+
+ for (size_t i = 0; i < scaffold_indices.size(); ++i) {
+ ScanScaffoldIndex(file_name + "_" + ToString(i), scanner, scaffold_indices[i]);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndex(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ScanGraphPack(file_name, scanner, gp);
+ if (!clustered_index) {
+ ScanPairedIndex(file_name, scanner, paired_index);
+ } else {
+ ScanClusteredIndex(file_name, scanner, paired_index);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndices(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+
+ ScanGraphPack(file_name, scanner, gp);
+ if (!clustered_index) {
+ ScanPairedIndices(file_name, scanner, paired_indices);
+ } else {
+ ScanClusteredIndices(file_name, scanner, paired_indices);
+ }
+}
+
+template<class graph_pack>
+void ScanWithPairedIndex(const string& file_name, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
+ bool clustered_index = false) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanWithPairedIndex(file_name, scanner, gp, paired_index, clustered_index);
+}
+
+template<class graph_pack>
+void ScanWithClusteredIndex(const string& file_name, graph_pack& gp,
+ PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
+ ScanWithPairedIndex(file_name, gp, clustered_index, true);
+}
+
+template<class graph_pack>
+void ScanWithClusteredIndices(const string& file_name,
+ DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ ScanWithPairedIndices(file_name, scanner, gp, paired_indices, true);
+}
+
+template<class graph_pack>
+void ScanWithPairedIndices(const string& file_name, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
+ bool clustered_index = false) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanWithPairedIndices(file_name, scanner, gp, paired_indices, clustered_index);
+}
+
+
+template<class graph_pack>
+void ScanWithClusteredIndices(const string& file_name, graph_pack& gp,
+ PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanGraphPack(file_name, scanner, gp);
+ ScanClusteredIndices(file_name, scanner, paired_indices, false);
+}
+
+template<class Graph>
+void ScanBasicGraph(const string& file_name, Graph& g) {
+ ConjugateDataScanner<Graph> scanner(g);
+ ScanBasicGraph<Graph>(file_name, scanner);
+}
+
+template<class Graph>
+void ScanSingleLongReads(const string& file_name, LongReadContainer<Graph>& single_long_reads) {
+ for (size_t i = 0; i < single_long_reads.size(); ++i){
+ single_long_reads[i].LoadFromFile(MakeSingleReadsFileName(file_name, i), false);
+ }
+}
+
+template<class graph_pack>
+void ScanGraphPack(const string& file_name, graph_pack& gp) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanGraphPack(file_name, scanner, gp);
+}
+
+template<class graph_pack>
+void ScanAll(const std::string& file_name, graph_pack& gp,
+ bool force_exists = true) {
+ ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
+ ScanGraphPack(file_name, scanner, gp);
+ ScanPairedIndices(file_name, scanner, gp.paired_indices, force_exists);
+ ScanClusteredIndices(file_name, scanner, gp.clustered_indices, force_exists);
+ ScanScaffoldingIndices(file_name, scanner, gp.scaffolding_indices, force_exists);
+ ScanSingleLongReads(file_name, gp.single_long_reads);
+ gp.ginfo.Load(file_name + ".ginfo");
+}
+}
+}
diff --git a/src/common/pipeline/library.cpp b/src/common/pipeline/library.cpp
new file mode 100644
index 0000000..6ed907d
--- /dev/null
+++ b/src/common/pipeline/library.cpp
@@ -0,0 +1,139 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/library.hpp"
+#include "utils/path_helper.hpp"
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+
+#include <string>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+using namespace io;
+
+namespace llvm { namespace yaml {
+template <>
+struct ScalarEnumerationTraits<LibraryOrientation> {
+ static void enumeration(yaml::IO &io, LibraryOrientation &value) {
+ io.enumCase(value, "fr", LibraryOrientation::FR);
+ io.enumCase(value, "rf", LibraryOrientation::RF);
+ io.enumCase(value, "ff", LibraryOrientation::FF);
+ io.enumCase(value, "rr", LibraryOrientation::RR);
+ }
+};
+
+template <>
+struct ScalarEnumerationTraits<LibraryType> {
+ static void enumeration(yaml::IO &io, LibraryType &value) {
+ io.enumCase(value, "paired-end", LibraryType::PairedEnd);
+ io.enumCase(value, "mate-pairs", LibraryType::MatePairs);
+ io.enumCase(value, "hq-mate-pairs", LibraryType::HQMatePairs);
+ io.enumCase(value, "pacbio", LibraryType::PacBioReads);
+ io.enumCase(value, "single", LibraryType::SingleReads);
+ io.enumCase(value, "sanger", LibraryType::SangerReads);
+ io.enumCase(value, "nanopore", LibraryType::NanoporeReads);
+ io.enumCase(value, "tslr", LibraryType::TSLReads);
+ io.enumCase(value, "trusted-contigs", LibraryType::TrustedContigs);
+ io.enumCase(value, "untrusted-contigs", LibraryType::UntrustedContigs);
+ io.enumCase(value, "path-extend-contigs", LibraryType::PathExtendContigs);
+ }
+};
+
+template <>
+struct SequenceTraits<std::vector<std::string>> {
+ static size_t size(IO &, std::vector<std::string> &seq) {
+ return seq.size();
+ }
+ static std::string&
+ element(IO &, std::vector<std::string> &seq, size_t index) {
+ if (index >= seq.size())
+ seq.resize(index+1);
+ return seq[index];
+ }
+};
+}}
+
+namespace io {
+template<>
+void SequencingLibrary<io::NoData>::yamlize(llvm::yaml::IO &io) {
+ SequencingLibraryBase::yamlize(io);
+}
+template<>
+void SequencingLibrary<io::NoData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
+ SequencingLibraryBase::validate(io, res);
+}
+}
+
+void SequencingLibraryBase::yamlize(llvm::yaml::IO &io) {
+ io.mapRequired("type", type_);
+ io.mapOptional("orientation", orientation_, LibraryOrientation::Undefined);
+ io.mapOptional("left reads", left_paired_reads_);
+ io.mapOptional("right reads", right_paired_reads_);
+ io.mapOptional("single reads", single_reads_);
+}
+
+void SequencingLibraryBase::validate(llvm::yaml::IO &, llvm::StringRef &res) {
+ switch (type_) {
+ case LibraryType::PairedEnd:
+ case LibraryType::MatePairs:
+ case LibraryType::HQMatePairs:
+ if (left_paired_reads_.size() != right_paired_reads_.size()) {
+ res = "Left and right reads lists should have equal length";
+ return;
+ }
+
+ if (orientation_ == LibraryOrientation::Undefined) {
+ res = "Orientation for paired reads should be specified";
+ return;
+ }
+ break;
+ case LibraryType::SingleReads:
+ case LibraryType::PacBioReads:
+ case LibraryType::SangerReads:
+ case LibraryType::NanoporeReads:
+ case LibraryType::TSLReads:
+ case LibraryType::TrustedContigs:
+ case LibraryType::UntrustedContigs:
+ case LibraryType::PathExtendContigs:
+ if (left_paired_reads_.size() || right_paired_reads_.size()) {
+ res = "Paired reads should not be set for this library type";
+ return;
+ }
+ break;
+ default:
+ // Impossible
+ res = "Unsupported library type";
+ return;
+ }
+}
+
+// FIXME: Lambda
+struct update_relative_filename : public std::binary_function<std::string, std::string, std::string> {
+ std::string operator() (const std::string &filename, const std::string &input_dir) const {
+ if (filename[0] == '/')
+ return filename;
+ return input_dir + filename;
+ }
+};
+
+void SequencingLibraryBase::update_relative_reads_filenames(const std::string &input_dir) {
+ std::transform(left_paired_reads_.begin(), left_paired_reads_.end(), left_paired_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+ std::transform(right_paired_reads_.begin(), right_paired_reads_.end(), right_paired_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+ std::transform(single_reads_.begin(), single_reads_.end(), single_reads_.begin(),
+ std::bind2nd(update_relative_filename(), input_dir));
+}
+
+#include "pipeline/library.inl"
+
+// Provide default implementation here (e.g. in case of Data == io::NoData)
+template class io::DataSet<>;
diff --git a/src/common/pipeline/library.hpp b/src/common/pipeline/library.hpp
new file mode 100644
index 0000000..8036d32
--- /dev/null
+++ b/src/common/pipeline/library.hpp
@@ -0,0 +1,367 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef __IO_LIBRARY_HPP__
+#define __IO_LIBRARY_HPP__
+
+#include "common/adt/chained_iterator.hpp"
+#include "common/adt/iterator_range.hpp"
+
+#include <boost/iterator/iterator_facade.hpp>
+
+#include <string>
+#include <vector>
+
+// Forward decls for YAML API
+namespace llvm { namespace yaml { class IO; template<typename T> struct MappingTraits; } }
+namespace llvm { class StringRef; }
+
+namespace io {
+
+enum class LibraryType {
+ SingleReads,
+ PairedEnd,
+ MatePairs,
+ HQMatePairs,
+ PacBioReads,
+ SangerReads,
+ NanoporeReads,
+ TSLReads,
+ TrustedContigs,
+ UntrustedContigs,
+ PathExtendContigs
+};
+
+static std::vector<LibraryType> LibraryPriotity = {
+ LibraryType::SingleReads,
+ LibraryType::SangerReads,
+ LibraryType::PacBioReads,
+ LibraryType::NanoporeReads,
+ LibraryType::PairedEnd,
+ LibraryType::HQMatePairs,
+ LibraryType::MatePairs,
+ LibraryType::TrustedContigs,
+ LibraryType::TSLReads,
+ LibraryType::PathExtendContigs,
+ LibraryType::UntrustedContigs
+};
+
+enum class LibraryOrientation {
+ FR,
+ FF,
+ RF,
+ RR,
+ Undefined
+};
+
+class SequencingLibraryBase {
+public:
+ class paired_reads_iterator :
+ public boost::iterator_facade<paired_reads_iterator,
+ std::pair<std::string, std::string>,
+ boost::forward_traversal_tag,
+ std::pair<std::string, std::string> > {
+
+ typedef std::vector<std::string>::const_iterator inner_iterator;
+
+ public:
+ paired_reads_iterator(inner_iterator left, inner_iterator right)
+ : left_(left), right_(right){}
+
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() { ++left_; ++right_; }
+ bool equal(const paired_reads_iterator &other) const {
+ return this->left_ == other.left_ && this->right_ == other.right_;
+ }
+ std::pair<std::string, std::string> dereference() const {
+ return std::make_pair(*left_, *right_);
+ }
+
+ inner_iterator left_;
+ inner_iterator right_;
+ };
+
+ typedef chained_iterator<std::vector<std::string>::const_iterator> single_reads_iterator;
+
+ SequencingLibraryBase()
+ : type_(LibraryType::PairedEnd), orientation_(LibraryOrientation::FR) {}
+
+ // YAML API. Public because we cannot have template friend class.
+ void yamlize(llvm::yaml::IO &io);
+ void validate(llvm::yaml::IO &io, llvm::StringRef &res);
+
+ LibraryType type() const { return type_; }
+ void set_type(LibraryType type) { type_ = type; }
+ LibraryOrientation orientation() const { return orientation_; }
+ void set_orientation(LibraryOrientation orientation) { orientation_ = orientation; }
+
+ void clear() {
+ left_paired_reads_.clear();
+ right_paired_reads_.clear();
+ single_reads_.clear();
+ }
+
+ void update_relative_reads_filenames(const std::string &input_dir);
+
+ void push_back_single(const std::string &reads) {
+ single_reads_.push_back(reads);
+ }
+
+ void push_back_paired(const std::string &left, const std::string &right) {
+ left_paired_reads_.push_back(left);
+ right_paired_reads_.push_back(right);
+ }
+
+ paired_reads_iterator paired_begin() const {
+ return paired_reads_iterator(left_paired_reads_.begin(), right_paired_reads_.begin());
+ }
+ paired_reads_iterator paired_end() const {
+ return paired_reads_iterator(left_paired_reads_.end(), right_paired_reads_.end());
+ }
+
+ adt::iterator_range<paired_reads_iterator> paired_reads() const {
+ return adt::make_range(paired_begin(), paired_end());
+ }
+
+ single_reads_iterator reads_begin() const {
+ // NOTE: We have a contract with single_end here. Single reads always go last!
+ single_reads_iterator res(left_paired_reads_.begin(), left_paired_reads_.end());
+ res.join(right_paired_reads_.begin(), right_paired_reads_.end());
+ res.join(single_reads_.begin(), single_reads_.end());
+
+ return res;
+ }
+ single_reads_iterator reads_end() const {
+ // NOTE: Do not forget about the contract with single_begin here!
+ return single_reads_iterator(single_reads_.end(), single_reads_.end());
+ }
+
+ adt::iterator_range<single_reads_iterator> reads() const {
+ return adt::make_range(reads_begin(), reads_end());
+ }
+
+ single_reads_iterator single_begin() const {
+ return single_reads_iterator(single_reads_.begin(), single_reads_.end());
+ }
+ single_reads_iterator single_end() const {
+ // NOTE: Do not forget about the contract with single_begin here!
+ return single_reads_iterator(single_reads_.end(), single_reads_.end());
+ }
+
+ adt::iterator_range<single_reads_iterator> single_reads() const {
+ return adt::make_range(single_begin(), single_end());
+ }
+
+ bool is_graph_contructable() const {
+ return type_ == io::LibraryType::PairedEnd ||
+ type_ == io::LibraryType::SingleReads ||
+ type_ == io::LibraryType::HQMatePairs;
+ }
+
+ bool is_bwa_alignable() const {
+ return type_ == io::LibraryType::MatePairs;
+ }
+
+ bool is_mismatch_correctable() const {
+ return is_graph_contructable();
+ }
+
+// bool is_binary_covertable() {
+// return is_graph_contructable() || is_mismatch_correctable() || is_paired();
+// }
+
+ bool is_paired() const {
+ return type_ == io::LibraryType::PairedEnd ||
+ type_ == io::LibraryType::MatePairs ||
+ type_ == io::LibraryType::HQMatePairs;
+ }
+
+ bool is_mate_pair() const {
+ return type_ == io::LibraryType::MatePairs ||
+ type_ == io::LibraryType::HQMatePairs;
+ }
+
+ static bool is_contig_lib(LibraryType type) {
+ return type == io::LibraryType::TrustedContigs ||
+ type == io::LibraryType::UntrustedContigs ||
+ type == io::LibraryType::PathExtendContigs;
+ }
+
+ static bool is_long_read_lib(LibraryType type) {
+ return type == io::LibraryType::PacBioReads ||
+ type == io::LibraryType::SangerReads ||
+ type == io::LibraryType::NanoporeReads ||
+ type == io::LibraryType::TSLReads;
+ }
+
+ bool is_contig_lib() const {
+ return is_contig_lib(type_);
+ }
+
+ bool is_long_read_lib() const {
+ return is_long_read_lib(type_);
+ }
+
+ bool is_repeat_resolvable() const {
+ return is_paired() ||
+ is_long_read_lib() ||
+ is_contig_lib();
+ }
+
+ //hybrid libraries are used to close gaps in the graph during their alignment
+ bool is_hybrid_lib() const {
+ return is_long_read_lib() ||
+ //comment next line to switch alignment method for trusted contigs
+ type_ == io::LibraryType::TrustedContigs ||
+ type_ == io::LibraryType::UntrustedContigs;
+ }
+
+private:
+ LibraryType type_;
+ LibraryOrientation orientation_;
+
+ std::vector<std::string> left_paired_reads_;
+ std::vector<std::string> right_paired_reads_;
+ std::vector<std::string> single_reads_;
+};
+
+struct NoData {};
+
+template<class Data = NoData>
+class SequencingLibrary: public SequencingLibraryBase {
+public:
+ const Data& data() const {
+ return data_;
+ }
+ Data& data() {
+ return data_;
+ }
+
+ void yamlize(llvm::yaml::IO &io);
+ void validate(llvm::yaml::IO &io, llvm::StringRef &res);
+
+private:
+ Data data_;
+};
+
+// Just convenient wrapper to "unwrap" the iterators over libraries.
+template<class Data = NoData>
+class DataSet {
+public:
+ typedef SequencingLibrary<Data> Library;
+ typedef std::vector<Library> LibraryStorage;
+
+public:
+ typedef typename LibraryStorage::iterator iterator;
+ typedef typename LibraryStorage::const_iterator const_iterator;
+ typedef chained_iterator<typename Library::single_reads_iterator> single_reads_iterator;
+ typedef chained_iterator<typename Library::paired_reads_iterator> paired_reads_iterator;
+
+ DataSet() {}
+ explicit DataSet(const std::string &path) { load(path); }
+
+ void load(const std::string &filename);
+ void save(const std::string &filename);
+
+ void clear() { libraries_.clear(); }
+ void push_back(const Library &lib) {
+ libraries_.push_back(lib);
+ }
+ Library& operator[](size_t n) { return libraries_[n]; }
+ const Library& operator[](size_t n) const { return libraries_[n]; }
+ size_t lib_count() const { return libraries_.size(); }
+
+ iterator library_begin() { return libraries_.begin(); }
+ const_iterator library_begin() const { return libraries_.begin(); }
+ iterator begin() { return libraries_.begin(); }
+ const_iterator begin() const { return libraries_.begin(); }
+
+ iterator library_end() { return libraries_.end(); }
+ const_iterator library_end() const { return libraries_.end(); }
+ iterator end() { return libraries_.end(); }
+ const_iterator end() const { return libraries_.end(); }
+
+ adt::iterator_range<iterator> libraries() {
+ return adt::make_range(library_begin(), library_end());
+ }
+ adt::iterator_range<const_iterator> libraries() const {
+ return adt::make_range(library_begin(), library_end());
+ }
+
+ single_reads_iterator reads_begin() const {
+ auto it = libraries_.begin();
+ single_reads_iterator res(it->reads_begin(), it->reads_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->reads_begin(), it->reads_end());
+
+ return res;
+ }
+ single_reads_iterator reads_end() const {
+ return single_reads_iterator(libraries_.back().reads_end(), libraries_.back().reads_end());
+ }
+ adt::iterator_range<single_reads_iterator> reads() const {
+ return adt::make_range(reads_begin(), reads_end());
+ }
+
+ single_reads_iterator single_begin() const {
+ auto it = libraries_.begin();
+ single_reads_iterator res(it->single_begin(), it->single_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->single_begin(), it->single_end());
+
+ return res;
+ }
+ single_reads_iterator single_end() const {
+ return single_reads_iterator(libraries_.back().single_end(), libraries_.back().single_end());
+ }
+ adt::iterator_range<single_reads_iterator> single_reads() const {
+ return adt::make_range(single_begin(), single_end());
+ }
+
+ paired_reads_iterator paired_begin() const {
+ auto it = libraries_.begin();
+ paired_reads_iterator res(it->paired_begin(), it->paired_end());
+ ++it;
+ for (auto end = libraries_.end(); it != end; ++it)
+ res.join(it->paired_begin(), it->paired_end());
+
+ return res;
+ }
+ paired_reads_iterator paired_end() const {
+ return paired_reads_iterator(libraries_.back().paired_end(), libraries_.back().paired_end());
+ }
+
+ adt::iterator_range<paired_reads_iterator> paired_reads() const {
+ return adt::make_range(paired_begin(), paired_end());
+ }
+
+private:
+ LibraryStorage libraries_;
+};
+
+}
+
+namespace llvm { namespace yaml {
+template <>
+struct MappingTraits<io::SequencingLibraryBase> {
+ static void mapping(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
+ static StringRef validate(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
+};
+
+template <class Data>
+struct MappingTraits<io::SequencingLibrary<Data> > {
+ static void mapping(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
+ static StringRef validate(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
+};
+
+}}
+
+#endif // __IO_LIBRARY_HPP__
diff --git a/src/modules/pipeline/library.inl b/src/common/pipeline/library.inl
similarity index 100%
rename from src/modules/pipeline/library.inl
rename to src/common/pipeline/library.inl
diff --git a/src/common/pipeline/stage.cpp b/src/common/pipeline/stage.cpp
new file mode 100644
index 0000000..3119b0a
--- /dev/null
+++ b/src/common/pipeline/stage.cpp
@@ -0,0 +1,133 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "pipeline/stage.hpp"
+#include "pipeline/graphio.hpp"
+
+#include "utils/logger/log_writers.hpp"
+
+#include <algorithm>
+#include <cstring>
+
+namespace spades {
+
+void AssemblyStage::load(debruijn_graph::conj_graph_pack& gp,
+ const std::string &load_from,
+ const char* prefix) {
+ std::string p = path::append_path(load_from, prefix == NULL ? id_ : prefix);
+ INFO("Loading current state from " << p);
+
+ debruijn_graph::graphio::ScanAll(p, gp, false);
+ debruijn_graph::config::load_lib_data(p);
+}
+
+void AssemblyStage::save(const debruijn_graph::conj_graph_pack& gp,
+ const std::string &save_to,
+ const char* prefix) const {
+ std::string p = path::append_path(save_to, prefix == NULL ? id_ : prefix);
+ INFO("Saving current state to " << p);
+
+ debruijn_graph::graphio::PrintAll(p, gp);
+ debruijn_graph::config::write_lib_data(p);
+}
+
+class StageIdComparator {
+ public:
+ StageIdComparator(const char* id)
+ : id_(id) {
+ const char* pos = strstr(id, ":");
+ len_ = (pos != NULL ? pos - id : strlen(id));
+ }
+
+ bool operator()(const std::unique_ptr<AssemblyStage> &stage) const {
+ const char* sid = stage->id();
+ return (0 == strncmp(id_, sid, len_) && sid[len_] == 0);
+ }
+
+ private:
+ const char* id_;
+ size_t len_;
+};
+
+class PhaseIdComparator {
+ public:
+ PhaseIdComparator(const char* id) {
+ const char* pos = strstr(id, ":");
+ VERIFY(pos != NULL);
+ id_ = pos + 1;
+ }
+
+ bool operator()(const std::unique_ptr<CompositeStageBase::PhaseBase> &phase) const {
+ return 0 == strcmp(id_, phase->id());
+ }
+
+ private:
+ const char* id_;
+};
+
+void CompositeStageBase::run(debruijn_graph::conj_graph_pack& gp,
+ const char* started_from) {
+ VERIFY(parent_);
+ auto start_phase = phases_.begin();
+ if (started_from &&
+ strstr(started_from, ":") &&
+ started_from == strstr(started_from, id())) {
+ start_phase = std::find_if(phases_.begin(), phases_.end(), PhaseIdComparator(started_from));
+ if (start_phase == phases_.end()) {
+ ERROR("Invalid start stage / phase combination specified: " << started_from);
+ exit(-1);
+ }
+ if (start_phase != phases_.begin()) {
+ PhaseBase * prev_phase = std::prev(start_phase)->get();
+ std::string composite_id(id());
+ composite_id += ":";
+ composite_id += prev_phase->id();
+ prev_phase->load(gp, parent_->saves_policy().load_from_, composite_id.c_str());
+ }
+ }
+
+ for (auto et = phases_.end(); start_phase != et; ++start_phase) {
+ PhaseBase *phase = start_phase->get();
+
+ INFO("PROCEDURE == " << phase->name());
+ phase->run(gp, started_from);
+
+ if (parent_->saves_policy().make_saves_) {
+ std::string composite_id(id());
+ composite_id += ":";
+ composite_id += phase->id();
+
+ phase->save(gp, parent_->saves_policy().save_to_, composite_id.c_str());
+ }
+
+ }
+}
+
+void StageManager::run(debruijn_graph::conj_graph_pack& g,
+ const char* start_from) {
+ auto start_stage = stages_.begin();
+ if (start_from) {
+ start_stage = std::find_if(stages_.begin(), stages_.end(), StageIdComparator(start_from));
+ if (start_stage == stages_.end()) {
+ ERROR("Invalid start stage specified: " << start_from);
+ exit(-1);
+ }
+ if (start_stage != stages_.begin())
+ (*std::prev(start_stage))->load(g, saves_policy_.load_from_);
+ }
+
+ for (; start_stage != stages_.end(); ++start_stage) {
+ AssemblyStage *stage = start_stage->get();
+
+ INFO("STAGE == " << stage->name());
+ stage->run(g, start_from);
+ if (saves_policy_.make_saves_)
+ stage->save(g, saves_policy_.save_to_);
+ }
+}
+
+}
diff --git a/src/modules/pipeline/stage.hpp b/src/common/pipeline/stage.hpp
similarity index 100%
rename from src/modules/pipeline/stage.hpp
rename to src/common/pipeline/stage.hpp
diff --git a/src/common/sequence/genome_storage.hpp b/src/common/sequence/genome_storage.hpp
new file mode 100644
index 0000000..d790386
--- /dev/null
+++ b/src/common/sequence/genome_storage.hpp
@@ -0,0 +1,55 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <string>
+#include "sequence.hpp"
+#include "nucl.hpp"
+
+class GenomeStorage {
+ std::string s_;
+public:
+ GenomeStorage() {
+ }
+
+ GenomeStorage(const std::string &s): s_(s) {
+ }
+
+ //TODO exterminate this where possible
+ Sequence GetSequence() const {
+ stringstream ss;
+ size_t l = 0, r = 0;
+ for(size_t i = 0; i < s_.size(); i++) {
+ if (!is_nucl(s_[i]) ) {
+ if (r > l) {
+ ss << s_.substr(l, r - l);
+ }
+ r = i + 1;
+ l = i + 1;
+ } else {
+ r++;
+ }
+ }
+ if (r > l) {
+ ss << s_.substr(l, r - l);
+ }
+ return Sequence(ss.str());
+ }
+
+ void SetSequence(const Sequence &s) {
+ s_ = s.str();
+ }
+
+ std::string str() const {
+ return s_;
+ }
+
+ size_t size() const {
+ return s_.size();
+ }
+};
+
diff --git a/src/common/sequence/nucl.hpp b/src/common/sequence/nucl.hpp
new file mode 100755
index 0000000..3170593
--- /dev/null
+++ b/src/common/sequence/nucl.hpp
@@ -0,0 +1,123 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file nucl.hpp
+ * @author vyahhi
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Simple operations and checks for nucleotide-letters
+ *
+ */
+
+
+#ifndef NUCL_HPP_
+#define NUCL_HPP_
+
+#include "utils/verify.hpp"
+#include <iostream>
+
+const char dignucl_map['T' + 1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3};
+
+const bool isnucl_map[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+const char nucl_map[4] = {'A', 'C', 'G', 'T'};
+
+const char nucl_complement_map['T' + 1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 'T', 0, 'G', 0, 0, 0, 'C', 0, 0, 0, 0, 0, 0, 'N', 0, 0, 0, 0, 0, 'A'};
+
+/**
+ * ACGT -> true
+ * @param char c
+ * @return true if c is 'A', 'C', 'G' or 'T'.
+ */
+inline bool is_nucl(char c) { // is ACGT
+ return isnucl_map[(unsigned)c];
+}
+
+/**
+ * 0123 -> true
+ * @param char c
+ * @return true if c is 0, 1, 2 or 3.
+ */
+inline bool is_dignucl(char c) { // is 0123
+ return (c < 4);
+}
+
+/**
+ * 0123 -> 3210
+ * @param char c
+ * @return c ^ 3
+ */
+inline char complement(char c) {
+ // VERIFY(is_dignucl(c));
+ return c ^ 3;
+}
+
+/**
+ * ACGT -> TGCA
+ * @param char c is 'A', 'C', 'G', 'T' or 'N'
+ * @return complement symbol, i.e. 'A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', 'N' => 'N'
+ */
+
+struct nucl_complement_functor { // still unused
+ inline bool operator() (char c) const {
+ char cc = nucl_complement_map[(unsigned)c];
+ return cc ? cc : 'N';
+ }
+};
+
+inline char nucl_complement(char c){
+ // TODO: deal with 'N' case
+ //VERIFY(is_nucl(c));
+ char cc = nucl_complement_map[(unsigned)c];
+ return cc ? cc : 'N';
+}
+
+/**
+ * 0123 -> ACGT
+ * @param char c is 0, 1, 2 or 3
+ * @return 0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'
+ */
+inline char nucl(char c) {
+ return nucl_map[(unsigned)c];
+}
+
+/**
+ * ACGT -> 0123
+ * @param char c is 'A', 'C', 'G' or 'T'
+ * @return A => 0, C => 1, G => 2, T => 3
+ */
+
+/*
+struct dignucl : public unary_function<int,bool> {
+ bool operator()(signed char c) const {
+ return dignucl_map[c];
+ }
+};*/
+
+inline char dignucl(char c) {
+ // VERIFY(is_nucl(c));
+ return dignucl_map[(unsigned)c];
+}
+
+
+#endif /* NUCL_HPP_ */
diff --git a/src/modules/data_structures/sequence/quality.hpp b/src/common/sequence/quality.hpp
similarity index 100%
rename from src/modules/data_structures/sequence/quality.hpp
rename to src/common/sequence/quality.hpp
diff --git a/src/common/sequence/rtseq.hpp b/src/common/sequence/rtseq.hpp
new file mode 100644
index 0000000..5bc27e7
--- /dev/null
+++ b/src/common/sequence/rtseq.hpp
@@ -0,0 +1,751 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * rtseq.hpp
+ *
+ * Created on: Jun 28, 2012
+ * Author: andrey
+ */
+
+#ifndef RTSEQ_HPP_
+#define RTSEQ_HPP_
+
+#include <string>
+#include "utils/verify.hpp"
+#include <array>
+#include <algorithm>
+#include "nucl.hpp"
+#include "utils/log.hpp"
+#include "seq_common.hpp"
+#include "seq.hpp"
+#include "simple_seq.hpp"
+
+#include <cstring>
+#include <iostream>
+
+template<size_t max_size_, typename T = seq_element_type>
+class RuntimeSeq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ const static size_t Iterations = log_<TBits, 2>::value;
+
+ static const std::array<T, Iterations> ConstructLeftMasks() {
+ std::array<T, Iterations> result;
+ for (size_t i = 0; i < Iterations; i++) {
+ size_t shift = 1 << i;
+ T mask = T(T(1) << shift) - T(1);
+ result[i] = T(mask << shift);
+ for (size_t j = 0; j < i; j++) {
+ result[j] += T(result[j] << shift);
+ }
+ }
+ return result;
+ }
+
+ static const std::array<T, Iterations> ConstructRightMasks() {
+ std::array<T, Iterations> result(ConstructLeftMasks());
+ for (size_t i = 0; i < Iterations; i++) {
+ result[i] = T(~result[i]);
+ }
+ return result;
+ }
+
+
+ RuntimeSeq<max_size_, T> FastRC() const {
+ const static std::array<T, Iterations> LeftMasks(ConstructLeftMasks());
+ const static std::array<T, Iterations> RightMasks(ConstructRightMasks());
+ const static size_t LogTSize = log_<sizeof(T), 2>::value + 3;
+
+ RuntimeSeq<max_size_, T> res(this->size());
+
+ const size_t bit_size = size_ << 1;
+ const size_t extra = bit_size & ((1 << LogTSize) - 1);
+ const size_t to_extra = TBits - extra;
+ const size_t filled = bit_size >> LogTSize;
+ size_t real_length = filled;
+ if (extra == 0) {
+ for (size_t i = 0, j = filled - 1; i < filled; i++, j--) {
+ res.data_[i] = data_[j];
+ }
+ } else {
+ for (size_t i = 0, j = filled; i < filled && j > 0; i++, j--) {
+ res.data_[i] = (data_[j] << to_extra) + (data_[j - 1] >> extra);
+ }
+ res.data_[filled] = (data_[0] << to_extra);
+ real_length++;
+ }
+
+ for (size_t i = 0; i < real_length; i++) {
+ res.data_[i] = res.data_[i] ^ T(-1);
+ for (size_t it = 1; it < Iterations; it++) {
+ size_t shift = 1 << it;
+ res.data_[i] = T((res.data_[i] & LeftMasks[it]) >> shift) ^ T((res.data_[i] & RightMasks[it]) << shift);
+ }
+ }
+
+ if (extra != 0) {
+ res.data_[real_length - 1] = (res.data_[real_length - 1] & ((T(1) << extra) - 1));
+ }
+ return res;
+ }
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (max_size_ + TNucl - 1) >> TNuclBits;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+ typedef T DataType;
+
+ static size_t GetDataSize(size_t size) {
+ return (size + TNucl - 1) >> TNuclBits;
+ }
+
+private:
+ /* *
+ * @variable Just some prime number to count the hash function of the kmer
+ * */
+ const static size_t PrimeNum = 239;
+
+
+ // number of nucleotides in the last data_ bucket
+ static size_t NuclsRemain(size_t size) {
+ return size & (TNucl - 1);
+ }
+
+ // useful mask to fill the last element of the data_ array
+ static size_t MaskForLastBucket(size_t size) {
+ size_t nr = NuclsRemain(size);
+ return nr != 0 ? (((T) 1) << (nr << 1)) - 1 : -1ul;
+ }
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+ size_t size_;
+
+ /**
+ * Initialize data_ array of this object with C-string
+ *
+ * @param s C-string (ACGT chars only), strlen(s) = size_
+ */
+ void init(const char *s) {
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+ for (size_t pos = 0; pos < size_; ++pos, ++s) { // unsafe!
+ // VERIFY(is_nucl(*s)); // for performance
+ data = data | ((T) dignucl(*s) << cnt);
+ cnt += 2;
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur < DataSize; ++cur)
+ this->data_[cur] = 0;
+
+ VERIFY(*s == 0); // C-string always ends on 0
+ }
+
+ /**
+ * Sets i-th symbol of Seq with 0123-char
+ */
+ inline void set(const size_t i, char c) {
+ data_[i >> TNuclBits] =
+ (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
+ }
+
+ // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
+ return t.size();
+ }
+
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
+ return strlen(t);
+ }
+
+
+public:
+
+ const static size_t max_size = max_size_;
+
+ RuntimeSeq() : size_(0) {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ /**
+ * Default constructor, fills Seq with A's
+ */
+
+ explicit RuntimeSeq(size_t k) : size_(k) {
+ VERIFY(k <= max_size_);
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ RuntimeSeq(size_t k, const char *s) : size_(k) {
+ VERIFY(k <= max_size_);
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ init(s);
+ }
+
+
+ explicit RuntimeSeq(size_t k, const T *data_array) : size_(k) {
+ VERIFY(k <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+
+ size_t data_size = GetDataSize(size_);
+ memcpy(data_.data(), data_array, data_size * sizeof(T));
+
+ if (NuclsRemain(size_)) {
+ data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
+ }
+ }
+
+ explicit RuntimeSeq(size_t k, T *data_array) : size_(k) {
+ VERIFY(k <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+
+ size_t data_size = GetDataSize(size_);
+ memcpy(data_.data(), data_array, data_size * sizeof(T));
+
+ if (NuclsRemain(size_)) {
+ data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
+ }
+ }
+
+ template<size_t size2_, typename T2 = T>
+ explicit RuntimeSeq(const Seq<size2_, T2> &seq, bool) : size_(size2_) {
+ VERIFY(size_ <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+ seq.copy_data(data_.data());
+ }
+
+ template<size_t size2_, typename T2 = T>
+ explicit RuntimeSeq(const SimpleSeq<size2_, T2> &seq, size_t k) : size_(k) {
+ VERIFY(size_ <= max_size_);
+ VERIFY(size2_ <= max_size_);
+ std::fill(data_.begin(), data_.end(), 0);
+ seq.copy_data(data_.data());
+ }
+
+
+ /**
+ * Ultimate constructor from ACGT0123-string.
+ *
+ * @param s Any object with operator[], which returns 0123 chars
+ * @param offset Offset when this sequence starts
+ * @number_to_read A number of nucleotides, we want to fetch from this string
+ * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
+ * no init method, filling right here
+ */
+ template<typename S>
+ explicit RuntimeSeq(size_t k, const S &s, size_t offset = 0) : size_(k) {
+ VERIFY(size_ <= max_size_);
+ //TRACE("New Constructor for seq " << s[0] << " is first symbol");
+ VERIFY(size_ == 0 || is_dignucl(s[0]) || is_nucl(s[0]));
+ VERIFY(offset + size_ <= this->size(s));
+
+ // which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = size_ == 0 || is_dignucl(s[0]);
+
+ // we fill everything with zeros (As) by default.
+ std::fill(data_.begin(), data_.end(), 0);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ for (size_t i = 0; i < size_; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+
+ // we fill everything with zeros (As) by default.
+ char c = (char) (digit_str ? s[offset + i] : dignucl(s[offset + i]));
+
+ data = data | (T(c) << cnt);
+ cnt += 2;
+
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur < DataSize; ++cur)
+ this->data_[cur] = 0;
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinRead(std::istream &file) {
+ file.read((char *) data_.data(), sizeof(T) * GetDataSize(size_));
+ return !file.fail();
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinWrite(std::ostream &file) const {
+ file.write((const char *) data_.data(), sizeof(T) * GetDataSize(size_));
+ return !file.fail();
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinRead(std::istream &file, RuntimeSeq<max_size_, T> *seq) {
+ return seq->BinRead(file);
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinWrite(std::ostream &file, const RuntimeSeq<max_size_, T> &seq) {
+ return seq.BinWrite(file);
+ }
+
+
+ /**
+ * Get i-th symbol of Seq.
+ *
+ * @param i Index of the symbol (0 <= i < size_)
+ * @return 0123-char on position i
+ */
+ char operator[](const size_t i) const {
+ VERIFY(i < size_);
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ /**::
+ * Reverse complement.
+ *
+ * @return Reverse complement Seq.
+ */
+ RuntimeSeq<max_size_, T> operator!() const {
+// RuntimeSeq<max_size_, T> res(*this);
+// for (size_t i = 0; i < (size_ >> 1); ++i) {
+// auto front = complement(res[i]);
+// auto end = complement(res[size_ - 1 - i]);
+// res.set(i, end);
+// res.set(size_ - 1 - i, front);
+// }
+// if ((size_ & 1) == 1) {
+// res.set(size_ >> 1, complement(res[size_ >> 1]));
+// }
+ return FastRC();
+// return res;
+ }
+
+ /**
+ * Is the kmer minimal among this and !this.
+ *
+ * @return True if kmer < !kmer and false otherwise.
+ */
+ bool IsMinimal() const {
+ for (size_t i = 0; (i << 1) + 1 <= size_; ++i) {
+ auto front = this->operator[](i);
+ auto end = complement(this->operator[](size_ - 1 - i));
+ if (front != end)
+ return front < end;
+ }
+ return true;
+ }
+
+ /**
+ * Shift left
+ *
+ * @param c New 0123 char which should be added to the right.
+ * @return Shifted (to the left) sequence with 'c' char on the right.
+ */
+ RuntimeSeq<max_size_, T> operator<<(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ RuntimeSeq<max_size_, T> res(*this);
+ std::array<T, DataSize> &data = res.data_;
+
+ size_t data_size = GetDataSize(size_);
+
+ if (data_size != 0) { // unless empty sequence
+ T rm = data[data_size - 1] & 3;
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data[data_size - 1] = (data[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
+
+ if (data_size >= 2) { // if we have at least 2 elements in data
+ for (int i = (int) data_size - 2; i >= 0; --i) {
+ T new_rm = data[i] & 3;
+ data[i] = (data[i] >> 2) |
+ (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
+ rm = new_rm;
+ }
+ }
+ }
+ return res;
+ }
+
+ void operator<<=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_t data_size = GetDataSize(size_);
+
+ if (data_size == 0) {
+ return;
+ }
+
+ for (size_t i = 0; i < data_size - 1; ++i) {
+ data_[i] = (data_[i] >> 2) | (((T) data_[i + 1] & 3) << (TBits - 2));
+ }
+
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data_[data_size - 1] = (data_[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
+ }
+
+//todo naming convention violation!
+ RuntimeSeq<max_size_, T> pushBack(char c) const {
+ //VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ //VERIFY(is_dignucl(c));
+ RuntimeSeq<max_size_, T> s(size_ + 1);
+ copy(this->data_.begin(), this->data_.end(), s.data_.begin());
+
+ size_t data_size = GetDataSize(size_ + 1);
+
+ s.data_[data_size - 1] |= ((T) c << ((size_ & (TNucl - 1)) << 1));
+
+ return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
+ }
+
+
+//todo naming convention violation!
+ void pushBackThis(char c) {
+ VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_ += 1;
+ size_t data_size = GetDataSize(size_);
+
+ data_[data_size - 1] |= ((T) c << (((size_ - 1) & (TNucl - 1)) << 1));
+ }
+
+ // /**
+ // * @todo optimize!!!
+ // */
+ // RuntimeSeq<max_size_, T> pushFront(char c) const {
+ // VERIFY(size_ + 1 < max_size_);
+ // if (is_nucl(c)) {
+ // c = dignucl(c);
+ // }
+ // VERIFY(is_dignucl(c));
+ // return RuntimeSeq<max_size_, T> (size_ + 1, nucl(c) + str());
+ // }
+
+ //todo naming convention violation!
+ RuntimeSeq<max_size_, T> pushFront(char c) const {
+ VERIFY(size_ + 1 <= max_size_);
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ RuntimeSeq<max_size_, T> res(size_ + 1);
+
+ size_t data_size = GetDataSize(size_ + 1);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ return res;
+ }
+
+//todo naming convention violation!
+ void pushFrontThis(char c) {
+ VERIFY(size_ + 1 <= max_size_);
+
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+
+ size_ += 1;
+ size_t data_size = GetDataSize(size_);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+ }
+
+ /**
+ * Shift right
+ *
+ * @param c New 0123 char which should be added to the left.
+ * @return Shifted (to the right) sequence with 'c' char on the left.
+ */
+ RuntimeSeq<max_size_, T> operator>>(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+
+ RuntimeSeq<max_size_, T> res(*this);
+ size_t data_size = GetDataSize(size_);
+
+ T rm = c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (res.data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ res.data_[data_size - 1] &= MaskForLastBucket(size_);
+
+ return res;
+ }
+
+ //todo remove code duplication!
+ void operator>>=(char c) {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+
+ size_t data_size = GetDataSize(size_);
+
+ T rm = (T) c;
+ for (size_t i = 0; i < data_size; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ data_[data_size - 1] &= MaskForLastBucket(size_);
+ }
+
+ bool operator==(const RuntimeSeq<max_size_, T> &s) const {
+ VERIFY(size_ == s.size_);
+
+ size_t data_size = GetDataSize(size_);
+ for (size_t i = 0; i < data_size; ++i)
+ if (data_[i] != s.data_[i])
+ return false;
+
+ return true;
+ }
+
+ /**
+ * @see operator ==()
+ */
+ bool operator!=(const RuntimeSeq<max_size_, T> &s) const {
+ return !operator==(s);
+ }
+
+ /**
+ * String representation of this Seq
+ *
+ * @return ACGT-string of length size_
+ * @see nucl()
+ */
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ std::string err() const {
+ return "";
+ }
+
+
+ std::string full_str() const {
+ std::string res(max_size, '-');
+ for (size_t i = 0; i < max_size; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ size_t data_size() const {
+ return GetDataSize(size_);
+ }
+
+ const T *data() const {
+ return data_.data();
+ }
+
+ template<size_t size2_, typename T2 = T>
+ Seq<size2_, T2> get_seq() const {
+ VERIFY(size2_ == size_);
+ return Seq<size2_, T2>((T2 *) data_.data());
+ }
+
+ template<size_t size2_, typename T2 = T>
+ SimpleSeq<size2_, T2> get_sseq() const {
+ VERIFY(size2_ <= max_size_);
+ return SimpleSeq<size2_, T2>((T2 *) data_.data());
+ }
+
+ void copy_data(void *dst) const {
+ memcpy(dst, (const void *) data_.data(), GetDataSize(size_) * sizeof(T));
+ }
+
+ char last() const {
+ return operator[](size_ - 1);
+ }
+
+ char first() const {
+ return operator[](0);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(unsigned seed = 0) const {
+ return GetHash(data_.data(), GetDataSize(size_), seed);
+ }
+
+ struct hash {
+ size_t operator()(const RuntimeSeq<max_size_, T> &seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct less2 {
+ int operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
+ for (size_t i = 0; i < l.size(); ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return l.size() < r.size();
+ }
+ };
+
+ /**
+ * Denotes some (weird) order on k-mers. Works fast.
+ */
+ struct less2_fast {
+ bool operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
+ return 0 > memcmp(l.data(), r.data(), sizeof(T) * l.data_size());
+ }
+ };
+
+ struct less3 {
+ bool operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
+ VERIFY(l.size() == r.size());
+ const T* l_data = l.data();
+ const T* r_data = r.data();
+ for (size_t i = 0; i < l.data_size(); ++i)
+ if (l_data[i] != r_data[i])
+ return l_data[i] < r_data[i];
+ return false;
+ }
+ };
+};
+
+template<size_t max_size_, typename T = seq_element_type>
+bool operator<(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) {
+ for (size_t i = 0; i < l.size(); ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+
+ return l.size() < r.size();
+}
+
+template<size_t max_size_, typename T>
+std::ostream &operator<<(std::ostream &os, RuntimeSeq<max_size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+namespace std {
+template<size_t max_size, typename T>
+struct hash<RuntimeSeq<max_size, T>> {
+ size_t operator()(const RuntimeSeq<max_size, T> &seq) const {
+ return seq.GetHash();
+ }
+};
+
+}
+
+typedef RuntimeSeq<UPPER_BOUND> RtSeq;
+
+#endif /* RTSEQ_HPP_ */
diff --git a/src/common/sequence/seq.hpp b/src/common/sequence/seq.hpp
new file mode 100755
index 0000000..bcaaa72
--- /dev/null
+++ b/src/common/sequence/seq.hpp
@@ -0,0 +1,529 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/**
+ * @file seq.hpp
+ * @author vyahhi
+ * @version 1.0
+ *
+ * @section LICENSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * @section DESCRIPTION
+ *
+ * Immutable ACGT-sequence with compile-time size.
+ * It compress sequence to array of Ts (default: char).
+ */
+
+#ifndef SEQ_HPP_
+#define SEQ_HPP_
+
+#include <string>
+#include <array>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include <city/city.h>
+
+#include "utils/verify.hpp"
+#include "nucl.hpp"
+#include "utils/log.hpp"
+#include "seq_common.hpp"
+
+
+/**
+ * @param T is max number of nucleotides, type for storage
+ */
+template<size_t size_, typename T = seq_element_type>
+class Seq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
+
+ typedef T DataType;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+ static size_t GetDataSize(size_t size) {
+ VERIFY(size == size_);
+ return (size_ + TNucl - 1) >> TNuclBits;
+ }
+
+private:
+ /* *
+ * @variable Just some prime number to count the hash function of the kmer
+ * */
+ const static size_t PrimeNum = 239;
+
+ // number of nucleotides in the last data_ bucket
+ const static size_t NuclsRemain = size_ & (TNucl - 1);
+
+ // useful mask to fill the last element of the data_ array
+ const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1)) - 1;
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+ friend class Seq<size_ - 1, T>;
+
+ /**
+ * Initialize data_ array of this object with C-string
+ *
+ * @param s C-string (ACGT chars only), strlen(s) = size_
+ */
+ void init(const char *s) {
+ T data = 0;
+ size_t cnt = 0;
+ int cur = 0;
+ for (size_t pos = 0; pos != size_; ++pos, ++s) { // unsafe!
+ // VERIFY(is_nucl(*s)); // for performance
+ data = data | (T) ((T) dignucl(*s) << cnt);
+ cnt += 2;
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+ VERIFY(*s == 0); // C-string always ends on 0
+ }
+
+ // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
+ return t.size();
+ }
+
+ template<class S>
+ size_t size(const S &t,
+ typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
+ return strlen(t);
+ }
+
+public:
+ /**
+ * Default constructor, fills Seq with A's
+ */
+ Seq() {
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ Seq(const char *s) {
+ init(s);
+ }
+
+ explicit Seq(T *data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+ explicit Seq(unsigned, const T *data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+
+ /**
+ * Ultimate constructor from ACGT0123-string.
+ *
+ * @param s Any object with operator[], which returns 0123 chars
+ * @param offset Offset when this sequence starts
+ * @number_to_read A number of nucleotides, we want to fetch from this string
+ * @raw Flag whether to check for string length (e.g. via strlen, or not)
+ * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
+ * no init method, filling right here
+ */
+ template<typename S>
+ explicit Seq(const S &s, size_t offset = 0, size_t number_to_read = size_,
+ bool raw = false) {
+ if (this->size(s) == 0) {
+ return;
+ }
+ VERIFY(offset < this->size(s));
+ VERIFY(is_dignucl(s[offset]) || is_nucl(s[offset]));
+ if (!raw)
+ VERIFY(offset + number_to_read <= this->size(s));
+
+ // which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = is_dignucl(s[offset]);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ T data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ for (size_t i = 0; i < number_to_read; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+
+ // we fill everything with zeros (As) by default.
+ char c = digit_str ? s[offset + i] : (char) dignucl(s[offset + i]);
+
+ data = data | (T(c) << cnt);
+ cnt += 2;
+
+ if (cnt == TBits) {
+ this->data_[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+
+ if (cnt != 0) {
+ this->data_[cur++] = data;
+ }
+
+ for (; cur != DataSize; ++cur)
+ this->data_[cur] = 0;
+ }
+
+
+ /**
+ * Get i-th symbol of Seq.
+ *
+ * @param i Index of the symbol (0 <= i < size_)
+ * @return 0123-char on position i
+ */
+ char operator[](const size_t i) const {
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ /**
+ * Reverse complement.
+ *
+ * @return Reverse complement Seq.
+ */
+ Seq<size_, T> operator!() const {
+ Seq<size_, T> res(*this);
+ for (size_t i = 0; i < (size_ >> 1); ++i) {
+ T front = complement(res[i]);
+ T end = complement(res[size_ - 1 - i]);
+ res.set(i, (char) end);
+ res.set(size_ - 1 - i, (char) front);
+ }
+ if ((size_ & 1) == 1) {
+ res.set(size_ >> 1, complement(res[size_ >> 1]));
+ }
+ // can be made without complement calls, but with xor on all bytes afterwards.
+ return res;
+ }
+
+ /**
+ * Shift left
+ *
+ * @param c New 0123 char which should be added to the right.
+ * @return Shifted (to the left) sequence with 'c' char on the right.
+ */
+ Seq<size_, T> operator<<(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ Seq<size_, T> res(*this);
+ std::array<T, DataSize> &data = res.data_;
+ if (DataSize != 0) { // unless empty sequence
+ T rm = data[DataSize - 1] & 3;
+ T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
+ data[DataSize - 1] = (data[DataSize - 1] >> 2) | ((T) c << lastnuclshift_);
+
+ if (DataSize >= 2) { // if we have at least 2 elements in data
+ int data_size = DataSize;
+ for (int i = data_size - 2; i >= 0; --i) {
+ T new_rm = data[i] & 3;
+ data[i] = (data[i] >> 2) |
+ (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
+ rm = new_rm;
+ }
+ }
+ }
+ return res;
+ }
+
+ Seq<size_ + 1, T> pushBack(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ //VERIFY(is_dignucl(c));
+ Seq<size_ + 1, T> s;
+ copy(this->data_.begin(), this->data_.end(), s.data_.begin());
+ s.data_[s.DataSize - 1] = s.data_[s.DataSize - 1] | ((T) c << ((size_ & (TNucl - 1)) << 1));
+
+ return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
+
+ }
+
+ // /**
+ // * @todo optimize!!!
+ // */
+ // Seq<size_ + 1, T> pushFront(char c) const {
+ // if (is_nucl(c)) {
+ // c = dignucl(c);
+ // }
+ // VERIFY(is_dignucl(c));
+ // return Seq<size_ + 1, T> (nucl(c) + str());
+ // }
+
+ Seq<size_ + 1, T> pushFront(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ Seq<size_ + 1, T> res;
+
+ //if new kmer has more Ts
+ if (Seq<size_ + 1, T>::DataSize > DataSize) {
+ res.data_[DataSize] = (data_[DataSize - 1] >> (TBits - 2)) & 3;
+ }
+
+ T rm = c;
+ for (size_t i = 0; i < DataSize; ++i) {
+ T new_rm = (data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+
+ return res;
+ }
+
+ /**
+ * Shift right
+ *
+ * @param c New 0123 char which should be added to the left.
+ * @return Shifted (to the right) sequence with 'c' char on the left.
+ */
+ Seq<size_, T> operator>>(char c) const {
+ if (is_nucl(c)) {
+ c = dignucl(c);
+ }
+ VERIFY(is_dignucl(c));
+ Seq<size_, T> res(*this);
+ T rm = c;
+ for (size_t i = 0; i < DataSize; ++i) {
+ T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
+ res.data_[i] = (res.data_[i] << 2) | rm;
+ rm = new_rm;
+ }
+ if ((size_ & (TNucl - 1)) != 0) {
+ T lastnuclshift_ = (size_ & (TNucl - 1)) << 1;
+ res.data_[DataSize - 1] = res.data_[DataSize - 1] & (((T) 1
+ << lastnuclshift_) - 1);
+ }
+ return res;
+ }
+
+ /**
+ * Sets i-th symbol of Seq with 0123-char
+ */
+ inline void set(const size_t i, char c) {
+ data_[i >> TNuclBits] =
+ (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
+ }
+
+ bool operator==(const Seq<size_, T> &s) const {
+ for (size_t i = 0; i < DataSize; ++i)
+ if (data_[i] != s.data_[i])
+ return false;
+ return true;
+ }
+
+ /**
+ * @see operator ==()
+ */
+
+ bool operator!=(const Seq<size_, T> &s) const {
+ return !operator==(s);
+ }
+
+ /**
+ * String representation of this Seq
+ *
+ * @return ACGT-string of length size_
+ * @see nucl()
+ */
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i != size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ static size_t size() {
+ return size_;
+ }
+
+
+ void copy_data(void *dst) const {
+ memcpy(dst, (const void *) data_.data(), TotalBytes);
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinRead(std::istream &file, Seq<size_> *seq) {
+ file.read((char *) seq->data_.data(), sizeof(T) * DataSize);
+ return !file.fail();
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ static bool BinWrite(std::ostream &file, const Seq<size_> &seq) {
+ file.write((const char *) seq.data_.data(), sizeof(T) * DataSize);
+ return !file.fail();
+ }
+
+ /**
+ * Reads sequence from the file (in the same format as BinWrite writes it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinRead(std::istream &file) {
+ return BinRead(file, this);
+ }
+
+ /**
+ * Writes sequence to the file (in the same format as BinRead reads it)
+ * and returns false if error occured, true otherwise.
+ */
+ bool BinWrite(std::ostream &file) const {
+ return BinWrite(file, *this);
+ }
+
+ /**
+ * @see Seq
+ */
+ template<size_t size2_, typename T2 = T>
+ Seq<size2_, T2> start() const {
+ VERIFY(size2_ <= size_);
+ return Seq<size2_, T2>(*this);
+ }
+
+ template<size_t size2_/* = size_ - 1*/, typename T2 = T>
+ Seq<size2_, T2> end() const {
+ VERIFY(size2_ <= size_);
+ return Seq<size2_, T2>(*this, size_ - size2_);
+ }
+
+ const T *data() const {
+ return data_.data();
+ }
+
+ size_t data_size() const {
+ return DataSize;
+ }
+
+
+ char last() const {
+ return operator[](size_ - 1);
+ }
+
+ char first() const {
+ return operator[](0);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(uint32_t seed = 0) const {
+ return GetHash(data_.data(), DataSize, seed);
+ }
+
+ struct hash {
+ size_t operator()(const Seq<size_, T> &seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct equal_to {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ return r == l;
+ }
+ };
+
+ struct less2 {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ for (size_t i = 0; i < size_; ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return false;
+ }
+ };
+
+ /**
+ * Denotes some (weird) order on k-mers. Works fast.
+ */
+ struct less2_fast {
+ bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
+ return 0 > memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize);
+ }
+ };
+};
+
+template<size_t size_, typename T>
+std::ostream &operator<<(std::ostream &os, Seq<size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+//namespace std {
+//
+//template<size_t size_, typename T = seq_element_type>
+//struct hash<Seq<size_, T> {
+// typedef size_t result_type;
+// typedef Seq<size_, T> argument_type;
+//
+// result_type operator() (const argument_type& arg) {
+// return Seq<size_, T>::hash()(arg);
+// }
+//};
+//
+//}
+
+#endif /* SEQ_HPP_ */
diff --git a/src/common/sequence/seq_common.hpp b/src/common/sequence/seq_common.hpp
new file mode 100644
index 0000000..51ceb42
--- /dev/null
+++ b/src/common/sequence/seq_common.hpp
@@ -0,0 +1,44 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * seq_common.hpp
+ *
+ * Created on: Jun 25, 2012
+ * Author: andrey
+ */
+
+#ifndef SEQ_COMMON_HPP_
+#define SEQ_COMMON_HPP_
+
+#include "k_range.hpp"
+
+typedef u_int64_t seq_element_type;
+
+constexpr size_t t_size(void) {
+ return sizeof(seq_element_type);
+}
+
+constexpr size_t get_t_elements_number(size_t value) {
+ return ((value - 1) / (t_size() << 2) + 1);
+}
+
+constexpr size_t get_k_by_ts(size_t value) {
+ return (value * (t_size() << 2));
+}
+
+constexpr size_t get_upper_bound(size_t value) {
+ return get_k_by_ts(get_t_elements_number(value));
+}
+
+const size_t UPPER_BOUND = get_upper_bound(runtime_k::MAX_K); //((MAX_K - 1) / (sizeof(seq_element_type) << 2) + 1) * (sizeof(seq_element_type) << 2);
+
+const size_t MAX_TS = get_t_elements_number(runtime_k::MAX_K);
+
+const size_t MIN_TS = get_t_elements_number(runtime_k::MIN_K);
+
+#endif /* SEQ_COMMON_HPP_ */
diff --git a/src/common/sequence/sequence.hpp b/src/common/sequence/sequence.hpp
new file mode 100755
index 0000000..aaaf21b
--- /dev/null
+++ b/src/common/sequence/sequence.hpp
@@ -0,0 +1,553 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_HPP_
+#define SEQUENCE_HPP_
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <cstring>
+
+#include "seq.hpp"
+#include "rtseq.hpp"
+
+class Sequence {
+ // Type to store Seq in Sequences
+ typedef seq_element_type ST;
+ // Number of bits in ST
+ const static size_t STBits = sizeof(ST) << 3;
+ // Number of nucleotides in ST
+ const static size_t STN = (STBits >> 1);
+ // Number of bits in STN (for faster div and mod)
+ const static size_t STNBits = log_<STN, 2>::value;
+
+ template<typename T>
+ struct array_deleter {
+ void operator()(const T *p) { delete[] p; }
+ };
+
+private:
+ size_t from_;
+ size_t size_;
+ bool rtl_; // Right to left + complimentary (?)
+ std::shared_ptr<ST> data_;
+
+ static size_t DataSize(size_t size) {
+ return (size + STN - 1) >> STNBits;
+ }
+
+ template<typename S>
+ void InitFromNucls(const S &s, bool rc = false) {
+ size_t bytes_size = DataSize(size_);
+ ST *bytes = data_.get();
+
+ VERIFY(is_dignucl(s[0]) || is_nucl(s[0]));
+
+ // Which symbols does our string contain : 0123 or ACGT?
+ bool digit_str = is_dignucl(s[0]);
+
+ // data -- one temporary variable corresponding to the i-th array element
+ // and some counters
+ ST data = 0;
+ size_t cnt = 0;
+ size_t cur = 0;
+
+ if (rc) {
+ for (int i = (int) size_ - 1; i >= 0; --i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+ char c = complement(digit_str ? s[(unsigned) i] : dignucl(s[(unsigned) i]));
+
+ data = data | (ST(c) << cnt);
+ cnt += 2;
+
+ if (cnt == STBits) {
+ bytes[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ } else {
+ for (size_t i = 0; i < size_; ++i) {
+ //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
+ char c = digit_str ? s[i] : dignucl(s[i]);
+
+ data = data | (ST(c) << cnt);
+ cnt += 2;
+
+ if (cnt == STBits) {
+ bytes[cur++] = data;
+ cnt = 0;
+ data = 0;
+ }
+ }
+ }
+
+ if (cnt != 0)
+ bytes[cur++] = data;
+
+ for (; cur < bytes_size; ++cur)
+ bytes[cur] = 0;
+ }
+
+
+public:
+ /**
+ * Sequence initialization (arbitrary size string)
+ *
+ * @param s ACGT or 0123-string
+ */
+ explicit Sequence(const char *s, bool rc = false) :
+ from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ explicit Sequence(char *s, bool rc = false) :
+ from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ template<typename S>
+ explicit Sequence(const S &s, bool rc = false) :
+ from_(0), size_(s.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ InitFromNucls(s, rc);
+ }
+
+ Sequence() :
+ from_(0), size_(0), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+ memset(data_.get(), 0, DataSize(size_));
+ }
+
+ template<size_t size2_>
+ explicit Sequence(const Seq<size2_> &kmer, size_t) :
+ from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+
+ kmer.copy_data(data_.get());
+ }
+
+ template<size_t size2_>
+ explicit Sequence(const RuntimeSeq<size2_> &kmer, size_t) :
+ from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
+
+ kmer.copy_data(data_.get());
+ }
+
+ Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) :
+ from_(from), size_(size), rtl_(rtl), data_(seq.data_) {
+ }
+
+ Sequence(const Sequence &s) :
+ from_(s.from_), size_(s.size_), rtl_(s.rtl_), data_(s.data_) {
+ }
+
+ ~Sequence() { }
+
+ const Sequence &operator=(const Sequence &rhs) {
+ if (&rhs != this) {
+ from_ = rhs.from_;
+ size_ = rhs.size_;
+ rtl_ = rhs.rtl_;
+ data_ = rhs.data_;
+ }
+
+ return *this;
+ }
+
+ char operator[](const size_t index) const {
+ //todo can be put back after switching to distributing release without asserts
+ //VERIFY(index < size_);
+ const ST *bytes = data_.get();
+ if (rtl_) {
+ size_t i = from_ + size_ - 1 - index;
+ return complement((bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3);
+ } else {
+ size_t i = from_ + index;
+ return (bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3;
+ }
+ }
+
+ bool operator==(const Sequence &that) const {
+ if (size_ != that.size_) {
+ return false;
+ }
+
+ if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) {
+ return true;
+ }
+
+ for (size_t i = 0; i < size_; ++i) {
+ if (this->operator[](i) != that[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool operator!=(const Sequence &that) const {
+ return !(operator==(that));
+ }
+
+ /**
+ * @todo Might be optimized via int comparison (not so easy)
+ */
+ bool operator<(const Sequence &that) const {
+ size_t s = std::min(size_, that.size_);
+ for (size_t i = 0; i < s; ++i) {
+ if (this->operator[](i) != that[i]) {
+ return (this->operator[](i) < that[i]);
+ }
+ }
+ return (size_ < that.size_);
+ }
+
+ Sequence operator!() const {
+ return Sequence(*this, from_, size_, !rtl_);
+ }
+
+ inline Sequence operator<<(char c) const;
+
+ /**
+ * @param from inclusive
+ * @param to exclusive;
+ */
+ inline Sequence Subseq(size_t from, size_t to) const;
+
+ inline Sequence Subseq(size_t from) const; // up to size_ by default
+ inline Sequence First(size_t count) const;
+
+ inline Sequence Last(size_t count) const;
+
+ inline Sequence operator+(const Sequence &s) const;
+
+ /////todo what are these methods???
+ inline size_t find(const Sequence &t, size_t from = 0) const;
+
+ inline size_t similar(const Sequence &t, size_t k, char directed = 0) const;
+
+ inline size_t leftSimilar(const Sequence &t, size_t k) const;
+
+ inline size_t rightSimilar(const Sequence &t, size_t k) const;
+
+ /**
+ * @param from inclusive
+ * @param to exclusive;
+ * @return true if two sequences intersect
+ */
+ inline bool intersects(const Sequence &t) const;
+
+ template<size_t size2_>
+ Seq<size2_> start() const;
+
+ template<size_t size2_>
+ Seq<size2_> fast_start() const;
+
+ template<size_t size2_>
+ Seq<size2_> end() const;
+
+ template<class Seq>
+ Seq start(size_t k) const;
+
+ template<class Seq>
+ Seq end(size_t k) const;
+
+ inline std::string str() const;
+
+ inline std::string err() const;
+
+ size_t size() const {
+ return size_;
+ }
+
+ template<class Seq>
+ bool contains(const Seq& s, size_t offset = 0) const {
+ VERIFY(offset + s.size() <= size());
+
+ for (size_t i = 0, e = s.size(); i != e; ++i)
+ if (operator[](offset + i) != s[i])
+ return false;
+
+ return true;
+ }
+
+private:
+ inline bool ReadHeader(std::istream &file);
+
+ inline bool WriteHeader(std::ostream &file) const;
+
+public:
+ inline bool BinRead(std::istream &file);
+
+ inline bool BinWrite(std::ostream &file) const;
+};
+
+inline std::ostream &operator<<(std::ostream &os, const Sequence &s);
+
+/**
+ * start of Sequence is Seq with preferred size
+ */
+template<size_t size2_>
+Seq<size2_> Sequence::start() const {
+ //VERIFY(size2_ <= size_);
+ return Seq<size2_>(*this);
+}
+
+template<size_t size2_>
+Seq<size2_> Sequence::fast_start() const {
+ ST result[(size2_ + STN - 1) >> STNBits] = {0};
+
+ size_t start = from_ >> STNBits;
+ size_t end = (from_ + size_ - 1) >> STNBits;
+ size_t shift = (from_ & (STN - 1)) << 1;
+ const ST *bytes = data_.get();
+
+ for (size_t i = start; i <= end; ++i) {
+ result[i - start] = bytes[i] >> shift;
+ }
+
+ if (shift != 0) {
+ shift = STBits - shift;
+
+ for (size_t i = start + 1; i <= end; ++i) {
+ result[i - start - 1] |= bytes[i] << shift;
+ }
+ }
+
+ return (rtl_ ? !Seq<size2_>(result) : Seq<size2_>(result));
+}
+
+template<size_t size2_>
+Seq<size2_> Sequence::end() const {
+ return Seq<size2_>(*this, size_ - size2_);
+}
+
+
+template<class Seq>
+Seq Sequence::start(size_t k) const {
+ return Seq(unsigned(k), *this);
+}
+
+template<class Seq>
+Seq Sequence::end(size_t k) const {
+ return Seq(unsigned(k), *this, size_ - k);
+}
+
+
+Sequence Sequence::First(size_t count) const {
+ return Subseq(0, count);
+}
+
+Sequence Sequence::Last(size_t count) const {
+ return Subseq(size_ - count);
+}
+
+bool Sequence::intersects(const Sequence &t) const {
+ for (size_t i = 0; i < std::min(size_, t.size_); ++i) {
+ if (this->operator[](i) == t[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// O(1)
+//including from, excluding to
+//safe if not #DEFINE NDEBUG
+Sequence Sequence::Subseq(size_t from, size_t to) const {
+ // cerr << endl<<"subseq:" << from <<" " << to << " " << this->str() << endl;
+ VERIFY(to >= from);
+ VERIFY(to <= size_);
+ //VERIFY(to - from <= size_);
+ if (rtl_) {
+ return Sequence(*this, from_ + size_ - to, to - from, true);
+ } else {
+ return Sequence(*this, from_ + from, to - from, false);
+ }
+}
+
+//including from, excluding to
+Sequence Sequence::Subseq(size_t from) const {
+ return Subseq(from, size_);
+}
+
+/**
+ * @todo : must be KMP or hashing instead of this
+ */
+size_t Sequence::find(const Sequence &t, size_t from) const {
+ for (size_t i = from; i <= size() - t.size(); i++) {
+ if (Subseq(i, i + t.size()) == t) {
+ return i;
+ }
+ }
+ return -1ULL;
+}
+
+/**
+ *
+ *@param k minimal intersection of sequences
+ *@param directed LEFT means that after intersection t continues to left over _this and matches perfectly with _this on overlaping
+ *@return 0 - undirected similarity, 1: t extends this to right, -1: this extends t
+ *
+ */
+size_t Sequence::similar(const Sequence &t, size_t k, char directed) const {
+ size_t result = 0;
+ if (directed != -1)
+ result |= rightSimilar(t, k);
+ if (directed != 1)
+ result |= leftSimilar(t, k);
+ return result;
+}
+
+size_t Sequence::leftSimilar(const Sequence &t, size_t k) const {
+ return t.rightSimilar(*this, k);
+}
+
+size_t Sequence::rightSimilar(const Sequence &t, size_t k) const {
+ size_t tsz = t.size();
+ size_t sz = size();
+ Sequence d(t.Subseq(0, k));
+ for (size_t res = find(d, 0); res != -1ULL; res = find(d, res + 1)) {
+ if (res + tsz < sz)
+ continue;
+ size_t i;
+ for (i = k; i + res < sz; i++) {
+ if (t[i] != this->operator[](i + res)) {
+ break;
+ };
+ }
+ if (i == sz - res)
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * @todo optimize
+ */
+Sequence Sequence::operator+(const Sequence &s) const {
+ return Sequence(str() + s.str());
+ // TODO might be opposite to correct
+ // int total = size_ + s.size_;
+ // std::vector<Seq<4> > bytes((total + 3) >> 2);
+ // for (size_t i = 0; i < size_; ++i) {
+ // bytes[i / 4] = (bytes[i / 4] << operator [](i)); // TODO :-) use <<=
+ // }
+ // for (size_t i = 0, j = size_; i < s.size_; ++i, ++j) {
+ // bytes[j / 4] = (bytes[j / 4]) << s[i];
+ // }
+ // return Sequence(new Data(bytes), 0, total, false);
+}
+
+std::string Sequence::str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(this->operator[](i));
+ }
+ return res;
+}
+
+std::string Sequence::err() const {
+ std::ostringstream oss;
+ oss << "{ *data=" << data_ <<
+ ", from_=" << from_ <<
+ ", size_=" << size_ <<
+ ", rtl_=" << int(rtl_) << " }";
+ return oss.str();
+}
+
+std::ostream &operator<<(std::ostream &os, const Sequence &s) {
+ os << s.str();
+ return os;
+}
+
+bool Sequence::ReadHeader(std::istream &file) {
+ file.read((char *) &size_, sizeof(size_));
+
+ from_ = 0;
+ rtl_ = false;
+
+ return !file.fail();
+}
+
+bool Sequence::WriteHeader(std::ostream &file) const {
+ VERIFY(from_ == 0);
+ VERIFY(!rtl_);
+
+ file.write((const char *) &size_, sizeof(size_));
+
+ return !file.fail();
+}
+
+
+bool Sequence::BinRead(std::istream &file) {
+ ReadHeader(file);
+
+ data_ = std::shared_ptr<ST>(new ST[DataSize(size_)], array_deleter<ST>());
+ file.read((char *) data_.get(), DataSize(size_) * sizeof(ST));
+
+ return !file.fail();
+}
+
+
+bool Sequence::BinWrite(std::ostream &file) const {
+ if (from_ != 0 || rtl_) {
+ Sequence clear(this->str());
+ return clear.BinWrite(file);
+ }
+
+ WriteHeader(file);
+
+ file.write((const char *) data_.get(), DataSize(size_) * sizeof(ST));
+
+ return !file.fail();
+}
+
+/**
+ * @class SequenceBuilder
+ * @section DESCRIPTION
+ *
+ * Class was created for build sequence. It is included method: size(), append()
+ */
+
+class SequenceBuilder {
+ std::vector<char> buf_;
+public:
+ template<typename S>
+ SequenceBuilder &append(const S &s) {
+ for (size_t i = 0; i < s.size(); ++i) {
+ buf_.push_back(s[i]);
+ }
+ return *this;
+ }
+
+ SequenceBuilder &append(char c) {
+ buf_.push_back(c);
+ return *this;
+ }
+
+ Sequence BuildSequence() {
+ return Sequence(buf_);
+ }
+
+ size_t size() const {
+ return buf_.size();
+ }
+
+ char operator[](const size_t index) const {
+ VERIFY(index < buf_.size());
+ return buf_[index];
+ }
+
+ std::string str() const {
+ std::string s(buf_.size(), '-');
+ for (size_t i = 0; i < s.size(); ++i) {
+ s[i] = nucl(buf_[i]);
+ }
+ return s;
+ }
+};
+
+#endif /* SEQUENCE_HPP_ */
diff --git a/src/common/sequence/sequence_tools.hpp b/src/common/sequence/sequence_tools.hpp
new file mode 100644
index 0000000..f2231e2
--- /dev/null
+++ b/src/common/sequence/sequence_tools.hpp
@@ -0,0 +1,159 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef SEQUENCE_TOOLS_HPP_
+#define SEQUENCE_TOOLS_HPP_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "nucl.hpp"
+#include "sequence.hpp"
+#include "utils/levenshtein.hpp"
+
+inline const std::string Reverse(const std::string &s) {
+ return std::string(s.rbegin(), s.rend());
+}
+
+inline const std::string Complement(const std::string &s) {
+ std::string res(s.size(), 0);
+ transform(s.begin(), s.end(), res.begin(), nucl_complement);
+ return res;
+}
+
+inline const Sequence MergeOverlappingSequences(std::vector<Sequence>& ss,
+ size_t overlap, bool safe_merging = true) {
+ if (ss.empty()) {
+ return Sequence();
+ }
+ SequenceBuilder sb;
+ Sequence prev_end = ss.front().Subseq(0, overlap);
+ sb.append(prev_end);
+ for (auto it = ss.begin(); it != ss.end(); ++it) {
+ if(safe_merging)
+ VERIFY(prev_end == it->Subseq(0, overlap));
+ sb.append(it->Subseq(overlap));
+ prev_end = it->Subseq(it->size() - overlap);
+ }
+ return sb.BuildSequence();
+}
+
+inline size_t EditDistance(const Sequence& s1, const Sequence& s2) {
+ return edit_distance(s1.str(), s2.str());
+}
+
+inline bool Relax(int& val, int new_val) {
+ if (new_val > val) {
+ val = new_val;
+ return true;
+ }
+ return false;
+}
+
+inline std::pair<size_t, size_t> LocalSimilarity(const Sequence& s1, const Sequence& s2) {
+ size_t m = s1.size();
+ size_t n = s2.size();
+ std::vector<std::vector<int>> a(m + 1);
+ for (size_t i = 0; i <= m; ++i) {
+ a[i].resize(n + 1);
+ }
+ for (size_t i = 0; i <= m; ++i) {
+ for (size_t j = 0; j <= n; ++j) {
+ a[i][j] = 0;
+ }
+ }
+ for (size_t i = 1; i <= m; ++i) {
+ for (size_t j = 1; j <= n; ++j) {
+ Relax(a[i][j], a[i - 1][j] - 1);
+ Relax(a[i][j], a[i][j - 1] - 1);
+ if (s1[i - 1] == s2[j - 1]) {
+ Relax(a[i][j], a[i - 1][j - 1] + 1);
+ } else {
+ Relax(a[i][j], a[i - 1][j - 1] - 1);
+ }
+ }
+ }
+
+ //finding local alignment
+ int answer = 0;
+ size_t i_m = 0;
+ size_t j_m = 0;
+ for (size_t i = 0; i <= m; ++i) {
+ for (size_t j = 0; j <= n; ++j) {
+ if (Relax(answer, a[i][j])) {
+ i_m = i;
+ j_m = j;
+ }
+ }
+ }
+
+ //finding alignment lengths
+ size_t i = i_m;
+ size_t j = j_m;
+ while (a[i][j] > 0) {
+ if (a[i][j] == a[i][j - 1] - 1) {
+ j--;
+ } else if (a[i][j] == a[i-1][j] - 1) {
+ i--;
+ } else if (a[i][j] == a[i-1][j-1] + 1) {
+ VERIFY(s1[i-1] == s2[j-1]);
+ i--;
+ j--;
+ } else {
+ VERIFY(a[i-1][j-1] - 1 == a[i][j] && s1[i-1] != s2[j-1]);
+ i--;
+ j--;
+ }
+ }
+ return std::make_pair(size_t(answer), std::min(i_m - i, j_m - j));
+}
+
+inline const std::string ReverseComplement(const std::string &s) {
+ std::string res(s.size(), 0);
+ transform(s.begin(), s.end(), res.rbegin(), nucl_complement); // only difference with reverse is rbegin() instead of begin()
+ return res;
+}
+
+class UniformPositionAligner {
+private:
+ size_t upper_length_;
+ size_t lower_length_;
+public:
+ UniformPositionAligner(size_t upper_length, size_t lower_length) :
+ upper_length_(upper_length), lower_length_(lower_length) {
+ }
+
+ size_t GetPosition(size_t upper_position) {
+ if (upper_position * 2 + 1 >= upper_length_)
+ return (2 * upper_position + 1) * lower_length_
+ / (2 * upper_length_);
+ else
+ return lower_length_ - 1
+ - GetPosition(upper_length_ - 1 - upper_position);
+ }
+};
+
+class EnsureEndsPositionAligner {
+private:
+ size_t upper_length_;
+ size_t lower_length_;
+public:
+ EnsureEndsPositionAligner(size_t upper_length, size_t lower_length) :
+ upper_length_(upper_length), lower_length_(lower_length) {
+ }
+
+ size_t GetPosition(size_t upper_position) {
+ VERIFY(upper_position > 0);
+ if (lower_length_ == 1)
+ return 1;
+ return (2 * upper_position * lower_length_ + upper_length_)
+ / (2 * upper_length_);
+ }
+};
+
+#endif /* SEQUENCE_TOOLS_HPP_ */
diff --git a/src/common/sequence/simple_seq.hpp b/src/common/sequence/simple_seq.hpp
new file mode 100644
index 0000000..5bc144a
--- /dev/null
+++ b/src/common/sequence/simple_seq.hpp
@@ -0,0 +1,157 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * simple_seq.hpp
+ *
+ * Created on: Jul 23, 2012
+ * Author: andrey
+ */
+
+#ifndef SIMPLE_SEQ_HPP_
+#define SIMPLE_SEQ_HPP_
+
+#include <string>
+#include <array>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+
+#include "utils/verify.hpp"
+#include "nucl.hpp"
+#include "utils/log.hpp"
+#include "seq_common.hpp"
+/**
+ * @param T is max number of nucleotides, type for storage
+ */
+template<size_t size_, typename T = seq_element_type>
+class SimpleSeq {
+public:
+ /**
+ * @variable Number of bits in type T (e.g. 8 for char)
+ * @example 8: 2^8 = 256 or 16
+ */
+ const static size_t TBits = sizeof(T) << 3;
+
+ /**
+ * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
+ * TNucl MUST be a power of two
+ * @example 4: 8/2 = 4 or 16/2 = 8
+ */
+ const static size_t TNucl = TBits >> 1;
+
+ /**
+ * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
+ */
+ const static size_t TNuclBits = log_<TNucl, 2>::value;
+
+ /**
+ * @variable Number of Ts which required to store all sequence.
+ */
+ const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
+
+ typedef T DataType;
+
+ /**
+ * @variable Number of meaningful bytes in whick seq is stored
+ */
+ const static size_t TotalBytes = sizeof(T) * DataSize;
+
+private:
+ // number of nucleotides in the last data_ bucket
+ const static size_t NuclsRemain = size_ & (TNucl - 1);
+
+ // useful mask to fill the last element of the data_ array
+ const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1) ) - 1;
+
+
+ /**
+ * @variable Inner representation of sequence: array of Ts with length = DataSize.
+ *
+ * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
+ */
+ std::array<T, DataSize> data_;
+
+
+public:
+
+ SimpleSeq() {
+ //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
+ std::fill(data_.begin(), data_.end(), 0);
+ }
+
+ explicit SimpleSeq(T * data_array) {
+ memcpy(data_.data(), data_array, TotalBytes);
+ }
+
+
+ char operator[](const size_t i) const {
+ //VERIFY(i >= 0);
+ //VERIFY(i < size_);
+ return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
+ }
+
+ std::string str() const {
+ std::string res(size_, '-');
+ for (size_t i = 0; i < size_; ++i) {
+ res[i] = nucl(operator[](i));
+ }
+ return res;
+ }
+
+ void copy_data(void * dst) const {
+ memcpy(dst, (const void *) data_.data(), TotalBytes);
+ }
+
+ static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
+ return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
+ }
+
+ size_t GetHash(uint32_t seed = 0) const {
+ return GetHash(data_.data(), DataSize, seed);
+ }
+
+ struct hash {
+ size_t operator()(const SimpleSeq<size_, T>& seq, uint32_t seed = 0) const {
+ return seq.GetHash(seed);
+ }
+
+ size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
+ return GetHash(data, sz, seed);
+ }
+ };
+
+ struct equal_to {
+ bool operator()(const SimpleSeq<size_, T>& l, const SimpleSeq<size_, T>& r) const {
+ for (size_t i = 0; i < DataSize; ++i)
+ if (l.data_[i] != r.data_[i])
+ return false;
+ return true;
+ }
+ };
+
+ struct less2 {
+ int operator()(const SimpleSeq<size_, T> &l, const SimpleSeq<size_, T> &r) const {
+ for (size_t i = 0; i < size_; ++i) {
+ if (l[i] != r[i]) {
+ return (l[i] < r[i]);
+ }
+ }
+ return false;
+ }
+ };
+
+};
+
+template<size_t size_, typename T>
+std::ostream& operator<<(std::ostream& os, SimpleSeq<size_, T> seq) {
+ os << seq.str();
+ return os;
+}
+
+
+#endif /* SIMPLE_SEQ_HPP_ */
diff --git a/src/modules/stages/CMakeLists.txt b/src/common/stages/CMakeLists.txt
similarity index 100%
rename from src/modules/stages/CMakeLists.txt
rename to src/common/stages/CMakeLists.txt
diff --git a/src/common/stages/construction.cpp b/src/common/stages/construction.cpp
new file mode 100644
index 0000000..6116a62
--- /dev/null
+++ b/src/common/stages/construction.cpp
@@ -0,0 +1,70 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/reads/vector_reader.hpp"
+#include "io/dataset_support/dataset_readers.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "io/dataset_support/read_converter.hpp"
+
+#include "modules/graph_construction.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "construction.hpp"
+
+namespace debruijn_graph {
+
+template<class Read>
+void construct_graph(io::ReadStreamList<Read>& streams,
+ conj_graph_pack& gp, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
+ config::debruijn_config::construction params = cfg::get().con;
+ params.early_tc.enable &= !cfg::get().gap_closer_enable;
+
+ ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g,
+ gp.index, gp.flanking_cov, contigs_stream);
+ size_t rl = stats.max_read_length_;
+
+ if (!cfg::get().ds.RL()) {
+ INFO("Figured out: read length = " << rl);
+ cfg::get_writable().ds.set_RL(rl);
+ cfg::get_writable().ds.set_aRL((double) stats.bases_ / (double) stats.reads_);
+ } else if (cfg::get().ds.RL() != rl)
+ WARN("In datasets.info, wrong RL is specified: " << cfg::get().ds.RL() << ", not " << rl);
+}
+
+void Construction::run(conj_graph_pack &gp, const char*) {
+ // Has to be separate stream for not counting it in coverage
+ io::ReadStreamList<io::SingleRead> trusted_contigs;
+ if (cfg::get().use_additional_contigs) {
+ DEBUG("Contigs from previous K will be used: " << cfg::get().additional_contigs);
+ trusted_contigs.push_back(io::EasyStream(cfg::get().additional_contigs, true));
+ }
+
+ bool trusted_contigs_exist = false;
+ for (const auto& lib : cfg::get().ds.reads) {
+ if (lib.type() != io::LibraryType::TrustedContigs)
+ continue;
+
+ for (const auto& read : lib.single_reads()) {
+ trusted_contigs.push_back(io::EasyStream(read, true));
+ trusted_contigs_exist = true;
+ }
+ }
+
+ if (trusted_contigs_exist)
+ INFO("Trusted contigs will be used in graph construction");
+ auto contigs_stream = MultifileWrap(trusted_contigs);
+
+ auto& dataset = cfg::get_writable().ds;
+ std::vector<size_t> libs_for_construction;
+ for (size_t i = 0; i < dataset.reads.lib_count(); ++i)
+ if (dataset.reads[i].is_graph_contructable())
+ libs_for_construction.push_back(i);
+
+ auto streams = io::single_binary_readers_for_libs(dataset, libs_for_construction, true, true);
+ construct_graph<io::SingleReadSeq>(streams, gp, contigs_stream);
+}
+
+} //namespace debruijn_graph
diff --git a/src/modules/stages/construction.hpp b/src/common/stages/construction.hpp
similarity index 100%
rename from src/modules/stages/construction.hpp
rename to src/common/stages/construction.hpp
diff --git a/src/common/stages/simplification.cpp b/src/common/stages/simplification.cpp
new file mode 100644
index 0000000..f0cd8a9
--- /dev/null
+++ b/src/common/stages/simplification.cpp
@@ -0,0 +1,613 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/core/basic_graph_stats.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+#include "stages/simplification_pipeline/graph_simplification.hpp"
+#include "stages/simplification_pipeline/single_cell_simplification.hpp"
+#include "stages/simplification_pipeline/rna_simplification.hpp"
+
+#include "simplification.hpp"
+
+namespace debruijn_graph {
+
+using namespace debruijn::simplification;
+using namespace config;
+
+template<class graph_pack>
+shared_ptr<visualization::graph_colorer::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
+ const graph_pack& gp) {
+ io::SingleRead genome("ref", gp.genome.str());
+ auto mapper = MapperInstance(gp);
+ auto path1 = mapper->MapRead(genome).path();
+ auto path2 = mapper->MapRead(!genome).path();
+ return visualization::graph_colorer::DefaultColorer(gp.g, path1, path2);
+}
+
+class GraphSimplifier {
+ typedef std::function<void(EdgeId)> HandlerF;
+
+ typedef std::vector<std::pair<AlgoPtr<Graph>, std::string>> AlgoStorageT;
+
+ conj_graph_pack& gp_;
+ Graph& g_;
+ SimplifInfoContainer info_container_;
+ const debruijn_config::simplification simplif_cfg_;
+
+ CountingCallback<Graph> cnt_callback_;
+ HandlerF removal_handler_;
+ stats::detail_info_printer& printer_;
+
+ bool PerformInitCleaning() {
+
+ if (simplif_cfg_.init_clean.early_it_only && info_container_.main_iteration()) {
+ INFO("Most init cleaning disabled on main iteration");
+ return false;
+ }
+ if (math::ge(simplif_cfg_.init_clean.activation_cov, 0.)
+ && math::ls(info_container_.detected_mean_coverage(), simplif_cfg_.init_clean.activation_cov)) {
+ INFO("Most init cleaning disabled since detected mean " << info_container_.detected_mean_coverage()
+ << " was less than activation coverage " << simplif_cfg_.init_clean.activation_cov);
+ return false;
+ }
+
+ return true;
+ }
+
+ void RemoveShortPolyATEdges(size_t max_length,
+ HandlerF removal_handler = 0, size_t chunk_cnt = 1) {
+ INFO("Removing short polyAT");
+ EdgeRemover<Graph> er(g_, removal_handler);
+ ATCondition<Graph> condition (g_, 0.8, max_length, false);
+ for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter){
+ if (g_.length(*iter) == 1 && condition.Check(*iter)) {
+ er.DeleteEdgeNoCompress(*iter);
+ }
+ }
+ ParallelCompress(g_, chunk_cnt);
+ }
+
+ void InitialCleaning() {
+ INFO("PROCEDURE == InitialCleaning");
+
+ AlgoStorageT algos;
+
+ PushValid(
+ SelfConjugateEdgeRemoverInstance(g_,
+ simplif_cfg_.init_clean.self_conj_condition,
+ info_container_, removal_handler_),
+ "Self conjugate edge remover",
+ algos);
+
+ if (info_container_.mode() == config::pipeline_type::rna){
+ RemoveShortPolyATEdges(1, removal_handler_, info_container_.chunk_cnt());
+ PushValid(ShortPolyATEdgesRemoverInstance(g_, 1, removal_handler_, info_container_.chunk_cnt()), "Short PolyA/T Edges",algos) ;
+ PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
+ }
+
+ if (PerformInitCleaning()) {
+ PushValid(
+ IsolatedEdgeRemoverInstance(g_,
+ simplif_cfg_.init_clean.ier,
+ info_container_, removal_handler_),
+ "Initial isolated edge remover",
+ algos);
+
+ PushValid(
+ TipClipperInstance(g_,
+ debruijn_config::simplification::tip_clipper(simplif_cfg_.init_clean.tip_condition),
+ info_container_,
+ removal_handler_),
+ "Initial tip clipper",
+ algos);
+
+ PushValid(
+ ECRemoverInstance(g_,
+ debruijn_config::simplification::erroneous_connections_remover(simplif_cfg_.init_clean.ec_condition),
+ info_container_,
+ removal_handler_),
+ "Initial ec remover",
+ algos);
+
+ PushValid(
+ LowFlankDisconnectorInstance(g_, gp_.flanking_cov,
+ simplif_cfg_.init_clean.disconnect_flank_cov, info_container_,
+ removal_handler_),
+ "Disconnecting edges with low flanking coverage",
+ algos);
+ }
+
+ RunAlgos(algos);
+
+ if (info_container_.mode() == config::pipeline_type::rna){
+ RemoveHiddenLoopEC(g_, gp_.flanking_cov, info_container_.detected_coverage_bound(), simplif_cfg_.her, removal_handler_);
+ cnt_callback_.Report();
+ }
+ }
+
+ bool AllTopology() {
+ bool res = TopologyRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
+ removal_handler_);
+ cnt_callback_.Report();
+ res |= TopologyReliabilityRemoveErroneousEdges(gp_.g, simplif_cfg_.trec,
+ removal_handler_);
+ cnt_callback_.Report();
+ res |= RemoveThorns(gp_.g, simplif_cfg_.isec, removal_handler_);
+ cnt_callback_.Report();
+ res |= MultiplicityCountingRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
+ removal_handler_);
+ cnt_callback_.Report();
+ return res;
+ }
+
+ bool FinalRemoveErroneousEdges() {
+ bool changed = false;
+ if (simplif_cfg_.topology_simplif_enabled && info_container_.main_iteration()) {
+ changed |= AllTopology();
+ changed |= MaxFlowRemoveErroneousEdges(gp_.g, simplif_cfg_.mfec,
+ removal_handler_);
+ cnt_callback_.Report();
+ }
+ return changed;
+ }
+
+ void PostSimplification() {
+ using namespace omnigraph;
+ using namespace func;
+ INFO("PROCEDURE == Post simplification");
+
+ AlgoStorageT algos;
+
+ //auto colorer = debruijn_graph::DefaultGPColorer(gp_);
+ //visualization::graph_labeler::DefaultLabeler<Graph> labeler(g_, gp_.edge_pos);
+
+ // gp.ClearQuality();
+ // gp.FillQuality();
+ // QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
+ // cfg::get().output_dir + "pictures/colored_edges_deleted/");
+ //
+ // //positive quality edges removed (folder colored_edges_deleted)
+ // std::function<void(EdgeId)> qual_removal_handler_f = boost::bind(
+ // // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
+ // &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
+ // boost::ref(qual_removal_handler), _1);
+
+ //visualization::visualization_utils::LocalityPrintingRH<Graph> drawing_handler(gp_.g, labeler, colorer, "/home/snurk/pics");
+ //auto printing_handler=[&] (EdgeId e) {
+ // std::cout << "Edge:" << g_.str(e) << "; cov: " << g_.coverage(e) << "; start " << g_.str(g_.EdgeStart(e)) << "; end " << g_.str(g_.EdgeEnd(e)) << std::endl;
+ //};
+ //auto extensive_handler = [&] (EdgeId e) {removal_handler_(e) ; printing_handler(e); drawing_handler.HandleDelete(e);};
+
+
+ typename ComponentRemover<Graph>::HandlerF set_removal_handler_f;
+ if (removal_handler_) {
+ set_removal_handler_f = [=](const set<EdgeId>& edges) {
+ std::for_each(edges.begin(), edges.end(), removal_handler_);
+ };
+ }
+
+ PushValid(
+ RelativeECRemoverInstance(gp_.g,
+ simplif_cfg_.rcec, info_container_, removal_handler_),
+ "Relative coverage component remover",
+ algos);
+
+ PushValid(
+ RelativeCoverageComponentRemoverInstance(gp_.g, gp_.flanking_cov,
+ simplif_cfg_.rcc, info_container_, set_removal_handler_f),
+ "Relative coverage component remover",
+ algos);
+
+
+ PushValid(
+ RelativelyLowCoverageDisconnectorInstance(gp_.g, gp_.flanking_cov,
+ simplif_cfg_.relative_ed, info_container_),
+ "Disconnecting edges with relatively low coverage",
+ algos);
+
+ PushValid(
+ ComplexTipClipperInstance(gp_.g, simplif_cfg_.complex_tc, info_container_, set_removal_handler_f),
+ "Complex tip clipper",
+ algos);
+
+ PushValid(
+ ComplexBRInstance(gp_.g, simplif_cfg_.cbr, info_container_),
+ "Complex bulge remover",
+ algos);
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc,
+ info_container_, removal_handler_),
+ "Tip clipper",
+ algos);
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.final_tc,
+ info_container_, removal_handler_),
+ "Final tip clipper",
+ algos);
+
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br,
+ info_container_, removal_handler_),
+ "Bulge remover",
+ algos);
+
+ PushValid(
+ BRInstance(g_, simplif_cfg_.final_br,
+ info_container_, removal_handler_),
+ "Final bulge remover",
+ algos);
+
+ if (simplif_cfg_.topology_simplif_enabled) {
+ PushValid(
+ TopologyTipClipperInstance(g_, simplif_cfg_.ttc,
+ info_container_, removal_handler_),
+ "Topology tip clipper",
+ algos);
+ }
+
+ //FIXME need better configuration
+
+ if (info_container_.mode() == config::pipeline_type::meta) {
+ PushValid(
+ BRInstance(g_, simplif_cfg_.second_final_br,
+ info_container_, removal_handler_),
+ "Yet another final bulge remover",
+ algos);
+
+ EdgePredicate<Graph> meta_thorn_condition
+ = And(LengthUpperBound<Graph>(g_, LengthThresholdFinder::MaxErroneousConnectionLength(
+ g_.k(), simplif_cfg_.isec.max_ec_length_coefficient)),
+
+ And([&] (EdgeId e) {
+ //todo configure!
+ return simplification::relative_coverage::
+ RelativeCoverageHelper<Graph>(g_, gp_.flanking_cov, 2).AnyHighlyCoveredOnFourSides(e);
+ },
+
+ And(UniqueIncomingPathLengthLowerBound(g_, simplif_cfg_.isec.uniqueness_length),
+
+ //todo configure!
+ TopologicalThornCondition<Graph>(g_, simplif_cfg_.isec.span_distance, /*max edge cnt*/5))));
+
+ PushValid(std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g_, meta_thorn_condition, info_container_.chunk_cnt(),
+ removal_handler_),
+ "Thorn remover (meta)",
+ algos);
+ }
+
+ if (info_container_.mode() == config::pipeline_type::rna) {
+ PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
+ }
+
+ size_t iteration = 0;
+ bool enable_flag = true;
+ while (enable_flag) {
+ enable_flag = false;
+
+ INFO("Iteration " << iteration);
+
+ enable_flag |= FinalRemoveErroneousEdges();
+ cnt_callback_.Report();
+
+ enable_flag |= RunAlgos(algos);
+
+ iteration++;
+
+ // printer(ipp_before_final_err_con_removal);
+ // printer(ipp_final_tip_clipping, str(format("_%d") % iteration));
+ // printer(ipp_final_err_con_removal, str(format("_%d") % iteration));
+ // printer(ipp_final_bulge_removal, str(format("_%d") % iteration));
+ }
+
+ if (simplif_cfg_.topology_simplif_enabled) {
+ RemoveHiddenEC(gp_.g, gp_.flanking_cov, simplif_cfg_.her, info_container_, removal_handler_);
+
+ cnt_callback_.Report();
+ }
+
+ if (info_container_.mode() == config::pipeline_type::meta && simplif_cfg_.her.enabled) {
+ VERIFY(math::ls(simplif_cfg_.her.unreliability_threshold, 0.));
+ MetaHiddenECRemover<Graph> algo(g_, info_container_.chunk_cnt(), gp_.flanking_cov,
+ simplif_cfg_.her.uniqueness_length,
+ simplif_cfg_.her.relative_threshold,
+ removal_handler_);
+ INFO("Running Hidden EC remover (meta)");
+ LoopedRun(algo);
+ cnt_callback_.Report();
+ }
+
+ INFO("Disrupting self-conjugate edges");
+ SelfConjugateDisruptor<Graph>(gp_.g, removal_handler_).Run();
+ cnt_callback_.Report();
+ }
+
+ //inline
+ //void IdealSimplification(Graph& graph,
+ // std::function<double(EdgeId)> quality_handler_f) {
+ // for (auto iterator = graph.SmartEdgeBegin(); !iterator.IsEnd();
+ // ++iterator) {
+ // if (math::eq(quality_handler_f(*iterator), 0.))
+ // graph.DeleteEdge(*iterator);
+ // }
+ // CompressAllVertices(graph);
+ //}
+
+ void PushValid(const AlgoPtr<Graph>& algo_ptr, std::string comment, AlgoStorageT& algos) const {
+ if (algo_ptr) {
+ algos.push_back(std::make_pair(algo_ptr, comment));
+ }
+ }
+
+ bool RunAlgo(const AlgoPtr<Graph>& algo, const string &comment, bool force_primary_launch = false) {
+ INFO("Running " << comment);
+ size_t triggered = algo->Run(force_primary_launch);
+ INFO("Triggered " << triggered << " times");
+ cnt_callback_.Report();
+ return (triggered > 0);
+ }
+
+ bool RunAlgos(AlgoStorageT& algos, bool force_primary_launch = false) {
+ bool changed = false;
+ for (auto algo_comment : algos) {
+ changed |= RunAlgo(algo_comment.first, algo_comment.second, force_primary_launch);
+ }
+ return changed;
+ }
+
+public:
+ GraphSimplifier(conj_graph_pack &gp, const SimplifInfoContainer& info_container,
+ const debruijn_config::simplification& simplif_cfg,
+ const std::function<void(EdgeId)>& removal_handler,
+ stats::detail_info_printer& printer)
+ : gp_(gp),
+ g_(gp_.g),
+ info_container_(info_container),
+ simplif_cfg_(simplif_cfg),
+ removal_handler_(AddCountingCallback(cnt_callback_, removal_handler)),
+ printer_(printer) {
+
+ }
+
+ void SimplifyGraph() {
+ printer_(info_printer_pos::before_simplification);
+ INFO("Graph simplification started");
+
+ InitialCleaning();
+
+ AlgoStorageT algos;
+
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_),
+ "Tip clipper",
+ algos);
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_),
+ "Bulge remover",
+ algos);
+ PushValid(
+ ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
+ "Low coverage edge remover",
+ algos);
+
+ size_t iteration = 0;
+ bool graph_changed = true;
+ //cannot stop simply if nothing changed, since threshold changes on every iteration
+ while (iteration < simplif_cfg_.cycle_iter_count || graph_changed) {
+ INFO("PROCEDURE == Simplification cycle, iteration " << iteration + 1);
+ graph_changed = RunAlgos(algos);
+ ++iteration;
+ }
+
+ printer_(info_printer_pos::before_post_simplification);
+
+ if (simplif_cfg_.post_simplif_enabled) {
+ PostSimplification();
+ } else {
+ INFO("PostSimplification disabled");
+ }
+ }
+
+ //TODO reduce code duplication
+ void SimplifyRNAGraph() {
+ printer_(info_printer_pos::before_simplification);
+ INFO("Graph simplification started");
+
+ InitialCleaning();
+
+ if (gp_.genome.GetSequence().size() > 0) {
+ DEBUG("Reference genome length = " + std::to_string(gp_.genome.GetSequence().size()));
+ }
+
+ auto ec_algo = ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_,
+ simplif_cfg_.cycle_iter_count);
+
+ size_t iteration = 0;
+ bool graph_changed_ec = true;
+
+ //cannot stop simply if nothing changed, since threshold changes on every iteration
+ while (iteration < simplif_cfg_.cycle_iter_count || graph_changed_ec) {
+ //FIXME either algos creation can be moved out of the cycle,
+ // or checking graph_changed_ec is not enough for correct behaviour
+ AlgoStorageT algos;
+ PushValid(
+ TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_),
+ "Tip clipper",
+ algos);
+ PushValid(
+ DeadEndInstance(g_, simplif_cfg_.dead_end, info_container_, removal_handler_),
+ "Dead end clipper",
+ algos);
+ PushValid(
+ BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_),
+ "Bulge remover",
+ algos);
+
+ bool graph_changed = true;
+ size_t inner_iteration = 0;
+ while (graph_changed) {
+ INFO("PROCEDURE == Tip clipper and bulge removal cycle, iteration "
+ << iteration + 1 << "." << inner_iteration);
+ graph_changed = RunAlgos(algos);
+ ++inner_iteration;
+ }
+ INFO("PROCEDURE == Erroneous connection, iteration " << iteration + 1);
+ graph_changed_ec = RunAlgo(ec_algo, "Low coverage edge remover");
+ ++iteration;
+ }
+
+ printer_(info_printer_pos::before_post_simplification);
+
+ if (simplif_cfg_.post_simplif_enabled) {
+ PostSimplification();
+ } else {
+ INFO("PostSimplification disabled");
+ }
+ }
+};
+
+shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> DefaultGPColorer(
+ const conj_graph_pack &gp) {
+ auto mapper = MapperInstance(gp);
+ auto path1 = mapper->MapSequence(gp.genome.GetSequence()).path();
+ auto path2 = mapper->MapSequence(!gp.genome.GetSequence()).path();
+ return visualization::graph_colorer::DefaultColorer(gp.g, path1, path2);
+}
+
+void Simplification::run(conj_graph_pack &gp, const char*) {
+ using namespace omnigraph;
+
+ //no other handlers here, todo change with DetachAll
+ gp.index.Detach();
+ gp.index.clear();
+
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+
+ // QualityLoggingRemovalHandler<Graph> qual_removal_handler(gp.g, edge_qual);
+// auto colorer = debruijn_graph::DefaultGPColorer(gp);
+// QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
+// cfg::get().output_dir + "pictures/colored_edges_deleted/");
+//
+// //positive quality edges removed (folder colored_edges_deleted)
+// std::function<void(EdgeId)> removal_handler_f = boost::bind(
+// // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
+// &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
+// boost::ref(qual_removal_handler), _1);
+
+
+ SimplifInfoContainer info_container(cfg::get().mode);
+ info_container.set_read_length(cfg::get().ds.RL())
+ .set_main_iteration(cfg::get().main_iteration)
+ .set_chunk_cnt(5 * cfg::get().max_threads);
+
+ //0 if model didn't converge
+ //todo take max with trusted_bound
+ //FIXME add warning when used for uneven coverage applications
+ info_container.set_detected_mean_coverage(gp.ginfo.estimated_mean())
+ .set_detected_coverage_bound(gp.ginfo.ec_bound());
+
+ GraphSimplifier simplifier(gp, info_container,
+ preliminary_ ? *cfg::get().preliminary_simp : cfg::get().simp,
+ nullptr/*removal_handler_f*/,
+ printer);
+ if (cfg::get().mode == pipeline_type::rna)
+ simplifier.SimplifyRNAGraph();
+ else
+ simplifier.SimplifyGraph();
+
+}
+
+
+void SimplificationCleanup::run(conj_graph_pack &gp, const char*) {
+ SimplifInfoContainer info_container(cfg::get().mode);
+ info_container
+ .set_read_length(cfg::get().ds.RL())
+ .set_main_iteration(cfg::get().main_iteration)
+ .set_chunk_cnt(5 * cfg::get().max_threads);
+
+
+ auto isolated_edge_remover =
+ IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (EdgeRemovalHandlerF<Graph>)nullptr);
+ if (isolated_edge_remover != nullptr)
+ isolated_edge_remover->Run();
+
+ double low_threshold = gp.ginfo.trusted_bound();
+ if (math::gr(low_threshold, 0.0)) {
+ INFO("Removing all the edges having coverage " << low_threshold << " and less");
+ ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>>
+ cov_cleaner(gp.g,
+ CoverageUpperBound<Graph>(gp.g, low_threshold),
+ info_container.chunk_cnt(),
+ (EdgeRemovalHandlerF<Graph>)nullptr,
+ /*canonical_only*/true,
+ CoverageComparator<Graph>(gp.g));
+ cov_cleaner.Run();
+ }
+
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(info_printer_pos::final_simplified);
+
+ DEBUG("Graph simplification finished");
+
+ INFO("Counting average coverage");
+ AvgCovereageCounter<Graph> cov_counter(gp.g);
+
+ cfg::get_writable().ds.set_avg_coverage(cov_counter.Count());
+
+ INFO("Average coverage = " << cfg::get().ds.avg_coverage());
+ if (!cfg::get().uneven_depth) {
+ if (cfg::get().ds.avg_coverage() < gp.ginfo.ec_bound())
+ WARN("The determined erroneous connection coverage threshold may be determined improperly\n");
+ }
+}
+
+
+#if 0
+void corrected_and_save_reads(const conj_graph_pack& gp) {
+ //saving corrected reads
+ //todo read input files, correct, save and use on the next iteration
+
+ auto_ptr<io::IReader<io::PairedReadSeq>> paired_stream =
+ paired_binary_multireader(false, /*insert_size*/0);
+ io::ModifyingWrapper<io::PairedReadSeq> refined_paired_stream(
+ *paired_stream,
+ GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
+
+ auto_ptr<io::IReader<io::SingleReadSeq>> single_stream =
+ single_binary_multireader(false, /*include_paired_reads*/false);
+ io::ModifyingWrapper<io::SingleReadSeq> refined_single_stream(
+ *single_stream,
+ GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
+
+ if (cfg::get().graph_read_corr.binary) {
+ INFO("Correcting paired reads");
+
+ io::BinaryWriter paired_converter(
+ cfg::get().paired_read_prefix + "_cor", cfg::get().max_threads,
+ cfg::get().buffer_size);
+ paired_converter.ToBinary(refined_paired_stream);
+
+ INFO("Correcting single reads");
+ io::BinaryWriter single_converter(
+ cfg::get().single_read_prefix + "_cor", cfg::get().max_threads,
+ cfg::get().buffer_size);
+ single_converter.ToBinary(refined_single_stream);
+ } else {
+ //save in fasta
+ VERIFY(false);
+ }
+
+ INFO("Error correction done");
+}
+#endif
+
+} //debruijn_graph
diff --git a/src/modules/stages/simplification.hpp b/src/common/stages/simplification.hpp
similarity index 100%
rename from src/modules/stages/simplification.hpp
rename to src/common/stages/simplification.hpp
diff --git a/src/common/stages/simplification_pipeline/graph_simplification.hpp b/src/common/stages/simplification_pipeline/graph_simplification.hpp
new file mode 100644
index 0000000..99937ed
--- /dev/null
+++ b/src/common/stages/simplification_pipeline/graph_simplification.hpp
@@ -0,0 +1,678 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * graph_simplification.hpp
+ *
+ * Created on: Aug 12, 2011
+ * Author: sergey
+ */
+
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+
+#include "modules/simplification/tip_clipper.hpp"
+#include "modules/simplification/complex_tip_clipper.hpp"
+#include "modules/simplification/bulge_remover.hpp"
+#include "modules/simplification/complex_bulge_remover.hpp"
+#include "modules/simplification/erroneous_connection_remover.hpp"
+#include "modules/simplification/relative_coverage_remover.hpp"
+#include "modules/simplification/mf_ec_remover.hpp"
+#include "modules/simplification/parallel_simplification_algorithms.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+
+#include "modules/graph_read_correction.hpp"
+
+#include "assembly_graph/graph_support/chimera_stats.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+
+#include "assembly_graph/core/graph.hpp"
+
+#include "visualization/graph_colorer.hpp"
+#include "utils/standard_base.hpp"
+
+namespace debruijn {
+
+namespace simplification {
+
+//todo remove this line
+using namespace debruijn_graph;
+
+template<class Graph>
+using AlgoPtr = std::shared_ptr<omnigraph::PersistentAlgorithmBase<Graph>>;
+
+template<class Graph>
+using EdgeConditionT = func::TypedPredicate<typename Graph::EdgeId>;
+
+template<class Graph>
+class ConditionParser {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &g_;
+ string next_token_;
+ string input_;
+ const SimplifInfoContainer settings_;
+ size_t curr_iteration_;
+ size_t iteration_cnt_;
+ std::queue<string> tokenized_input_;
+
+ size_t max_length_bound_;
+ double max_coverage_bound_;
+
+ string ReadNext() {
+ if (!tokenized_input_.empty()) {
+ next_token_ = tokenized_input_.front();
+ tokenized_input_.pop();
+ } else {
+ next_token_ = "";
+ }
+ return next_token_;
+ }
+
+ template<typename T>
+ bool RelaxMax(T &cur_max, T t) {
+ if (t > cur_max) {
+ cur_max = t;
+ return true;
+ }
+ return false;
+ }
+
+ template<typename T>
+ bool RelaxMin(T &cur_min, T t) {
+ if (t < cur_min) {
+ cur_min = t;
+ return true;
+ }
+ return false;
+ }
+
+ double GetCoverageBound() {
+ if (next_token_ == "auto") {
+ return settings_.detected_coverage_bound();
+ } else {
+ return std::stod(next_token_);
+ }
+ }
+
+ func::TypedPredicate<EdgeId> ParseCondition(size_t &min_length_bound,
+ double &min_coverage_bound) {
+ if (next_token_ == "tc_lb") {
+ double length_coeff = std::stod(ReadNext());
+
+ DEBUG("Creating tip length bound. Coeff " << length_coeff);
+ size_t length_bound = LengthThresholdFinder::MaxTipLength(
+ settings_.read_length(), g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "rlmk") {
+ //Read length minus k
+ VERIFY_MSG(settings_.read_length() > g_.k(), "Read length was shorter than K");
+ DEBUG("Creating (rl - k) bound");
+ size_t length_bound = settings_.read_length() - g_.k();
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "to_ec_lb") {
+ double length_coeff = std::stod(ReadNext());
+
+ DEBUG( "Creating length bound for erroneous connections originated from tip merging. Coeff " << length_coeff);
+ size_t length_bound =
+ LengthThresholdFinder::MaxTipOriginatedECLength(
+ settings_.read_length(), g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+
+ } else if (next_token_ == "ec_lb") {
+ size_t length_coeff = std::stoll(ReadNext());
+
+ DEBUG("Creating ec length bound. Coeff " << length_coeff);
+ size_t length_bound =
+ LengthThresholdFinder::MaxErroneousConnectionLength(
+ g_.k(), length_coeff);
+
+ DEBUG("Length bound " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+ } else if (next_token_ == "lb") {
+ size_t length_bound = std::stoll(ReadNext());
+
+ DEBUG("Creating length bound. Value " << length_bound);
+
+ RelaxMin(min_length_bound, length_bound);
+ DEBUG("Min length bound - " << min_length_bound);
+ return LengthUpperBound<Graph>(g_, length_bound);
+ } else if (next_token_ == "cb") {
+ ReadNext();
+ double cov_bound = GetCoverageBound();
+ DEBUG("Creating coverage upper bound " << cov_bound);
+ RelaxMin(min_coverage_bound, cov_bound);
+ return CoverageUpperBound<Graph>(g_, cov_bound);
+ } else if (next_token_ == "icb") {
+ VERIFY(iteration_cnt_ != -1ul && curr_iteration_ != -1ul);
+ ReadNext();
+ double cov_bound = GetCoverageBound();
+ cov_bound = cov_bound / (double) iteration_cnt_ * (double) (curr_iteration_ + 1);
+ DEBUG("Creating iterative coverage upper bound " << cov_bound);
+ RelaxMin(min_coverage_bound, cov_bound);
+ return CoverageUpperBound<Graph>(g_, cov_bound);
+ } else if (next_token_ == "nbr") {
+ return NotBulgeECCondition<Graph>(g_);
+ } else if (next_token_ == "rctc") {
+ ReadNext();
+ DEBUG("Creating relative cov tip cond " << next_token_);
+ return RelativeCoverageTipCondition<Graph>(g_, std::stod(next_token_));
+ } else if (next_token_ == "disabled") {
+ DEBUG("Creating disabling condition");
+ return func::AlwaysFalse<EdgeId>();
+ } else if (next_token_ == "mmm") {
+ ReadNext();
+ DEBUG("Creating max mismatches cond " << next_token_);
+ return MismatchTipCondition<Graph>(g_, std::stoll(next_token_));
+ } else {
+ VERIFY(false);
+ return func::AlwaysTrue<EdgeId>();
+ }
+ }
+
+ func::TypedPredicate<EdgeId> ParseConjunction(size_t &min_length_bound,
+ double &min_coverage_bound) {
+ func::TypedPredicate<EdgeId> answer = func::AlwaysTrue<EdgeId>();
+ VERIFY(next_token_ == "{");
+ ReadNext();
+ while (next_token_ != "}") {
+ answer = func::And(answer,
+ ParseCondition(min_length_bound, min_coverage_bound));
+ ReadNext();
+ }
+ return answer;
+ }
+
+public:
+
+ ConditionParser(const Graph &g, string input, const SimplifInfoContainer &settings,
+ size_t curr_iteration = -1ul, size_t iteration_cnt = -1ul)
+ : g_(g),
+ input_(input),
+ settings_(settings),
+ curr_iteration_(curr_iteration),
+ iteration_cnt_(iteration_cnt),
+ max_length_bound_(0),
+ max_coverage_bound_(0.) {
+ DEBUG("Creating parser for string " << input);
+ using namespace boost;
+ vector<string> tmp_tokenized_input;
+ boost::split(tmp_tokenized_input, input_, boost::is_any_of(" ,;"), boost::token_compress_on);
+ for (auto it = tmp_tokenized_input.begin();
+ it != tmp_tokenized_input.end(); ++it) {
+ tokenized_input_.push(*it);
+ }
+ ReadNext();
+ }
+
+ func::TypedPredicate<EdgeId> operator()() {
+ DEBUG("Parsing");
+ func::TypedPredicate<EdgeId> answer = func::AlwaysFalse<EdgeId>();
+ VERIFY_MSG(next_token_ == "{", "Expected \"{\", but next token was " << next_token_);
+ while (next_token_ == "{") {
+ size_t min_length_bound = numeric_limits<size_t>::max();
+ double min_coverage_bound = numeric_limits<double>::max();
+ answer = func::Or(answer,
+ ParseConjunction(min_length_bound, min_coverage_bound));
+ RelaxMax(max_length_bound_, min_length_bound);
+ RelaxMax(max_coverage_bound_, min_coverage_bound);
+ ReadNext();
+ }
+ return answer;
+ }
+
+ size_t max_length_bound() const {
+ return max_length_bound_;
+ }
+
+ double max_coverage_bound() const {
+ return max_coverage_bound_;
+ }
+
+private:
+ DECL_LOGGER("ConditionParser");
+};
+
+template<class Graph>
+class EditDistanceTrackingCallback {
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &g_;
+
+public:
+ EditDistanceTrackingCallback(const Graph &g)
+ : g_(g) {
+ }
+
+ bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
+ vector<Sequence> path_sequences;
+ for (EdgeId e : path) {
+ path_sequences.push_back(g_.EdgeNucls(e));
+ }
+ Sequence path_sequence(
+ MergeOverlappingSequences(path_sequences, g_.k()));
+ size_t dist = EditDistance(g_.EdgeNucls(edge), path_sequence);
+ TRACE( "Bulge sequences with distance " << dist << " were " << g_.EdgeNucls(edge) << " and " << path_sequence);
+ return true;
+ }
+
+private:
+ DECL_LOGGER("EditDistanceTrackingCallback");
+};
+
+//enabling tip projection
+template<class gp_t>
+EdgeRemovalHandlerF<typename gp_t::graph_t> WrapWithProjectionCallback(
+ gp_t &gp,
+ EdgeRemovalHandlerF<typename gp_t::graph_t> removal_handler) {
+ typedef typename gp_t::graph_t Graph;
+ typedef typename Graph::EdgeId EdgeId;
+ TipsProjector<gp_t> tip_projector(gp);
+
+ EdgeRemovalHandlerF<Graph> projecting_callback = std::bind(&TipsProjector<gp_t>::ProjectTip,
+ tip_projector, std::placeholders::_1);
+
+ return func::CombineCallbacks<EdgeId>(std::ref(removal_handler), projecting_callback);
+}
+
+template<class Graph>
+class LowCoverageEdgeRemovingAlgorithm : public PersistentProcessingAlgorithm<Graph,
+ typename Graph::EdgeId,
+ omnigraph::CoverageComparator<Graph>> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef PersistentProcessingAlgorithm<Graph, EdgeId, omnigraph::CoverageComparator<Graph>> base;
+
+ const SimplifInfoContainer simplif_info_;
+ const std::string condition_str_;
+ EdgeRemover<Graph> edge_remover_;
+
+ func::TypedPredicate<EdgeId> remove_condition_;
+ func::TypedPredicate<EdgeId> proceed_condition_;
+
+protected:
+
+ void PrepareIteration(size_t it_cnt, size_t total_it_estimate) override {
+ TRACE("Preparing iteration " << it_cnt << " out of total estimate " << total_it_estimate);
+ ConditionParser<Graph> parser(this->g(), condition_str_,
+ simplif_info_, it_cnt, total_it_estimate);
+ remove_condition_ = omnigraph::AddAlternativesPresenceCondition(this->g(), parser());
+ TRACE("Updated remove condition");
+ proceed_condition_ = CoverageUpperBound<Graph>(this->g(), parser.max_coverage_bound());
+ TRACE("Updated proceed condition up to coverage " << parser.max_coverage_bound());
+ }
+
+ bool Proceed(EdgeId e) const override {
+ return proceed_condition_(e);
+ }
+
+ bool Process(EdgeId e) override {
+ TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
+ if (remove_condition_(e)) {
+ TRACE("Check passed, removing");
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ TRACE("Check not passed");
+ return false;
+ }
+
+public:
+ LowCoverageEdgeRemovingAlgorithm(Graph &g,
+ const std::string &condition_str,
+ const SimplifInfoContainer &simplif_info,
+ std::function<void(EdgeId)> removal_handler = nullptr,
+ bool canonical_only = true,
+ bool track_changes = true,
+ size_t total_iteration_estimate = -1ul)
+ : base(g, nullptr,
+ canonical_only,
+ omnigraph::CoverageComparator<Graph>(g),
+ track_changes,
+ total_iteration_estimate),
+ simplif_info_(simplif_info),
+ condition_str_(condition_str),
+ edge_remover_(g, removal_handler),
+ remove_condition_(func::AlwaysFalse<EdgeId>()),
+ proceed_condition_(func::AlwaysTrue<EdgeId>()) {
+
+ ConditionParser<Graph> parser(g, condition_str, simplif_info,
+ total_iteration_estimate - 1, total_iteration_estimate);
+ this->interest_el_finder_ =
+ std::make_shared<omnigraph::ParallelInterestingElementFinder<Graph>>(
+ AddAlternativesPresenceCondition(g, parser()),
+ simplif_info.chunk_cnt());
+ }
+
+private:
+ DECL_LOGGER("LowCoverageEdgeRemovingAlgorithm");
+};
+
+template<class Graph>
+AlternativesAnalyzer<Graph> ParseBRConfig(const Graph &g,
+ const config::debruijn_config::simplification::bulge_remover &config) {
+ size_t max_length = LengthThresholdFinder::MaxBulgeLength(
+ g.k(), config.max_bulge_length_coefficient,
+ config.max_additive_length_coefficient);
+
+ DEBUG("Length bound " << max_length);
+
+ return AlternativesAnalyzer<Graph>(g, config.max_coverage,
+ max_length,
+ config.max_relative_coverage,
+ config.max_delta,
+ config.max_relative_delta,
+ config.max_number_edges);
+}
+
+template<class Graph>
+AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string &condition_str,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = 0) {
+ ConditionParser<Graph> parser(g, condition_str, info);
+ auto condition = func::And(SelfConjugateCondition<Graph>(g), parser());
+
+ return std::make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ condition,
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> RelativeCoverageComponentRemoverInstance (
+ Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_comp_remover &rcc_config,
+ const SimplifInfoContainer &info,
+ typename ComponentRemover<Graph>::HandlerF removal_handler = nullptr) {
+ if (!rcc_config.enabled) {
+ return nullptr;
+ INFO("Removal of relatively low covered connections disabled");
+ }
+
+ // INFO("Removing relatively low covered connections");
+ size_t connecting_path_length_bound = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), rcc_config.max_ec_length_coefficient);
+
+ std::string pics_dir = "";
+
+ double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
+ ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
+ : std::numeric_limits<double>::max();
+
+ return std::make_shared<omnigraph::simplification::relative_coverage::
+ RelativeCoverageComponentRemover<Graph>>(g,
+ info.chunk_cnt(),
+ flanking_cov,
+ rcc_config.coverage_gap,
+ size_t(double(info.read_length()) * rcc_config.length_coeff),
+ size_t(double(info.read_length()) * rcc_config.tip_allowing_length_coeff),
+ connecting_path_length_bound,
+ max_coverage,
+ removal_handler, rcc_config.vertex_count_limit, pics_dir);
+}
+
+template<class Graph>
+AlgoPtr<Graph> RelativelyLowCoverageDisconnectorInstance(Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
+ const config::debruijn_config::simplification::relative_coverage_edge_disconnector &rced_config,
+ const SimplifInfoContainer &info) {
+ if (!rced_config.enabled) {
+ INFO("Disconnection of relatively low covered edges disabled");
+ return nullptr;
+ }
+
+ return std::make_shared<omnigraph::DisconnectionAlgorithm<Graph>>(g,
+ omnigraph::simplification::relative_coverage::
+ RelativeCovDisconnectionCondition<Graph>(g, flanking_cov, rced_config.diff_mult, rced_config.edge_sum),
+ info.chunk_cnt(),
+ nullptr);
+}
+
+template<class Graph>
+AlgoPtr<Graph> ComplexBRInstance(
+ Graph &g,
+ config::debruijn_config::simplification::complex_bulge_remover cbr_config,
+ const SimplifInfoContainer &info) {
+ if (!cbr_config.enabled)
+ return nullptr;
+ size_t max_length = (size_t) ((double) g.k() * cbr_config.max_relative_length);
+ size_t max_diff = cbr_config.max_length_difference;
+ return std::make_shared<omnigraph::complex_br::ComplexBulgeRemover<Graph>>(g, max_length,
+ max_diff, info.chunk_cnt());
+}
+
+template<class Graph>
+AlgoPtr<Graph> ComplexTipClipperInstance(Graph &g,
+ config::debruijn_config::simplification::complex_tip_clipper ctc_conf,
+ const SimplifInfoContainer &info,
+ typename ComponentRemover<Graph>::HandlerF removal_handler = 0) {
+ if (!ctc_conf.enabled) {
+ INFO("Complex tip clipping disabled");
+ return nullptr;
+ }
+
+ ConditionParser<Graph> parser(g, ctc_conf.condition, info);
+ parser();
+
+ return std::make_shared<omnigraph::ComplexTipClipper<Graph>>(g, ctc_conf.max_relative_coverage,
+ ctc_conf.max_edge_len,
+ parser.max_length_bound(), info.chunk_cnt(),
+ "", removal_handler);
+}
+
+template<class Graph>
+AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
+ config::debruijn_config::simplification::isolated_edges_remover ier,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = 0) {
+ if (!ier.enabled) {
+ return nullptr;
+ }
+ size_t max_length_any_cov = std::max(info.read_length(), ier.max_length_any_cov);
+
+ auto condition = func::And(IsolatedEdgeCondition<Graph>(g),
+ func::Or(LengthUpperBound<Graph>(g, max_length_any_cov),
+ func::And(LengthUpperBound<Graph>(g, ier.max_length),
+ CoverageUpperBound<Graph>(g, ier.max_coverage))));
+
+ return std::make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ condition,
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> RelativeECRemoverInstance(Graph &g,
+ const config::debruijn_config::simplification::relative_coverage_ec_remover &rcec_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler) {
+ if (!rcec_config.enabled)
+ return nullptr;
+
+ return std::make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph>>(g,
+ AddRelativeCoverageECCondition(g, rcec_config.rcec_ratio,
+ AddAlternativesPresenceCondition(g, func::TypedPredicate<typename Graph::EdgeId>
+ (LengthUpperBound<Graph>(g, rcec_config.max_ec_length)))),
+ info.chunk_cnt(), removal_handler, /*canonical_only*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> ECRemoverInstance(Graph &g,
+ const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = nullptr,
+ size_t iteration_cnt = 1) {
+ if (ec_config.condition.empty())
+ return nullptr;
+
+ return std::make_shared<LowCoverageEdgeRemovingAlgorithm<Graph>>(
+ g, ec_config.condition, info, removal_handler,
+ /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TipClipperInstance(Graph &g,
+ const EdgeConditionT<Graph> &condition,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = nullptr,
+ bool track_changes = true) {
+ return make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph, omnigraph::LengthComparator<Graph>>>(g,
+ AddTipCondition(g, condition),
+ info.chunk_cnt(),
+ removal_handler,
+ /*canonical_only*/true,
+ LengthComparator<Graph>(g),
+ track_changes);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TipClipperInstance(Graph &g,
+ const config::debruijn_config::simplification::tip_clipper &tc_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = nullptr) {
+ if (tc_config.condition.empty())
+ return nullptr;
+
+ ConditionParser<Graph> parser(g, tc_config.condition, info);
+ auto condition = parser();
+ return TipClipperInstance(g, condition, info, removal_handler, /*track changes*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> DeadEndInstance(Graph &g,
+ const config::debruijn_config::simplification::dead_end_clipper &dead_end_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler) {
+ if (!dead_end_config.enabled || dead_end_config.condition.empty())
+ return nullptr;
+
+ ConditionParser<Graph> parser(g, dead_end_config.condition, info);
+ auto condition = parser();
+ return make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph, omnigraph::LengthComparator<Graph>>>(g,
+ AddDeadEndCondition(g, condition), info.chunk_cnt(), removal_handler, /*canonical_only*/true,
+ LengthComparator<Graph>(g), /*track changes*/true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> TopologyTipClipperInstance(
+ Graph &g,
+ const config::debruijn_config::simplification::topology_tip_clipper &ttc_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = nullptr) {
+
+ auto condition
+ = func::And(LengthUpperBound<Graph>(g,
+ LengthThresholdFinder::MaxTipLength(info.read_length(), g.k(), ttc_config.length_coeff)),
+ DefaultUniquenessPlausabilityCondition<Graph>(g,
+ ttc_config.uniqueness_length, ttc_config.plausibility_length));
+
+ return TipClipperInstance(g,
+ condition, info, removal_handler, /*track changes*/false);
+}
+
+template<class Graph>
+AlgoPtr<Graph> BRInstance(Graph &g,
+ const config::debruijn_config::simplification::bulge_remover &br_config,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler = nullptr) {
+ if (!br_config.enabled || (br_config.main_iteration_only && !info.main_iteration())) {
+ return nullptr;
+ }
+
+ auto alternatives_analyzer = ParseBRConfig(g, br_config);
+
+ auto candidate_finder = std::make_shared<omnigraph::ParallelInterestingElementFinder<Graph>>(
+ omnigraph::NecessaryBulgeCondition(g,
+ alternatives_analyzer.max_length(),
+ alternatives_analyzer.max_coverage()),
+ info.chunk_cnt());
+ if (br_config.parallel) {
+ INFO("Creating parallel br instance");
+ return make_shared<omnigraph::ParallelBulgeRemover<Graph>>(g,
+ candidate_finder,
+ br_config.buff_size,
+ br_config.buff_cov_diff,
+ br_config.buff_cov_rel_diff,
+ alternatives_analyzer,
+ nullptr,
+ removal_handler,
+ /*track_changes*/true);
+ } else {
+ INFO("Creating br instance");
+ return make_shared<omnigraph::BulgeRemover<Graph>>(g,
+ candidate_finder,
+ alternatives_analyzer,
+ nullptr,
+ removal_handler,
+ /*track_changes*/true);
+ }
+}
+
+template<class Graph>
+AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
+ double cov_bound,
+ const SimplifInfoContainer &info,
+ EdgeRemovalHandlerF<Graph> removal_handler) {
+ if (math::ls(cov_bound, 0.)) {
+ INFO("Flanking coverage based disconnection disabled");
+ return nullptr;
+ }
+
+ auto condition = [&,cov_bound] (EdgeId e) {
+ return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1
+ && math::le(flanking_cov.CoverageOfStart(e), cov_bound);
+ };
+
+ return make_shared<omnigraph::DisconnectionAlgorithm<Graph>>(g, condition,
+ info.chunk_cnt(),
+ removal_handler);
+}
+
+template<class Graph>
+bool RemoveHiddenLoopEC(Graph &g,
+ const FlankingCoverage<Graph> &flanking_cov,
+ double determined_coverage_threshold,
+ config::debruijn_config::simplification::hidden_ec_remover her_config,
+ EdgeRemovalHandlerF<Graph> removal_handler) {
+ if (her_config.enabled) {
+ INFO("Removing loops and rc loops with erroneous connections");
+ ECLoopRemover<Graph> hc(g, flanking_cov,
+ determined_coverage_threshold,
+ her_config.relative_threshold, removal_handler);
+ bool res = hc.Run();
+ hc.PrintLoopStats();
+ return res;
+ }
+ return false;
+}
+
+}
+}
diff --git a/src/common/stages/simplification_pipeline/rna_simplification.hpp b/src/common/stages/simplification_pipeline/rna_simplification.hpp
new file mode 100644
index 0000000..050fa61
--- /dev/null
+++ b/src/common/stages/simplification_pipeline/rna_simplification.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "assembly_graph/graph_support/parallel_processing.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+
+namespace debruijn {
+namespace simplification {
+
+template<class Graph>
+AlgoPtr<Graph> ShortPolyATEdgesRemoverInstance(Graph &g, size_t max_length, EdgeRemovalHandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
+ auto condition = func::And(ATCondition<Graph>(g, 0.8, max_length, false), LengthUpperBound<Graph>(g, 1));
+ return std::make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph>>(g, condition, chunk_cnt, removal_handler, true);
+}
+
+template<class Graph>
+AlgoPtr<Graph> ATTipClipperInstance(Graph &g, EdgeRemovalHandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
+//TODO: review params 0.8, 200?
+ return std::make_shared<omnigraph::ParallelEdgeRemovingAlgorithm<Graph>>(g, ATCondition<Graph>(g, 0.8, 200, true), chunk_cnt, removal_handler, true);
+}
+
+}
+}
diff --git a/src/common/stages/simplification_pipeline/simplification_settings.hpp b/src/common/stages/simplification_pipeline/simplification_settings.hpp
new file mode 100644
index 0000000..ae0edf7
--- /dev/null
+++ b/src/common/stages/simplification_pipeline/simplification_settings.hpp
@@ -0,0 +1,112 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "pipeline/config_struct.hpp"
+
+namespace debruijn {
+
+namespace simplification {
+
+class LengthThresholdFinder {
+public:
+ static size_t MaxTipLength(size_t read_length, size_t k, double coeff) {
+ return std::max((size_t) math::round((double)std::min(k, read_length / 2) * coeff),
+ read_length);
+ }
+
+ static size_t MaxBulgeLength(size_t k, double coeff,
+ size_t additive_coeff) {
+ return std::max((size_t) math::round((double)k * coeff), k + additive_coeff);
+ }
+
+ static size_t MaxErroneousConnectionLength(size_t k, size_t param) {
+ return k + param;
+ }
+
+ static size_t MaxTipOriginatedECLength(size_t read_length, size_t k,
+ double coeff) {
+ return 2 * MaxTipLength(read_length, k, coeff) - 1;
+ }
+};
+
+//todo use GenomicInfo as field!
+class SimplifInfoContainer {
+ size_t read_length_;
+ double detected_mean_coverage_;
+ double detected_coverage_bound_;
+ bool main_iteration_;
+ size_t chunk_cnt_;
+ debruijn_graph::config::pipeline_type mode_;
+
+public:
+ SimplifInfoContainer(debruijn_graph::config::pipeline_type mode) :
+ read_length_(-1ul),
+ detected_mean_coverage_(-1.0),
+ detected_coverage_bound_(-1.0),
+ main_iteration_(false),
+ chunk_cnt_(-1ul),
+ mode_(mode) {
+ }
+
+ size_t read_length() const {
+ VERIFY(read_length_ != -1ul);
+ return read_length_;
+ }
+
+ double detected_mean_coverage() const {
+ VERIFY(math::ge(detected_mean_coverage_, 0.));
+ return detected_mean_coverage_;
+ }
+
+ double detected_coverage_bound() const {
+ VERIFY(math::ge(detected_coverage_bound_, 0.));
+ return detected_coverage_bound_;
+ }
+
+ bool main_iteration() const {
+ return main_iteration_;
+ }
+
+ size_t chunk_cnt() const {
+ VERIFY(chunk_cnt_ != -1ul);
+ return chunk_cnt_;
+ }
+
+ debruijn_graph::config::pipeline_type mode() const {
+ return mode_;
+ }
+
+ SimplifInfoContainer& set_read_length(size_t read_length) {
+ read_length_ = read_length;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_detected_coverage_bound(double detected_coverage_bound) {
+ detected_coverage_bound_ = detected_coverage_bound;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_detected_mean_coverage(double detected_mean_coverage) {
+ detected_mean_coverage_ = detected_mean_coverage;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_main_iteration(bool main_iteration) {
+ main_iteration_ = main_iteration;
+ return *this;
+ }
+
+ SimplifInfoContainer& set_chunk_cnt(size_t chunk_cnt) {
+ chunk_cnt_ = chunk_cnt;
+ return *this;
+ }
+};
+
+}
+
+}
diff --git a/src/common/stages/simplification_pipeline/single_cell_simplification.hpp b/src/common/stages/simplification_pipeline/single_cell_simplification.hpp
new file mode 100644
index 0000000..ae5a208
--- /dev/null
+++ b/src/common/stages/simplification_pipeline/single_cell_simplification.hpp
@@ -0,0 +1,142 @@
+#pragma once
+
+#include "pipeline/config_struct.hpp"
+#include "assembly_graph/graph_support/comparators.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "assembly_graph/graph_support/detail_coverage.hpp"
+#include "modules/simplification/erroneous_connection_remover.hpp"
+#include "modules/simplification/mf_ec_remover.hpp"
+#include "stages/simplification_pipeline/simplification_settings.hpp"
+
+namespace debruijn {
+namespace simplification {
+
+//deprecated
+template<class Graph>
+bool RemoveErroneousEdgesInCoverageOrder(Graph &g,
+ func::TypedPredicate<typename Graph::EdgeId> removal_condition,
+ double max_coverage,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
+ AddAlternativesPresenceCondition(g, removal_condition),
+ removal_handler);
+
+ return erroneous_edge_remover.Run(omnigraph::CoverageComparator<Graph>(g),
+ omnigraph::CoverageUpperBound<Graph>(g, max_coverage));
+}
+
+//deprecated
+template<class Graph>
+bool RemoveErroneousEdgesInLengthOrder(Graph &g,
+ func::TypedPredicate<typename Graph::EdgeId> removal_condition,
+ size_t max_length,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
+ AddAlternativesPresenceCondition(g, removal_condition),
+ removal_handler);
+
+ return erroneous_edge_remover.Run(omnigraph::LengthComparator<Graph>(g),
+ omnigraph::LengthUpperBound<Graph>(g, max_length));
+}
+
+template<class Graph>
+bool TopologyRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topology");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), tec_config.max_ec_length_coefficient);
+
+ func::TypedPredicate<typename Graph::EdgeId>
+ condition(omnigraph::DefaultUniquenessPlausabilityCondition<Graph>(g, tec_config.uniqueness_length, tec_config.plausibility_length));
+
+ return RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool MultiplicityCountingRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topological multiplicity counting");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), tec_config.max_ec_length_coefficient);
+
+ func::TypedPredicate<typename Graph::EdgeId>
+ condition(omnigraph::MultiplicityCountingCondition<Graph>(g, tec_config.uniqueness_length,
+ /*plausibility*/ MakePathLengthLowerBound(g,
+ omnigraph::PlausiblePathFinder<Graph>(g, 2 * tec_config.plausibility_length), tec_config.plausibility_length)));
+
+ return RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool RemoveThorns(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::interstrand_ec_remover& isec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing interstrand connections");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), isec_config.max_ec_length_coefficient);
+
+ auto condition
+ = func::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
+ func::And(omnigraph::AdditionalMDAThornCondition<Graph>(g, isec_config.uniqueness_length),
+ omnigraph::TopologicalThornCondition<Graph>(g, isec_config.span_distance)));
+
+ return RemoveErroneousEdgesInCoverageOrder(g, condition, numeric_limits<double>::max(), removal_handler);
+}
+
+template<class Graph>
+bool TopologyReliabilityRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::tr_based_ec_remover& trec_config,
+ std::function<void(typename Graph::EdgeId)> removal_handler) {
+ INFO("Removing connections based on topology and reliable coverage");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), trec_config.max_ec_length_coefficient);
+
+ auto condition
+ = func::And(omnigraph::CoverageUpperBound<Graph>(g, trec_config.unreliable_coverage),
+ omnigraph::PredicateUniquenessPlausabilityCondition<Graph>(g,
+ /*uniqueness*/omnigraph::MakePathLengthLowerBound(g, omnigraph::UniquePathFinder<Graph>(g), trec_config.uniqueness_length),
+ /*plausibility*/func::AlwaysTrue<typename Graph::EdgeId>()));
+
+ return RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
+}
+
+template<class Graph>
+bool MaxFlowRemoveErroneousEdges(
+ Graph &g,
+ const debruijn_graph::config::debruijn_config::simplification::max_flow_ec_remover& mfec_config,
+ omnigraph::EdgeRemovalHandlerF<Graph> removal_handler = 0) {
+ if (!mfec_config.enabled)
+ return false;
+ INFO("Removing connections based on max flow strategy");
+ size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
+ g.k(), (size_t) mfec_config.max_ec_length_coefficient);
+ omnigraph::MaxFlowECRemover<Graph> erroneous_edge_remover(
+ g, max_length, mfec_config.uniqueness_length,
+ mfec_config.plausibility_length, removal_handler);
+ return erroneous_edge_remover.Process();
+}
+
+template<class Graph>
+bool RemoveHiddenEC(Graph& g,
+ const omnigraph::FlankingCoverage<Graph>& flanking_cov,
+ const debruijn_graph::config::debruijn_config::simplification::hidden_ec_remover& her_config,
+ const SimplifInfoContainer& info,
+ omnigraph::EdgeRemovalHandlerF<Graph> removal_handler) {
+ if (her_config.enabled) {
+ INFO("Removing hidden erroneous connections");
+ omnigraph::HiddenECRemover<Graph> remover(g, info.chunk_cnt(), flanking_cov, her_config.uniqueness_length,
+ her_config.unreliability_threshold, info.detected_coverage_bound(),
+ her_config.relative_threshold, removal_handler);
+ return LoopedRun(remover) > 0;
+ }
+ return false;
+}
+
+}
+}
diff --git a/src/common/utils/CMakeLists.txt b/src/common/utils/CMakeLists.txt
new file mode 100644
index 0000000..40c2d20
--- /dev/null
+++ b/src/common/utils/CMakeLists.txt
@@ -0,0 +1,20 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(utils CXX)
+
+set(utils_src
+ copy_file.cpp
+ path_helper.cpp
+ logger/logger_impl.cpp)
+
+if (READLINE_FOUND)
+ set(utils_src ${utils_src} autocompletion.cpp)
+endif()
+
+add_library(utils STATIC
+ ${utils_src})
diff --git a/src/modules/dev_support/autocompletion.cpp b/src/common/utils/autocompletion.cpp
similarity index 100%
rename from src/modules/dev_support/autocompletion.cpp
rename to src/common/utils/autocompletion.cpp
diff --git a/src/modules/dev_support/autocompletion.hpp b/src/common/utils/autocompletion.hpp
similarity index 100%
rename from src/modules/dev_support/autocompletion.hpp
rename to src/common/utils/autocompletion.hpp
diff --git a/src/common/utils/copy_file.cpp b/src/common/utils/copy_file.cpp
new file mode 100644
index 0000000..289ff34
--- /dev/null
+++ b/src/common/utils/copy_file.cpp
@@ -0,0 +1,158 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "copy_file.hpp"
+
+#include "utils/path_helper.hpp"
+#include "utils/logger/logger.hpp"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <unistd.h>
+#include <dirent.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace path {
+
+namespace details {
+
+using namespace path;
+
+void copy_file(std::string from_path, std::string to_path) {
+ using namespace std;
+
+ make_full_path(from_path);
+ make_full_path(to_path );
+
+ if (from_path == to_path)
+ return;
+
+ std::ifstream source(from_path, ios::binary);
+ std::ofstream dest (to_path.c_str() , ios::binary);
+
+ dest << source.rdbuf();
+}
+
+
+void hard_link(std::string from_path, std::string to_path) {
+ make_full_path(from_path);
+ make_full_path(to_path );
+
+ if (from_path == to_path)
+ return;
+
+ if (link(from_path.c_str(), to_path.c_str()) == -1) {
+ WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead");
+ copy_file(from_path, to_path);
+ }
+}
+
+files_t files_in_folder(std::string const& path) {
+ DIR *dp;
+ if ((dp = opendir(path.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + path);
+
+ files_t files;
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL)
+ if (dirp->d_type == DT_REG)
+ files.push_back(append_path(path, dirp->d_name));
+
+ closedir(dp);
+ return files;
+}
+
+files_t folders_in_folder(std::string const& path) {
+ DIR *dp;
+ if ((dp = opendir(path.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + path);
+
+ files_t folders;
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL)
+ if (dirp->d_type == DT_DIR) {
+ std::string folder = dirp->d_name;
+
+ if (folder != "." && folder != "..")
+ folders.push_back(append_path(path, folder));
+ }
+
+ closedir(dp);
+ return folders;
+}
+
+} // details
+
+path::files_t files_by_prefix(std::string const& path) {
+ using namespace details;
+ files_t files;
+
+ std::string folder(parent_path(path));
+ std::string prefix = filename(path);
+
+ files_t out_files;
+ const files_t all_files = files_in_folder(folder);
+
+ for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11
+ if (boost::starts_with(filename(*it), prefix))
+ out_files.push_back(*it);
+
+ return out_files;
+}
+
+void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
+ using namespace details;
+
+ for (auto it = files.begin(); it != files.end(); ++it) {
+ files_t files_to_copy = files_by_prefix(*it);
+
+ for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
+ copy_file(*it, append_path(to_folder, filename(*it)));
+ }
+}
+
+void link_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
+ using namespace details;
+
+ for (auto it = files.begin(); it != files.end(); ++it) {
+ files_t files_to_copy = files_by_prefix(*it);
+
+ for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
+ hard_link(*it, append_path(to_folder, filename(*it)));
+ }
+}
+
+void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) {
+ using namespace details;
+
+ files_t files = files_in_folder(from_folder);
+
+ for (auto it = files.begin(); it != files.end(); ++it)
+ if (boost::ends_with(*it, ext))
+ copy_file(*it, append_path(to_folder, filename(*it)));
+
+ if (recursive) {
+ files_t folders = folders_in_folder(from_folder);
+
+ for (auto it = folders.begin(); it != folders.end(); ++it) {
+ std::string subdir = append_path(to_folder, filename(*it));
+ path:: make_dir(subdir);
+ copy_files_by_ext(*it, subdir, ext, recursive);
+ }
+ }
+}
+
+}
diff --git a/src/common/utils/copy_file.hpp b/src/common/utils/copy_file.hpp
new file mode 100644
index 0000000..4f0e4ab
--- /dev/null
+++ b/src/common/utils/copy_file.hpp
@@ -0,0 +1,18 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/path_helper.hpp"
+#include <string>
+
+namespace path {
+
+path::files_t files_by_prefix(std::string const& path);
+void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder);
+void link_files_by_prefix(path::files_t const& files, std::string const& to_folder);
+void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive);
+
+}
diff --git a/src/common/utils/coverage_model/CMakeLists.txt b/src/common/utils/coverage_model/CMakeLists.txt
new file mode 100644
index 0000000..4df6767
--- /dev/null
+++ b/src/common/utils/coverage_model/CMakeLists.txt
@@ -0,0 +1,14 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(coverage_model CXX)
+
+add_library(coverage_model STATIC
+ kmer_coverage_model.cpp)
+
+target_link_libraries(coverage_model nlopt)
+
diff --git a/src/common/utils/coverage_model/kmer_coverage_model.cpp b/src/common/utils/coverage_model/kmer_coverage_model.cpp
new file mode 100644
index 0000000..ce77e11
--- /dev/null
+++ b/src/common/utils/coverage_model/kmer_coverage_model.cpp
@@ -0,0 +1,380 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "kmer_coverage_model.hpp"
+
+#include "utils/logger/logger.hpp"
+#include "utils/verify.hpp"
+#include "math/xmath.h"
+#include "math/smooth.hpp"
+
+#include <boost/math/special_functions/zeta.hpp>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/distributions/skew_normal.hpp>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/distributions/pareto.hpp>
+
+#include <nlopt/nlopt.hpp>
+
+#include <vector>
+
+#include <cstring>
+#include <cstdint>
+#include <cstddef>
+#include <cmath>
+
+namespace utils {
+namespace coverage_model {
+
+using std::isfinite;
+
+static const size_t MaxCopy = 10;
+
+static double dzeta(double x, double p) {
+ return pow(x, -p - 1) / boost::math::zeta(p + 1);
+}
+
+static double perr(size_t i, double scale, double shape) {
+ return pow((1 + shape * ((double) (i - 1)) / scale), -1.0 / shape) -
+ pow((1 + shape * ((double) i) / scale), -1.0 / shape);
+}
+
+static double pgood(size_t i, double zp, double u, double sd, double shape,
+ double* mixprobs = NULL) {
+ double res = 0;
+
+ for (unsigned copy = 0; copy < MaxCopy; ++copy) {
+ boost::math::skew_normal snormal((copy + 1) * u, sd * sqrt(copy + 1), shape);
+ // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i));
+ res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i);
+ }
+
+ return res;
+}
+
+class CovModelLogLike {
+ const std::vector<size_t>& cov;
+
+public:
+ CovModelLogLike(const std::vector<size_t>& cov)
+ : cov(cov) {}
+
+ int getN() const { return 7; };
+
+private:
+
+ double eval_(const double* x) const {
+ double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6];
+
+ if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1 - 1e-9 || u <= 0 || scale <= 0 ||
+ !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) ||
+ !isfinite(scale) || !isfinite(shape2))
+ return +std::numeric_limits<double>::infinity();
+
+ std::vector<double> kmer_probs(cov.size());
+
+ // Error
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ kmer_probs[i] += p * perr(i + 1, scale, shape);
+
+ // Good
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2);
+
+ double res = 0;
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ res += (double) (cov[i]) * log(kmer_probs[i]);
+
+ return -res;
+ }
+};
+
+struct CovModelLogLikeEMData {
+ const std::vector<size_t>& cov;
+ const std::vector<double>& z;
+};
+
+static double CovModelLogLikeEM(unsigned, const double* x, double*, void* data) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+
+ // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]);
+
+ if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 ||
+ !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) ||
+ !isfinite(scale) || !isfinite(shape2))
+ return -std::numeric_limits<double>::infinity();
+
+ const std::vector<size_t>& cov = static_cast<CovModelLogLikeEMData*>(data)->cov;
+ const std::vector<double>& z = static_cast<CovModelLogLikeEMData*>(data)->z;
+
+ std::vector<double> kmer_probs(cov.size(), 0);
+
+ // Error
+ for (size_t i = 0; i < kmer_probs.size(); ++i) {
+ if (cov[i] == 0)
+ continue;
+
+ kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape));
+ }
+
+ // Good
+ // Pre-compute mixing probabilities
+ std::vector<double> mixprobs(MaxCopy, 0);
+ for (unsigned copy = 0; copy < MaxCopy; ++copy)
+ mixprobs[copy] = dzeta(copy + 1, zp);
+
+ // Compute the density
+ for (size_t i = 0; i < kmer_probs.size(); ++i) {
+ if (cov[i] == 0)
+ continue;
+
+ double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0]));
+ if (!isfinite(val))
+ val = -1000.0;
+ kmer_probs[i] += (1 - z[i]) * val;
+ }
+
+ double res = 0;
+ for (size_t i = 0; i < kmer_probs.size(); ++i)
+ res += (double) (cov[i]) * kmer_probs[i];
+
+ // INFO("f: " << res);
+ return res;
+}
+
+
+static std::vector<double> EStep(const std::vector<double>& x,
+ double p, size_t N) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+
+ std::vector<double> res(N);
+ for (size_t i = 0; i < N; ++i) {
+ double pe = p * perr(i + 1, scale, shape);
+ res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2));
+ if (!isfinite(res[i]))
+ res[i] = 1.0;
+ }
+
+ return res;
+}
+
+// Estimate the coverage mean by finding the max past the
+// first valley.
+size_t KMerCoverageModel::EstimateValley() const {
+ // Smooth the histogram
+ std::vector<size_t> scov;
+ math::Smooth3RS3R(scov, cov_);
+
+ size_t Valley = scov[0];
+
+ // Start finding the valley
+ size_t Idx = 1;
+ while (scov[Idx] < Valley && Idx < scov.size()) {
+ Valley = scov[Idx];
+ Idx += 1;
+ }
+ Idx -= 1;
+
+ INFO("Kmer coverage valley at: " << Idx);
+
+ return Idx;
+}
+
+void KMerCoverageModel::Fit() {
+ VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram, make sure that the coverage is indeed uniform");
+
+ // Find the minimal coverage point using smoothed histogram.
+ Valley_ = EstimateValley();
+
+ // First estimate of coverage is the first maximum after the valley.
+ MaxCov_ = Valley_ + 1;
+ size_t MaxHist = cov_[MaxCov_];
+ for (size_t i = Valley_ + 1; i < cov_.size(); ++i) {
+ if (cov_[i] > MaxHist) {
+ MaxHist = cov_[i];
+ MaxCov_ = i;
+ }
+ }
+ INFO("K-mer histogram maximum: " << MaxCov_);
+
+ // Refine the estimate via median
+ size_t AfterValley = 0, SecondValley = std::min(2 * MaxCov_ - Valley_, cov_.size());
+ for (size_t i = Valley_ + 1; i < SecondValley; ++i)
+ AfterValley += cov_[i];
+
+ size_t ccov = 0;
+ for (size_t i = Valley_ + 1; i < SecondValley; ++i) {
+ if (ccov > AfterValley / 2) {
+ MaxCov_ = std::max(i, MaxCov_);
+ break;
+ }
+ ccov += cov_[i];
+ }
+
+ if (MaxCov_ - Valley_ < 3)
+ WARN("Too many erroneous kmers, the estimates might be unreliable");
+
+ std::vector<size_t> mvals(1 + MaxCov_ - Valley_);
+ mvals[0] = cov_[MaxCov_];
+ size_t tmadcov = mvals[0];
+ for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) {
+ mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i];
+ tmadcov += mvals[i];
+ }
+ size_t madcov = 0;
+ double CovSd = sqrt((double) (5 * MaxCov_));
+ for (size_t i = 0; i < MaxCov_ - Valley_; ++i) {
+ if (madcov > tmadcov / 2) {
+ CovSd = (double) i;
+ break;
+ }
+ madcov += mvals[i];
+ }
+ CovSd *= 1.4826;
+ INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd);
+
+ // Estimate error probability as ratio of kmers before the valley.
+ size_t BeforeValley = 0, Total = 0;
+ double ErrorProb = 0;
+ for (size_t i = 0; i < cov_.size(); ++i) {
+ if (i <= Valley_)
+ BeforeValley += cov_[i];
+ Total += cov_[i];
+ }
+ ErrorProb = (double) BeforeValley / (double) Total;
+ // Allow some erroneous / good kmers.
+ ErrorProb = std::min(1 - 1e-3, ErrorProb);
+ ErrorProb = std::max(1e-3, ErrorProb);
+
+ TRACE("Total: " << Total << ". Before: " << BeforeValley);
+ TRACE("p: " << ErrorProb);
+
+ std::vector<double> x = {3.0, 3.0, (double) MaxCov_, CovSd, 1.0, 0.0},
+ lb = {0.0, 0.0, 0.0, (double) (MaxCov_ - Valley_), 0.0, -6.0},
+ ub = {2000.0, 2000.0, (double) (2 * MaxCov_), (double) SecondValley, 2000.0, 6.0};
+
+ INFO("Fitting coverage model");
+ // Ensure that there will be at least 2 iterations.
+ double PrevErrProb = 2;
+ const double ErrProbThr = 1e-8;
+ auto GoodCov = cov_;
+ GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4));
+ converged_ = true;
+ unsigned it = 1;
+ while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) {
+ // Recalculate the vector of posterior error probabilities
+ std::vector<double> z = EStep(x, ErrorProb, GoodCov.size());
+
+ // Recalculate the probability of error
+ PrevErrProb = ErrorProb;
+ ErrorProb = 0;
+ for (size_t i = 0; i < GoodCov.size(); ++i)
+ ErrorProb += z[i] * (double) GoodCov[i];
+ ErrorProb /= (double) Total;
+
+ bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr;
+
+ nlopt::opt opt(nlopt::LN_NELDERMEAD, 6);
+ CovModelLogLikeEMData data = {GoodCov, z};
+ opt.set_max_objective(CovModelLogLikeEM, &data);
+ if (!LastIter)
+ opt.set_maxeval(5 * 6 * it);
+ opt.set_xtol_rel(1e-8);
+ opt.set_ftol_rel(1e-8);
+
+ double fMin;
+ nlopt::result Results = nlopt::FAILURE;
+ try {
+ Results = opt.optimize(x, fMin);
+ } catch (nlopt::roundoff_limited&) {
+ }
+
+ VERBOSE_POWER_T2(it, 1, "... iteration " << it);
+ TRACE("Results: ");
+ TRACE("Converged: " << Results << " " << "F: " << fMin);
+
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+ TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd <<
+ " scale: " << scale << " shape2: " << shape2);
+
+ it += 1;
+ }
+
+ double delta = x[5] / sqrt(1 + x[5] * x[5]);
+ mean_coverage_ = x[2] + x[3] * delta * sqrt(2 / M_PI);
+ sd_coverage_ = x[3] * sqrt(1 - 2 * delta * delta / M_PI);
+ INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_);
+
+ // Now let us check whether we have sane results
+ for (size_t i = 0; i < x.size(); ++i)
+ if (!isfinite(x[i])) {
+ converged_ = false;
+ break;
+ }
+
+ if (!isfinite(ErrorProb))
+ converged_ = false;
+
+ // See, if we can deduce proper threshold
+
+ // First, check whether initial estimate of Valley was sane.
+ ErrorThreshold_ = 0;
+ if (converged_ && Valley_ > x[2] && x[2] > 2) {
+ Valley_ = (size_t) math::round(x[2] / 2.0);
+ WARN("Valley value was estimated improperly, reset to " << Valley_);
+ }
+
+ // If the model converged, then use it to estimate the thresholds.
+ if (converged_) {
+ std::vector<double> z = EStep(x, ErrorProb, GoodCov.size());
+
+ INFO("Probability of erroneous kmer at valley: " << z[Valley_]);
+ converged_ = false;
+ for (size_t i = 0; i < z.size(); ++i)
+ if (z[i] > strong_probability_threshold_) //0.999
+ LowThreshold_ = std::min(i + 1, Valley_);
+ else if (z[i] < probability_threshold_) {//0.05?
+ ErrorThreshold_ = std::max(i + 1, Valley_);
+ converged_ = true;
+ break;
+ }
+
+#if 0
+ for (size_t i = 0; i < z.size(); ++i) {
+ double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
+ double pe = ErrorProb * perr(i + 1, scale, shape);
+ double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2);
+
+ fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape));
+ }
+#endif
+ }
+
+ // See, if we have sane ErrorThreshold_ and go down to something convervative, if not.
+ if (converged_) {
+ INFO("Preliminary threshold calculated as: " << ErrorThreshold_);
+ ErrorThreshold_ = (Valley_ < mean_coverage_ ?
+ std::min(Valley_ + (size_t) (mean_coverage_ - (double) Valley_) / 2, ErrorThreshold_) :
+ Valley_);
+ INFO("Threshold adjusted to: " << ErrorThreshold_);
+ } else {
+ ErrorThreshold_ = Valley_;
+ LowThreshold_ = 1;
+ WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_);
+ }
+
+ // Now the bonus: estimate the genome size!
+ GenomeSize_ = 0;
+ for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i)
+ GenomeSize_ += GoodCov[i];
+ GenomeSize_ /= 2;
+
+ INFO("Estimated genome size (ignoring repeats): " << GenomeSize_);
+}
+
+}
+}
diff --git a/src/common/utils/coverage_model/kmer_coverage_model.hpp b/src/common/utils/coverage_model/kmer_coverage_model.hpp
new file mode 100644
index 0000000..2268262
--- /dev/null
+++ b/src/common/utils/coverage_model/kmer_coverage_model.hpp
@@ -0,0 +1,48 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <vector>
+#include <cstddef>
+
+namespace utils {
+namespace coverage_model {
+
+class KMerCoverageModel {
+ const std::vector<size_t>& cov_;
+ size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_;
+ double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_;
+ bool converged_;
+
+public:
+ KMerCoverageModel(const std::vector<size_t>& cov, double probability_threshold,
+ double strong_probability_threshold)
+ : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold),
+ strong_probability_threshold_(strong_probability_threshold),
+ mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) {}
+
+ void Fit();
+
+ size_t GetErrorThreshold() const { return ErrorThreshold_; }
+
+ size_t GetLowThreshold() const { return LowThreshold_; }
+
+ size_t GetGenomeSize() const { return GenomeSize_; }
+
+ double GetMeanCoverage() const { return mean_coverage_; }
+
+ double GetSdCoverage() const { return sd_coverage_; }
+
+ bool converged() const { return converged_; }
+
+private:
+ size_t EstimateValley() const;
+};
+
+}
+}
diff --git a/src/modules/dev_support/cpp_utils.hpp b/src/common/utils/cpp_utils.hpp
similarity index 100%
rename from src/modules/dev_support/cpp_utils.hpp
rename to src/common/utils/cpp_utils.hpp
diff --git a/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp b/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp
new file mode 100644
index 0000000..47aed1d
--- /dev/null
+++ b/src/common/utils/debruijn_graph/debruijn_graph_constructor.hpp
@@ -0,0 +1,558 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/core/construction_helper.hpp"
+#include "utils/standard_base.hpp"
+#include "utils/indices/kmer_extension_index.hpp"
+#include "utils/openmp_wrapper.h"
+#include "utils/parallel_wrapper.hpp"
+
+namespace debruijn_graph {
+
+/*
+ * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
+ */
+template<class Graph, class Index>
+class DeBruijnGraphConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef Index DeBruijn;
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Index::KMer Kmer;
+ typedef typename DeBruijn::KeyWithHash KeyWithHash;
+ typedef typename DeBruijn::kmer_iterator kmer_iterator;
+
+ Graph &graph_;
+ DeBruijn &origin_;
+ size_t kmer_size_;
+
+ bool StepRightIfPossible(KeyWithHash &kwh) {
+ // VERIFY(origin_.contains(edge));
+ if (origin_.RivalEdgeCount(kwh) == 1
+ && origin_.NextEdgeCount(kwh) == 1) {
+ kwh = origin_.NextEdge(kwh);
+ // VERIFY(origin_.contains(next_edge));
+ return true;
+ }
+ return false;
+ }
+
+ KeyWithHash &GoRight(KeyWithHash &kwh) {
+ KeyWithHash initial = kwh;
+ while (StepRightIfPossible(kwh) && kwh != initial) {
+ ;
+ }
+ return kwh;
+ }
+
+ KeyWithHash &GoLeft(KeyWithHash &kwh) {
+ //These strange things are in order to avoid making copies of kwh
+ kwh = !kwh;
+ kwh = !GoRight(kwh);
+ return kwh;
+ }
+
+ Sequence ConstructSeqGoingRight(KeyWithHash &kwh) {
+ SequenceBuilder s;
+ s.append(kwh.key());
+ KeyWithHash initial = kwh;
+ while (StepRightIfPossible(kwh) && kwh != initial) {
+ s.append(kwh[kmer_size_]);
+ }
+ return s.BuildSequence();
+ }
+
+ Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) {
+ KeyWithHash tmp = kwh;
+ return ConstructSeqGoingRight(GoLeft(tmp));
+ }
+
+ VertexId FindVertexByOutgoingEdges(Kmer kmer) {
+ for (char c = 0; c < 4; ++c) {
+ KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c));
+ if (origin_.contains(edge))
+ return graph_.EdgeStart(origin_.get_value(edge).edge_id);
+ }
+ return VertexId(NULL);
+ }
+
+ VertexId FindVertexByIncomingEdges(Kmer kmer) {
+ for (char c = 0; c < 4; ++c) {
+ KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c));
+ if (origin_.contains(edge)) {
+ return graph_.EdgeEnd(origin_.get_value(edge).edge_id);
+ }
+ }
+ return VertexId(NULL);
+ }
+
+ VertexId FindVertex(Kmer kmer) {
+ VertexId v = FindVertexByOutgoingEdges(kmer);
+ return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v;
+ }
+
+ VertexId FindVertexMaybeMissing(Kmer kmer) {
+ VertexId v = FindVertex(kmer);
+ return v != VertexId(NULL) ? v : graph_.AddVertex();
+ }
+
+ VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph,
+ VertexId start, Kmer start_kmer, Kmer end_kmer) {
+ if (start_kmer == end_kmer) {
+ return start;
+ } else if (start_kmer == !end_kmer) {
+ return graph.conjugate(start);
+ } else {
+ return FindVertexMaybeMissing(end_kmer);
+ }
+ }
+
+ void ConstructPart(const std::vector<KeyWithHash>& kwh_list,
+ std::vector<Sequence>& sequences) {
+ for (size_t i = 0; i < sequences.size(); ++i) {
+ if (origin_.contains(kwh_list[i])) {
+ continue;
+ }
+
+ Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_);
+ Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_);
+
+ VertexId start = FindVertexMaybeMissing(start_kmer);
+ VertexId end = FindEndMaybeMissing(graph_, start, start_kmer,
+ end_kmer);
+
+ graph_.AddEdge(start, end, sequences[i]);
+ }
+ }
+
+ void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize,
+ std::vector<KeyWithHash>& kwh_list) {
+ for (; kwh_list.size() != queueSize && it != end; ++it) {
+ KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data()));
+
+ if (!origin_.contains(kwh))
+ kwh_list.push_back(kwh);
+ }
+ }
+
+ void CalculateSequences(std::vector<KeyWithHash> &kwh_list,
+ std::vector<Sequence> &sequences) {
+ size_t size = kwh_list.size();
+ sequences.resize(size);
+
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ sequences[i] = ConstructSequenceWithEdge(kwh_list[i]);
+ }
+ }
+
+public:
+ DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) :
+ graph_(graph), origin_(origin), kmer_size_(graph_.k()) {
+ }
+
+ void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ kmer_iterator it = origin_.kmer_begin();
+ kmer_iterator end = origin_.kmer_end();
+ size_t queueSize = queueMinSize;
+ std::vector<KeyWithHash> kwh_list;
+ std::vector<Sequence> sequences;
+ kwh_list.reserve(queueSize);
+ sequences.reserve(queueMaxSize);
+ while (it != end) {
+ AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index
+ CalculateSequences(kwh_list, sequences); // in parallel
+ ConstructPart(kwh_list, sequences);
+ kwh_list.clear();
+ queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize);
+ }
+ }
+
+private:
+ DECL_LOGGER("DeBruijnGraphConstructor")
+};
+
+class UnbranchingPathFinder {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef RtSeq Kmer;
+ typedef Index::kmer_iterator kmer_iterator;
+ typedef Index::KeyWithHash KeyWithHash;
+ typedef Index::DeEdge DeEdge;
+
+ Index &origin_;
+ size_t kmer_size_;
+
+public:
+ UnbranchingPathFinder(Index &origin, size_t kmer_size) : origin_(origin), kmer_size_(kmer_size) {
+ }
+
+ bool StepRightIfPossible(DeEdge &edge) {
+ if (origin_.CheckUniqueOutgoing(edge.end) && origin_.CheckUniqueIncoming(edge.end)) {
+ edge = DeEdge(edge.end, origin_.GetUniqueOutgoing(edge.end));
+ return true;
+ }
+ return false;
+ }
+
+ Sequence ConstructSeqGoingRight(DeEdge edge) {
+ SequenceBuilder s;
+ s.append(edge.start.key());
+ s.append(edge.end[kmer_size_ - 1]);
+ DeEdge initial = edge;
+ while (StepRightIfPossible(edge) && edge != initial) {
+ s.append(edge.end[kmer_size_ - 1]);
+ }
+ return s.BuildSequence();
+ }
+
+ Sequence ConstructSequenceWithEdge(DeEdge edge) {
+ return ConstructSeqGoingRight(edge);
+ }
+
+ //Loop consists of 4 parts: 2 selfRC k+1-mers and two sequences of arbitrary length RC to each other; pos is a position of one of selfRC edges
+ vector<Sequence> SplitLoop(Sequence s, size_t pos) {
+ return {s.Subseq(pos, pos + kmer_size_ + 1), s.Subseq(pos + 1, s.size() - kmer_size_) + s.Subseq(0, pos + kmer_size_)};
+
+ }
+
+//TODO Think about what happends to self rc perfect loops
+ vector<Sequence> ConstructLoopFromVertex(const KeyWithHash &kh) {
+ DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh));
+ Sequence s = ConstructSequenceWithEdge(break_point);
+ Kmer kmer = s.start<Kmer>(kmer_size_ + 1) >> 'A';
+ for(size_t i = kmer_size_; i < s.size(); i++) {
+ kmer = kmer << s[i];
+ if (kmer == !kmer) {
+ return SplitLoop(s, i - kmer_size_);
+ }
+ }
+ return {s};
+ }
+};
+
+class UnbranchingPathExtractor {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef RtSeq Kmer;
+ typedef Index::kmer_iterator kmer_iterator;
+ typedef Index::DeEdge DeEdge;
+ typedef Index::KeyWithHash KeyWithHash;
+
+ Index &origin_;
+ size_t kmer_size_;
+
+ bool IsJunction(KeyWithHash kh) const {
+ return !(origin_.CheckUniqueOutgoing(kh) && origin_.CheckUniqueIncoming(kh));
+ }
+
+ void AddStartDeEdgesForVertex(KeyWithHash kh, std::vector<DeEdge>& start_edges) const {
+ for (char next = 0; next < 4; next++) {
+ if (origin_.CheckOutgoing(kh, next)) {
+ TRACE("Added to queue " << DeEdge(kh, origin_.GetOutgoing(kh, next)));
+ start_edges.push_back(DeEdge(kh, origin_.GetOutgoing(kh, next)));
+ }
+ }
+ }
+
+ void AddStartDeEdges(kmer_iterator &it, size_t queueSize,
+ std::vector<DeEdge>& start_edges) const {
+ for (; start_edges.size() < queueSize && it.good(); ++it) {
+ KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
+ if (IsJunction(kh)) {
+ AddStartDeEdgesForVertex(kh, start_edges);
+ KeyWithHash kh_inv = !kh;
+ if(!(kh_inv.is_minimal())) {
+ AddStartDeEdgesForVertex(kh_inv, start_edges);
+ }
+ }
+ }
+ }
+
+ void CalculateSequences(std::vector<DeEdge> &edges,
+ std::vector<Sequence> &sequences, UnbranchingPathFinder &finder) const {
+ size_t size = edges.size();
+ size_t start = sequences.size();
+ sequences.resize(start + size);
+
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ sequences[start + i] = finder.ConstructSequenceWithEdge(edges[i]);
+ TRACE("From " << edges[i] << " calculated sequence");
+ TRACE(sequences[start + i]);
+ }
+ }
+
+ void CleanCondensed(const Sequence &sequence) {
+ Kmer kmer = sequence.start<Kmer>(kmer_size_);
+ KeyWithHash kwh = origin_.ConstructKWH(kmer);
+ origin_.IsolateVertex(kwh);
+ for(size_t pos = kmer_size_; pos < sequence.size(); pos++) {
+ kwh = kwh << sequence[pos];
+ origin_.IsolateVertex(kwh);
+ }
+ }
+
+ void CleanCondensed(const std::vector<Sequence> &sequences) {
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < sequences.size(); ++i) {
+ CleanCondensed(sequences[i]);
+ }
+ }
+
+ //This methods collects all loops that were not extracted by finding unbranching paths because there are no junctions on loops.
+ //TODO make parallel
+ const std::vector<Sequence> CollectLoops() {
+ INFO("Collecting perfect loops");
+ UnbranchingPathFinder finder(origin_, kmer_size_);
+ std::vector<Sequence> result;
+ for (kmer_iterator it = origin_.kmer_begin(); it.good(); ++it) {
+ KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
+ if (!IsJunction(kh)) {
+ vector<Sequence> loop = finder.ConstructLoopFromVertex(kh);
+ for(Sequence s: loop) {
+ result.push_back(s);
+ CleanCondensed(s);
+ if(s != (!s)) {
+ result.push_back(!s);
+ }
+ }
+ }
+ }
+ INFO("Collecting perfect loops finished. " << result.size() << " loops collected");
+ return result;
+ }
+
+public:
+ UnbranchingPathExtractor(Index &origin, size_t k) : origin_(origin), kmer_size_(k) {
+ }
+
+ //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it.
+ const std::vector<Sequence> ExtractUnbranchingPaths(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ INFO("Extracting unbranching paths");
+ UnbranchingPathFinder finder(origin_, kmer_size_);
+ std::vector<Sequence> result;
+ size_t queueSize = queueMinSize;
+ std::vector<DeEdge> start_edges;
+ std::vector<Sequence> sequences;
+ start_edges.reserve(queueSize);
+ auto it = origin_.kmer_begin();
+ while (it.good()) {
+ AddStartDeEdges(it, queueSize, start_edges); // format a queue of junction kmers
+ CalculateSequences(start_edges, sequences, finder); // in parallel
+ start_edges.clear();
+ queueSize = min((size_t) ((double) queueSize * queueGrowthRate), queueMaxSize);
+ }
+ INFO("Extracting unbranching paths finished. " << sequences.size() << " sequences extracted");
+ return sequences;
+ }
+
+ const std::vector<Sequence> ExtractUnbranchingPathsAndLoops(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate) {
+ std::vector<Sequence> result = ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
+ CleanCondensed(result);
+ std::vector<Sequence> loops = CollectLoops();
+ for(auto it = loops.begin(); it != loops.end(); ++it) {
+ result.push_back(*it);
+ }
+ return result;
+ }
+
+private:
+ DECL_LOGGER("UnbranchingPathExtractor")
+};
+
+/*
+ * Only works for Conjugate dbg
+ */
+template<class Graph>
+class FastGraphFromSequencesConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ typedef RtSeq Kmer;
+ typedef DeBruijnExtensionIndex<> Index;
+ size_t kmer_size_;
+ Index &origin_;
+
+ class LinkRecord {
+ private:
+ size_t hash_and_mask_;
+ EdgeId edge_;
+
+ size_t BitBool(bool flag) const {
+ if(flag)
+ return 1;
+ return 0;
+ }
+
+ public:
+ size_t GetHash() const {
+ return hash_and_mask_ >> 2;
+ }
+
+ bool IsRC() const {
+ return hash_and_mask_ & 2;
+ }
+
+ bool IsStart() const {
+ return hash_and_mask_ & 1;
+ }
+
+
+ EdgeId GetEdge() const {
+ return edge_;
+ }
+
+ LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) :
+ hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) {
+ }
+
+ LinkRecord() :
+ hash_and_mask_(-1ul), edge_(0) {
+ }
+
+ bool IsInvalid() {
+ return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0);
+ }
+
+ bool operator<(const LinkRecord &other) const {
+ if(this->hash_and_mask_ == other.hash_and_mask_)
+ return this->edge_ < other.edge_;
+ return this->hash_and_mask_ < other.hash_and_mask_;
+ }
+ };
+
+ LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const {
+ Kmer kmer(kmer_size_, sequence);
+ Kmer kmer_rc = !kmer;
+ if(kmer < kmer_rc)
+ return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false);
+ else
+ return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true);
+ }
+
+ LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const {
+ Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_);
+ Kmer kmer_rc = !kmer;
+ if(kmer < kmer_rc)
+ return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false);
+ else
+ return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true);
+ }
+
+ void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, vector<LinkRecord> &records, const vector<Sequence> &sequences) const {
+ size_t size = sequences.size();
+ records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false));
+ restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
+# pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < size; ++i) {
+ size_t j = i << 1;
+ auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required
+ EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor);
+ records[j] = StartLink(edge, sequences[i]);
+ if(graph.conjugate(edge) != edge)
+ records[j + 1] = EndLink(edge, sequences[i]);
+ else
+ records[j + 1] = LinkRecord();
+ }
+ }
+
+ void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const {
+ VertexId v1 = v;
+ if(is_rc) {
+ v1 = graph.conjugate(v);
+ }
+ if(is_start) {
+ helper.LinkOutgoingEdge(v1, edge);
+ } else {
+ helper.LinkIncomingEdge(v1, edge);
+ }
+ }
+
+public:
+ FastGraphFromSequencesConstructor(size_t k, Index &origin) : kmer_size_(k), origin_(origin) {
+ }
+
+ void ConstructGraph(Graph &graph, const vector<Sequence> &sequences) const {
+ typename Graph::HelperT helper = graph.GetConstructionHelper();
+ vector<LinkRecord> records;
+ CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel
+ parallel::sort(records.begin(), records.end());
+ size_t size = records.size();
+ vector<vector<VertexId>> vertices_list(omp_get_max_threads());
+ restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < size; i++) {
+ if(i != 0 && records[i].GetHash() == records[i - 1].GetHash()) {
+ continue;
+ }
+ if(records[i].IsInvalid())
+ continue;
+ auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2);
+ VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor);
+ vertices_list[omp_get_thread_num()].push_back(v);
+ for(size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) {
+ LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC());
+ }
+ }
+ for(size_t i = 0; i < vertices_list.size(); i++)
+ helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end());
+ }
+};
+
+/*
+ * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
+ */
+template<class Graph>
+class DeBruijnGraphExtentionConstructor {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef DeBruijnExtensionIndex<> DeBruijn;
+ typedef typename Graph::VertexId VertexId;
+ typedef RtSeq Kmer;
+
+ Graph &graph_;
+ DeBruijn &origin_;
+ size_t kmer_size_;
+
+ void FilterRC(std::vector<Sequence> &edge_sequences) {
+ size_t size = 0;
+ for(size_t i = 0; i < edge_sequences.size(); i++) {
+ if(!(edge_sequences[i] < !edge_sequences[i])) {
+ edge_sequences[size] = edge_sequences[i];
+ size++;
+ }
+ }
+ edge_sequences.resize(size);
+ }
+
+public:
+ DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) :
+ graph_(graph), origin_(origin), kmer_size_(graph.k()) {
+ }
+
+ void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
+ double queueGrowthRate, bool keep_perfect_loops) {
+ std::vector<Sequence> edge_sequences;
+ if(keep_perfect_loops)
+ edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(queueMinSize, queueMaxSize, queueGrowthRate);
+ else
+ edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
+ FilterRC(edge_sequences);
+ FastGraphFromSequencesConstructor<Graph>(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences);
+ }
+
+private:
+ DECL_LOGGER("DeBruijnGraphConstructor")
+};
+
+}
diff --git a/src/common/utils/debruijn_graph/early_simplification.hpp b/src/common/utils/debruijn_graph/early_simplification.hpp
new file mode 100644
index 0000000..d85649f
--- /dev/null
+++ b/src/common/utils/debruijn_graph/early_simplification.hpp
@@ -0,0 +1,191 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "utils/standard_base.hpp"
+#include "utils/indices/perfect_hash_map.hpp"
+#include "utils/mph_index/kmer_index.hpp"
+
+namespace debruijn_graph {
+
+class LinkCleaner {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef Index::KMer Kmer;
+ typedef Index::KeyWithHash KeyWithHash;
+ Index &index_;
+
+ void CleanForwardLinks(KeyWithHash &kh, char i) {
+ if(index_.CheckOutgoing(kh, i)) {
+ KeyWithHash next_kh = index_.GetOutgoing(kh, i);
+ if(!index_.CheckIncoming(next_kh, kh[0])) {
+ index_.DeleteOutgoing(kh, i);
+ }
+ }
+ }
+
+ void CleanBackwardLinks(KeyWithHash &kh, char i) {
+ if(index_.CheckIncoming(kh, i)) {
+ KeyWithHash prev_kh = index_.GetIncoming(kh, i);
+ if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) {
+ index_.DeleteIncoming(kh, i);
+ }
+ }
+ }
+
+public:
+ LinkCleaner(Index &index) : index_(index) {}
+
+ //TODO make parallel
+ void CleanLinks() {
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < iters.size(); i++) {
+ for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
+ KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it));
+ if (kh.is_minimal()) {
+ KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it));
+ for (char i = 0; i < 4; i++) {
+ CleanForwardLinks(kh, i);
+ CleanBackwardLinks(kh, i);
+ }
+ }
+ }
+ }
+ }
+};
+
+class AlternativeEarlyTipClipper {
+private:
+ typedef DeBruijnExtensionIndex<> Index;
+ typedef Index::KMer Kmer;
+ typedef Index::KeyWithHash KeyWithHash;
+ Index &index_;
+ size_t length_bound_;
+
+ /*
+ * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector.
+ * The method returns length of a tip.
+ * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned.
+ * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend.
+ */
+ size_t FindForward(KeyWithHash kh, vector<KeyWithHash> &tip) {
+ while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) {
+ tip.push_back(kh);
+ kh = index_.GetUniqueOutgoing(kh);
+ }
+ tip.push_back(kh);
+ if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) {
+ return tip.size();
+ }
+ tip.clear();
+ return -1;
+ }
+
+ size_t FindBackward(KeyWithHash kh, vector<KeyWithHash> &tip) {
+ while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) {
+ tip.push_back(kh);
+ kh = index_.GetUniqueIncoming(kh);
+ }
+ tip.push_back(kh);
+ if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) {
+ return tip.size();
+ }
+ tip.clear();
+ return -1;
+ }
+
+ size_t RemoveTip(vector<KeyWithHash > &tip) {
+ for(size_t i = 0; i < tip.size(); i++)
+ index_.IsolateVertex(tip[i]);
+ return tip.size();
+ }
+
+ size_t RemoveTips(vector<vector<KeyWithHash > > tips, size_t max) {
+ size_t result = 0;
+ for(char c = 0; c < 4; c++) {
+ if(tips[c].size() < max) {
+ result += RemoveTip(tips[c]);
+ }
+ }
+ return result;
+ }
+
+ size_t RemoveForward(KeyWithHash kh) {
+ vector<vector<KeyWithHash >> tips;
+ tips.resize(4);
+ size_t max = 0;
+ for(char c = 0; c < 4; c++) {
+ if(index_.CheckOutgoing(kh, c)) {
+ KeyWithHash khc = index_.GetOutgoing(kh, c);
+ size_t len = FindForward(khc, tips[c]);
+ if(len > max)
+ max = len;
+ }
+ }
+ return RemoveTips(tips, max);
+ }
+
+ size_t RemoveBackward(KeyWithHash kh) {
+ vector<vector<KeyWithHash >> tips;
+ tips.resize(4);
+ size_t max = 0;
+ for(char c = 0; c < 4; c++) {
+ if(index_.CheckIncoming(kh, c)) {
+ KeyWithHash khc = index_.GetIncoming(kh, c);
+ size_t len = FindBackward(khc, tips[c]);
+ if(len > max)
+ max = len;
+ }
+ }
+ return RemoveTips(tips, max);
+ }
+
+ //TODO make parallel
+ size_t RoughClipTips() {
+ vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
+ vector<size_t> result(iters.size());
+# pragma omp parallel for schedule(guided)
+ for(size_t i = 0; i < iters.size(); i++) {
+ for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
+ KeyWithHash kh = index_.ConstructKWH(RtSeq(index_.k(), *it));
+ if(kh.is_minimal()) {
+ if (index_.OutgoingEdgeCount(kh) >= 2) {
+ result[i] += RemoveForward(kh);
+ }
+ if (index_.IncomingEdgeCount(kh) >= 2) {
+ result[i] += RemoveBackward(kh);
+ }
+ }
+ }
+ }
+ size_t sum = 0;
+ for(size_t i = 0; i < result.size(); i++)
+ sum += result[i];
+ return sum;
+ }
+
+
+public:
+ AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) {
+ }
+
+ /*
+ * Method returns the number of removed edges
+ */
+ size_t ClipTips() {
+ INFO("Early tip clipping");
+ size_t result = RoughClipTips();
+ LinkCleaner(index_).CleanLinks();
+ INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
+ return result;
+ }
+protected:
+ DECL_LOGGER("Early tip clipping");
+};
+
+}
diff --git a/src/common/utils/file_limit.hpp b/src/common/utils/file_limit.hpp
new file mode 100644
index 0000000..d97c791
--- /dev/null
+++ b/src/common/utils/file_limit.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "utils/verify.hpp"
+
+inline rlim_t limit_file(size_t limit) {
+ struct rlimit rl;
+
+ int res = getrlimit(RLIMIT_NOFILE, &rl);
+ VERIFY_MSG(res == 0,
+ "getrlimit(2) call failed, errno = " << errno);
+
+ // We cannot go beyond hard limit and we might not have enough privileges to
+ // increase the hard limit
+ limit = std::max<size_t>(limit, rl.rlim_cur);
+ rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
+ res = setrlimit(RLIMIT_NOFILE, &rl);
+ VERIFY_MSG(res == 0,
+ "setrlimit(2) call failed, errno = " << errno);
+ INFO("Open file limit set to " << rl.rlim_cur);
+
+ return rl.rlim_cur;
+}
diff --git a/src/common/utils/indices/edge_index_builders.hpp b/src/common/utils/indices/edge_index_builders.hpp
new file mode 100644
index 0000000..95d5831
--- /dev/null
+++ b/src/common/utils/indices/edge_index_builders.hpp
@@ -0,0 +1,174 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "edge_info_updater.hpp"
+#include "perfect_hash_map_builder.hpp"
+
+namespace debruijn_graph {
+
+template<class Index>
+class GraphPositionFillingIndexBuilder {
+public:
+ typedef Index IndexT;
+ typedef typename Index::KMer Kmer;
+
+ template<class Graph>
+ void BuildIndexFromGraph(Index &index,
+ const Graph/*T*/ &g, size_t read_buffer_size = 0) const {
+ debruijn_graph::BuildIndexFromGraph(index, g, read_buffer_size);
+
+ // Now use the index to fill the coverage and EdgeId's
+ INFO("Collecting k-mer coverage information from graph, this takes a while.");
+ EdgeInfoUpdater<Index, Graph> updater(g, index);
+ updater.UpdateAll();
+ }
+
+};
+
+template<typename> struct Void { typedef void type; };
+
+template<typename T, typename Sfinae = void>
+struct has_contains: std::false_type {};
+
+template<typename T>
+struct has_contains<
+ T
+ , typename Void<
+ //decltype( std::declval<T&>().contains(typename T::KMerIdx(0), typename T::KMer()) )
+ decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) )
+ >::type
+>: std::true_type {};
+
+template <class Builder>
+class CoverageFillingEdgeIndexBuilder : public Builder {
+ typedef Builder base;
+ public:
+ typedef typename Builder::IndexT IndexT;
+ typedef typename IndexT::KMer Kmer;
+ typedef typename IndexT::KMerIdx KmerIdx;
+ typedef typename IndexT::KeyWithHash KeyWithHash;
+
+ private:
+
+
+ bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const {
+ return !check_contains || index.contains(kwh);
+ }
+
+ bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const {
+ VERIFY(false);
+// VERIFY(!check_contains);
+ return true;
+ }
+
+ template<class ReadStream>
+ size_t FillCoverageFromStream(ReadStream &stream,
+ IndexT &index, bool check_contains) const {
+ unsigned k = index.k();
+ size_t rl = 0;
+
+ while (!stream.eof()) {
+ typename ReadStream::ReadT r;
+ stream >> r;
+ rl = std::max(rl, r.size());
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k)
+ continue;
+
+ KeyWithHash kwh = index.ConstructKWH(seq.start<Kmer>(k) >> 'A');
+ for (size_t j = k - 1; j < seq.size(); ++j) {
+ kwh <<= seq[j];
+ //contains is not used since index might be still empty here
+ if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains<IndexT>())) {
+# pragma omp atomic
+ index.get_raw_value_reference(kwh).count += 1;
+ }
+ }
+ }
+
+ return rl;
+ }
+
+ public:
+
+ template<class Streams>
+ size_t ParallelFillCoverage(IndexT &index,
+ Streams &streams,
+ bool check_contains = true) const {
+ INFO("Collecting k-mer coverage information from reads, this takes a while.");
+ unsigned nthreads = (unsigned) streams.size();
+ size_t rl = 0;
+ streams.reset();
+#pragma omp parallel for num_threads(nthreads) shared(rl)
+ for (size_t i = 0; i < nthreads; ++i) {
+ size_t crl = FillCoverageFromStream(streams[i], index, check_contains);
+
+ // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
+#pragma omp flush(rl)
+ if (crl > rl)
+#pragma omp critical
+ {
+ rl = std::max(rl, crl);
+ }
+ }
+
+ // Contigs have zero coverage!
+#if 0
+ if (contigs_stream) {
+ contigs_stream->reset();
+ FillCoverageFromStream(*contigs_stream, index, check_contains);
+ }
+#endif
+
+//todo if this verify is neede, put it outside
+//#ifndef NDEBUG
+// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end();
+// idx != eidx; ++idx) {
+//
+// Kmer k = index.kmer(idx);
+//
+// VERIFY(index[k].count == index[!k].count);
+// }
+//#endif
+
+ return rl;
+ }
+
+ template<class Streams>
+ size_t BuildIndexFromStream(IndexT &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) const {
+ debruijn_graph::BuildIndexFromStream(index, streams, contigs_stream);
+
+ return ParallelFillCoverage(index, streams, false);
+ }
+
+// template<class Streams>
+// size_t BuildIndexWithCoverageFromGraph(
+// GraphT &graph, IndexT &index,
+// Streams &streams,
+// SingleReadStream* contigs_stream = 0) const {
+// this->BuildIndexFromGraph(index, graph);
+//
+// return ParallelFillCoverage(index, streams, contigs_stream, true);
+// }
+};
+
+template<class Index>
+struct EdgeIndexHelper {
+ typedef typename Index::KMer Kmer;
+ typedef typename Index::KMerIdx KMerIdx;
+ typedef typename Index::traits_t traits_t;
+ typedef CoverageFillingEdgeIndexBuilder<Index> CoverageFillingEdgeIndexBuilderT;
+ typedef GraphPositionFillingIndexBuilder<Index> GraphPositionFillingIndexBuilderT;
+ typedef CoverageFillingEdgeIndexBuilder<GraphPositionFillingIndexBuilderT> CoverageAndGraphPositionFillingIndexBuilderT;
+};
+
+}
diff --git a/src/common/utils/indices/edge_info_updater.hpp b/src/common/utils/indices/edge_info_updater.hpp
new file mode 100644
index 0000000..3760f00
--- /dev/null
+++ b/src/common/utils/indices/edge_info_updater.hpp
@@ -0,0 +1,109 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "utils/openmp_wrapper.h"
+#include "sequence/sequence.hpp"
+#include "assembly_graph/core/graph_iterators.hpp"
+#include "utils/indices/edge_position_index.hpp"
+
+namespace debruijn_graph {
+
+template<typename Index, typename Graph>
+class EdgeInfoUpdater {
+ typedef typename Index::KMer Kmer;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Index::KeyWithHash KeyWithHash;
+
+ const Graph &g_;
+ Index &index_;
+
+// void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) {
+// if (index_.valid(kwh)) {
+// auto &entry = index_.get_raw_value_reference(kwh);
+// if (!entry.valid() || index_.contains(kwh)) {
+// index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count));
+// }
+// }
+// }
+
+ //todo why do we need to check equality???!!!
+ bool DeleteIfEqual(const KeyWithHash& kwh, EdgeId e) {
+ if (!index_.contains(kwh))
+ return false;
+ if (index_.get_value(kwh).edge_id == e) {
+ index_.get_raw_value_reference(kwh).clear();
+ return true;
+ }
+ return false;
+ }
+
+ void UpdateKMers(const Sequence &nucls, EdgeId e) {
+ VERIFY(nucls.size() >= index_.k());
+ KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
+ if (kwh.is_minimal())
+ index_.PutInIndex(kwh, e, 0);
+ for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
+ kwh <<= nucls[i];
+ if (kwh.is_minimal())
+ index_.PutInIndex(kwh, e, i - index_.k() + 1);
+ }
+ }
+
+ void DeleteKMers(const Sequence &nucls, EdgeId e) {
+ VERIFY(nucls.size() >= index_.k());
+ KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
+ DeleteIfEqual(kwh, e);
+ for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
+ kwh <<= nucls[i];
+ DeleteIfEqual(kwh, e);
+ }
+ }
+
+ public:
+ /**
+ * Creates DataHashRenewer for specified graph and index
+ * @param g graph to be indexed
+ * @param index index to be synchronized with graph
+ */
+ EdgeInfoUpdater(const Graph& g, Index& index)
+ : g_(g),
+ index_(index) {
+ }
+
+ void UpdateKmers(EdgeId e) {
+ Sequence nucls = g_.EdgeNucls(e);
+ UpdateKMers(nucls, e);
+ }
+
+ void DeleteKmers(EdgeId e) {
+ Sequence nucls = g_.EdgeNucls(e);
+ DeleteKMers(nucls, e);
+ }
+
+ void UpdateAll() {
+ unsigned nthreads = omp_get_max_threads();
+
+ omnigraph::IterationHelper<Graph, EdgeId> edges(g_);
+ auto iters = edges.Chunks(16 * nthreads);
+
+ #pragma omp parallel for schedule(guided)
+ for (size_t i = 0; i < iters.size() - 1; ++i) {
+ TRACE("Processing chunk #" << i);
+ for (auto it = iters[i]; it != iters[i + 1]; ++it) {
+ UpdateKmers(*it);
+ }
+ }
+ }
+
+ private:
+ DECL_LOGGER("EdgeInfoUpdater")
+};
+
+}
diff --git a/src/common/utils/indices/edge_multi_index.hpp b/src/common/utils/indices/edge_multi_index.hpp
new file mode 100644
index 0000000..763e9a5
--- /dev/null
+++ b/src/common/utils/indices/edge_multi_index.hpp
@@ -0,0 +1,155 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "perfect_hash_map.hpp"
+#include "edge_info_updater.hpp"
+#include "edge_position_index.hpp"
+
+#include <folly/SmallLocks.h>
+
+namespace debruijn_graph {
+
+template<class IdType>
+class EdgeInfoStorage {
+public:
+ typedef vector<EdgeInfo<IdType>> Content;
+ typedef typename Content::iterator iterator;
+ typedef typename Content::const_iterator const_iterator;
+ Content content_;
+ folly::MicroSpinLock lock_;
+
+ EdgeInfoStorage(const Content &content) : content_(content) {
+ lock_.init();
+ }
+
+ EdgeInfoStorage() {
+ lock_.init();
+ }
+
+ EdgeInfo<IdType> &operator[](size_t i) {
+ return content_[i];
+ }
+
+ iterator begin() {
+ return content_.begin();
+ }
+
+ iterator end() {
+ return content_.end();
+ }
+
+ const_iterator begin() const {
+ return content_.cbegin();
+ }
+
+ const_iterator end() const {
+ return content_.cend();
+ }
+
+ iterator find(const EdgeInfo<IdType> &info) {
+ return content_.find(info);
+ }
+
+ const_iterator find(const EdgeInfo<IdType> &info) const {
+ return content_.find(info);
+ }
+
+ void push_back(const EdgeInfo<IdType> &info) {
+ folly::MSLGuard g(lock_);
+ content_.push_back(info);
+ }
+
+ template<class... Args>
+ void emplace_back(Args&&... args) {
+ folly::MSLGuard g(lock_);
+ content_.emplace_back(std::forward<Args>(args)...);
+ }
+
+ size_t size() const{
+ return content_.size();
+ }
+
+ bool valid() const {
+ //what's invalid edge info storage?
+ return true;
+ }
+
+ EdgeInfoStorage conjugate(size_t k) const {
+ EdgeInfoStorage result;
+ for(auto it = content_.rbegin(); it != content_.rend(); ++it) {
+ result.push_back(it->conjugate(k));
+ }
+ return result;
+ }
+};
+
+//todo it is not handling graph events!!!
+template<class IdType, class Seq = RtSeq,
+ class traits = kmer_index_traits<Seq>, class StoringType = SimpleStoring >
+class DeBruijnEdgeMultiIndex : public KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > {
+ typedef KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > base;
+ public:
+ typedef StoringType storing_type;
+ typedef typename base::traits_t traits_t;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfoStorage<IdType> Value;
+
+ using base::ConstructKWH;
+// typedef typename base::IdType IdType;
+ //todo move this typedef up in hierarchy (need some c++ tricks)
+
+ DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir)
+ : base(k, workdir) {
+ INFO("Constructing multi-kmer index");
+ }
+
+ ~DeBruijnEdgeMultiIndex() {}
+
+
+ Value get(const KeyWithHash &kwh) const {
+ VERIFY(contains(kwh));
+ return base::get_value(kwh);
+ }
+
+ bool contains(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+ return this->get_raw_value_reference(kwh).valid();
+ }
+
+ bool valid(const KMer &kmer) const {
+ KeyWithHash kwh = base::ConstructKWH(kmer);
+ return base::valid(kwh);
+ }
+
+ void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) {
+ if (!contains(kwh))
+ return;
+
+ EdgeInfoStorage<IdType> &entry = this->get_raw_value_reference(kwh);
+ entry.emplace_back(id, (unsigned int)offset);
+ }
+
+ const EdgeInfoStorage<IdType> get(const KMer& kmer) const {
+ auto kwh = base::ConstructKWH(kmer);
+ auto entry = this->get_value(kwh);
+ return entry;
+ }
+
+ //todo delete if equal seems to work improperly!!!
+ bool DeleteIfEqual(const KeyWithHash &, IdType) {
+ VERIFY(false);
+ return false;
+ }
+
+};
+
+}
diff --git a/src/common/utils/indices/edge_position_index.hpp b/src/common/utils/indices/edge_position_index.hpp
new file mode 100644
index 0000000..446fad4
--- /dev/null
+++ b/src/common/utils/indices/edge_position_index.hpp
@@ -0,0 +1,216 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "perfect_hash_map.hpp"
+#include "io/reads/single_read.hpp"
+
+namespace debruijn_graph {
+
+template<class IdType>
+struct EdgeInfo {
+ IdType edge_id;
+ unsigned offset;
+ unsigned count;
+
+ EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = unsigned(-1), unsigned count_ = 0) :
+ edge_id(edge_id_), offset(offset_), count(count_) {
+ VERIFY(edge_id != IdType() || clean());
+ }
+
+ template<class KWH>
+ EdgeInfo conjugate(const KWH &kwh) const {
+ return conjugate(kwh.key().size());
+ }
+
+ EdgeInfo conjugate(size_t k) const {
+ if(!valid()) {
+ return EdgeInfo(IdType(0), unsigned(-1), count);
+ } else {
+ return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count);
+ }
+ }
+
+ void clear() {
+ offset = unsigned(-1);
+ }
+
+ bool clean() const {
+ return offset == unsigned(-1);
+ }
+
+ void remove() {
+ offset = unsigned(-2);
+ }
+
+ bool removed() const {
+ return offset == unsigned(-2);
+ }
+
+ bool valid() const {
+ return !clean() && !removed();
+ }
+};
+
+template<class stream, class IdType>
+stream &operator<<(stream &s, const EdgeInfo<IdType> &info) {
+ return s << "EdgeInfo[" << info.edge_id.int_id() << ", " << info.offset << ", " << info.count << "]";
+}
+
+template<class Graph, class StoringType = DefaultStoring>
+class KmerFreeEdgeIndex : public KeyIteratingMap<RtSeq, EdgeInfo<typename Graph::EdgeId>,
+ kmer_index_traits<RtSeq>, StoringType> {
+ typedef KeyIteratingMap<RtSeq, EdgeInfo<typename Graph::EdgeId>,
+ kmer_index_traits<RtSeq>, StoringType> base;
+ const Graph &graph_;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef Graph GraphT;
+ typedef typename Graph::EdgeId IdType;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfo<typename Graph::EdgeId> KmerPos;
+ using base::valid;
+ using base::ConstructKWH;
+
+public:
+
+ KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir)
+ : base(unsigned(graph.k() + 1), workdir), graph_(graph) {}
+
+ /**
+ * Shows if kmer has some entry associated with it
+ */
+ bool contains(const KeyWithHash &kwh) const {
+ // Sanity check
+ if (!valid(kwh))
+ return false;
+
+ KmerPos entry = base::get_value(kwh);
+ if (!entry.valid())
+ return false;
+ return graph_.EdgeNucls(entry.edge_id).contains(kwh.key(), entry.offset);
+ }
+
+ void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
+ if (!valid(kwh))
+ return;
+
+ KmerPos &entry = this->get_raw_value_reference(kwh);
+ if (entry.removed()) {
+ //VERIFY(false);
+ return;
+ }
+ if (entry.clean()) {
+ //put verify on this conversion!
+ this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count));
+ } else if (contains(kwh)) {
+ //VERIFY(false);
+ entry.remove();
+ } else {
+ //VERIFY(false);
+ //FIXME bad situation; some other kmer is there; think of putting verify
+ }
+ }
+
+ //Only coverage is loaded
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ this->index_ptr_->serialize(writer);
+ size_t sz = this->data_.size();
+ writer.write((char*)&sz, sizeof(sz));
+ for (size_t i = 0; i < sz; ++i)
+ writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string/* &FileName*/) {
+ this->clear();
+ this->index_ptr_->deserialize(reader);
+ size_t sz = 0;
+ reader.read((char*)&sz, sizeof(sz));
+ this->data_.resize(sz);
+ for (size_t i = 0; i < sz; ++i)
+ reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ }
+};
+
+template<class Graph, class StoringType = DefaultStoring>
+class KmerStoringEdgeIndex : public KeyStoringMap<RtSeq, EdgeInfo<typename Graph::EdgeId>,
+ kmer_index_traits<RtSeq>, StoringType> {
+ typedef KeyStoringMap<RtSeq, EdgeInfo<typename Graph::EdgeId>,
+ kmer_index_traits<RtSeq>, StoringType> base;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KMer KMer;
+ typedef typename base::KMerIdx KMerIdx;
+ typedef Graph GraphT;
+ typedef typename Graph::EdgeId IdType;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef EdgeInfo<typename Graph::EdgeId> KmerPos;
+ using base::valid;
+ using base::ConstructKWH;
+
+
+ KmerStoringEdgeIndex(const Graph& g, const std::string &workdir)
+ : base(unsigned(g.k() + 1), workdir) {}
+
+ ~KmerStoringEdgeIndex() {}
+
+ /**
+ * Shows if kmer has some entry associated with it
+ */
+ bool contains(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+ return this->get_raw_value_reference(kwh).valid();
+ }
+
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ this->index_ptr_->serialize(writer);
+ size_t sz = this->data_.size();
+ writer.write((char*)&sz, sizeof(sz));
+ for (size_t i = 0; i < sz; ++i)
+ writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ this->BinWriteKmers(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &FileName) {
+ this->clear();
+ this->index_ptr_->deserialize(reader);
+ size_t sz = 0;
+ reader.read((char*)&sz, sizeof(sz));
+ this->data_.resize(sz);
+ for (size_t i = 0; i < sz; ++i)
+ reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
+ this->BinReadKmers(reader, FileName);
+ }
+
+ void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
+ //here valid already checks equality of query-kmer and stored-kmer sequences
+ if (base::valid(kwh)) {
+ KmerPos &entry = this->get_raw_value_reference(kwh);
+ if (entry.removed())
+ return;
+ if (!entry.clean()) {
+ this->put_value(kwh, KmerPos(id, (unsigned)offset, entry.count));
+ } else {
+ entry.remove();
+ }
+ }
+ }
+};
+
+}
diff --git a/src/common/utils/indices/editable_index.hpp b/src/common/utils/indices/editable_index.hpp
new file mode 100644
index 0000000..60b629e
--- /dev/null
+++ b/src/common/utils/indices/editable_index.hpp
@@ -0,0 +1,270 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "perfect_hash_map.hpp.hpp"
+
+namespace debruijn_graph {
+
+//template<class ValueType, class traits>
+//class EditableDeBruijnKMerIndex: public DeBruijnKMerIndex<ValueType, traits> {
+//public:
+// typedef size_t KMerIdx;
+//private:
+// typedef typename traits::SeqType KMer;
+// typedef KMerIndex<traits> KMerIndexT;
+// typedef ValueType KMerIndexValueType;
+// typedef std::vector<KMerIndexValueType> KMerIndexStorageType;
+// typedef boost::bimap<KMer, size_t> KMerPushBackIndexType;
+//
+// KMerPushBackIndexType push_back_index_;
+// KMerIndexStorageType push_back_buffer_;
+//
+// using DeBruijnKMerIndex<ValueType, traits>::index_;
+// using DeBruijnKMerIndex<ValueType, traits>::data_;
+// using DeBruijnKMerIndex<ValueType, traits>::kmers;
+// using DeBruijnKMerIndex<ValueType, traits>::K_;
+// using DeBruijnKMerIndex<ValueType, traits>::InvalidKMerIdx;
+//public:
+// EditableDeBruijnKMerIndex(unsigned K, const std::string &workdir) :
+// DeBruijnKMerIndex<ValueType, traits>(K, workdir) {
+// }
+//
+// KMerIdx seq_idx(const KMer &s) const {
+// KMerIdx idx = index_.seq_idx(s);
+//
+// // First, check whether we're insert index itself.
+// if (contains(idx, s, /* check push back */false))
+// return idx;
+//
+// // Maybe we're inside push_back buffer then?
+// auto it = push_back_index_.left.find(s);
+// if (it != push_back_index_.left.end())
+// return data_.size() + it->second;
+//
+// return InvalidKMerIdx;
+// }
+//
+// KMerIndexValueType &operator[](const KMer &s) {
+// return operator[](index_.seq_idx(s));
+// }
+//
+// const KMerIndexValueType &operator[](const KMer &s) const {
+// return operator[](index_.seq_idx(s));
+// }
+//
+//
+// const KMerIndexValueType &operator[](KMerIdx idx) const {
+// if (idx < this->data_.size())
+// return this->data_[idx];
+// return push_back_buffer_[idx - this->data_.size()];
+// }
+//
+// KMerIndexValueType &operator[](KMerIdx idx) {
+// if (idx < this->data_.size())
+// return this->data_[idx];
+//
+// return push_back_buffer_[idx - this->data_.size()];
+// }
+//
+// size_t size() const {
+// return this->data_.size() + push_back_buffer_.size();
+// }
+//
+// bool contains(const KMer &k) const {
+// KMerIdx idx = seq_idx(k);
+//
+// return idx != InvalidKMerIdx;
+// }
+// bool contains(KMerIdx idx) const {
+// return idx < size();
+// }
+//
+// size_t insert(const KMer &s, const KMerIndexValueType &value) {
+// size_t idx = push_back_buffer_.size();
+// push_back_index_.insert(
+// typename KMerPushBackIndexType::value_type(s, idx));
+// push_back_buffer_.push_back(value);
+//
+// return idx;
+// }
+//
+// KMer kmer(KMerIdx idx) const {
+// VERIFY(contains(idx));
+//
+// if (idx < this->data_.size()) {
+// auto it = kmers->begin() + idx;
+// return (typename traits::raw_create()(K_, *it));
+// }
+//
+// idx -= this->data_.size();
+// return push_back_index_.right.find(idx)->second;
+// }
+//
+// template<class Writer>
+// void BinWrite(Writer &writer) const {
+// index_.serialize(writer);
+// size_t sz = this->data_.size();
+// writer.write((char*) &sz, sizeof(sz));
+// writer.write((char*) &this->data_[0], sz * sizeof(data_[0]));
+// sz = push_back_buffer_.size();
+// writer.write((char*) &sz, sizeof(sz));
+// writer.write((char*) &push_back_buffer_[0],
+// sz * sizeof(push_back_buffer_[0]));
+// for (auto it = push_back_index_.left.begin(), e =
+// push_back_index_.left.end(); it != e; ++it) {
+// size_t idx = it->second;
+// KMer::BinWrite(writer, it->first);
+// writer.write((char*) &idx, sizeof(idx));
+// sz -= 0;
+// }
+// VERIFY(sz == 0);
+// traits::raw_serialize(writer, kmers);
+// }
+//
+// template<class Reader>
+// void BinRead(Reader &reader, const std::string &FileName) {
+// clear();
+// index_.deserialize(reader);
+// size_t sz = 0;
+// reader.read((char*) &sz, sizeof(sz));
+// data_.resize(sz);
+// reader.read((char*) &data_[0], sz * sizeof(data_[0]));
+// reader.read((char*) &sz, sizeof(sz));
+// push_back_buffer_.resize(sz);
+// reader.read((char*) &push_back_buffer_[0],
+// sz * sizeof(push_back_buffer_[0]));
+// for (size_t i = 0; i < sz; ++i) {
+// KMer s(K_);
+// size_t idx;
+//
+// s.BinRead(reader);
+// reader.read((char*) &idx, sizeof(idx));
+//
+// push_back_index_.insert(
+// typename KMerPushBackIndexType::value_type(s, idx));
+// }
+//
+// kmers = traits::raw_deserialize(reader, FileName);
+// }
+//
+// void clear() {
+// index_.clear();
+// this->data_.clear();
+// KMerIndexStorageType().swap(data_);
+// push_back_index_.clear();
+// push_back_buffer_.clear();
+// delete kmers;
+// kmers = NULL;
+// }
+//
+//protected:
+// bool contains(KMerIdx idx, const KMer &k,
+// bool check_push_back = true) const {
+// // Sanity check
+// if (idx == InvalidKMerIdx || idx >= size())
+// return false;
+//
+// if (idx < data_.size()) {
+// auto it = kmers->begin() + idx;
+// return (typename traits::raw_equal_to()(k, *it));
+// }
+//
+// if (check_push_back) {
+// auto it = push_back_index_.right.find(idx - data_.size());
+// return (it != push_back_index_.right.end() && it->second == k);
+// }
+//
+// return false;
+// }
+//
+//};
+
+//template <class kmer_index_traits>
+//class EditableDeBruijnKMerIndexBuilder {
+// public:
+// template <class IdType, class Read>
+// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex<IdType, kmer_index_traits> &index,
+// io::ReadStreamVector<io::IReader<Read> > &streams,
+// SingleReadStream* contigs_stream = 0) const;
+//
+// template <class IdType, class Graph>
+// void BuildIndexFromGraph(EditableDeBruijnKMerIndex<IdType, kmer_index_traits> &index,
+// const Graph &g) const;
+//
+// protected:
+// template <class KMerCounter, class Index>
+// void SortUniqueKMers(KMerCounter &counter, Index &index) const;
+//
+// protected:
+// DECL_LOGGER("K-mer Index Building");
+//};
+
+//template <>
+//class EditableDeBruijnKMerIndexBuilder<kmer_index_traits<RtSeq>> {
+// public:
+// template <class IdType, class Read>
+// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex<IdType, kmer_index_traits<RtSeq>> &index,
+// io::ReadStreamVector<io::IReader<Read> > &streams,
+// SingleReadStream* contigs_stream = 0) const {
+// DeBruijnReadKMerSplitter<Read> splitter(index.workdir(),
+// index.K(), 0,
+// streams, contigs_stream);
+// KMerDiskCounter<RtSeq> counter(index.workdir(), splitter);
+// KMerIndexBuilder<typename DeBruijnKMerIndex<IdType, kmer_index_traits<RtSeq>>::KMerIndexT> builder(index.workdir(), 16, streams.size());
+// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true);
+// index.data_.resize(sz);
+//
+// if (!index.kmers)
+// index.kmers = counter.GetFinalKMers();
+//
+// SortUniqueKMers(counter, index);
+//
+// return 0;
+// }
+//
+// template <class IdType, class Graph>
+// void BuildIndexFromGraph(EditableDeBruijnKMerIndex<IdType, RtSeq> &index,
+// const Graph &g) const {
+// DeBruijnGraphKMerSplitter<Graph> splitter(index.workdir(), index.K(), g);
+// KMerDiskCounter<RtSeq> counter(index.workdir(), splitter);
+// KMerIndexBuilder<typename DeBruijnKMerIndex<typename Graph::EdgeId, kmer_index_traits<RtSeq>>::KMerIndexT> builder(index.workdir(), 16, 1);
+// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true);
+// index.data_.resize(sz);
+//
+// if (!index.kmers)
+// index.kmers = counter.GetFinalKMers();
+//
+// SortUniqueKMers(counter, index);
+// }
+//
+// protected:
+// template <class KMerCounter, class Index>
+// void SortUniqueKMers(KMerCounter &counter, Index &index) const {
+// size_t swaps = 0;
+// INFO("Arranging kmers in hash map order");
+// for (auto I = index.kmers->begin(), E = index.kmers->end(); I != E; ++I) {
+// size_t cidx = I - index.kmers->begin();
+// size_t kidx = index.raw_seq_idx(*I);
+// while (cidx != kidx) {
+// auto J = index.kmers->begin() + kidx;
+// using std::swap;
+// swap(*I, *J);
+// swaps += 1;
+//
+// kidx = index.raw_seq_idx(*I);
+// }
+// }
+// INFO("Done. Total swaps: " << swaps);
+// }
+//
+// protected:
+// DECL_LOGGER("K-mer Index Building");
+//};
+
+}
diff --git a/src/common/utils/indices/key_with_hash.hpp b/src/common/utils/indices/key_with_hash.hpp
new file mode 100644
index 0000000..57e5a5a
--- /dev/null
+++ b/src/common/utils/indices/key_with_hash.hpp
@@ -0,0 +1,229 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "storing_traits.hpp"
+
+namespace debruijn_graph {
+
+template<typename Key, class HashFunction>
+class SimpleKeyWithHash {
+public:
+ typedef Key KeyType;
+private:
+ typedef typename HashFunction::IdxType IdxType;
+ const HashFunction &hash_;
+ Key key_;
+ mutable IdxType idx_; //lazy computation
+ mutable bool ready_;
+
+ void CountIdx() const {
+ ready_ = true;
+ idx_ = hash_.seq_idx(key_);
+ }
+
+ void SetKey(const Key &key) {
+ ready_ = false;
+ key_ = key;
+ }
+public:
+
+ SimpleKeyWithHash(Key key, const HashFunction &hash) : hash_(hash), key_(key), idx_(0), ready_(false) {
+ }
+
+ Key key() const {
+ return key_;
+ }
+
+ IdxType idx() const {
+ if(!ready_) {
+ CountIdx();
+ }
+ return idx_;
+ }
+
+ SimpleKeyWithHash &operator=(const SimpleKeyWithHash &that) {
+ VERIFY(&this->hash_ == &that.hash_);
+ this->key_= that.key_;
+ this->idx_ = that.idx_;
+ this->ready_ = that.ready_;
+ return *this;
+ }
+
+ bool operator==(const SimpleKeyWithHash &that) const {
+ VERIFY(&this->hash_ == &that.hash_);
+ if (this->ready_ && that.ready_)
+ return this->idx_ == that.idx_ && this->is_minimal_ == that.is_minimal_;
+ return this->key_ == that.key_;
+ }
+
+ bool operator!=(const SimpleKeyWithHash &that) const {
+ VERIFY(&this->hash_ == &that.hash_);
+ return this->key_ != that.key_;
+ }
+
+ SimpleKeyWithHash operator!() const {
+ return SimpleKeyWithHash(!key_, hash_);
+ }
+
+ SimpleKeyWithHash operator<<(char nucl) const {
+ return SimpleKeyWithHash(key_ << nucl, hash_);
+ }
+
+ SimpleKeyWithHash operator>>(char nucl) const {
+ return SimpleKeyWithHash(key_ >> nucl, hash_);
+ }
+
+ void operator<<=(char nucl) {
+ SetKey(key_ << nucl);
+ }
+
+ void operator>>=(char nucl) {
+ SetKey(key_ >> nucl);
+ }
+
+ char operator[](size_t i) const {
+ return key_[i];
+ }
+
+ bool is_minimal() const {
+ return true;
+ }
+};
+
+template<class stream, class Key, class Index>
+stream &operator<<(stream &s, const SimpleKeyWithHash<Key, Index> &kwh) {
+ return s << "SKWH[" << kwh.key() << ", " << kwh.idx() << "]";
+}
+
+//Would it make sense to also store inverted kmer for not minimal kwh?
+template<typename Key, class HashFunction>
+class InvertableKeyWithHash {
+private:
+ typedef typename HashFunction::IdxType IdxType;
+
+ const HashFunction &hash_;
+ Key key_;
+ mutable IdxType idx_; //lazy computation
+ mutable bool is_minimal_;
+ mutable bool ready_;
+
+ void CountIdx() const {
+ ready_ = true;
+ is_minimal_ = key_.IsMinimal();
+ if(is_minimal_)
+ idx_ = hash_.seq_idx(key_);
+ else{
+ idx_ = hash_.seq_idx(!key_);
+ }
+ }
+
+ InvertableKeyWithHash(Key key, const HashFunction &hash, bool is_minimal,
+ size_t idx, bool ready)
+ : hash_(hash), key_(key), idx_(idx),
+ is_minimal_(is_minimal), ready_(ready) {
+ }
+ public:
+
+ InvertableKeyWithHash(Key key, const HashFunction &hash)
+ : hash_(hash), key_(key), idx_(0), is_minimal_(false), ready_(false) {}
+
+ const Key &key() const {
+ return key_;
+ }
+
+ IdxType idx() const {
+ if (!ready_)
+ CountIdx();
+
+ return idx_;
+ }
+
+ bool is_minimal() const {
+ if(!ready_) {
+ return key_.IsMinimal();
+ }
+ return is_minimal_;
+ }
+
+ bool ready() const {
+ return ready_;
+ }
+
+ InvertableKeyWithHash &operator=(const InvertableKeyWithHash &that) {
+ VERIFY(&this->hash_ == &that.hash_);
+ this->key_= that.key_;
+ this->idx_ = that.idx_;
+ this->ready_ = that.ready_;
+ this->is_minimal_ = that.is_minimal_;
+ return *this;
+ }
+
+ bool operator==(const InvertableKeyWithHash &that) const {
+ VERIFY(&this->hash_ == &that.hash_);
+ return this->key_ == that.key_;
+ }
+
+ bool operator!=(const InvertableKeyWithHash &that) const {
+ VERIFY(&this->hash_ == &that.hash_);
+ return this->key_ != that.key_;
+ }
+
+ InvertableKeyWithHash operator!() const {
+ if(!ready_)
+ return InvertableKeyWithHash(!key_, hash_);
+ return InvertableKeyWithHash(!key_, hash_, !is_minimal_, idx_, ready_);
+ }
+
+ InvertableKeyWithHash operator<<(char nucl) const {
+ return InvertableKeyWithHash(key_ << nucl, hash_);
+ }
+
+ InvertableKeyWithHash operator>>(char nucl) const {
+ return InvertableKeyWithHash(key_ >> nucl, hash_);
+ }
+
+ void operator<<=(char nucl) {
+ key_ <<= nucl;
+ ready_ = false;
+ }
+
+ void operator>>=(char nucl) {
+ key_ >>= nucl;
+ ready_ = false;
+ }
+
+ char operator[](size_t i) const {
+ return key_[i];
+ }
+};
+
+template<class stream, class Key, class Index>
+stream &operator<<(stream &s, const InvertableKeyWithHash<Key, Index> &kwh) {
+ s << "IKWH[" << kwh.key();
+ if(kwh.ready()) {
+ return s << ", " << kwh.is_minimal() << ", " << kwh.idx() << "]";
+ } else {
+ return s << ", not ready]";
+ }
+}
+
+template<class K, class Index, class StoringType>
+struct StoringTraits;
+
+template<class K, class Index>
+struct StoringTraits<K, Index, SimpleStoring> {
+ typedef SimpleKeyWithHash<K, Index> KeyWithHash;
+};
+
+template<class K, class Index>
+struct StoringTraits<K, Index, InvertableStoring> {
+ typedef InvertableKeyWithHash<K, Index> KeyWithHash;
+};
+
+}
diff --git a/src/common/utils/indices/kmer_extension_index.hpp b/src/common/utils/indices/kmer_extension_index.hpp
new file mode 100644
index 0000000..b72be84
--- /dev/null
+++ b/src/common/utils/indices/kmer_extension_index.hpp
@@ -0,0 +1,309 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "perfect_hash_map.hpp"
+#include "utils/simple_tools.hpp"
+#include "storing_traits.hpp"
+#include <bitset>
+
+namespace debruijn_graph {
+
+inline uint8_t invert_byte_slow(uint8_t a) {
+ size_t res = 0;
+ for(size_t i = 0; i < 8; i++) {
+ res <<= 1;
+ res += a & 1;
+ a = uint8_t(a >> 1);
+ }
+ return uint8_t(res);
+}
+
+inline vector<uint8_t> count_invert_byte() {
+ vector<uint8_t> result;
+ for (size_t a = 0; a < 256; a++) {
+ result.push_back(invert_byte_slow((uint8_t)a));
+ }
+ return result;
+}
+
+inline uint8_t invert_byte(uint8_t a) {
+ static vector<uint8_t> precalc = count_invert_byte();
+ return precalc[a];
+}
+
+class InOutMask {
+private:
+ uint8_t mask_;
+
+ bool CheckUnique(uint8_t mask) const {
+ static bool unique[] =
+ { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
+ return unique[mask];
+ }
+
+ char GetUnique(uint8_t mask) const {
+ static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1,
+ -1, -1, -1 };
+ VERIFY(next[mask] != -1)
+ return next[mask];
+ }
+
+ size_t Count(uint8_t mask) const {
+ static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+ return count[mask];
+ }
+
+
+ char inv_position(char nucl, bool as_is) const {
+ if(as_is)
+ return nucl;
+ else
+ return char(7 - nucl);
+ }
+
+public:
+ explicit InOutMask(uint8_t mask = 0) : mask_(mask){
+ }
+
+ uint8_t get_mask() const {
+ return mask_;
+ }
+
+ template<class Key>
+ InOutMask conjugate(const Key & /*k*/) const {
+ return InOutMask(invert_byte(mask_));
+ }
+
+ void AddOutgoing(char nnucl, bool as_is) {
+ unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is));
+ if (!(mask_ & nmask)) {
+# pragma omp atomic
+ mask_ |= (unsigned char) nmask;
+ }
+ }
+
+ void AddIncoming(char pnucl, bool as_is) {
+ unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is));
+ if (!(mask_ & pmask)) {
+# pragma omp atomic
+ mask_|= (unsigned char) pmask;
+ }
+ }
+
+ void DeleteOutgoing(char nnucl, bool as_is) {
+ unsigned nmask = (1 << inv_position(nnucl, as_is));
+ if (mask_ & nmask) {
+# pragma omp atomic
+ mask_ &= (unsigned char) ~nmask;
+ }
+ }
+
+ void DeleteIncoming(char pnucl, bool as_is) {
+ unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is));
+ if (mask_ & pmask) {
+# pragma omp atomic
+ mask_ &= (unsigned char) ~pmask;
+ }
+ }
+
+ void IsolateVertex() {
+ mask_ = 0;
+ }
+
+ bool CheckOutgoing(char nucl) const {
+ return mask_ & (1 << nucl);
+ }
+
+ bool CheckIncoming(char nucl) const {
+ return mask_ & (1 << (4 + nucl));
+ }
+
+ bool IsDeadEnd() const {
+ return !(mask_ & 15);
+ }
+
+ bool IsDeadStart() const {
+ return !(mask_ >> 4);
+ }
+
+ bool CheckUniqueOutgoing() const {
+ return CheckUnique(mask_ & 15);
+ }
+
+ bool CheckUniqueIncoming() const {
+ return CheckUnique(uint8_t(mask_ >> 4));
+ }
+
+ char GetUniqueOutgoing() const {
+ return GetUnique(mask_ & 15);
+ }
+
+ char GetUniqueIncoming() const {
+ return GetUnique(uint8_t(mask_ >> 4));
+ }
+
+ size_t OutgoingEdgeCount() const {
+ return Count(mask_ & 15);
+ }
+
+ size_t IncomingEdgeCount() const {
+ return Count(uint8_t(mask_ >> 4));
+ }
+};
+
+template<class Stream>
+Stream &operator<<(Stream& stream, const InOutMask &mask) {
+ return stream << std::bitset<8>(mask.get_mask());
+}
+
+template<class Seq>
+struct slim_kmer_index_traits : public kmer_index_traits<Seq> {
+ typedef kmer_index_traits<Seq> __super;
+
+ typedef MMappedRecordReader<typename Seq::DataType> FinalKMerStorage;
+
+ template<class Writer>
+ static void raw_serialize(Writer&, typename __super::RawKMerStorage*) {
+ VERIFY(false && "Cannot save extension index");
+ }
+
+ template<class Reader>
+ static typename __super::RawKMerStorage *raw_deserialize(
+ Reader&, const std::string &) {
+ VERIFY(false && "Cannot load extension index");
+ return NULL;
+ }
+
+};
+
+template<typename KeyWithHash>
+struct AbstractDeEdge {
+ KeyWithHash start;
+ KeyWithHash end;
+ AbstractDeEdge(KeyWithHash _start, KeyWithHash _end) : start(_start), end(_end) {
+ }
+
+ AbstractDeEdge<KeyWithHash> &operator=(const AbstractDeEdge<KeyWithHash> &that) {
+ this->start = that.start;
+ this->end = that.end;
+ return *this;
+ }
+
+ bool operator==(const AbstractDeEdge &other) {
+ return start == other.start && end == other.end;
+ }
+
+ bool operator!=(const AbstractDeEdge &other) {
+ return !(*this == other);
+ }
+};
+
+template<class stream, class KWH>
+stream &operator<<(stream &s, const AbstractDeEdge<KWH> de_edge) {
+ return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]";
+}
+
+template<class traits = slim_kmer_index_traits<RtSeq>, class StoringType = DefaultStoring>
+class DeBruijnExtensionIndex : public KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> {
+ typedef KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> base;
+
+public:
+ typedef typename base::traits_t traits_t;
+ typedef StoringType storing_type;
+ typedef typename base::KeyType KMer;
+ typedef typename base::IdxType KMerIdx;
+ typedef typename base::KeyWithHash KeyWithHash;
+ typedef AbstractDeEdge<KeyWithHash> DeEdge;
+ using base::ConstructKWH;
+
+ DeBruijnExtensionIndex(unsigned K, const std::string &workdir)
+ : base((size_t) K, workdir) {
+ }
+
+ void AddOutgoing(const KeyWithHash &kwh, char nucl) {
+ TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal());
+ }
+
+ void AddIncoming(const KeyWithHash &kwh, char nucl) {
+ TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal());
+ }
+
+ void DeleteOutgoing(const KeyWithHash &kwh, char nucl) {
+ TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal());
+ }
+
+ void DeleteIncoming(const KeyWithHash &kwh, char nucl) {
+ TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
+ this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal());
+ }
+
+ void IsolateVertex(const KeyWithHash &kwh) {
+ TRACE("Isolate vertex " << kwh);
+ this->get_raw_value_reference(kwh).IsolateVertex();
+ }
+
+ bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const {
+ return this->get_value(kwh).CheckOutgoing(nucl);
+ }
+
+ KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const {
+ return kwh << nucl;
+ }
+
+ bool CheckIncoming(const KeyWithHash &kwh, char nucl) const {
+ return this->get_value(kwh).CheckIncoming(nucl);
+ }
+
+ KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const {
+ return kwh >> nucl;
+ }
+
+ bool IsDeadEnd(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IsDeadEnd();
+ }
+
+ bool IsDeadStart(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IsDeadStart();
+ }
+
+ bool CheckUniqueOutgoing(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).CheckUniqueOutgoing();
+ }
+
+ KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const {
+ return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing());
+ }
+
+ bool CheckUniqueIncoming(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).CheckUniqueIncoming();
+ }
+
+ KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const {
+ return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming());
+ }
+
+ size_t OutgoingEdgeCount(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).OutgoingEdgeCount();
+ }
+
+ size_t IncomingEdgeCount(const KeyWithHash &kwh) const {
+ return this->get_value(kwh).IncomingEdgeCount();
+ }
+
+ ~DeBruijnExtensionIndex() {
+ }
+
+private:
+ DECL_LOGGER("ExtentionIndex");
+};
+
+}
diff --git a/src/common/utils/indices/kmer_extension_index_builder.hpp b/src/common/utils/indices/kmer_extension_index_builder.hpp
new file mode 100644
index 0000000..4ca9089
--- /dev/null
+++ b/src/common/utils/indices/kmer_extension_index_builder.hpp
@@ -0,0 +1,106 @@
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "kmer_extension_index.hpp"
+#include "kmer_splitters.hpp"
+
+class DeBruijnExtensionIndexBuilder {
+public:
+ template<class ReadStream, class Index>
+ size_t FillExtensionsFromStream(ReadStream &stream, Index &index) const {
+ unsigned k = index.k();
+ size_t rl = 0;
+
+ while (!stream.eof()) {
+ typename ReadStream::read_type r;
+ stream >> r;
+ rl = std::max(rl, r.size());
+
+ const Sequence &seq = r.sequence();
+ if (seq.size() < k + 1)
+ continue;
+
+ typename Index::KeyWithHash kwh = index.ConstructKWH(seq.start<RtSeq>(k));
+ for (size_t j = k; j < seq.size(); ++j) {
+ char nnucl = seq[j], pnucl = kwh[0];
+ index.AddOutgoing(kwh, nnucl);
+ kwh <<= nnucl;
+ index.AddIncoming(kwh, pnucl);
+ }
+ }
+
+ return rl;
+ }
+
+ template<class Index>
+ void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
+ Index &index) const {
+ unsigned KPlusOne = index.k() + 1;
+
+ typename Index::kmer_iterator it(KPlusOneMersFilename,
+ RtSeq::GetDataSize(KPlusOne));
+ for (; it.good(); ++it) {
+ RtSeq kpomer(KPlusOne, *it);
+
+ char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
+ TRACE("processing k+1-mer " << kpomer);
+ index.AddOutgoing(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer)),
+ nnucl);
+ // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
+ index.AddIncoming(index.ConstructKWH(RtSeq(KPlusOne - 1, kpomer << 0)),
+ pnucl);
+ }
+ }
+
+public:
+ template<class Index, class Streams>
+ ReadStatistics BuildExtensionIndexFromStream(Index &index, Streams &streams, io::SingleStream* contigs_stream = 0,
+ size_t read_buffer_size = 0) const {
+ unsigned nthreads = (unsigned) streams.size();
+
+ // First, build a k+1-mer index
+ DeBruijnReadKMerSplitter<typename Streams::ReadT,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
+ contigs_stream, read_buffer_size);
+ KMerDiskCounter<RtSeq> counter(index.workdir(), splitter);
+ counter.CountAll(nthreads, nthreads, /* merge */false);
+
+ // Now, count unique k-mers from k+1-mers
+ DeBruijnKMerKMerSplitter<StoringTypeFilter<typename Index::storing_type> >
+ splitter2(index.workdir(), index.k(),
+ index.k() + 1, Index::storing_type::IsInvertable(), read_buffer_size);
+ for (unsigned i = 0; i < nthreads; ++i)
+ splitter2.AddKMers(counter.GetMergedKMersFname(i));
+ KMerDiskCounter<RtSeq> counter2(index.workdir(), splitter2);
+
+ BuildIndex(index, counter2, 16, nthreads);
+
+ // Build the kmer extensions
+ INFO("Building k-mer extensions from k+1-mers");
+# pragma omp parallel for num_threads(nthreads)
+ for (unsigned i = 0; i < nthreads; ++i)
+ FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
+ INFO("Building k-mer extensions from k+1-mers finished.");
+
+ return splitter.stats();
+ }
+
+private:
+ DECL_LOGGER("DeBruijnExtensionIndexBuilder");
+};
+
+template<class Index>
+struct ExtensionIndexHelper {
+ using IndexT = Index;
+ typedef typename Index::traits_t traits_t;
+ typedef typename Index::KMer Kmer;
+ typedef typename Index::KMerIdx KMerIdx;
+ using DeBruijnExtensionIndexBuilderT = DeBruijnExtensionIndexBuilder;
+};
+
diff --git a/src/common/utils/indices/kmer_splitters.hpp b/src/common/utils/indices/kmer_splitters.hpp
new file mode 100644
index 0000000..4f3b087
--- /dev/null
+++ b/src/common/utils/indices/kmer_splitters.hpp
@@ -0,0 +1,317 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "io/reads/io_helper.hpp"
+#include "storing_traits.hpp"
+
+#include "utils/file_limit.hpp"
+#include "utils/mph_index/kmer_index_builder.hpp"
+
+namespace debruijn_graph {
+
+template<class StoringType>
+struct StoringTypeFilter {
+};
+
+template<>
+struct StoringTypeFilter<SimpleStoring> {
+ template<class Kmer>
+ bool filter(const Kmer &/*kmer*/) const {
+ return true;
+ }
+};
+
+template<>
+struct StoringTypeFilter<InvertableStoring> {
+ template<class Kmer>
+ bool filter(const Kmer &kmer) const {
+ return kmer.IsMinimal();
+ }
+};
+
+using RtSeqKMerSplitter = ::KMerSortingSplitter<RtSeq>;
+
+template<class KmerFilter>
+class DeBruijnKMerSplitter : public RtSeqKMerSplitter {
+ private:
+ KmerFilter kmer_filter_;
+ protected:
+ size_t read_buffer_size_;
+ protected:
+ bool FillBufferFromSequence(const Sequence &seq,
+ unsigned thread_id) {
+ if (seq.size() < this->K_)
+ return false;
+
+ RtSeq kmer = seq.start<RtSeq>(this->K_) >> 'A';
+ bool stop = false;
+ for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
+ kmer <<= seq[j];
+ if (!kmer_filter_.filter(kmer))
+ continue;
+
+ stop |= this->push_back_internal(kmer, thread_id);
+ }
+
+ return stop;
+ }
+
+ public:
+ DeBruijnKMerSplitter(const std::string &work_dir,
+ unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0)
+ : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) {
+ }
+ protected:
+ DECL_LOGGER("DeBruijnKMerSplitter");
+};
+
+struct ReadStatistics {
+ size_t reads_;
+ size_t max_read_length_;
+ size_t bases_;
+};
+
+template<class Read, class KmerFilter>
+class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ io::ReadStreamList<Read> &streams_;
+ io::SingleStream *contigs_;
+
+ template<class ReadStream>
+ ReadStatistics
+ FillBufferFromStream(ReadStream& stream, unsigned thread_id);
+
+ ReadStatistics rs_;
+
+ public:
+ DeBruijnReadKMerSplitter(const std::string &work_dir,
+ unsigned K, uint32_t seed,
+ io::ReadStreamList<Read>& streams,
+ io::SingleStream* contigs_stream = 0,
+ size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size, seed),
+ streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {}
+
+ path::files_t Split(size_t num_files) override;
+
+ size_t read_length() const { return rs_.max_read_length_; }
+ ReadStatistics stats() const { return rs_; }
+};
+
+template<class Read, class KmerFilter> template<class ReadStream>
+ReadStatistics
+DeBruijnReadKMerSplitter<Read, KmerFilter>::FillBufferFromStream(ReadStream &stream,
+ unsigned thread_id) {
+ typename ReadStream::ReadT r;
+ size_t reads = 0, rl = 0, bases = 0;
+
+ while (!stream.eof()) {
+ stream >> r;
+ rl = std::max(rl, r.size());
+ reads += 1;
+ bases += r.size();
+
+ if (this->FillBufferFromSequence(r.sequence(), thread_id))
+ break;
+ }
+ return { reads, rl, bases };
+}
+
+template<class Read, class KmerFilter>
+path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files) {
+ unsigned nthreads = (unsigned) streams_.size();
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+ path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
+
+ size_t counter = 0, rl = 0, bases = 0, n = 15;
+ streams_.reset();
+ while (!streams_.eof()) {
+# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl)
+ for (unsigned i = 0; i < nthreads; ++i) {
+ ReadStatistics stats = FillBufferFromStream(streams_[i], i);
+ counter += stats.reads_;
+ bases += stats.bases_;
+
+ // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
+# pragma omp flush(rl)
+ if (stats.max_read_length_ > rl)
+# pragma omp critical
+ {
+ rl = std::max(rl, stats.max_read_length_);
+ }
+ }
+
+ this->DumpBuffers(out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " reads");
+ n += 1;
+ }
+ }
+
+ if (contigs_) {
+ INFO("Adding contigs from previous K");
+ unsigned cnt = 0;
+ contigs_->reset();
+ while (!contigs_->eof()) {
+ FillBufferFromStream(*contigs_, cnt);
+ this->DumpBuffers(out);
+ if (++cnt >= nthreads)
+ cnt = 0;
+ }
+ }
+
+ this->ClearBuffers();
+
+ INFO("Used " << counter << " reads. Maximum read length " << rl);
+ INFO("Average read length " << double(bases) / double(counter));
+ rs_ = { counter, rl, bases };
+
+ return out;
+}
+
+template<class Graph, class KmerFilter>
+class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ typedef typename Graph::ConstEdgeIt EdgeIt;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const Graph &g_;
+
+ size_t FillBufferFromEdges(EdgeIt &edge, unsigned thread_id);
+
+ public:
+ DeBruijnGraphKMerSplitter(const std::string &work_dir,
+ unsigned K, const Graph &g, size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {}
+
+ path::files_t Split(size_t num_files) override;
+};
+
+template<class Graph, class KmerFilter>
+size_t
+DeBruijnGraphKMerSplitter<Graph, KmerFilter>::FillBufferFromEdges(EdgeIt &edge,
+ unsigned thread_id) {
+ size_t seqs = 0;
+ for (; !edge.IsEnd(); ++edge) {
+ const Sequence &nucls = g_.EdgeNucls(*edge);
+
+ seqs += 1;
+ if (this->FillBufferFromSequence(nucls, thread_id))
+ break;
+ }
+
+ return seqs;
+}
+
+template<class Graph, class KmerFilter>
+path::files_t DeBruijnGraphKMerSplitter<Graph, KmerFilter>::Split(size_t num_files) {
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ path::files_t out = this->PrepareBuffers(num_files, 1, this->read_buffer_size_);
+
+ size_t counter = 0, n = 10;
+ for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) {
+ counter += FillBufferFromEdges(it, 0);
+
+ this->DumpBuffers(out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " edges");
+ n += 1;
+ }
+ }
+
+ INFO("Used " << counter << " sequences.");
+
+ this->ClearBuffers();
+
+ return out;
+}
+
+
+template<class KmerFilter>
+class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
+ typedef MMappedFileRecordArrayIterator<RtSeq::DataType> kmer_iterator;
+
+ unsigned K_source_;
+ std::vector<std::string> kmers_;
+ bool add_rc_;
+
+ size_t FillBufferFromKMers(kmer_iterator &kmer,
+ unsigned thread_id);
+
+ public:
+ DeBruijnKMerKMerSplitter(const std::string &work_dir,
+ unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0)
+ : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size),
+ K_source_(K_source), add_rc_(add_rc) {}
+
+ void AddKMers(const std::string &file) {
+ kmers_.push_back(file);
+ }
+
+ path::files_t Split(size_t num_files) override;
+};
+
+template<class KmerFilter>
+inline size_t DeBruijnKMerKMerSplitter<KmerFilter>::FillBufferFromKMers(kmer_iterator &kmer,
+ unsigned thread_id) {
+ size_t seqs = 0;
+ for (; kmer.good(); ++kmer) {
+ Sequence nucls(RtSeq(K_source_, *kmer));
+ seqs += 1;
+
+ bool stop = this->FillBufferFromSequence(nucls, thread_id);
+ if (add_rc_)
+ stop |= this->FillBufferFromSequence(!nucls, thread_id);
+
+ if (stop)
+ break;
+ }
+
+ return seqs;
+}
+
+template<class KmerFilter>
+path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
+ unsigned nthreads = (unsigned) kmers_.size();
+
+ INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
+
+ path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
+
+ size_t counter = 0, n = 10;
+ std::vector<kmer_iterator> its;
+ its.reserve(nthreads);
+ for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it)
+ its.emplace_back(*it, RtSeq::GetDataSize(K_source_));
+
+ while (std::any_of(its.begin(), its.end(),
+ [](const kmer_iterator &it) { return it.good(); })) {
+# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
+ for (unsigned i = 0; i < nthreads; ++i)
+ counter += FillBufferFromKMers(its[i], i);
+
+ this->DumpBuffers(out);
+
+ if (counter >> n) {
+ INFO("Processed " << counter << " kmers");
+ n += 1;
+ }
+ }
+
+ INFO("Used " << counter << " kmers.");
+
+ this->ClearBuffers();
+
+ return out;
+}
+
+
+}
diff --git a/src/common/utils/indices/perfect_hash_map.hpp b/src/common/utils/indices/perfect_hash_map.hpp
new file mode 100644
index 0000000..857efc9
--- /dev/null
+++ b/src/common/utils/indices/perfect_hash_map.hpp
@@ -0,0 +1,339 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/openmp_wrapper.h"
+#include "utils/path_helper.hpp"
+#include "io/kmers/kmer_iterator.hpp"
+
+#include "utils/mph_index/kmer_index.hpp"
+
+#include "key_with_hash.hpp"
+#include "values.hpp"
+#include "storing_traits.hpp"
+
+#include <vector>
+#include <cstdlib>
+#include <cstdint>
+
+namespace debruijn_graph {
+
+template<class K, class traits>
+class IndexWrapper {
+ static const size_t InvalidIdx = size_t(-1);
+public:
+ typedef size_t IdxType;
+ typedef K KeyType;
+ typedef traits traits_t;
+protected:
+ typedef KMerIndex<traits> KMerIndexT;
+ //these fields are protected only for reduction of storage in edge indices BinWrite
+ std::shared_ptr<KMerIndexT> index_ptr_;
+private:
+ std::string workdir_;
+ unsigned k_;
+
+protected:
+ size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const {
+ return index_ptr_->raw_seq_idx(s);
+ }
+
+ bool valid(const size_t idx) const {
+ return idx != InvalidIdx && idx < index_ptr_->size();
+ }
+public:
+ IndexWrapper(size_t k, const std::string &workdir)
+ : index_ptr_(std::make_shared<KMerIndexT>())
+ , k_((unsigned) k) {
+ //fixme string literal
+ workdir_ = path::make_temp_dir(workdir, "kmeridx");
+ }
+
+ IndexWrapper(size_t k, const std::string &workdir, std::shared_ptr<KMerIndexT> index_ptr)
+ : IndexWrapper(k, workdir) {
+ index_ptr_ = index_ptr;
+ }
+
+ ~IndexWrapper() {
+ path::remove_dir(workdir_);
+ }
+
+ void clear() {
+ index_ptr_->clear();
+ }
+
+ unsigned k() const { return k_; }
+
+public:
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ index_ptr_->serialize(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &) {
+ clear();
+ index_ptr_->deserialize(reader);
+ }
+
+ const std::string &workdir() const {
+ return workdir_;
+ }
+};
+
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
+class PerfectHashMap : public ValueArray<V>, public IndexWrapper<K, traits> {
+public:
+ typedef size_t IdxType;
+ typedef K KeyType;
+ typedef ValueArray<V> ValueBase;
+ typedef IndexWrapper<KeyType, traits> KeyBase;
+ using KeyBase::index_ptr_;
+ typedef typename KeyBase::KMerIndexT KMerIndexT;
+ typedef typename StoringTraits<K, KMerIndexT, StoringType>::KeyWithHash KeyWithHash;
+
+ KeyWithHash ConstructKWH(const KeyType &key) const {
+ return KeyWithHash(key, *index_ptr_);
+ }
+
+ bool valid(const KeyWithHash &kwh) const {
+ return KeyBase::valid(kwh.idx());
+ }
+
+ PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) {
+ }
+
+ PerfectHashMap(size_t k, const std::string &workdir, std::shared_ptr<KMerIndexT> index_ptr)
+ : KeyBase(k, workdir, index_ptr) {
+ ValueBase::resize(index_ptr_->size());
+ }
+
+ ~PerfectHashMap() {
+ }
+
+ void clear() {
+ KeyBase::clear();
+ ValueBase::clear();
+ }
+
+ const V get_value(const KeyWithHash &kwh) const {
+ return StoringType::get_value(*this, kwh);
+ }
+
+ template<typename F>
+ const V get_value(const KeyWithHash &kwh, const F& inverter) const {
+ return StoringType::get_value(*this, kwh, inverter);
+ }
+
+ //Think twice or ask AntonB if you want to use it!
+ V &get_raw_value_reference(const KeyWithHash &kwh) {
+ return ValueBase::operator[](kwh.idx());
+ }
+
+ const V &get_raw_value_reference(const KeyWithHash &kwh) const {
+ return ValueBase::operator[](kwh.idx());
+ }
+
+ void put_value(const KeyWithHash &kwh, const V &value) {
+ StoringType::set_value(*this, kwh, value);
+ }
+
+ template<typename F>
+ void put_value(const KeyWithHash &kwh, const V &value, const F& inverter) {
+ StoringType::set_value(*this, kwh, value, inverter);
+ }
+
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ KeyBase::BinWrite(writer);
+ ValueBase::BinWrite(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &tmp) {
+ KeyBase::BinRead(reader, tmp);
+ ValueBase::BinRead(reader, tmp);
+ }
+
+ friend struct PerfectHashMapBuilder;
+};
+
+
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
+class KeyStoringMap : public PerfectHashMap<K, V, traits, StoringType> {
+private:
+ typedef PerfectHashMap<K, V, traits, StoringType> base;
+
+public:
+ typedef traits traits_t;
+ typedef K KMer;
+ typedef typename base::IdxType KMerIdx;
+ typedef typename traits::FinalKMerStorage::iterator kmer_iterator;
+ typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator;
+ typedef typename base::KeyWithHash KeyWithHash;
+ using base::ConstructKWH;
+
+private:
+ std::unique_ptr<typename traits::FinalKMerStorage> kmers_;
+
+ void SortUniqueKMers() const {
+ size_t swaps = 0;
+ INFO("Arranging kmers in hash map order");
+ for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) {
+ size_t cidx = I - kmers_->begin();
+ size_t kidx = this->raw_seq_idx(*I);
+ while (cidx != kidx) {
+ auto J = kmers_->begin() + kidx;
+ using std::swap;
+ swap(*I, *J);
+ swaps += 1;
+ kidx = this->raw_seq_idx(*I);
+ }
+ }
+ INFO("Done. Total swaps: " << swaps);
+ }
+
+protected:
+ template<class Writer>
+ void BinWriteKmers(Writer &writer) const {
+ traits::raw_serialize(writer, this->kmers_);
+ }
+
+ template<class Reader>
+ void BinReadKmers(Reader &reader, const std::string &FileName) {
+ this->kmers_ = traits_t::raw_deserialize(reader, FileName);
+ }
+
+public:
+ template<class Writer>
+ void BinWrite(Writer &writer) const {
+ base::BinWrite(writer);
+ BinWriteKmers(writer);
+ }
+
+ template<class Reader>
+ void BinRead(Reader &reader, const std::string &FileName) {
+ base::BinRead(reader, FileName);
+ BinReadKmers(reader, FileName);
+ }
+
+ KeyStoringMap(size_t k, const std::string &workdir)
+ : base(k, workdir), kmers_(nullptr) {}
+
+ ~KeyStoringMap() {}
+
+ KMer true_kmer(KeyWithHash kwh) const {
+ VERIFY(this->valid(kwh));
+
+ auto it = this->kmers_->begin() + kwh.idx();
+ return (typename traits_t::raw_create()(this->k(), *it));
+ }
+
+ void clear() {
+ base::clear();
+ kmers_ = nullptr;
+ }
+
+ kmer_iterator kmer_begin() {
+ return kmers_->begin();
+ }
+ const_kmer_iterator kmer_begin() const {
+ return kmers_->cbegin();
+ }
+
+ kmer_iterator kmer_end() {
+ return kmers_->end();
+ }
+ const_kmer_iterator kmer_end() const {
+ return kmers_->cend();
+ }
+
+ bool valid(const KeyWithHash &kwh) const {
+ if (!base::valid(kwh))
+ return false;
+
+ auto it = this->kmers_->begin() + kwh.idx();
+ if (!kwh.is_minimal())
+ return (typename traits_t::raw_equal_to()(!kwh.key(), *it));
+ else
+ return (typename traits_t::raw_equal_to()(kwh.key(), *it));
+ }
+
+ /**
+ * Number of edges going out of the param edge's end
+ */
+ unsigned NextEdgeCount(const KeyWithHash &kwh) const {
+ unsigned res = 0;
+ for (char c = 0; c < 4; ++c)
+ if (valid(kwh << c))
+ res += 1;
+
+ return res;
+ }
+
+ KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge
+ for (char c = 0; c < 4; ++c) {
+ if (valid(kwh << c))
+ //hack for this code to work with long seqs! (oterwise return s is totally fine)
+ return ConstructKWH(true_kmer(kwh));//s;
+ }
+
+ VERIFY_MSG(false, "Couldn't find requested edge!");
+ return ConstructKWH(KMer(this->k()));
+ // no next edges (we should request one here).
+ }
+
+ /**
+ * Number of edges coming into param edge's end
+ */
+ unsigned RivalEdgeCount(const KeyWithHash &kwh) const {
+ KeyWithHash next = kwh << 'A';
+ unsigned res = 0;
+ for (char c = 0; c < 4; ++c)
+ if (valid(next >> c))
+ res += 1;
+
+ return res;
+ }
+
+ friend struct KeyStoringIndexBuilder;
+};
+
+template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
+class KeyIteratingMap : public PerfectHashMap<K, V, traits, StoringType> {
+ typedef PerfectHashMap<K, V, traits, StoringType> base;
+
+ std::string KMersFilename_;
+
+public:
+ typedef StoringType storing_type;
+ typedef typename base::traits_t traits_t;
+ typedef typename base::KeyType KMer;
+ typedef typename base::IdxType KMerIdx;
+ using base::ConstructKWH;
+
+public:
+
+ KeyIteratingMap(size_t k, const std::string &workdir)
+ : base(k, workdir), KMersFilename_("") {}
+
+ ~KeyIteratingMap() {}
+
+ typedef MMappedFileRecordArrayIterator<typename KMer::DataType> kmer_iterator;
+
+ kmer_iterator kmer_begin() const {
+ return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k()));
+ }
+
+ std::vector<kmer_iterator> kmer_begin(size_t parts) const {
+ return io::make_kmer_iterator<KMer>(this->KMersFilename_, base::k(), parts);
+ }
+
+ friend struct KeyIteratingIndexBuilder;
+};
+
+}
diff --git a/src/common/utils/indices/perfect_hash_map_builder.hpp b/src/common/utils/indices/perfect_hash_map_builder.hpp
new file mode 100644
index 0000000..c8d6972
--- /dev/null
+++ b/src/common/utils/indices/perfect_hash_map_builder.hpp
@@ -0,0 +1,102 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/mph_index/kmer_index_builder.hpp"
+
+#include "perfect_hash_map.hpp"
+#include "kmer_splitters.hpp"
+
+namespace debruijn_graph {
+
+struct PerfectHashMapBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ using KMerIndex = typename PerfectHashMap<K, V, traits, StoringType>::KMerIndexT;
+
+ KMerIndexBuilder<KMerIndex> builder(index.workdir(),
+ (unsigned) bucket_num,
+ (unsigned) thread_num);
+ size_t sz = builder.BuildIndex(*index.index_ptr_, counter, save_final);
+ index.resize(sz);
+ }
+};
+
+struct KeyStoringIndexBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
+ VERIFY(!index.kmers_.get());
+ index.kmers_ = counter.GetFinalKMers();
+ VERIFY(index.kmers_.get());
+ index.SortUniqueKMers();
+ }
+
+ private:
+ PerfectHashMapBuilder phm_builder_;
+};
+
+struct KeyIteratingIndexBuilder {
+ template<class K, class V, class traits, class StoringType, class Counter>
+ void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) const {
+ phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
+ index.KMersFilename_ = counter.GetFinalKMersFname();
+ }
+
+ private:
+ PerfectHashMapBuilder phm_builder_;
+};
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ KeyIteratingIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ KeyStoringIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+template<class K, class V, class traits, class StoringType, class Counter>
+void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
+ Counter& counter, size_t bucket_num,
+ size_t thread_num, bool save_final = true) {
+ PerfectHashMapBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
+}
+
+template<class Index, class Streams>
+size_t BuildIndexFromStream(Index &index,
+ Streams &streams,
+ io::SingleStream* contigs_stream = 0) {
+ DeBruijnReadKMerSplitter<typename Streams::ReadT,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
+ KMerDiskCounter<RtSeq> counter(index.workdir(), splitter);
+ BuildIndex(index, counter, 16, streams.size());
+ return 0;
+}
+
+template<class Index, class Graph>
+void BuildIndexFromGraph(Index &index, const Graph &g, size_t read_buffer_size = 0) {
+ DeBruijnGraphKMerSplitter<Graph,
+ StoringTypeFilter<typename Index::storing_type>>
+ splitter(index.workdir(), index.k(), g, read_buffer_size);
+ KMerDiskCounter<RtSeq> counter(index.workdir(), splitter);
+ BuildIndex(index, counter, 16, 1);
+}
+
+}
diff --git a/src/common/utils/indices/storing_traits.hpp b/src/common/utils/indices/storing_traits.hpp
new file mode 100644
index 0000000..0904cd4
--- /dev/null
+++ b/src/common/utils/indices/storing_traits.hpp
@@ -0,0 +1,81 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+/*
+ * key_with_hash.hpp
+ *
+ * Created on: Nov 7, 2013
+ * Author: anton
+ */
+
+#include "values.hpp"
+
+namespace debruijn_graph {
+
+
+struct SimpleStoring {
+ template<class K, class V>
+ static V get_value(const ValueArray<V> &values, const K& key) {
+ return values[key.idx()];
+ }
+
+ template<class K, class V>
+ static void set_value(ValueArray<V> &values, const K& key, const V& value) {
+ values[key.idx()] = value;
+ }
+
+ static bool IsInvertable() {
+ return false;
+ }
+};
+
+struct InvertableStoring {
+ template<typename V>
+ struct default_inverter {
+ template<typename K>
+ V operator()(const V& v, const K& k) const {
+ return v.conjugate(k);
+ }
+ };
+
+ template<typename V>
+ struct trivial_inverter {
+ template<typename K>
+ V operator()(const V& v, const K& /*k*/) const {
+ return v;
+ }
+ };
+
+ template<class K, class V, class F = default_inverter<V>>
+ static V get_value(const ValueArray<V> &values, const K& key,
+ const F& inverter = F()) {
+ if (key.is_minimal())
+ return values[key.idx()];
+ else
+ return inverter(values[key.idx()], key);
+ }
+
+ template<class K, class V, class F = default_inverter<V>>
+ static void set_value(ValueArray<V>& values, const K& key, const V& value,
+ const F& inverter = F()) {
+ VERIFY(key.idx() < values.size());
+ if (key.is_minimal()) {
+ values[key.idx()] = value;
+ } else {
+ values[key.idx()] = inverter(value, key);
+ }
+ }
+
+ static bool IsInvertable() {
+ return true;
+ }
+};
+
+typedef InvertableStoring DefaultStoring;
+
+}
diff --git a/src/modules/data_structures/indices/values.hpp b/src/common/utils/indices/values.hpp
similarity index 100%
rename from src/modules/data_structures/indices/values.hpp
rename to src/common/utils/indices/values.hpp
diff --git a/src/common/utils/levenshtein.hpp b/src/common/utils/levenshtein.hpp
new file mode 100644
index 0000000..9fad614
--- /dev/null
+++ b/src/common/utils/levenshtein.hpp
@@ -0,0 +1,241 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "utils/simple_tools.hpp"
+
+/*
+ * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm
+ */
+inline size_t edit_distance(const std::string &source, const std::string &target) {
+
+ // Step 1
+
+ const size_t n = source.length();
+ const size_t m = target.length();
+ if (n == 0) {
+ return m;
+ }
+ if (m == 0) {
+ return n;
+ }
+
+ // Good form to declare a TYPEDEF
+
+ typedef std::vector<std::vector<size_t> > Tmatrix;
+
+ Tmatrix matrix(n + 1);
+
+ // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
+ // allow for allocation on declaration of 2.nd dimension of vec of vec
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i].resize(m + 1);
+ }
+
+ // Step 2
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i][0] = i;
+ }
+
+ for (size_t j = 0; j <= m; j++) {
+ matrix[0][j] = j;
+ }
+
+ // Step 3
+
+ for (size_t i = 1; i <= n; i++) {
+
+ const char s_i = source[i - 1];
+
+ // Step 4
+
+ for (size_t j = 1; j <= m; j++) {
+
+ const char t_j = target[j - 1];
+
+ // Step 5
+
+ size_t cost;
+ if (s_i == t_j) {
+ cost = 0;
+ }
+ else {
+ cost = 1;
+ }
+
+ // Step 6
+
+ const size_t above = matrix[i - 1][j];
+ const size_t left = matrix[i][j - 1];
+ const size_t diag = matrix[i - 1][j - 1];
+ size_t cell = std::min(above + 1, std::min(left + 1, diag + cost));
+
+ // Step 6A: Cover transposition, in addition to deletion,
+ // insertion and substitution. This step is taken from:
+ // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
+ // Enhanced Dynamic Programming ASM Algorithm"
+ // (http://www.acm.org/~hlb/publications/asm/asm.html)
+
+ if (i > 2 && j > 2) {
+ size_t trans = matrix[i - 2][j - 2] + 1;
+ if (source[i - 2] != t_j) trans++;
+ if (s_i != target[j - 2]) trans++;
+ if (cell > trans) cell = trans;
+ }
+
+ matrix[i][j] = cell;
+ }
+ }
+
+ // Step 7
+
+ return matrix[n][m];
+}
+
+inline std::pair<std::pair<int, int>, std::string> best_edit_distance_cigar(const std::string &source,
+ const std::string &target) {
+
+ // Step 1
+
+ const size_t n = source.length();
+ const size_t m = target.length();
+// if (n == 0) {
+// return m;
+// }
+// if (m == 0) {
+// return n;
+// }
+
+ // Good form to declare a TYPEDEF
+
+ typedef std::vector<std::vector<int> > Tmatrix;
+
+ Tmatrix matrix(n + 1);
+
+ // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
+ // allow for allocation on declaration of 2.nd dimension of vec of vec
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i].resize(m + 1);
+ }
+
+ // Step 2
+
+ for (size_t i = 0; i <= n; i++) {
+ matrix[i][0] = (int) i;
+ }
+
+ for (size_t j = 0; j <= m; j++) {
+ matrix[0][j] = 0; //free inserts in front
+ }
+
+ // Step 3
+
+ for (size_t i = 1; i <= n; i++) {
+
+ const char s_i = source[i - 1];
+
+ // Step 4
+
+ for (size_t j = 1; j <= m; j++) {
+
+ const char t_j = target[j - 1];
+
+ // Step 5
+
+ int cost;
+ if (s_i == t_j) {
+ cost = 0;
+ }
+ else {
+ cost = 1;
+ }
+
+ // Step 6
+
+ const int above = matrix[i - 1][j];
+ const int left = matrix[i][j - 1];
+ const int diag = matrix[i - 1][j - 1];
+ int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+
+ // Step 6A: Cover transposition, in addition to deletion,
+ // insertion and substitution. This step is taken from:
+ // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
+ // Enhanced Dynamic Programming ASM Algorithm"
+ // (http://www.acm.org/~hlb/publications/asm/asm.html)
+
+// if (i>2 && j>2) {
+// int trans=matrix[i-2][j-2]+1;
+// if (source[i-2]!=t_j) trans++;
+// if (s_i!=target[j-2]) trans++;
+// if (cell>trans) cell=trans;
+// }
+
+ matrix[i][j] = cell;
+ }
+ }
+
+ // Step 7
+ int min = matrix[n][m];
+ size_t min_m = m;
+
+ for (size_t j = 0; j <= m; j++) {
+ if (min > matrix[n][j]) {
+ min = matrix[n][j];
+ min_m = j;
+ }
+ }
+
+// INFO("min = "<<min<< " min_m = "<< min_m);
+ std::string res = "";
+ char last_operation = 0;
+ int cnt_last_operation = 0;
+ size_t cur_pos_i = n;
+ size_t cur_pos_j = min_m;
+ char cur_operation = 0;
+
+
+// if (min > 0) {
+// for (int i = 0; i <= n; i++) {
+// INFO(ToString(matrix[i]));
+// }
+// }
+
+ while ((cur_pos_i > 0) && (cur_pos_j > 0)) {
+ if (matrix[cur_pos_i - 1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) {
+ cur_operation = 'I';
+ cur_pos_i--;
+ }
+ else {
+ if (matrix[cur_pos_i][cur_pos_j - 1] < matrix[cur_pos_i][cur_pos_j]) {
+ cur_operation = 'D';
+ cur_pos_j--;
+ }
+ else {
+ cur_operation = 'M';
+ cur_pos_i--;
+ cur_pos_j--;
+ }
+ }
+ if (cur_operation != last_operation) {
+ if (last_operation != 0)
+ res = ToString(cnt_last_operation) + last_operation + res;
+ last_operation = cur_operation;
+ cnt_last_operation = 1;
+ }
+ else {
+ cnt_last_operation++;
+ }
+ }
+ res = ToString(cnt_last_operation) + last_operation + res;
+ return std::make_pair(std::make_pair(cur_pos_j, min_m), res);
+}
diff --git a/src/modules/dev_support/log.hpp b/src/common/utils/log.hpp
similarity index 100%
rename from src/modules/dev_support/log.hpp
rename to src/common/utils/log.hpp
diff --git a/src/common/utils/logger/log_writers.hpp b/src/common/utils/logger/log_writers.hpp
new file mode 100644
index 0000000..666c03f
--- /dev/null
+++ b/src/common/utils/logger/log_writers.hpp
@@ -0,0 +1,43 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/path_helper.hpp"
+#include "logger.hpp"
+
+#include <iostream>
+
+#include "config.hpp"
+
+#include <iostream>
+
+namespace logging {
+
+struct console_writer : public writer {
+#ifdef SPADES_USE_JEMALLOC
+
+ void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num,
+ const char *source, const char *msg) {
+ std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
+ human_readable_time(time), human_readable_memory(cmem),
+ human_readable_memory(max_rss), logging::level_name(l),
+ source, path::filename(file), int(line_num), msg)
+ << std::endl;
+ }
+
+#else
+void write_msg(double time, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) {
+ std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
+ human_readable_time(time), human_readable_memory(max_rss), logging::level_name(l),
+ source, path::filename(file), int(line_num), msg)
+ << std::endl;
+}
+#endif
+};
+
+} // logging
diff --git a/src/common/utils/logger/logger.hpp b/src/common/utils/logger/logger.hpp
new file mode 100644
index 0000000..c088aed
--- /dev/null
+++ b/src/common/utils/logger/logger.hpp
@@ -0,0 +1,149 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "utils/perfcounter.hpp"
+
+#include <vector>
+#include <unordered_map>
+#include <string>
+#include <sstream>
+#include <memory>
+
+#include "config.hpp"
+
+namespace logging
+{
+
+/////////////////////////////////////////////////////
+enum level
+{
+ L_TRACE,
+ L_DEBUG,
+ L_INFO,
+ L_WARN,
+ L_ERROR
+};
+
+inline std::string level_name(level l)
+{
+ static std::string names [] =
+ {
+ "TRACE",
+ "DEBUG",
+ "INFO" ,
+ "WARN" ,
+ "ERROR"
+ };
+
+ return names[l];
+}
+
+
+/////////////////////////////////////////////////////
+struct writer
+{
+#ifdef SPADES_USE_JEMALLOC
+ virtual void write_msg(double time_in_sec, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
+#else
+ virtual void write_msg(double time_in_sec, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
+#endif
+ virtual ~writer(){}
+};
+
+typedef std::shared_ptr<writer> writer_ptr;
+
+/////////////////////////////////////////////////////
+struct properties
+{
+ /* Reading logger properties from file
+ *
+ * File should contains lines like below.
+ * Use leading # for comment.
+ * File could contain line with default behavior description. If no 'default' entry found, default is set to INFO
+ * Valid levels: TRACE, DEBUG, INFO, WARN, ERROR
+ *
+ * default=INFO
+ * AbraCaDabra=TRACE
+ * #BubaZuba=WARN
+ * HariKrishna=INFO
+ *
+ */
+
+ properties(std::string filename = "", level default_level = L_INFO);
+ properties(level default_level = L_INFO);
+
+ std::unordered_map<std::string, level> levels;
+ level def_level;
+ bool all_default;
+};
+
+////////////////////////////////////////////////////
+struct logger
+{
+ logger(properties const& props);
+
+ //
+ bool need_log(level desired_level, const char* source) const;
+ void log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg);
+
+ //
+ void add_writer(writer_ptr ptr);
+
+private:
+ properties props_ ;
+ std::vector<writer_ptr> writers_;
+ perf_counter timer_ ;
+};
+
+std::shared_ptr<logger>& __logger();
+logger* create_logger(std::string filename = "", level default_level = L_INFO);
+
+void attach_logger(logger *lg);
+void detach_logger();
+
+} // logging
+
+inline const char* __scope_source_name() {
+ return " General ";
+}
+
+#define DECL_LOGGER(source) \
+ static const char* __scope_source_name() { \
+ return source; \
+ }
+
+#define LOG_MSG(l, msg) \
+ do { \
+ std::shared_ptr<logging::logger> &__lg__ = logging::__logger(); \
+ if (__lg__.get() == NULL) \
+ break; \
+ \
+ if (__lg__->need_log((l), __scope_source_name())) { \
+ std::stringstream __logger__str__; \
+ __logger__str__ << msg; /* don't use brackets here! */ \
+ __lg__->log((l), __FILE__, __LINE__, __scope_source_name(), __logger__str__.str().c_str()); \
+ } \
+ } while(0);
+
+#ifdef SPADES_DEBUG_LOGGING
+# define DEBUG(message) LOG_MSG(logging::L_DEBUG, message)
+# define TRACE(message) LOG_MSG(logging::L_TRACE, message)
+#else
+# define DEBUG(message) /* No trace */
+# define TRACE(message) /* No trace */
+#endif
+#define INFO(message) LOG_MSG(logging::L_INFO , message)
+#define VERBOSE_T(n, T, message) {size_t n_copy = (n); if (n_copy % (T) == 0 && n_copy > 0) INFO(n_copy << message)}
+#define VERBOSE(n, message) VERBOSE_T((n), 10000, message)
+#define VERBOSE_POWER_T(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(n_copy << message)}
+#define VERBOSE_POWER(n, message) VERBOSE_POWER_T((n), 10000, message)
+#define VERBOSE_POWER_T2(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(message)}
+#define VERBOSE_POWER2(n, message) VERBOSE_POWER_T2((n), 10000, message)
+#define WARN(message) LOG_MSG(logging::L_WARN, message)
+#define ERROR(message) LOG_MSG(logging::L_ERROR, message)
+#define FATAL_ERROR(message) {ERROR(message); exit(-1);}
diff --git a/src/common/utils/logger/logger_impl.cpp b/src/common/utils/logger/logger_impl.cpp
new file mode 100644
index 0000000..4b8ce6b
--- /dev/null
+++ b/src/common/utils/logger/logger_impl.cpp
@@ -0,0 +1,148 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <boost/algorithm/string.hpp>
+#include <cppformat/format.h>
+
+#include <string>
+#include <map>
+#include <fstream>
+#include <vector>
+
+#include "utils/logger/logger.hpp"
+
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+# include <jemalloc/jemalloc.h>
+#endif
+
+namespace logging {
+
+properties::properties(level default_level)
+ : def_level(default_level), all_default(true) {}
+
+properties::properties(std::string filename, level default_level)
+ : def_level(default_level), all_default(true) {
+ if (filename.empty())
+ return;
+
+ std::ifstream in(filename.c_str());
+
+ std::map<std::string, level> remap = {
+ {"TRACE", L_TRACE},
+ {"DEBUG", L_DEBUG},
+ {"INFO" , L_INFO },
+ {"WARN" , L_WARN },
+ {"ERROR", L_ERROR}
+ };
+
+ while (!in.eof()) {
+ using namespace boost;
+
+ char buf [0x400] = {};
+ in.getline(buf, sizeof buf);
+
+ std::string str(buf);
+ trim(str);
+
+ if (str.empty() || boost::starts_with(str, "#"))
+ continue;
+
+ std::vector<std::string> entry;
+ split(entry, str, is_any_of("="));
+
+ if(entry.size() != 2)
+ throw std::runtime_error("invalid log file property entry: " + str);
+
+ trim (entry[0]);
+ trim (entry[1]);
+ to_upper(entry[1]);
+
+ auto it = remap.find(entry[1]);
+ if(it == remap.end())
+ throw std::runtime_error("invalid log file level description: " + entry[1]);
+
+ levels[entry[0]] = it->second;
+ }
+
+ auto def = levels.find("default");
+ if (def != levels.end())
+ def_level = def->second;
+
+ for (auto I = levels.begin(), E = levels.end(); I != E; ++I) {
+ if (I->second != def_level) {
+ all_default = false;
+ break;
+ }
+ }
+}
+
+
+logger::logger(properties const& props)
+ : props_(props) { }
+
+bool logger::need_log(level desired_level, const char* source) const {
+ level source_level = props_.def_level;
+
+ if (!props_.all_default) {
+ auto it = props_.levels.find(source);
+ if (it != props_.levels.end())
+ source_level = it->second;
+ }
+
+ return desired_level >= source_level;
+}
+
+#ifdef SPADES_USE_JEMALLOC
+
+void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
+ double time = timer_.time();
+ const size_t *cmem = 0, *cmem_max = 0;
+ size_t clen = sizeof(cmem);
+
+ je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
+ je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0);
+
+ for (auto it = writers_.begin(); it != writers_.end(); ++it)
+ (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg);
+}
+#else
+void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
+ double time = timer_.time();
+ size_t max_rss = get_max_rss();
+
+ for (auto it = writers_.begin(); it != writers_.end(); ++it)
+ (*it)->write_msg(time, max_rss, desired_level, file, line_num, source, msg);
+}
+#endif
+
+//
+void logger::add_writer(writer_ptr ptr)
+{
+ writers_.push_back(ptr);
+}
+
+////////////////////////////////////////////////////
+std::shared_ptr<logger> &__logger() {
+ static std::shared_ptr<logger> l;
+ return l;
+}
+
+logger *create_logger(std::string filename, level default_level) {
+ return new logger(properties(filename, default_level));
+}
+
+void attach_logger(logger *lg) {
+ __logger().reset(lg);
+}
+
+void detach_logger() {
+ __logger().reset();
+}
+
+
+} // logging
diff --git a/src/modules/dev_support/md5.h b/src/common/utils/md5.h
similarity index 100%
rename from src/modules/dev_support/md5.h
rename to src/common/utils/md5.h
diff --git a/src/modules/dev_support/memory.hpp b/src/common/utils/memory.hpp
similarity index 100%
rename from src/modules/dev_support/memory.hpp
rename to src/common/utils/memory.hpp
diff --git a/src/modules/dev_support/memory_limit.hpp b/src/common/utils/memory_limit.hpp
similarity index 100%
rename from src/modules/dev_support/memory_limit.hpp
rename to src/common/utils/memory_limit.hpp
diff --git a/src/modules/data_structures/mph_index/CMakeLists.txt b/src/common/utils/mph_index/CMakeLists.txt
similarity index 100%
rename from src/modules/data_structures/mph_index/CMakeLists.txt
rename to src/common/utils/mph_index/CMakeLists.txt
diff --git a/src/modules/data_structures/mph_index/base_hash.hpp b/src/common/utils/mph_index/base_hash.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/base_hash.hpp
rename to src/common/utils/mph_index/base_hash.hpp
diff --git a/src/modules/data_structures/mph_index/bitpair_vector.cpp b/src/common/utils/mph_index/bitpair_vector.cpp
similarity index 100%
rename from src/modules/data_structures/mph_index/bitpair_vector.cpp
rename to src/common/utils/mph_index/bitpair_vector.cpp
diff --git a/src/modules/data_structures/mph_index/bitpair_vector.hpp b/src/common/utils/mph_index/bitpair_vector.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/bitpair_vector.hpp
rename to src/common/utils/mph_index/bitpair_vector.hpp
diff --git a/src/modules/data_structures/mph_index/common.hpp b/src/common/utils/mph_index/common.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/common.hpp
rename to src/common/utils/mph_index/common.hpp
diff --git a/src/modules/data_structures/mph_index/emphf_config.hpp b/src/common/utils/mph_index/emphf_config.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/emphf_config.hpp
rename to src/common/utils/mph_index/emphf_config.hpp
diff --git a/src/modules/data_structures/mph_index/hypergraph.hpp b/src/common/utils/mph_index/hypergraph.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/hypergraph.hpp
rename to src/common/utils/mph_index/hypergraph.hpp
diff --git a/src/common/utils/mph_index/hypergraph_sorter_seq.hpp b/src/common/utils/mph_index/hypergraph_sorter_seq.hpp
new file mode 100644
index 0000000..9adfdc3
--- /dev/null
+++ b/src/common/utils/mph_index/hypergraph_sorter_seq.hpp
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+#include <cmath>
+#include <vector>
+#include <iterator>
+#include <algorithm>
+#include <stdexcept>
+
+#include "common.hpp"
+#include "hypergraph.hpp"
+
+#include "utils/logger/logger.hpp"
+
+namespace emphf {
+
+ template <typename HypergraphType>
+ class hypergraph_sorter_seq {
+ public:
+ typedef HypergraphType hg;
+ typedef typename hg::node_t node_t;
+ typedef typename hg::hyperedge hyperedge;
+ typedef typename hg::xored_adj_list xored_adj_list;
+
+ hypergraph_sorter_seq()
+ {}
+
+ template <typename Range, typename EdgeGenerator>
+ bool try_generate_and_sort(Range const& input_range,
+ EdgeGenerator const& edge_gen,
+ size_t n,
+ size_t hash_domain,
+ bool verbose = true)
+ {
+ using std::get;
+ std::vector<xored_adj_list> adj_lists;
+
+ size_t m = hash_domain * 3;
+
+ // do all the allocations upfront
+ m_peeling_order.clear();
+ m_peeling_order.reserve(n);
+ adj_lists.resize(m);
+
+ // generate edges
+ if (verbose) {
+ //logger() << "Generating hyperedges and populating adjacency lists"
+ // << std::endl;
+ }
+
+ for (auto const& val: input_range) {
+ auto edge = edge_gen(val);
+ // canonical by construction
+ assert(orientation(edge) == 0);
+
+ adj_lists[edge.v0].add_edge(edge);
+
+ std::swap(edge.v0, edge.v1);
+ adj_lists[edge.v0].add_edge(edge);
+
+ std::swap(edge.v0, edge.v2);
+ adj_lists[edge.v0].add_edge(edge);
+ }
+
+ // peel
+ if (verbose) {
+ // logger() << "Peeling" << std::endl;
+ }
+
+ auto visit = [&](node_t v0) {
+ if (adj_lists[v0].degree == 1) {
+ auto edge = adj_lists[v0].edge_from(v0);
+ m_peeling_order.push_back(edge);
+
+ edge = canonicalize_edge(edge);
+ adj_lists[edge.v0].delete_edge(edge);
+
+ std::swap(edge.v0, edge.v1);
+ adj_lists[edge.v0].delete_edge(edge);
+
+ std::swap(edge.v0, edge.v2);
+ adj_lists[edge.v0].delete_edge(edge);
+ }
+ };
+
+ size_t queue_position = 0;
+ for (node_t v0 = 0; v0 < m; ++v0) {
+ visit(v0);
+
+ while (queue_position < m_peeling_order.size()) {
+ auto const& cur_edge = m_peeling_order[queue_position];
+
+ visit(cur_edge.v1);
+ visit(cur_edge.v2);
+ queue_position += 1;
+ }
+ }
+
+ if (m_peeling_order.size() < n) {
+ if (verbose) {
+ // logger() << "Hypergraph is not peelable: "
+ // << (n - m_peeling_order.size()) << " edges remaining"
+ // << std::endl;
+ }
+ return false;
+ }
+
+ assert(m_peeling_order.size() == n);
+
+ return true;
+ }
+
+ typedef typename std::vector<hyperedge>::const_reverse_iterator
+ peeling_iterator;
+
+ std::pair<peeling_iterator, peeling_iterator>
+ get_peeling_order() const
+ {
+ return std::make_pair(m_peeling_order.crbegin(),
+ m_peeling_order.crend());
+ }
+
+ private:
+
+ size_t m_hash_domain;
+ std::vector<hyperedge> m_peeling_order;
+ };
+}
diff --git a/src/modules/data_structures/mph_index/kmer_index.hpp b/src/common/utils/mph_index/kmer_index.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/kmer_index.hpp
rename to src/common/utils/mph_index/kmer_index.hpp
diff --git a/src/common/utils/mph_index/kmer_index_builder.hpp b/src/common/utils/mph_index/kmer_index_builder.hpp
new file mode 100644
index 0000000..1d72db1
--- /dev/null
+++ b/src/common/utils/mph_index/kmer_index_builder.hpp
@@ -0,0 +1,486 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "kmer_index.hpp"
+
+#include "io/kmers/mmapped_reader.hpp"
+#include "io/kmers/mmapped_writer.hpp"
+#include "common/adt/pointer_iterator.hpp"
+#include "common/adt/kmer_vector.hpp"
+
+#include "utils/openmp_wrapper.h"
+
+#include "utils/logger/logger.hpp"
+#include "utils/path_helper.hpp"
+
+#include "utils/memory_limit.hpp"
+#include "utils/file_limit.hpp"
+
+#include "adt/iterator_range.hpp"
+#include "adt/loser_tree.hpp"
+
+#include "mphf.hpp"
+#include "base_hash.hpp"
+#include "hypergraph.hpp"
+#include "hypergraph_sorter_seq.hpp"
+
+#include <libcxx/sort.hpp>
+
+#include <algorithm>
+#ifdef USE_GLIBCXX_PARALLEL
+#include <parallel/algorithm>
+#endif
+
+#include "config.hpp"
+
+#ifdef SPADES_USE_JEMALLOC
+# include <jemalloc/jemalloc.h>
+#endif
+
+#include <fstream>
+#include <vector>
+#include <cmath>
+
+template<class Seq>
+class KMerSplitter {
+ public:
+ typedef typename Seq::hash hash_function;
+
+ KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
+ : work_dir_(work_dir), K_(K), seed_(seed) {}
+
+ virtual ~KMerSplitter() {}
+
+ virtual path::files_t Split(size_t num_files) = 0;
+
+ size_t kmer_size() const {
+ return Seq::GetDataSize(K_) * sizeof(typename Seq::DataType);
+ }
+
+ unsigned K() const { return K_; }
+
+ protected:
+ const std::string &work_dir_;
+ hash_function hash_;
+ unsigned K_;
+ uint32_t seed_;
+
+ DECL_LOGGER("K-mer Splitting");
+};
+
+template<class Seq>
+class KMerSortingSplitter : public KMerSplitter<Seq> {
+ public:
+ KMerSortingSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
+ : KMerSplitter<Seq>(work_dir, K, seed), cell_size_(0), num_files_(0) {}
+
+ protected:
+ using SeqKMerVector = KMerVector<Seq>;
+ using KMerBuffer = std::vector<SeqKMerVector>;
+
+ std::vector<KMerBuffer> kmer_buffers_;
+ size_t cell_size_;
+ size_t num_files_;
+
+ path::files_t PrepareBuffers(size_t num_files, unsigned nthreads, size_t reads_buffer_size) {
+ num_files_ = num_files;
+
+ // Determine the set of output files
+ path::files_t out;
+ for (unsigned i = 0; i < num_files_; ++i)
+ out.push_back(this->GetRawKMersFname(i));
+
+ size_t file_limit = num_files_ + 2*nthreads;
+ size_t res = limit_file(file_limit);
+ if (res < file_limit) {
+ WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
+ WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
+ }
+
+ if (reads_buffer_size == 0) {
+ reads_buffer_size = 536870912ull;
+ size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
+ INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
+ reads_buffer_size = std::min(reads_buffer_size, mem_limit);
+ }
+ cell_size_ = reads_buffer_size / (num_files_ * this->kmer_size());
+ // Set sane minimum cell size
+ if (cell_size_ < 16384)
+ cell_size_ = 16384;
+
+ INFO("Using cell size of " << cell_size_);
+ kmer_buffers_.resize(nthreads);
+ for (unsigned i = 0; i < nthreads; ++i) {
+ KMerBuffer &entry = kmer_buffers_[i];
+ entry.resize(num_files_, KMerVector<Seq>(this->K_, (size_t) (1.1 * (double) cell_size_)));
+ }
+
+ return out;
+ }
+
+ bool push_back_internal(const Seq &seq, unsigned thread_id) {
+ KMerBuffer &entry = kmer_buffers_[thread_id];
+
+ size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files_);
+ entry[idx].push_back(seq);
+ return entry[idx].size() > cell_size_;
+ }
+
+ void DumpBuffers(const path::files_t &ostreams) {
+ VERIFY(ostreams.size() == num_files_ && kmer_buffers_[0].size() == num_files_);
+
+# pragma omp parallel for
+ for (unsigned k = 0; k < num_files_; ++k) {
+ // Below k is thread id!
+
+ size_t sz = 0;
+ for (size_t i = 0; i < kmer_buffers_.size(); ++i)
+ sz += kmer_buffers_[i][k].size();
+
+ KMerVector<Seq> SortBuffer(this->K_, sz);
+ for (auto & entry : kmer_buffers_) {
+ const auto &buffer = entry[k];
+ for (size_t j = 0; j < buffer.size(); ++j)
+ SortBuffer.push_back(buffer[j]);
+ }
+ libcxx::sort(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::less2_fast());
+ auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::equal_to());
+
+# pragma omp critical
+ {
+ size_t cnt = it - SortBuffer.begin();
+
+ // Write k-mers
+ FILE *f = fopen(ostreams[k].c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(SortBuffer.data(), SortBuffer.el_data_size(), cnt, f);
+ fclose(f);
+
+ // Write index
+ f = fopen((ostreams[k] + ".idx").c_str(), "ab");
+ VERIFY_MSG(f, "Cannot open temporary file to write");
+ fwrite(&cnt, sizeof(cnt), 1, f);
+ fclose(f);
+ }
+ }
+
+ for (auto & entry : kmer_buffers_)
+ for (auto & eentry : entry)
+ eentry.clear();
+ }
+
+ void ClearBuffers() {
+ for (auto & entry : kmer_buffers_)
+ for (auto & eentry : entry) {
+ eentry.clear();
+ eentry.shrink_to_fit();
+ }
+ }
+
+ std::string GetRawKMersFname(unsigned suffix) const {
+ return path::append_path(this->work_dir_, "kmers.raw." + std::to_string(suffix));
+ }
+
+ unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
+ return (unsigned)(this->hash_(s, this->seed_) % total);
+ }
+
+};
+
+template<class Seq, class traits = kmer_index_traits<Seq> >
+class KMerCounter {
+ public:
+ typedef typename traits::raw_data_iterator iterator;
+ typedef typename traits::raw_data_const_iterator const_iterator;
+ typedef typename traits::RawKMerStorage RawKMerStorage;
+ typedef typename traits::FinalKMerStorage FinalKMerStorage;
+
+ virtual size_t kmer_size() const = 0;
+
+ virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
+ virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
+ virtual void MergeBuckets(unsigned num_buckets) = 0;
+
+ virtual std::unique_ptr<RawKMerStorage> GetBucket(size_t idx, bool unlink = true) = 0;
+ virtual std::unique_ptr<FinalKMerStorage> GetFinalKMers() = 0;
+
+ virtual ~KMerCounter() {}
+
+protected:
+ DECL_LOGGER("K-mer Counting");
+};
+
+template<class Seq, class traits = kmer_index_traits<Seq> >
+class KMerDiskCounter : public KMerCounter<Seq> {
+ typedef KMerCounter<Seq, traits> __super;
+ typedef typename traits::RawKMerStorage BucketStorage;
+public:
+ KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
+ : work_dir_(work_dir), splitter_(splitter) {
+ std::string prefix = path::append_path(work_dir, "kmers_XXXXXX");
+ char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str());
+ VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file");
+ kmer_prefix_ = tempprefix;
+ delete[] tempprefix;
+ }
+
+ ~KMerDiskCounter() {
+ ::close(fd_);
+ ::unlink(kmer_prefix_.c_str());
+ }
+
+ size_t kmer_size() const override {
+ return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
+ }
+
+ std::unique_ptr<BucketStorage> GetBucket(size_t idx, bool unlink = true) override {
+ unsigned K = splitter_.K();
+ return std::unique_ptr<BucketStorage>(new BucketStorage(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink));
+ }
+
+ size_t Count(unsigned num_buckets, unsigned num_threads) override {
+ unsigned K = splitter_.K();
+
+ // Split k-mers into buckets.
+ path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads);
+
+ INFO("Starting k-mer counting.");
+ size_t kmers = 0;
+# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers)
+ for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) {
+ kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K);
+ }
+ INFO("K-mer counting done. There are " << kmers << " kmers in total. ");
+
+ INFO("Merging temporary buckets.");
+ for (unsigned i = 0; i < num_buckets; ++i) {
+ std::string ofname = GetMergedKMersFname(i);
+ std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
+ for (unsigned j = 0; j < num_threads; ++j) {
+ BucketStorage ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
+ ofs.write((const char*)ins.data(), ins.data_size());
+ }
+ }
+
+ return kmers;
+ }
+
+ void MergeBuckets(unsigned num_buckets) override {
+ unsigned K = splitter_.K();
+
+ INFO("Merging final buckets.");
+
+ MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
+ std::string ofname = GetFinalKMersFname();
+ std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
+ for (unsigned j = 0; j < num_buckets; ++j) {
+ auto bucket = GetBucket(j, /* unlink */ true);
+ ofs.write((const char*)bucket->data(), bucket->data_size());
+ }
+ ofs.close();
+ }
+
+ size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) override {
+ size_t kmers = Count(num_buckets, num_threads);
+ if (merge)
+ MergeBuckets(num_buckets);
+
+ return kmers;
+ }
+
+ std::unique_ptr<typename __super::FinalKMerStorage> GetFinalKMers() override {
+ unsigned K = splitter_.K();
+ return std::unique_ptr<typename __super::FinalKMerStorage>(new typename __super::FinalKMerStorage(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true));
+ }
+
+ std::string GetMergedKMersFname(unsigned suffix) const {
+ return kmer_prefix_ + ".merged." + std::to_string(suffix);
+ }
+
+ std::string GetFinalKMersFname() const {
+ return kmer_prefix_ + ".final";
+ }
+
+private:
+ std::string work_dir_;
+ KMerSplitter<Seq> &splitter_;
+ int fd_;
+ std::string kmer_prefix_;
+
+ std::string GetUniqueKMersFname(unsigned suffix) const {
+ return kmer_prefix_ + ".unique." + std::to_string(suffix);
+ }
+
+ size_t MergeKMers(const std::string &ifname, const std::string &ofname,
+ unsigned K) {
+ MMappedRecordArrayReader<typename Seq::DataType> ins(ifname, Seq::GetDataSize(K), /* unlink */ true);
+
+ std::string IdxFileName = ifname + ".idx";
+ if (FILE *f = fopen(IdxFileName.c_str(), "rb")) {
+ fclose(f);
+ MMappedRecordReader<size_t> index(ifname + ".idx", true, -1ULL);
+
+ // INFO("Total runs: " << index.size());
+
+ // Prepare runs
+ std::vector<adt::iterator_range<decltype(ins.begin())>> ranges;
+ auto beg = ins.begin();
+ for (size_t sz : index) {
+ auto end = std::next(beg, sz);
+ ranges.push_back(adt::make_range(beg, end));
+ VERIFY(std::is_sorted(beg, end, array_less<typename Seq::DataType>()));
+ beg = end;
+ }
+
+ // Construct tree on top entries of runs
+ adt::loser_tree<decltype(beg),
+ array_less<typename Seq::DataType>> tree(ranges);
+
+ if (tree.empty()) {
+ FILE *g = fopen(ofname.c_str(), "ab");
+ VERIFY_MSG(g, "Cannot open temporary file to write");
+ fclose(g);
+ return 0;
+ }
+
+ // Write it down!
+ KMerVector<Seq> buf(K, 1024*1024);
+ auto pval = tree.pop();
+ size_t total = 0;
+ while (!tree.empty()) {
+ buf.clear();
+ for (size_t cnt = 0; cnt < buf.capacity() && !tree.empty(); ) {
+ auto cval = tree.pop();
+ if (!array_equal_to<typename Seq::DataType>()(pval, cval)) {
+ buf.push_back(pval);
+ pval = cval;
+ cnt += 1;
+ }
+ }
+ total += buf.size();
+
+ FILE *g = fopen(ofname.c_str(), "ab");
+ VERIFY_MSG(g, "Cannot open temporary file to write");
+ fwrite(buf.data(), buf.el_data_size(), buf.size(), g);
+ fclose(g);
+ }
+
+ // Handle very last value
+ {
+ FILE *g = fopen(ofname.c_str(), "ab");
+ VERIFY_MSG(g, "Cannot open temporary file to write");
+ fwrite(pval.data(), pval.data_size(), 1, g);
+ fclose(g);
+ total += 1;
+ }
+
+ return total;
+ } else {
+ // Sort the stuff
+ libcxx::sort(ins.begin(), ins.end(), array_less<typename Seq::DataType>());
+
+ // FIXME: Use something like parallel version of unique_copy but with explicit
+ // resizing.
+ auto it = std::unique(ins.begin(), ins.end(), array_equal_to<typename Seq::DataType>());
+
+ MMappedRecordArrayWriter<typename Seq::DataType> os(ofname, Seq::GetDataSize(K));
+ os.resize(it - ins.begin());
+ std::copy(ins.begin(), it, os.begin());
+
+ return it - ins.begin();
+ }
+ }
+};
+
+template<class Index>
+class KMerIndexBuilder {
+ typedef typename Index::KMerSeq Seq;
+ typedef typename Index::kmer_index_traits kmer_index_traits;
+
+ std::string work_dir_;
+ unsigned num_buckets_;
+ unsigned num_threads_;
+
+ public:
+ KMerIndexBuilder(const std::string &workdir,
+ unsigned num_buckets, unsigned num_threads)
+ : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {}
+ size_t BuildIndex(Index &out, KMerCounter<Seq> &counter,
+ bool save_final = false);
+
+ unsigned num_buckets() const { return num_buckets_; }
+
+ private:
+
+ DECL_LOGGER("K-mer Index Building");
+};
+
+template<class Index>
+size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &counter,
+ bool save_final) {
+ index.clear();
+
+ INFO("Building kmer index ");
+
+ // First, count the unique k-mers
+ size_t kmers = counter.Count(num_buckets_, num_threads_);
+
+ index.num_buckets_ = num_buckets_;
+ index.bucket_starts_.resize(num_buckets_ + 1);
+ index.index_ = new typename KMerIndex<kmer_index_traits>::KMerDataIndex[num_buckets_];
+
+ INFO("Building perfect hash indices");
+
+ // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit.
+ unsigned num_threads = num_threads_;
+# ifdef SPADES_USE_JEMALLOC
+ const size_t *cmem = 0;
+ size_t clen = sizeof(cmem);
+
+ je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
+ size_t bucket_size = (36 * kmers + kmers * counter.kmer_size()) / num_buckets_;
+ num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
+ if (num_threads < 1)
+ num_threads = 1;
+ if (num_threads < num_threads_)
+ WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction");
+# endif
+
+# pragma omp parallel for shared(index) num_threads(num_threads)
+ for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
+ auto bucket = counter.GetBucket(iFile, !save_final);
+ size_t sz = bucket->end() - bucket->begin();
+ index.bucket_starts_[iFile + 1] = sz;
+ typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
+ size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
+ if (max_nodes >= uint64_t(1) << 32) {
+ emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
+ sz, emphf::range(bucket->begin(), bucket->end()),
+ adaptor).swap(data_index);
+ } else {
+ emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
+ typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
+ sz, emphf::range(bucket->begin(), bucket->end()),
+ adaptor).swap(data_index);
+ }
+ }
+
+ // Finally, record the sizes of buckets.
+ for (unsigned iFile = 1; iFile < num_buckets_; ++iFile)
+ index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1];
+
+ if (save_final)
+ counter.MergeBuckets(num_buckets_);
+
+ double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers;
+ INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer).");
+ index.count_size();
+ return kmers;
+}
diff --git a/src/common/utils/mph_index/kmer_index_traits.hpp b/src/common/utils/mph_index/kmer_index_traits.hpp
new file mode 100644
index 0000000..4656720
--- /dev/null
+++ b/src/common/utils/mph_index/kmer_index_traits.hpp
@@ -0,0 +1,92 @@
+#pragma once
+//***************************************************************************
+//* Copyright (c) 2016 Saint Petersburg State University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/kmers/mmapped_reader.hpp"
+#include "mphf.hpp"
+
+template<class Seq>
+struct kmer_index_traits {
+ typedef Seq SeqType;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
+ typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
+ typedef typename RawKMerStorage::iterator raw_data_iterator;
+ typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
+ typedef typename RawKMerStorage::iterator::value_type KMerRawData;
+ typedef typename RawKMerStorage::iterator::reference KMerRawReference;
+ typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
+
+ struct raw_equal_to {
+ bool operator()(const Seq &lhs, const KMerRawReference rhs) {
+ return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
+ }
+ };
+
+ struct raw_create {
+ Seq operator()(unsigned K, const KMerRawReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ Seq operator()(unsigned K, const KMerRawConstReference kmer) {
+ return Seq(K, kmer.data());
+ }
+ };
+
+ struct hash_function {
+ uint64_t operator()(const Seq &k) const{
+ return typename Seq::hash()(k);
+ }
+ uint64_t operator()(const KMerRawReference k) const {
+ return typename Seq::hash()(k.data(), k.size());
+ }
+ };
+
+ struct KMerRawReferenceAdaptor {
+ emphf::byte_range_t operator()(const KMerRawReference k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size());
+ }
+ };
+
+ struct KMerSeqAdaptor {
+ emphf::byte_range_t operator()(const Seq &k) const {
+ const uint8_t * data = (const uint8_t*)k.data();
+ return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
+ }
+ };
+
+ template<class Writer>
+ static void raw_serialize(Writer &writer, RawKMerStorage *data) {
+ size_t sz = data->data_size(), elcnt = data->elcnt();
+ unsigned PageSize = getpagesize();
+ writer.write((char*)&sz, sizeof(sz));
+ writer.write((char*)&elcnt, sizeof(elcnt));
+ // Make sure data is aligned to the page boundary
+ size_t cpos = writer.tellp();
+ size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
+ size_t off = pos - writer.tellp();
+ writer.write((char*)&off, sizeof(off));
+ writer.seekp(pos);
+ writer.write((char*)data->data(), data->data_size());
+ }
+
+ template<class Writer>
+ static void raw_serialize(Writer &writer, const std::unique_ptr<RawKMerStorage> &data) {
+ raw_serialize(writer, data.get());
+ }
+
+ template<class Reader>
+ static std::unique_ptr<RawKMerStorage> raw_deserialize(Reader &reader, const std::string &FileName) {
+ size_t sz, off, elcnt;
+ reader.read((char*)&sz, sizeof(sz));
+ reader.read((char*)&elcnt, sizeof(elcnt));
+ reader.read((char*)&off, sizeof(off));
+ off -= sizeof(off);
+ off += reader.tellg();
+
+ return std::unique_ptr<RawKMerStorage>(new RawKMerStorage(FileName, elcnt, false, off, sz));
+ }
+
+};
diff --git a/src/common/utils/mph_index/mphf.hpp b/src/common/utils/mph_index/mphf.hpp
new file mode 100644
index 0000000..3327fef
--- /dev/null
+++ b/src/common/utils/mph_index/mphf.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <random>
+
+#include "bitpair_vector.hpp"
+#include "ranked_bitpair_vector.hpp"
+
+#include "utils/logger/logger.hpp"
+
+namespace emphf {
+
+ template <typename BaseHasher>
+ class mphf {
+ public:
+ mphf()
+ {}
+
+ template <typename HypergraphSorter, typename Range, typename Adaptor>
+ mphf(HypergraphSorter& sorter, size_t n,
+ Range const& input_range, Adaptor adaptor,
+ double gamma = 1.23)
+ : m_n(n)
+ , m_hash_domain(std::max((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3, size_t(2)))
+ {
+ typedef typename HypergraphSorter::node_t node_t;
+ typedef typename HypergraphSorter::hyperedge hyperedge;
+ typedef decltype(*std::begin(input_range)) value_type;
+
+ size_t nodes_domain = m_hash_domain * 3;
+
+ if (nodes_domain >= std::numeric_limits<node_t>::max()) {
+ throw std::invalid_argument("Too many nodes for node_t");
+ }
+
+ auto edge_gen = [&](value_type s) {
+ using std::get;
+ auto hashes = m_hasher(adaptor(s));
+ return hyperedge((node_t)(get<0>(hashes) % m_hash_domain),
+ (node_t)(m_hash_domain +
+ (get<1>(hashes) % m_hash_domain)),
+ (node_t)(2 * m_hash_domain +
+ (get<2>(hashes) % m_hash_domain)));
+ };
+
+ std::mt19937_64 rng(37); // deterministic seed
+
+ for (size_t trial = 0; ; ++trial) {
+ //logger() << "Hypergraph generation: trial " << trial << std::endl;
+
+ m_hasher = BaseHasher::generate(rng);
+ if (sorter.try_generate_and_sort(input_range, edge_gen,
+ m_n, m_hash_domain)) break;
+ }
+
+ auto peeling_order = sorter.get_peeling_order();
+ bitpair_vector bv(nodes_domain);
+
+ //logger() << "Assigning values" << std::endl;
+
+ for (auto edge = peeling_order.first;
+ edge != peeling_order.second;
+ ++edge) {
+
+ uint64_t target = orientation(*edge);
+ uint64_t assigned = bv[edge->v1] + bv[edge->v2];
+
+ // "assigned values" must be nonzeros to be ranked, so
+ // if the result is 0 we assign 3
+ bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3);
+ }
+
+ m_bv.build(std::move(bv));
+ }
+
+ uint64_t size() const
+ {
+ return m_n;
+ }
+
+ size_t mem_size() const {
+ return m_bv.mem_size();
+ }
+
+ BaseHasher const& base_hasher() const
+ {
+ return m_hasher;
+ }
+
+ template <typename T, typename Adaptor>
+ uint64_t lookup(const T &val, Adaptor adaptor)
+ {
+ using std::get;
+ auto hashes = m_hasher(adaptor(val));
+ uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain,
+ m_hash_domain + (get<1>(hashes) % m_hash_domain),
+ 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)};
+
+ uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3;
+ return m_bv.rank(nodes[hidx]);
+ }
+
+ void swap(mphf& other)
+ {
+ std::swap(m_n, other.m_n);
+ std::swap(m_hash_domain, other.m_hash_domain);
+ m_hasher.swap(other.m_hasher);
+ m_bv.swap(other.m_bv);
+ }
+
+ void save(std::ostream& os) const
+ {
+ os.write(reinterpret_cast<char const*>(&m_n), sizeof(m_n));
+ os.write(reinterpret_cast<char const*>(&m_hash_domain),
+ sizeof(m_hash_domain));
+ m_hasher.save(os);
+ m_bv.save(os);
+ }
+
+ void load(std::istream& is)
+ {
+ is.read(reinterpret_cast<char*>(&m_n), sizeof(m_n));
+ is.read(reinterpret_cast<char*>(&m_hash_domain),
+ sizeof(m_hash_domain));
+ m_hasher.load(is);
+ m_bv.load(is);
+ }
+
+
+ private:
+
+ uint64_t m_n;
+ uint64_t m_hash_domain;
+ BaseHasher m_hasher;
+ ranked_bitpair_vector m_bv;
+ };
+}
diff --git a/src/modules/data_structures/mph_index/ranked_bitpair_vector.hpp b/src/common/utils/mph_index/ranked_bitpair_vector.hpp
similarity index 100%
rename from src/modules/data_structures/mph_index/ranked_bitpair_vector.hpp
rename to src/common/utils/mph_index/ranked_bitpair_vector.hpp
diff --git a/src/modules/dev_support/openmp_wrapper.h b/src/common/utils/openmp_wrapper.h
similarity index 100%
rename from src/modules/dev_support/openmp_wrapper.h
rename to src/common/utils/openmp_wrapper.h
diff --git a/src/modules/dev_support/parallel_wrapper.hpp b/src/common/utils/parallel_wrapper.hpp
similarity index 100%
rename from src/modules/dev_support/parallel_wrapper.hpp
rename to src/common/utils/parallel_wrapper.hpp
diff --git a/src/common/utils/path_helper.cpp b/src/common/utils/path_helper.cpp
new file mode 100644
index 0000000..4225f7e
--- /dev/null
+++ b/src/common/utils/path_helper.cpp
@@ -0,0 +1,249 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/path_helper.hpp"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+
+#include <boost/tokenizer.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <string>
+#include <vector>
+
+namespace path {
+
+bool make_dir(std::string const& folder) {
+ return mkdir(folder.c_str(), 0755) == 0;
+}
+
+std::string make_temp_dir(std::string const& prefix,
+ std::string const& suffix) {
+ std::string name = append_path(prefix, suffix + "_XXXXXX");
+ char* actual;
+ if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str())))
+ == NULL)
+ throw std::runtime_error("Cannot create temporary dir " + name);
+
+ std::string result(actual);
+ if (result == name)
+ throw std::runtime_error("Cannot create temporary dir " + name);
+
+ delete[] actual;
+
+ return result;
+}
+
+void remove_dir(std::string const& folder) {
+ DIR *dp;
+ if ((dp = opendir(folder.c_str())) == NULL)
+ throw std::runtime_error("can not open folder " + folder);
+
+ struct dirent *dirp;
+ while ((dirp = readdir(dp)) != NULL) {
+ std::string full_path = folder + "/" + dirp->d_name;
+
+ if (dirp->d_type == DT_DIR) {
+ if (std::string(".") != dirp->d_name
+ && std::string("..") != dirp->d_name) {
+ remove_dir(full_path);
+ }
+ } else
+ remove(full_path.c_str());
+ }
+
+ closedir(dp);
+ remove(folder.c_str());
+}
+
+bool is_regular_file(std::string const& path) {
+ struct stat st;
+ return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode));
+}
+
+std::string append_path(std::string const& prefix, std::string const& suffix) {
+ std::string delimiter = "";
+
+ if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/")
+ && !prefix.empty()) {
+ delimiter = "/";
+ }
+
+ return prefix + delimiter + suffix;
+}
+
+std::string current_dir() {
+ char* cwd = getcwd(NULL, 0);
+ std::string result = cwd;
+
+ free(cwd);
+ return result;
+}
+
+void make_full_path(std::string& path) {
+ if (!boost::starts_with(path, "/")) // relative path
+ path = append_path(current_dir(), path);
+}
+
+std::string filename(std::string const& path) {
+ size_t pos = path.find_last_of('/');
+ return pos != std::string::npos ? path.substr(pos + 1) : path;
+}
+
+std::string basename(std::string const& path) {
+ size_t slash = path.find_last_of('/');
+ size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
+
+ size_t dot = path.find_last_of('.');
+ if (dot < after_slash)
+ dot = std::string::npos;
+
+ return path.substr(after_slash, dot - after_slash);
+}
+
+std::string extension(std::string const& path) {
+ size_t slash = path.find_last_of('/');
+ size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
+ size_t dot = path.find_last_of('.');
+
+ if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size())
+ return std::string();
+
+ return path.substr(dot);
+}
+
+std::string parent_path(std::string const& path) {
+ std::string cpath(path);
+
+ make_full_path(cpath);
+ size_t slash_pos = cpath.find_last_of('/');
+
+ return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos));
+}
+
+bool check_existence(std::string const& path) {
+ struct stat st_buf;
+ return stat(path.c_str(), &st_buf) == 0
+ && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir)
+}
+
+void remove_if_exists(std::string const& path) {
+ if (check_existence(path)) {
+ if (is_regular_file(path)) // file
+ remove(path.c_str());
+ else // dir
+ remove_dir(path);
+ }
+}
+
+//TODO do we need to screen anything but whitespaces?
+std::string screen_whitespaces(std::string const &path) {
+ std::string to_search = " ";
+ std::string res = "";
+ for (size_t i = 0; i < path.size(); i++) {
+ if ((i == 0) || (path[i] != ' ') || (path[i - 1] == '\\')) {
+ res += path[i];
+ } else {
+ res +='\\';
+ res +=' ';
+ }
+ }
+// res += "'";
+ return res;
+}
+
+//todo reduce code duplication!!!
+bool FileExists(std::string const &filename) {
+ struct stat st_buf;
+ return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode);
+}
+
+void CheckFileExistenceFATAL(std::string const &filename) {
+ if (!FileExists(filename)) FATAL_ERROR("File " << filename << " doesn't exist or can't be read!");
+}
+
+void make_dirs(std::string const &path) {
+ VERIFY(!path.empty());
+
+ size_t slash_pos = 0;
+ while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
+ make_dir(path.substr(0, slash_pos));
+ }
+ if (path[path.size() - 1] != '/') {
+ make_dir(path);
+ }
+}
+
+// doesn't support symlinks
+std::string resolve(std::string const& path) {
+ typedef boost::char_delimiters_separator<char> separator_t;
+ typedef boost::tokenizer<separator_t> tokenizer_t;
+
+ tokenizer_t tok(path, separator_t(false, "", "/"));
+
+ std::string result = "/";
+ for (auto it = tok.begin(); it != tok.end(); ++it) {
+ if (*it == "..")
+ result = parent_path(result);
+
+ else if (*it == ".")
+ ; // Ignore
+
+ else
+ // Just cat other path entries
+ result = append_path(result, *it);
+ }
+
+ return result;
+}
+
+std::string make_relative_path(std::string p, std::string base) {
+ p = resolve(p);
+ base = resolve(base);
+
+ std::string pp = parent_path(p);
+
+ typedef boost::char_delimiters_separator<char> separator_t;
+ typedef boost::tokenizer<separator_t> tokenizer_t;
+
+ tokenizer_t pp_tok(pp, separator_t(false, "", "/"));
+ tokenizer_t base_tok(base, separator_t(false, "", "/"));
+
+ auto i = pp_tok.begin();
+ auto j = base_tok.begin();
+
+ while (i != pp_tok.end() && j != base_tok.end() && *i == *j) {
+ ++i;
+ ++j;
+ }
+
+ std::string result;
+ for (; j != base_tok.end(); ++j)
+ result = append_path("..", result);
+
+ for (; i != pp_tok.end(); ++i)
+ result = append_path(result, *i);
+
+ return append_path(result, filename(p));
+}
+
+std::string MakeLaunchTimeDirName() {
+ time_t rawtime;
+ struct tm * timeinfo;
+ char buffer[80];
+
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+
+ strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo);
+ return std::string(buffer);
+}
+
+}
diff --git a/src/common/utils/path_helper.hpp b/src/common/utils/path_helper.hpp
new file mode 100644
index 0000000..73b2ab5
--- /dev/null
+++ b/src/common/utils/path_helper.hpp
@@ -0,0 +1,74 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+#include "utils/logger/logger.hpp"
+#include "utils/verify.hpp"
+
+namespace path {
+//todo review and make names consistent!
+
+typedef std::vector<std::string> files_t;
+
+bool make_dir(std::string const &folder);
+
+std::string make_temp_dir(std::string const &prefix, std::string const &suffix);
+
+void remove_dir(std::string const &folder);
+
+bool is_regular_file(std::string const &path);
+
+std::string append_path(std::string const &prefix, std::string const &suffix);
+
+std::string current_dir();
+
+//todo why non-cons argument?!
+void make_full_path(std::string &path);
+
+std::string filename(std::string const &path);
+
+std::string basename(std::string const &path);
+
+std::string extension(std::string const &path);
+
+std::string parent_path(std::string const &path);
+
+bool check_existence(std::string const &path);
+
+void remove_if_exists(std::string const &path);
+
+std::string screen_whitespaces(std::string const &path);
+
+/**
+* Checks if file exists.
+* Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/
+*/
+bool FileExists(std::string const &filename);
+
+/**
+* Exit(1) if file doesn't exists, writes FATAL log message.
+*/
+void CheckFileExistenceFATAL(std::string const &filename);
+
+void make_dirs(std::string const &path);
+
+// doesn't support symlinks
+std::string resolve(std::string const &path);
+
+std::string make_relative_path(std::string p, std::string base = current_dir());
+
+std::string MakeLaunchTimeDirName();
+
+}
diff --git a/src/modules/dev_support/perfcounter.hpp b/src/common/utils/perfcounter.hpp
similarity index 100%
rename from src/modules/dev_support/perfcounter.hpp
rename to src/common/utils/perfcounter.hpp
diff --git a/src/common/utils/range.hpp b/src/common/utils/range.hpp
new file mode 100644
index 0000000..2e05bed
--- /dev/null
+++ b/src/common/utils/range.hpp
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "utils/verify.hpp"
+
+namespace omnigraph {
+
+struct Range {
+private:
+ bool inside(size_t left, size_t right, size_t point) const {
+ return left <= point && point <= right;
+ }
+
+public:
+ //inclusive
+ size_t start_pos;
+ //exclusive
+ size_t end_pos;
+
+ size_t size() const {
+ VERIFY(end_pos >= start_pos);
+ return end_pos - start_pos;
+ }
+
+ void shift(int shift) {
+ VERIFY(shift > 0 || size_t(-shift) <= start_pos);
+ start_pos += shift;
+ end_pos += shift;
+ }
+
+ Range(): start_pos(0), end_pos(0) {
+ VERIFY(end_pos >= start_pos);
+ }
+
+ Range(size_t start_pos, size_t end_pos)
+ : start_pos(start_pos),
+ end_pos(end_pos) {
+ VERIFY(end_pos >= start_pos);
+ }
+
+ bool operator<(const Range &other) const {
+ if (start_pos != other.start_pos)
+ return start_pos < other.start_pos;
+ return end_pos < other.end_pos;
+ }
+
+ bool contains(const Range& that) const {
+ return start_pos <= that.start_pos && end_pos >= that.end_pos;
+ }
+
+ Range Merge(const Range &other) const {
+ return Range(this->start_pos, other.end_pos);
+ }
+
+ Range Invert(size_t base_length) const {
+ VERIFY(base_length >= end_pos);
+ return Range(base_length - end_pos, base_length - start_pos);
+ }
+
+ Range& operator=(const Range& other) {
+ start_pos = other.start_pos;
+ end_pos = other.end_pos;
+ return *this;
+ }
+
+ bool empty() const {
+ return start_pos == end_pos;
+ }
+
+ bool Intersect(const Range &other) const {
+ return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) ||
+ inside(other.start_pos, other.end_pos, start_pos);
+ }
+
+ bool IntersectLeftOf(const Range &other) const {
+ return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos);
+ }
+
+ bool operator==(const Range &that) const {
+ return start_pos == that.start_pos && end_pos == that.end_pos;
+ }
+
+ bool operator!=(const Range &that) const {
+ return !(*this == that);
+ }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Range& range) {
+ os << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]";
+ return os;
+}
+
+}
diff --git a/src/common/utils/segfault_handler.hpp b/src/common/utils/segfault_handler.hpp
new file mode 100644
index 0000000..2512ba5
--- /dev/null
+++ b/src/common/utils/segfault_handler.hpp
@@ -0,0 +1,58 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#pragma once
+
+#include "utils/stacktrace.hpp"
+#include "boost/noncopyable.hpp"
+
+#include <signal.h>
+
+struct segfault_handler : boost::noncopyable {
+ typedef std::function<void()> callback_t;
+
+ typedef void (*seg_handler_t)(int);
+
+ segfault_handler(callback_t const &cb = 0) {
+ if (callback() != 0)
+ throw std::runtime_error("failed to initialize segfault_handler, it has been already initialized");
+
+ callback() = cb;
+ old_func_ = signal(SIGSEGV, &segfault_handler::handler);
+ }
+
+ ~segfault_handler() {
+ callback() = 0;
+ signal(SIGSEGV, old_func_);
+ }
+
+private:
+ static callback_t &callback() {
+ static callback_t cb = 0;
+ return cb;
+ }
+
+ static void handler(int signum) {
+ if (signum == SIGSEGV) {
+ std::cerr << "The program was terminated by segmentation fault" << std::endl;
+ print_stacktrace();
+
+ if (callback())
+ callback()();
+ }
+
+ //TEST!!
+ exit(1);
+
+ signal(signum, SIG_DFL);
+ kill(getpid(), signum);
+ }
+
+private:
+ seg_handler_t old_func_;
+};
diff --git a/src/common/utils/simple_tools.hpp b/src/common/utils/simple_tools.hpp
new file mode 100644
index 0000000..c47f70f
--- /dev/null
+++ b/src/common/utils/simple_tools.hpp
@@ -0,0 +1,189 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * simple_tools.hpp
+ *
+ * Created on: 27.05.2011
+ * Author: vyahhi
+ */
+
+#ifndef SIMPLE_TOOLS_HPP_
+#define SIMPLE_TOOLS_HPP_
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "utils/verify.hpp"
+#include "io/reads/ireader.hpp"
+#include "utils/path_helper.hpp"
+#include <memory>
+#include <string>
+#include <set>
+#include <vector>
+
+/**
+ * Converts anything to string (using ostringstream).
+ */
+template <typename T>
+std::string ToString(const T& t) {
+ std::ostringstream ss;
+ ss << t;
+ return ss.str();
+}
+
+template <typename T>
+std::string ToString(const T& t, size_t length) {
+ std::ostringstream ss;
+ ss << t;
+ std::string result = ss.str();
+ while(result.size() < length)
+ result = "0" + result;
+ return result;
+}
+
+template <typename T>
+std::string ToString(std::vector<T>& t) {
+ std::ostringstream ss;
+ ss << "Size "<<t.size()<<": [";
+ for (auto it = t.begin(); it != t.end(); ++it)
+ ss<<*it<<", ";
+ ss<<"]";
+ return ss.str();
+}
+
+template <typename T>
+std::string ToString(std::set<T>& t) {
+ std::ostringstream ss;
+ ss << "Size "<<t.size()<<": [";
+ for (auto it = t.begin(); it != t.end(); ++it)
+ ss<<*it<<", ";
+ ss<<"]";
+ return ss.str();
+}
+
+template<typename T>
+inline const std::pair<T, T> ReversePair(std::pair<T, T> ep) {
+ return std::pair<T, T>(ep.second, ep.first);
+}
+
+template <class ContainerT1, class ContainerT2>
+void push_back_all(ContainerT1& target, const ContainerT2& to_insert) {
+ target.insert(target.end(), to_insert.begin(), to_insert.end());
+}
+
+template <class ContainerT1, class ContainerT2>
+void insert_all(ContainerT1& target, const ContainerT2& to_insert) {
+ target.insert(to_insert.begin(), to_insert.end());
+}
+
+template<class MapT>
+std::set<typename MapT::key_type> key_set(const MapT& m) {
+ std::set<typename MapT::key_type> answer;
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ answer.insert(it->first);
+ }
+ return answer;
+}
+
+template<class MapT>
+std::set<typename MapT::mapped_type> value_set(const MapT& m) {
+ std::set<typename MapT::mapped_type> answer;
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ answer.insert(it->second);
+ }
+ return answer;
+}
+
+template <class MapT>
+const typename MapT::mapped_type& get(const MapT& from, const typename MapT::key_type& key) {
+ auto it = from.find(key);
+ VERIFY(it != from.end());
+ return it->second;
+}
+
+template <class MapT>
+typename MapT::mapped_type& get(MapT& from, const typename MapT::key_type& key) {
+ auto it = from.find(key);
+ VERIFY(it != from.end());
+ return it->second;
+}
+
+template <class MMapT>
+const std::vector<typename MMapT::mapped_type> get_all(const MMapT& from, const typename MMapT::key_type& key) {
+ std::vector<typename MMapT::mapped_type> answer;
+ for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) {
+ answer.push_back(it->second);
+ }
+ return answer;
+}
+
+class TmpFolderFixture
+{
+ std::string tmp_folder_;
+
+public:
+ TmpFolderFixture(std::string tmp_folder = "tmp") :
+ tmp_folder_(tmp_folder)
+ {
+ path::make_dirs(tmp_folder_);
+ }
+
+ ~TmpFolderFixture()
+ {
+ path::remove_dir(tmp_folder_);
+ }
+};
+
+namespace std
+{
+template<class T1, class T2>
+std::ostream& operator<< (std::ostream& os, std::pair<T1, T2> const& pair)
+{
+ return os << "(" << pair.first << ", " << pair.second << ")";
+}
+//}
+
+//namespace omnigraph
+//{
+template<class T>
+std::ostream& operator<< (std::ostream& os, const std::vector<T>& v)
+{
+ os << "[";
+ std::string delim = "";
+ for (auto it = v.begin(); it != v.end(); ++it) {
+ os << delim << *it;
+ delim = ", ";
+ }
+// std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, ", "));
+ os << "]";
+ return os;
+}
+
+template<class T>
+std::ostream& operator<< (std::ostream& os, const std::set<T>& set)
+{
+ os << "{";
+ bool delim = false;
+ for (const auto& i : set) {
+ if (delim) os << ", ";
+ os << i;
+ delim = true;
+ }
+ os << "}";
+ return os;
+}
+
+}
+
+template<typename Base, typename T>
+inline bool instanceof(const T *ptr) {
+ return dynamic_cast<const Base *>(ptr) != nullptr;
+}
+
+#endif /* SIMPLE_TOOLS_HPP_ */
diff --git a/src/modules/dev_support/stacktrace.hpp b/src/common/utils/stacktrace.hpp
similarity index 100%
rename from src/modules/dev_support/stacktrace.hpp
rename to src/common/utils/stacktrace.hpp
diff --git a/src/common/utils/standard_base.hpp b/src/common/utils/standard_base.hpp
new file mode 100644
index 0000000..fac6fcf
--- /dev/null
+++ b/src/common/utils/standard_base.hpp
@@ -0,0 +1,140 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+/*
+ * standart.hpp
+ *
+ * Created on: 1 Sep 2011
+ * Author: valery
+ */
+
+#pragma once
+
+//==crt and stl
+#include <memory>
+#include <cstdlib>
+#include <cstdio>
+#include <time.h>
+#include <signal.h>
+#include <execinfo.h>
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <set>
+#include <string>
+#include <sstream>
+#include <utility>
+#include <array>
+#include <unordered_map>
+#include <unordered_set>
+#include <deque>
+#include <cmath>
+#include <limits>
+
+using std::cin;
+using std::cout;
+using std::cerr;
+using std::endl;
+using std::map;
+using std::multimap;
+using std::unordered_map;
+using std::unordered_set;
+using std::vector;
+using std::array;
+using std::set;
+using std::string;
+using std::pair;
+using std::make_pair;
+using std::ifstream;
+using std::istream;
+using std::ofstream;
+using std::ostream;
+using std::min;
+using std::max;
+using std::abs;
+using std::stringstream;
+using std::numeric_limits;
+using std::ostream_iterator;
+using std::copy;
+
+using std::shared_ptr;
+using std::make_shared;
+
+//==boost
+
+#ifndef NDEBUG
+#define BOOST_ENABLE_ASSERT_HANDLER
+#endif
+
+#include <boost/optional.hpp>
+
+#include <boost/noncopyable.hpp>
+
+using boost::optional;
+using boost::make_optional;
+using boost::none;
+
+using boost::noncopyable;
+
+// err handling
+#include "utils/stacktrace.hpp"
+
+// path manipulation instead of boost filesystem
+#include "utils/path_helper.hpp"
+using path::make_dir;
+using path::remove_dir;
+
+#ifndef NDEBUG
+namespace boost {
+inline void assertion_failed(char const * expr, char const * function,
+ char const * file, long line) {
+ std::cerr << "Aborted by assert: " << std::endl;
+ print_stacktrace();
+#if __DARWIN_UNIX03
+ __assert_rtn (expr, file, (int)line, function);
+#elif __DARWIN
+ __assert (expr, file, (int)line, function);
+#else
+ __assert_fail (expr, file, (unsigned)line, function);
+#endif
+}
+
+inline void assertion_failed_msg(char const * expr, char const * msg,
+ char const * function, char const * file,
+ long line) {
+ std::cerr << "Aborted by assert: " << msg << std::endl;
+ print_stacktrace();
+#if __DARWIN_UNIX03
+ __assert_rtn (expr, file, (int)line, function);
+#elif __DARWIN
+ __assert (expr, file, (int)line, function);
+#else
+ __assert_fail (expr, file, (unsigned)line, function);
+#endif
+}
+
+} // namespace boost
+
+#endif // NDEBUG
+
+//==sys
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+//our
+//math
+#include "math/xmath.h"
+#include "func/func.hpp"
+#include "utils/verify.hpp"
+// log
+#include "utils/logger/logger.hpp"
+
+
diff --git a/src/common/utils/verify.hpp b/src/common/utils/verify.hpp
new file mode 100644
index 0000000..b677a3e
--- /dev/null
+++ b/src/common/utils/verify.hpp
@@ -0,0 +1,33 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+#include "utils/stacktrace.hpp"
+#include "boost/current_function.hpp"
+#include <sstream>
+#include <iostream>
+#include <cassert>
+
+#define VERIFY(expr) \
+ do { \
+ if(!(expr))\
+ print_stacktrace();\
+ assert(expr); \
+ } while(0);
+
+#define VERIFY_MSG(expr, msg) \
+ if (!(expr)) { \
+ std::stringstream ss; \
+ print_stacktrace();\
+ ss << "Verification of expression '" << #expr << "' failed in function '" << BOOST_CURRENT_FUNCTION << \
+ "'. In file '" << __FILE__ << "' on line " << __LINE__ << ". Message '" << msg << "'." ; \
+ std::cout << ss.str() << std::endl; \
+ std::cerr << ss.str() << std::endl; \
+ fflush(stdout); \
+ fflush(stderr); \
+ assert(expr); \
+ }
diff --git a/src/common/visualization/graph_colorer.hpp b/src/common/visualization/graph_colorer.hpp
new file mode 100644
index 0000000..2a77d89
--- /dev/null
+++ b/src/common/visualization/graph_colorer.hpp
@@ -0,0 +1,355 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "assembly_graph/components/graph_component.hpp"
+#include "assembly_graph/paths/mapping_path.hpp"
+#include "visualization/printing_parameter_storage.hpp"
+//#include "edges_position_handler.hpp"
+
+using namespace omnigraph;
+
+namespace visualization {
+
+namespace graph_colorer {
+
+template<typename ElementId>
+class ElementColorer : public virtual printing_parameter_storage::ParameterStorage<ElementId, string> {
+public:
+ template<typename Iter>
+ set<ElementId> ColoredWith(Iter begin, Iter end, const string &color) {
+ set<ElementId> result;
+ for (Iter it = begin; it != end; ++it) {
+ if (this->GetValue(*it) == color)
+ result.insert(*it);
+ }
+ return result;
+ }
+};
+
+//TODO remove all default color parameters!
+
+template<typename ElementId>
+class MapColorer : public ElementColorer<ElementId>, public printing_parameter_storage::MapParameterStorage<ElementId, string> {
+public:
+ MapColorer(const string &default_color) : printing_parameter_storage::MapParameterStorage<ElementId, string>(default_color) {
+ }
+
+ MapColorer(const map<ElementId, string> &color_map) : printing_parameter_storage::MapParameterStorage<ElementId, string>(color_map) {
+ }
+
+ MapColorer(const map<ElementId, string> &color_map, const string &default_color)
+ : printing_parameter_storage::MapParameterStorage<ElementId, string>(color_map, default_color) {
+ }
+
+ template<class It>
+ MapColorer(It begin, It end, const string &color, const string &default_color)
+ : printing_parameter_storage::MapParameterStorage<ElementId, string>(begin, end, color, default_color) {
+ }
+
+ virtual ~MapColorer() {
+ }
+};
+
+template<typename ElementId>
+class FixedColorer : public MapColorer<ElementId> {
+public:
+ FixedColorer(const string &default_color) : MapColorer<ElementId>(default_color) {
+ }
+};
+
+template<class Graph>
+class SetColorer : public MapColorer<typename Graph::EdgeId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &graph_;
+
+ template<class It>
+ map<EdgeId, string> ConstructColorMap(It begin, It end, const string &color) {
+ map<EdgeId, string> result;
+ for (auto it = begin; it != end; ++it) {
+ result[*it] = color;
+ }
+ return result;
+ }
+
+public:
+ template<class It>
+ SetColorer(const Graph &graph, It begin, It end, const string &color) :
+ MapColorer<typename Graph::EdgeId>(ConstructColorMap(begin, end, color), "black"), graph_(graph) {
+ }
+
+ template<class Collection>
+ SetColorer(const Graph &graph, const Collection &c, const string &color) :
+ MapColorer<typename Graph::EdgeId>(ConstructColorMap(c.begin(), c.end(), color), "black"),
+ graph_(graph) {
+ }
+
+};
+//
+//template<class Graph>
+//class PositionsEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
+//private:
+// typedef typename Graph::VertexId VertexId;
+// typedef typename Graph::EdgeId EdgeId;
+// const Graph &graph_;
+// EdgesPositionHandler<Graph> &positions_;
+//public:
+// PositionsEdgeColorer(const Graph &graph, EdgesPositionHandler<Graph> &positions):
+// graph_(graph), positions_(positions) {
+// }
+// string GetValue(EdgeId element) const {
+// std::vector<EdgeId> path;
+// path.push_back(element);
+// if (positions_.GetEdgePositions(element).size() == 0) return "black";
+// else {
+// if (positions_.IsConsistentWithGenome(path)) return "green";
+// else return "orange";
+// }
+// }
+//};
+
+
+template<class Graph>
+class CompositeEdgeColorer : public ElementColorer<typename Graph::EdgeId> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ string default_color_;
+ vector<shared_ptr<ElementColorer<typename Graph::EdgeId>>> colorers_;
+
+ vector<string> CollectColors(EdgeId edge) const {
+ vector<string> result = {default_color_};
+ for (auto it = colorers_.begin(); it != colorers_.end(); ++it) {
+ string next_color = (*it)->GetValue(edge);
+ if (std::find(result.begin(), result.end(), next_color) == result.end())
+ result.push_back(next_color);
+ }
+ return result;
+ }
+
+ string ConstructColorString(const vector<string> &colors) const {
+ if (colors.size() == 1)
+ return default_color_;
+ string result = "";
+ for (size_t i = 1; i < colors.size(); i++)
+ result += ":" + colors[i];
+ return result.substr(1, result.size());
+ }
+
+public:
+ CompositeEdgeColorer(const string &default_color) : default_color_(default_color) {
+ }
+
+ CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer,
+ const string &default_color) : default_color_(default_color) {
+ AddColorer(colorer);
+ }
+
+ CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer1,
+ shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer2,
+ const string &default_color) : default_color_(default_color) {
+ AddColorer(colorer1);
+ AddColorer(colorer2);
+ }
+
+ void AddColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer) {
+ colorers_.push_back(colorer);
+ }
+
+ string GetValue(EdgeId edge) const {
+ return ConstructColorString(CollectColors(edge));
+ }
+};
+
+template<class Graph>
+class GraphColorer
+ : public ElementColorer<typename Graph::VertexId>, public ElementColorer<typename Graph::EdgeId> {
+public:
+ string GetValue(typename Graph::VertexId) const = 0;
+
+ string GetValue(typename Graph::EdgeId) const = 0;
+
+ template<typename Iter>
+ set<typename Iter::value_type> ColoredWith(Iter begin, Iter end, const string &color) {
+ return ElementColorer<typename Iter::value_type>::ColoredWith(begin, end, color);
+ }
+};
+
+template<class Graph>
+class DelegatingGraphColorer : public GraphColorer<Graph> {
+private:
+ const GraphColorer<Graph> &inner_colorer_;
+public:
+ DelegatingGraphColorer(const GraphColorer<Graph> &inner_colorer) : inner_colorer_(inner_colorer) {
+ }
+
+ string GetValue(typename Graph::VertexId v) const {
+ return inner_colorer_.GetValue(v);
+ }
+
+ string GetValue(typename Graph::EdgeId e) const {
+ return inner_colorer_.GetValue(e);
+ }
+};
+
+template<typename Graph>
+class BorderDecorator : public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const GraphComponent<Graph> &component_;
+// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
+// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
+ const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
+ const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
+ const string border_color_;
+public:
+// BorderDecorator(const GraphComponent<Graph> &component,
+// const shared_ptr<const GraphColorer<Graph>> colorer,
+// const string &border_color) :
+// component_(component), vertex_colorer_ptr_(colorer), edge_colorer_ptr_(
+// colorer), vertex_colorer_(*colorer), edge_colorer_(
+// *colorer), border_color_(border_color) {
+// }
+
+ BorderDecorator(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &border_color = "yellow") :
+ component_(component), vertex_colorer_(colorer), edge_colorer_(colorer),
+ border_color_(border_color) {
+ }
+
+ string GetValue(VertexId v) const {
+ if (component_.IsBorder(v)) {
+ return border_color_;
+ } else {
+ return vertex_colorer_.GetValue(v);
+ }
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_.GetValue(e);
+ }
+
+ static shared_ptr<BorderDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer,
+ const string &border_color = "yellow") {
+ return make_shared<BorderDecorator<Graph>>(component, colorer, border_color);
+ }
+};
+
+
+template<typename Graph>
+class SinkSourceDecorator : public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const GraphComponent<Graph> &component_;
+// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
+// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
+ const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
+ const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
+ const string sink_color_;
+ const string source_color_;
+ const string sinksource_color_;
+public:
+
+ SinkSourceDecorator(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer, const string &sink_color = "red",
+ const string &source_color = "orange", const string &sinksource_color = "green") :
+ component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), sink_color_(sink_color),
+ source_color_(source_color), sinksource_color_(sinksource_color) {
+ }
+
+ string GetValue(VertexId v) const {
+ if (component_.exits().count(v) && !component_.entrances().count(v)) {
+ return sink_color_;
+ }
+ if (component_.entrances().count(v) && !component_.exits().count(v)) {
+ return source_color_;
+ }
+ if (component_.entrances().count(v) && component_.exits().count(v)) {
+ return sinksource_color_;
+ }
+
+ return vertex_colorer_.GetValue(v);
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_.GetValue(e);
+ }
+
+ static shared_ptr<SinkSourceDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
+ const GraphColorer<Graph> &colorer,
+ const string &sink_color = "red",
+ const string &source_color = "orange") {
+ return make_shared<SinkSourceDecorator<Graph>>(component, colorer, sink_color, source_color);
+ }
+};
+
+template<class Graph>
+class CompositeGraphColorer : public GraphColorer<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ const shared_ptr<ElementColorer<VertexId>> vertex_colorer_;
+ const shared_ptr<ElementColorer<EdgeId>> edge_colorer_;
+public:
+ CompositeGraphColorer(shared_ptr<ElementColorer<VertexId>> vertex_colorer,
+ shared_ptr<ElementColorer<EdgeId>> edge_colorer) :
+ vertex_colorer_(vertex_colorer),
+ edge_colorer_(edge_colorer) {
+ }
+
+// explicit CompositeGraphColorer(shared_ptr<ElementColorer<EdgeId>> edge_colorer = make_shared<FixedColorer<EdgeId>>("black")) :
+// vertex_colorer_(shared_ptr<ElementColorer<VertexId>>(new FixedColorer<VertexId>("white"))),
+// edge_colorer_(edge_colorer) {
+// }
+
+ string GetValue(VertexId v) const {
+ return vertex_colorer_->GetValue(v);
+ }
+
+ string GetValue(EdgeId e) const {
+ return edge_colorer_->GetValue(e);
+ }
+
+};
+
+
+// edge_colorer management is passed here
+//TODO check all usages
+template<class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph & /*g*/,
+ shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer) {
+ return shared_ptr<GraphColorer<Graph>>(
+ new CompositeGraphColorer<Graph>(make_shared<FixedColorer<typename Graph::VertexId>>("white"),
+ edge_colorer));
+}
+
+template<class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph &g,
+ const Path<typename Graph::EdgeId> &path1,
+ const Path<typename Graph::EdgeId> &path2) {
+ shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer =
+ make_shared<CompositeEdgeColorer<Graph>>(
+ make_shared<SetColorer<Graph>>(g, path1.sequence(), "red"),
+ make_shared<SetColorer<Graph>>(g, path2.sequence(), "blue"), "black");
+ return DefaultColorer(g, edge_colorer);
+}
+
+template<class Graph>
+shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph & /*g*/) {
+ return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(
+ make_shared<FixedColorer<typename Graph::VertexId>>("white"),
+ make_shared<FixedColorer<typename Graph::EdgeId>>("black")));
+}
+}
+}
\ No newline at end of file
diff --git a/src/common/visualization/graph_labeler.hpp b/src/common/visualization/graph_labeler.hpp
new file mode 100644
index 0000000..8690af7
--- /dev/null
+++ b/src/common/visualization/graph_labeler.hpp
@@ -0,0 +1,308 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/simple_tools.hpp"
+#include "utils/standard_base.hpp"
+#include "common/assembly_graph/handlers/edges_position_handler.hpp"
+
+namespace visualization {
+
+namespace graph_labeler {
+
+/**
+* (Interface)
+* Provides string labels for vertices and edges of some graph.
+* Used with GraphPrinter to visualize graphs.
+*/
+template<class Graph>
+class GraphLabeler {
+public:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+
+ virtual ~GraphLabeler() {
+ }
+
+ virtual string label(VertexId v) const = 0;
+
+ virtual string label(EdgeId e) const = 0;
+
+};
+
+//template<class Graph>
+//class MapGraphLabeler {
+// typedef typename Graph::EdgeId EdgeId;
+// typedef typename Graph::VertexId VertexId;
+// map<EdgeId, string> edge_map_;
+// map<VertexId, string> vertex_map_;
+//
+//public:
+//
+// string label(VertexId v) const {
+// auto it = vertex_map_.find(v);
+// if (it == vertex_map_.end())
+// return "";
+// else
+// return it->second;
+// }
+//
+// string label(EdgeId e) const {
+// auto it = edge_map_.find(e);
+// if (it == edge_map_.end())
+// return "";
+// else
+// return it->second;
+// }
+//
+//};
+
+template<class Graph>
+class AbstractGraphLabeler : public GraphLabeler<Graph> {
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+ const Graph &g_;
+protected:
+ AbstractGraphLabeler(const Graph &g) : g_(g) {
+
+ }
+
+ const Graph &graph() const {
+ return g_;
+ }
+
+public:
+ /*virtual*/ std::string label(VertexId /*v*/) const {
+ return "";
+ }
+
+ /*virtual*/ std::string label(EdgeId /*e*/) const {
+ return "";
+ }
+
+};
+
+/**
+* Trivial implementation of GraphLabeler.
+* All labels are "".
+*/
+template<class Graph>
+class EmptyGraphLabeler : public GraphLabeler<Graph> {
+ typedef GraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ EmptyGraphLabeler() {}
+
+ std::string label(VertexId /*v*/) const {
+ return "";
+ }
+
+ std::string label(EdgeId /*e*/) const {
+ return "";
+ }
+};
+
+/**
+* Implementation of GraphLabeler for Graphs that have methods
+* str(VertexId) and str(EdgeId), such as AbstractGraph.
+*/
+template<class Graph>
+class StrGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ StrGraphLabeler(const Graph &g) : base(g) {}
+
+ /*virtual*/ std::string label(VertexId v) const {
+ return this->graph().str(v);
+ }
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ return this->graph().str(e);
+ }
+
+ /*virtual*/ ~StrGraphLabeler() {
+
+ }
+};
+
+template<class Graph>
+shared_ptr<GraphLabeler<Graph>> StrGraphLabelerInstance(const Graph &g) {
+ return make_shared<StrGraphLabeler<Graph>>(g);
+}
+
+template<class Graph>
+class LengthIdGraphLabeler : public StrGraphLabeler<Graph> {
+ typedef StrGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ LengthIdGraphLabeler(const Graph &g) : base(g) {}
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ std::stringstream ss;
+ ss << this->graph().length(e) << " (id: " << this->graph().int_id(e) << ")";
+ return ss.str();
+ }
+
+};
+
+template<class Graph>
+class LengthGraphLabeler : public StrGraphLabeler<Graph> {
+ typedef StrGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ LengthGraphLabeler(const Graph &g) : base(g) {}
+
+ /*virtual*/ std::string label(EdgeId e) const {
+ return ToString(this->graph().length(e));
+ }
+
+};
+
+template<class Graph>
+class CoverageGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef AbstractGraphLabeler<Graph> base;
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ CoverageGraphLabeler(const Graph &g) : base(g) {}
+
+ std::string label(EdgeId e) const {
+ double coverage = this->graph().coverage(e);
+ return " {Cov:" + ToString(coverage) + "}";
+ }
+};
+
+template<class Graph>
+class CompositeLabeler : public GraphLabeler<Graph> {
+private:
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ vector<GraphLabeler<Graph> *> list_;
+
+ template<typename ElementId>
+ string ConstructLabel(ElementId id) const {
+ vector<string> to_print;
+ for (size_t i = 0; i < list_.size(); i++) {
+ string next = list_[i]->label(id);
+ if (next.size() != 0) {
+ to_print.push_back(next);
+ }
+ }
+ string result = "";
+ for (size_t i = 0; i < to_print.size(); i++) {
+ result += to_print[i];
+ if (i + 1 < to_print.size())
+ result += "\\n";
+ }
+ return result;
+ }
+
+public:
+ CompositeLabeler() {
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2,
+ GraphLabeler<Graph> &labeler3,
+ GraphLabeler<Graph> &labeler4) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ AddLabeler(labeler3);
+ AddLabeler(labeler4);
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2,
+ GraphLabeler<Graph> &labeler3) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ AddLabeler(labeler3);
+ }
+
+ CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2) {
+ AddLabeler(labeler1);
+ AddLabeler(labeler2);
+ }
+
+ virtual ~CompositeLabeler() {
+ }
+
+ void AddLabeler(GraphLabeler<Graph> &labeler) {
+ list_.push_back(&labeler);
+ }
+
+ virtual string label(VertexId vertexId) const {
+ return ConstructLabel<VertexId>(vertexId);
+ }
+
+ virtual string label(EdgeId edgeId) const {
+ return ConstructLabel<EdgeId>(edgeId);
+ }
+};
+
+template<class Graph>
+class EdgePosGraphLabeler : public AbstractGraphLabeler<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+public:
+ const omnigraph::EdgesPositionHandler<Graph> &edge_pos_;
+
+ EdgePosGraphLabeler(const Graph &g, const omnigraph::EdgesPositionHandler<Graph> &edge_pos) :
+ AbstractGraphLabeler<Graph>(g), edge_pos_(edge_pos) {
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ return "Positions: " + edge_pos_.str(edgeId);
+ }
+
+ virtual ~EdgePosGraphLabeler() {
+// TRACE("~EdgePosGraphLabeler");
+ }
+
+private:
+ DECL_LOGGER("EdgePosGraphLabeler")
+};
+
+template<class Graph>
+class DefaultLabeler : public GraphLabeler<Graph> {
+private:
+ const Graph &g_;
+ const omnigraph::EdgesPositionHandler<Graph> &edges_positions_;
+protected:
+ typedef GraphLabeler<Graph> super;
+ typedef typename super::EdgeId EdgeId;
+ typedef typename super::VertexId VertexId;
+public:
+
+ DefaultLabeler(const Graph &g, const omnigraph::EdgesPositionHandler<Graph> &position_handler) :
+ g_(g), edges_positions_(position_handler) {
+ }
+
+ virtual std::string label(VertexId vertexId) const {
+ return ToString(vertexId.int_id());
+ }
+
+ virtual std::string label(EdgeId edgeId) const {
+ std::string ret_label;
+ ret_label += "Id " + g_.str(edgeId) + "\\n";
+ ret_label += "Positions:\\n" + edges_positions_.str(edgeId);
+ size_t len = g_.length(edgeId);
+ double cov = g_.coverage(edgeId);
+ ret_label += "Len(cov): " + ToString(len) + "(" + ToString(cov) + ")";
+ return ret_label;
+ }
+
+ virtual ~DefaultLabeler() {
+ }
+};
+}
+}
+
diff --git a/src/common/visualization/graph_print_utils.hpp b/src/common/visualization/graph_print_utils.hpp
new file mode 100755
index 0000000..0c2f978
--- /dev/null
+++ b/src/common/visualization/graph_print_utils.hpp
@@ -0,0 +1,327 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef GRAPH_PRINTER_HPP_
+#define GRAPH_PRINTER_HPP_
+
+#include "utils/standard_base.hpp"
+
+namespace visualization {
+
+template<class VertexId>
+struct BaseVertex {
+ VertexId id_;
+ string label_;
+ string href_;
+ string fill_color_;
+ BaseVertex(VertexId id, string label, string reference, string fill_color) :id_(id), label_(label), href_(reference), fill_color_(fill_color) {
+ }
+};
+
+template<class VertexId>
+struct BaseEdge {
+ VertexId from;
+ VertexId to;
+ string label;
+ string color;
+ BaseEdge(VertexId _from, VertexId _to, string _label, string _color) {
+ from = _from;
+ to = _to;
+ label = _label;
+ color = _color;
+ }
+};
+
+class StreamRecorder {
+private:
+ ostream &os_;
+protected:
+ virtual ostream &os() {
+ return os_;
+ }
+public:
+ StreamRecorder(ostream &os) : os_(os) {
+ }
+
+ virtual ~StreamRecorder() {
+ }
+};
+
+template<class Vertex, class Edge>
+class GraphRecorder {
+public:
+ virtual void recordVertex(Vertex vertex) = 0;
+
+ virtual void recordEdge(Edge edge) = 0;
+
+ virtual inline void startGraphRecord(const string &name) = 0;
+
+ virtual inline void endGraphRecord() = 0;
+
+ virtual ~GraphRecorder(){
+ }
+};
+
+template<class VertexId>
+class SingleGraphRecorder : public GraphRecorder<BaseVertex<VertexId>, BaseEdge<VertexId>> {
+protected:
+ typedef BaseVertex<VertexId> Vertex;
+ typedef BaseEdge<VertexId> Edge;
+};
+
+template<class VertexId>
+class PairedGraphRecorder : public GraphRecorder<pair<BaseVertex<VertexId>, BaseVertex<VertexId>>, BaseEdge<pair<VertexId, VertexId>>> {
+protected:
+ typedef pair<BaseVertex<VertexId>, BaseVertex<VertexId>> Vertex;
+ typedef BaseEdge<pair<VertexId, VertexId>> Edge;
+};
+
+template<class VertexId>
+class DotGraphRecorder : public StreamRecorder {
+public:
+ DotGraphRecorder(ostream &os) : StreamRecorder(os) {
+ }
+
+protected:
+ template<class vid>
+ void recordVertexId(vid id) {
+ this->os() << "vertex_" << id;
+ }
+
+ string IdToStr(VertexId u) {
+ stringstream ss;
+ ss << u;
+ return ss.str();
+ }
+
+ string constructNodeId(VertexId v) {
+ return constructNodePairId(v, v);
+ }
+
+ inline void recordParameter(ostream &os, const string &name, const string &value) {
+ os << name << "=" << "<" << value << "> ";
+ }
+
+ inline void recordParameter(const string &name, const string &value) {
+ recordParameter(this->os(), name, value);
+ }
+
+ inline void recordParameterInQuotes(ostream &os, const string &name, const string &value) {
+ os << name << "=" << "\"" << value << "\" ";
+ }
+
+ inline void recordParameterInQuotes(const string &name, const string &value) {
+ recordParameterInQuotes(this->os(), name, value);
+ }
+
+ inline double getColorParameter(int l, int r, double perc) {
+ return l * perc + r * (1 - perc);
+ }
+
+ inline string getColor(int currentLength, int approximateLength) {
+ currentLength %= approximateLength;
+ int points[8][3] = {{0, 0, 1}, {0, 1, 1}, {1, 1, 1}, {0, 1, 0}, {1, 1, 0}, {1, 0, 1}, {0, 0, 1}};
+ stringstream ss;
+ int bound = approximateLength / 6;
+ int num = currentLength / bound;
+ double perc = (currentLength % bound) * 1. / bound;
+ for (int i = 0; i < 3; i++) {
+ ss << getColorParameter(points[num][i], points[num + 1][i], perc);
+ if (i != 2)
+ ss << ",";
+ }
+ return ss.str();
+ }
+
+};
+
+
+template<class SingleVertexId>
+class DotSingleGraphRecorder: public SingleGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
+private:
+ typedef BaseVertex<SingleVertexId> Vertex;
+ typedef BaseEdge<SingleVertexId> Edge;
+
+public:
+ DotSingleGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
+ }
+
+ void recordVertex(Vertex vertex) {
+ this->recordVertexId(vertex.id_);
+ this->os() << "[";
+ this->recordParameterInQuotes("label", vertex.label_);
+ this->os() << ",";
+ this->recordParameter("style", "filled");
+ this->os() << ",";
+ this->recordParameter("color", "black");
+ this->os() << ",";
+ if(vertex.href_ != "") {
+ this->recordParameterInQuotes("href", vertex.href_);
+ this->os() << ",";
+ }
+ this->recordParameter("fillcolor", vertex.fill_color_);
+ this->os() << "]" << endl;
+ }
+
+ void recordEdge(Edge edge) {
+ this->recordVertexId(edge.from);
+ this->os() << "->";
+ this->recordVertexId(edge.to);
+ this->os() << "[";
+ this->recordParameterInQuotes("label", edge.label);
+ this->os() << ",";
+ this->recordParameter("color", edge.color);
+ this->os() << "]" << endl;
+ }
+
+ inline void startGraphRecord(const string &name) {
+ this->os() << "digraph " << name << " {" << endl;
+ this->os() << "node" << "[";
+ this->recordParameter("fontname", "Courier");
+ this->recordParameter("penwidth", "1.8");
+ this->os() << "]" << endl;
+ }
+
+ inline void endGraphRecord() {
+ this->os() << "}" << endl;
+ }
+};
+
+template<class SingleVertexId>
+class DotPairedGraphRecorder: public PairedGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
+private:
+ typedef BaseVertex<SingleVertexId> SingleVertex;
+ typedef BaseEdge<SingleVertexId> SingleEdge;
+ typedef typename PairedGraphRecorder<SingleVertexId>::Vertex Vertex;
+ typedef typename PairedGraphRecorder<SingleVertexId>::Edge Edge;
+
+
+ string constructNodePairId(SingleVertexId u, SingleVertexId v) {
+ stringstream ss;
+ string u_str = this->IdToStr(u);
+ string v_str = this->IdToStr(v);
+ if (u == v)
+ ss << u;
+ else if (u_str > v_str)
+ ss << v_str << "_" << u_str;
+ else
+ ss << u_str << "_" << v_str;
+ return ss.str();
+ }
+
+ inline string constructPortCell(const string &port, string href, const string &color) {
+ stringstream ss;
+ ss << "<TD BORDER=\"0\" PORT = \"port_" << port << "\" ";
+ this->recordParameterInQuotes(ss, "color", color);
+ this->recordParameterInQuotes(ss, "bgcolor", color);
+ if(href != "") {
+ ss <<"href=\"" << href << "\"";
+ }
+ ss << "></TD>";
+ return ss.str();
+ }
+
+ inline string constructLabelCell(const string &label, const string &href, const string &color) {
+ stringstream ss;
+ ss << "<TD BORDER=\"0\" ";
+ this->recordParameterInQuotes(ss, "color", color);
+ this->recordParameterInQuotes(ss, "bgcolor", color);
+ if(href != "") {
+ ss <<"href=\"" << href << "\"";
+ }
+ ss << ">"
+ << label << "</TD>";
+ return ss.str();
+ }
+
+ string constructComplexNodeId(string pairId, SingleVertexId v) {
+ stringstream ss;
+ ss << pairId << ":port_" << v;
+ return ss.str();
+ }
+
+ string constructTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
+ stringstream ss;
+ ss << "<TR>";
+ ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
+ ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
+ ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
+ ss << "</TR>\n";
+ return ss.str();
+ }
+
+ string constructReverceTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
+ stringstream ss;
+ ss << "<TR>";
+ ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
+ ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
+ ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
+ ss << "</TR>\n";
+ return ss.str();
+ }
+
+ string constructComplexNodeLabel(Vertex v) {
+ return "<TABLE BORDER=\"1\" CELLSPACING=\"0\" >\n" + constructTableEntry(v.first)
+ + constructReverceTableEntry(v.second) + "</TABLE>";
+ }
+
+ string constructVertexInPairId(SingleVertexId v, SingleVertexId rc) {
+ return constructComplexNodeId(constructNodePairId(v, rc), v);
+ }
+
+
+public:
+ DotPairedGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
+ }
+
+ void recordPairedVertexId(SingleVertexId id1, SingleVertexId id2) {
+ this->os() << "vertex_" << constructNodePairId(id1, id2);
+ }
+
+ void recordVertex(Vertex vertex) {
+ string pairLabel = constructComplexNodeLabel(vertex);
+ recordPairedVertexId(vertex.first.id_, vertex.second.id_);
+ this->os() << "[";
+ this->recordParameter("label", constructComplexNodeLabel(vertex));
+ this->os() << ",";
+ this->recordParameter("color", "black");
+ this->os() << ",";
+ this->recordParameter("URL", "/vertex/" + std::to_string(vertex.first.id_) + ".svg");
+ this->os() << "]" << endl;
+ }
+
+ void recordEdge(Edge edge) {
+ this->recordVertexId(constructVertexInPairId(edge.from.first, edge.from.second));
+ this->os() << "_out";
+ this->os() << "->";
+ this->recordVertexId(constructVertexInPairId(edge.to.first, edge.to.second));
+ this->os() << "_in";
+ this->os() << "[";
+ this->recordParameterInQuotes("label", edge.label);
+ this->os() << ",";
+ this->recordParameter("color", edge.color);
+ this->os() << "]" << endl;
+ }
+
+ inline void startGraphRecord(const string &name) {
+ this->os() << "digraph " << name << " {" << endl;
+ this->os() << "node" << "[";
+ this->recordParameter("fontname", "Courier");
+ this->os() << ",";
+ this->recordParameter("penwidth", "1.8");
+ this->os() << ",";
+ this->recordParameter("shape", "plaintext");
+ this->os() << "]" << endl;
+ }
+
+ inline void endGraphRecord() {
+ this->os() << "}" << endl;
+ }
+};
+
+}
+#endif //GRAPH_PRINTER_HPP_//
diff --git a/src/common/visualization/graph_printer.hpp b/src/common/visualization/graph_printer.hpp
new file mode 100644
index 0000000..d1f9b67
--- /dev/null
+++ b/src/common/visualization/graph_printer.hpp
@@ -0,0 +1,186 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "graph_print_utils.hpp"
+#include "graph_labeler.hpp"
+#include "graph_colorer.hpp"
+#include "vertex_linker.hpp"
+
+using namespace omnigraph;
+
+namespace visualization {
+
+namespace graph_printer {
+
+template<class Graph>
+class GraphPrinter {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+// ostream& os_;
+ const Graph &graph_;
+protected:
+ const graph_labeler::GraphLabeler<Graph> &labeler_;
+ const graph_colorer::GraphColorer<Graph> &colorer_;
+ const vertex_linker::VertexLinker<Graph> &linker_;
+
+protected:
+// ostream& os() {
+// return os_;
+// }
+
+
+ const Graph &graph() {
+ return graph_;
+ }
+
+ template<class GvisVertexId>
+ BaseVertex<GvisVertexId> CreateBaseVertex(GvisVertexId id, VertexId v) {
+ return BaseVertex<GvisVertexId>(id, labeler_.label(v), linker_.GetValue(v), colorer_.GetValue(v));
+ }
+
+ template<class GvisVertexId>
+ BaseEdge<GvisVertexId> CreateBaseEdge(GvisVertexId from, GvisVertexId to, EdgeId e){
+ return BaseEdge<GvisVertexId>(from, to, this->labeler_.label(e), this->colorer_.GetValue(e));
+ }
+
+ virtual void ManageDrawn(VertexId v, set<VertexId> &visited) {
+ visited.insert(v);
+ }
+
+public:
+ GraphPrinter(const Graph &graph, /*ostream &os,*/
+ const graph_labeler::GraphLabeler<Graph> &labeler,
+ const graph_colorer::GraphColorer<Graph> &colorer,
+ const vertex_linker::VertexLinker<Graph> &linker) :
+ /*os_(os), */graph_(graph), labeler_(labeler), colorer_(colorer), linker_(
+ linker) {
+ }
+
+ virtual void open() = 0;
+
+ virtual void close() = 0;
+
+ virtual void AddVertex(VertexId v1) = 0;
+
+ template<class iter>
+ void AddVertices(iter vbegin, iter vend) {
+ set<VertexId> drawn;
+ for (; vbegin != vend; ++vbegin) {
+ if (drawn.count(*vbegin) == 0) {
+ AddVertex(*vbegin);
+ ManageDrawn(*vbegin, drawn);
+ }
+ }
+ }
+
+ virtual void AddEdge(EdgeId e) = 0;
+
+ template<class iter>
+ void AddEdges(iter ebegin, iter eend) {
+ for (; ebegin != eend; ++ebegin) {
+ AddEdge(*ebegin);
+ }
+ }
+
+ virtual ~GraphPrinter() {
+ }
+};
+
+template<typename Graph>
+class SingleGraphPrinter : public GraphPrinter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ DotSingleGraphRecorder<size_t> recorder_;
+
+public:
+ SingleGraphPrinter(const Graph &graph, ostream &os,
+ const graph_labeler::GraphLabeler<Graph> &labeler,
+ const graph_colorer::GraphColorer<Graph> &colorer,
+ const vertex_linker::VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler,
+ colorer, linker),
+ recorder_(os) {
+ }
+
+ void open() {
+ recorder_.startGraphRecord("graph_picture");
+ }
+
+ void close() {
+ recorder_.endGraphRecord();
+ }
+
+ void AddVertex(VertexId v) {
+ recorder_.recordVertex(this->CreateBaseVertex((size_t) this->graph().int_id(v), v));
+ }
+
+ void AddEdge(EdgeId edge) {
+ recorder_.recordEdge(this->CreateBaseEdge((size_t) this->graph().int_id(this->graph().EdgeStart(edge)),
+ (size_t) this->graph().int_id(this->graph().EdgeEnd(edge)),
+ edge));
+ }
+};
+
+template<typename Graph>
+class PairedGraphPrinter : public GraphPrinter<Graph> {
+private:
+ typedef typename Graph::VertexId VertexId;
+ typedef typename Graph::EdgeId EdgeId;
+
+ DotPairedGraphRecorder<size_t> recorder_;
+
+ pair<BaseVertex<size_t>, BaseVertex<size_t>> CreateDoubleVertex(VertexId v) {
+ BaseVertex<size_t> u1 = this->CreateBaseVertex((size_t)this->graph().int_id(v), v);
+ BaseVertex<size_t> u2 = this->CreateBaseVertex((size_t)this->graph().int_id(this->graph().conjugate(v)), this->graph().conjugate(v));
+ return make_pair(u1, u2);
+ }
+
+ pair<size_t, size_t> CreateDoubleVertexId(VertexId v) {
+ return make_pair(this->graph().int_id(v), this->graph().int_id(this->graph().conjugate(v)));
+ }
+
+protected:
+ /*virtual */void ManageDrawn(VertexId v, set<VertexId> &visited) {
+ visited.insert(v);
+ visited.insert(this->graph().conjugate(v));
+ }
+
+public:
+ PairedGraphPrinter(const Graph &graph, ostream &os,
+ const graph_labeler::GraphLabeler<Graph> &labeler,
+ const graph_colorer::GraphColorer<Graph> &colorer,
+ const vertex_linker::VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler,
+ colorer, linker),
+ recorder_(os) {
+ }
+
+ void open() {
+ recorder_.startGraphRecord("graph_picture");
+ }
+
+ void close() {
+ recorder_.endGraphRecord();
+ }
+
+ void AddVertex(VertexId v) {
+ recorder_.recordVertex(CreateDoubleVertex(v));
+ }
+
+ void AddEdge(EdgeId edge) {
+ auto vid1 = CreateDoubleVertexId(this->graph().EdgeStart(edge));
+ auto vid2 = CreateDoubleVertexId(this->graph().EdgeEnd(edge));
+ recorder_.recordEdge(BaseEdge<pair<size_t, size_t>>(vid1, vid2, this->labeler_.label(edge), this->colorer_.GetValue(edge)));
+ }
+};
+
+}
+}
diff --git a/src/common/visualization/position_filler.hpp b/src/common/visualization/position_filler.hpp
new file mode 100644
index 0000000..e0e61b3
--- /dev/null
+++ b/src/common/visualization/position_filler.hpp
@@ -0,0 +1,96 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "common/modules/alignment/sequence_mapper.hpp"
+#include "assembly_graph/handlers/edges_position_handler.hpp"
+#include "io/reads/wrapper_collection.hpp"
+#include "io/reads/io_helper.hpp"
+
+namespace visualization {
+
+namespace position_filler {
+
+template<class Graph>
+class PosFiller {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef std::shared_ptr<debruijn_graph::SequenceMapper < Graph>> MapperPtr;
+ const Graph &g_;
+ MapperPtr mapper_;
+ omnigraph::EdgesPositionHandler<Graph> &edge_pos_;
+
+public:
+ PosFiller(const Graph &g, MapperPtr mapper,
+ omnigraph::EdgesPositionHandler<Graph> &edge_pos) :
+ g_(g), mapper_(mapper), edge_pos_(edge_pos) {
+
+ }
+
+ void Process(const Sequence &s, string name) const {
+ //todo stupid conversion!
+ return Process(io::SingleRead(name, s.str()));
+ }
+
+ void Process(const io::SingleRead &read) const {
+ omnigraph::MappingPath<EdgeId> path = mapper_->MapRead(read);
+ const string name = read.name();
+ int cur_pos = 0;
+ TRACE("Contig " << name << " mapped on " << path.size()
+ << " fragments.");
+ for (size_t i = 0; i < path.size(); i++) {
+ EdgeId ei = path[i].first;
+ omnigraph::MappingRange mr = path[i].second;
+ int len = (int) (mr.mapped_range.end_pos - mr.mapped_range.start_pos);
+ if (i > 0) if (path[i - 1].first != ei) if (g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) {
+ TRACE(
+ "Contig " << name
+ << " mapped on not adjacent edge. Position in contig is "
+ << path[i - 1].second.initial_range.start_pos
+ + 1
+ << "--"
+ << path[i - 1].second.initial_range.end_pos
+ << " and "
+ << mr.initial_range.start_pos + 1
+ << "--" << mr.initial_range.end_pos);
+ }
+ edge_pos_.AddEdgePosition(ei, name, mr.initial_range.start_pos,
+ mr.initial_range.end_pos,
+ mr.mapped_range.start_pos,
+ mr.mapped_range.end_pos);
+ cur_pos += len;
+ }
+ }
+
+ void Process(io::SingleStream &stream) const {
+ io::SingleRead read;
+ while (!stream.eof()) {
+ stream >> read;
+ Process(read);
+ }
+ }
+
+private:
+ DECL_LOGGER("PosFiller");
+};
+
+template<class gp_t>
+void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc = false) {
+ PosFiller<typename gp_t::graph_t> pos_filler(gp.g, debruijn_graph::MapperInstance(gp), gp.edge_pos);
+ auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc, false),
+ prefix);
+ pos_filler.Process(*irs);
+}
+
+template<class gp_t>
+void FillPos(gp_t &gp, const Sequence &s, string name) {
+ PosFiller<typename gp_t::graph_t> pos_filler(gp.g, debruijn_graph::MapperInstance(gp), gp.edge_pos);
+ pos_filler.Process(s, name);
+}
+
+}
+}
\ No newline at end of file
diff --git a/src/common/visualization/printing_parameter_storage.hpp b/src/common/visualization/printing_parameter_storage.hpp
new file mode 100644
index 0000000..2d4d500
--- /dev/null
+++ b/src/common/visualization/printing_parameter_storage.hpp
@@ -0,0 +1,88 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "utils/standard_base.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+
+using namespace omnigraph;
+
+namespace visualization {
+
+namespace printing_parameter_storage {
+
+template<typename ElementId, typename Value>
+class ParameterStorage {
+public:
+ virtual Value GetValue(ElementId element) const = 0;
+
+ virtual ~ParameterStorage() {
+ }
+};
+
+template<typename ElementId, typename Value>
+class MapParameterStorage : public virtual ParameterStorage<ElementId, Value> {
+private:
+private:
+ template<class It>
+ static map<ElementId, string> ConstructMap(It begin, It end, const string &color) {
+ map<ElementId, string> result;
+ for (auto it = begin; it != end; ++it) {
+ result.insert(make_pair(*it, color));
+ }
+ return result;
+ }
+
+protected:
+ map<ElementId, Value> storage_;
+private:
+ boost::optional<Value> default_value_;
+public:
+ MapParameterStorage(const string &default_value) : default_value_(default_value) {
+ }
+
+ MapParameterStorage(map<ElementId, Value> storage, Value default_value) : storage_(storage),
+ default_value_(default_value) {
+ }
+
+ MapParameterStorage(map<ElementId, Value> storage) : storage_(storage) {
+ }
+
+ template<class It>
+ MapParameterStorage(It begin, It end, const Value &value, const string &default_value) : storage_(
+ ConstructMap(begin, end, value)), default_value_(default_value) {
+ }
+
+
+ Value GetValue(ElementId element) const {
+ auto it = storage_.find(element);
+ if (it == storage_.end()) {
+ VERIFY(default_value_);
+ return default_value_.get();
+ }
+ return it->second;
+ }
+};
+
+template<typename ElementId, typename Value>
+class DecoratorParameterStorage : public virtual ParameterStorage<ElementId, Value> {
+private:
+ ParameterStorage<ElementId, Value> inner_storage_;
+public:
+ DecoratorParameterStorage(ParameterStorage<ElementId, Value> inner_storage) : inner_storage_(
+ inner_storage) {
+ }
+
+ Value GetInnerValue(ElementId element) {
+ return inner_storage_.GetValue(element);
+ }
+};
+
+}
+}
\ No newline at end of file
diff --git a/src/common/visualization/vertex_linker.hpp b/src/common/visualization/vertex_linker.hpp
new file mode 100644
index 0000000..b85ab76
--- /dev/null
+++ b/src/common/visualization/vertex_linker.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/standard_base.hpp"
+#include "printing_parameter_storage.hpp"
+
+namespace visualization {
+
+namespace vertex_linker {
+
+template<class Graph>
+class VertexLinker
+ : public virtual printing_parameter_storage::ParameterStorage<typename Graph::VertexId, string> {
+};
+
+template<class Graph>
+class MapVertexLinker : public VertexLinker<Graph>,
+ public printing_parameter_storage::MapParameterStorage<typename Graph::VertexId, string> {
+public:
+ MapVertexLinker() : printing_parameter_storage::MapParameterStorage<typename Graph::VertexId, string>("") {
+ }
+
+ MapVertexLinker(const map<typename Graph::VertexId, string> &link_map) :
+ printing_parameter_storage::MapParameterStorage<typename Graph::VertexId, string>(link_map, "") {
+ }
+
+ virtual ~MapVertexLinker() {
+ }
+};
+
+template<class Graph>
+class EmptyGraphLinker : public MapVertexLinker<Graph> {
+public:
+ EmptyGraphLinker() {
+ }
+};
+
+}
+
+}
\ No newline at end of file
diff --git a/src/modules/visualization/visualization.hpp b/src/common/visualization/visualization.hpp
similarity index 100%
rename from src/modules/visualization/visualization.hpp
rename to src/common/visualization/visualization.hpp
diff --git a/src/common/visualization/visualization_utils.hpp b/src/common/visualization/visualization_utils.hpp
new file mode 100644
index 0000000..34ec334
--- /dev/null
+++ b/src/common/visualization/visualization_utils.hpp
@@ -0,0 +1,223 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "graph_printer.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/components/splitters.hpp"
+#include "assembly_graph/components/graph_component.hpp"
+#include "visualizers.hpp"
+#include "vertex_linker.hpp"
+
+#include <fstream>
+
+namespace visualization {
+
+namespace visualization_utils {
+
+template<class Graph>
+void WriteComponents(const Graph &g,
+ const string &folder_name,
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler) {
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
+ auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(
+ inner_splitter, filter);
+ visualization::visualizers::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(*splitter,
+ folder_name);
+}
+
+template<class Graph>
+void DrawComponentsOfShortEdges(const Graph &g, const string &output_dir, size_t min_length, size_t sinks,
+ size_t sources) {
+ vector<typename Graph::EdgeId> short_edges;
+ std::string pics_folder_ =
+ output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_" +
+ "pics_polymorphic/";
+ make_dir(pics_folder_);
+ INFO("Writing pics with components consisting of short edges to " + pics_folder_);
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g, min_length);
+ while (splitter->HasNext()) {
+ GraphComponent<Graph> component = splitter->Next();
+ if (component.v_size() > 3 && component.exits().size() == sinks &&
+ component.entrances().size() == sources) {
+ bool fail = false;
+ for (auto v : component.entrances()) {
+ if (component.g().IncomingEdgeCount(v) != 1) {
+ fail = true;
+ }
+ }
+ for (auto v : component.exits()) {
+ if (component.g().OutgoingEdgeCount(v) != 1) {
+ fail = true;
+ }
+ }
+
+ if (fail) {
+ continue;
+ }
+
+ graph_labeler::StrGraphLabeler<Graph> labeler(component.g());
+ graph_labeler::CoverageGraphLabeler<Graph> labeler2(component.g());
+ graph_labeler::CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
+ WriteComponentSinksSources(component,
+ pics_folder_ + ToString(g.int_id(*component.vertices().begin()))
+ + ".dot", visualization::graph_colorer::DefaultColorer(g),
+ compositeLabeler);
+ INFO("Component is written to " + ToString(g.int_id(*component.vertices().begin())) + ".dot");
+
+ // PrintComponent(component,
+// pics_folder_ + "ShortComponents/"
+// + ToString(gp.g.int_id(component.vertices_[0]))
+// + ".dot");
+ }
+ }
+}
+
+
+template<class Graph>
+void WriteSizeLimitedComponents(const Graph &g,
+ const string &folder_name,
+ shared_ptr<GraphSplitter<Graph>> inner_splitter,
+ shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler, int min_component_size,
+ int max_component_size, size_t max_components) {
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+
+ auto filter = make_shared<omnigraph::ComponentSizeFilter<Graph>>(g, 1000000000, (size_t) min_component_size,
+ (size_t) max_component_size);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(
+ inner_splitter, filter);
+ visualization::visualizers::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker, false,
+ max_components).SplitAndVisualize(*splitter, folder_name);
+}
+
+template<class Graph>
+void WriteComponent(const GraphComponent<Graph> &gc,
+ const string &file_name, shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler) {
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+ graph_colorer::BorderDecorator<Graph> component_colorer(gc, *colorer, "yellow");
+ std::ofstream os;
+ os.open(file_name);
+ visualization::visualizers::ComponentVisualizer<Graph>(gc.g(), true).
+ Visualize(gc, os, labeler, component_colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentSinksSources(const GraphComponent<Graph> &gc,
+ const string &file_name, shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler) {
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+ graph_colorer::SinkSourceDecorator<Graph> component_colorer(gc, *colorer);
+ std::ofstream os;
+ os.open(file_name);
+ visualization::visualizers::ComponentVisualizer<Graph>(gc.g(), true).
+ Visualize(gc, os, labeler, component_colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentSinksSources(const GraphComponent<Graph> &gc,
+ const string &file_name) {
+
+ graph_labeler::StrGraphLabeler<Graph> labeler(gc.g());
+ graph_labeler::CoverageGraphLabeler<Graph> labeler2(gc.g());
+ graph_labeler::CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+ WriteComponentSinksSources(gc, file_name, graph_colorer::DefaultColorer(gc.g()),
+ compositeLabeler);
+}
+
+template<class Graph>
+void WriteSimpleComponent(const GraphComponent<Graph> &gc,
+ const string &file_name, shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler) {
+ vertex_linker::EmptyGraphLinker<Graph> linker;
+ std::ofstream os;
+ os.open(file_name);
+ visualization::visualizers::ComponentVisualizer<Graph>(gc.g(), false).
+ Visualize(gc, os, labeler, *colorer, linker);
+ os.close();
+}
+
+template<class Graph>
+void WriteComponentsAlongPath(const Graph &g, const vector<typename Graph::EdgeId> &path,
+ const string &prefix_path, shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+ const graph_labeler::GraphLabeler<Graph> &labeler, bool color_path = true) {
+ auto edge_colorer = make_shared<graph_colorer::CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer);
+ if (color_path) {
+ edge_colorer->AddColorer(make_shared<graph_colorer::SetColorer<Graph>>(g, path, "green"));
+ }
+ shared_ptr<graph_colorer::GraphColorer<Graph>> resulting_colorer = make_shared<graph_colorer::CompositeGraphColorer<Graph>>(
+ colorer, edge_colorer);
+ shared_ptr<GraphSplitter<Graph>> rs = ReliableSplitterAlongPath<Graph>(g, path);
+ auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
+ shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CondensingSplitterWrapper<Graph>>(rs,
+ filter);
+ WriteComponents<Graph>(g, prefix_path, splitter, resulting_colorer, labeler);
+}
+
+template<class Graph>
+class LocalityPrintingRH {
+ typedef typename Graph::EdgeId EdgeId;
+ typedef typename Graph::VertexId VertexId;
+ const Graph &g_;
+ const graph_labeler::GraphLabeler<Graph> &labeler_;
+ std::shared_ptr<graph_colorer::GraphColorer<Graph>> colorer_;
+ const string output_folder_;
+public:
+ LocalityPrintingRH(const Graph &g, const graph_labeler::GraphLabeler<Graph> &labeler,
+ std::shared_ptr<graph_colorer::GraphColorer<Graph>> colorer, const string &output_folder)
+ :
+ g_(g),
+ labeler_(labeler),
+ colorer_(colorer),
+ output_folder_(output_folder) {
+// path::make_dirs(output_folder_);
+ }
+
+ void HandleDelete(EdgeId e, const string &add_label = "") {
+ //todo magic constant
+// map<EdgeId, string> empty_coloring;
+ auto edge_colorer = make_shared<graph_colorer::CompositeEdgeColorer<Graph>>("black");
+ edge_colorer->AddColorer(colorer_);
+ edge_colorer->AddColorer(
+ make_shared<graph_colorer::SetColorer<Graph>>(g_, vector<EdgeId>(1, e), "green"));
+ shared_ptr<graph_colorer::GraphColorer<Graph>> resulting_colorer = make_shared<graph_colorer::CompositeGraphColorer<Graph>>(
+ colorer_, edge_colorer);
+
+ string fn = output_folder_ + "/edge_" + ToString(g_.int_id(e)) + add_label + ".dot";
+ visualization::visualization_utils::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(g_, e, 50, 250), fn, resulting_colorer,
+ labeler_);
+ }
+
+private:
+ DECL_LOGGER("LocalityPrintingRH");
+};
+
+//static void WriteFilteredComponents(const Graph& g,
+// const string& folder_name,
+// shared_ptr<GraphComponentFilter<Graph>> filter,
+// shared_ptr<GraphSplitter<Graph>> splitter,
+// shared_ptr<graph_colorer::GraphColorer<Graph>> colorer,
+// const GraphLabeler<Graph> &labeler) {
+// vertex_linker::EmptyGraphLinker<Graph> linker;
+//// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
+// omnigraph::FilteringSplitterWrapper<Graph> filtered_splitter(splitter, filter);
+// visualization::visualizers::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(filtered_splitter, folder_name);
+//}
+
+}
+
+}
\ No newline at end of file
diff --git a/src/common/visualization/visualizers.hpp b/src/common/visualization/visualizers.hpp
new file mode 100644
index 0000000..50819fe
--- /dev/null
+++ b/src/common/visualization/visualizers.hpp
@@ -0,0 +1,180 @@
+#pragma once
+
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+
+#include "utils/standard_base.hpp"
+#include "graph_printer.hpp"
+#include <fstream>
+
+using namespace omnigraph;
+
+namespace visualization {
+
+namespace visualizers {
+
+//DECL_LOGGER("omg.gvis")
+
+template<class Graph>
+class ComponentVisualizer {
+ const Graph &graph_;
+ const bool paired_;
+
+private:
+ void Visualize(const GraphComponent<Graph> &component, graph_printer::GraphPrinter <Graph> &printer) {
+ printer.open();
+ printer.AddVertices(component.vertices().begin(), component.vertices().end());
+ for (auto e_it = component.e_begin(); e_it != component.e_end();
+ ++e_it) {
+ printer.AddEdge(*e_it);
+ }
+ printer.close();
+ }
+
+public:
+ ComponentVisualizer(const Graph &graph, bool paired = true) :
+ graph_(graph), paired_(paired) {
+ }
+
+ void Visualize(const GraphComponent<Graph> &component, ostream &os,
+ const graph_labeler::GraphLabeler<Graph> &labeler,
+ const graph_colorer::GraphColorer<Graph> &colorer,
+ const vertex_linker::VertexLinker<Graph> &linker) {
+ if (paired_) {
+ graph_printer::PairedGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(component, printer);
+ } else {
+ graph_printer::SingleGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
+ Visualize(component, printer);
+ }
+ }
+
+ void Visualize(ostream &os,
+ const graph_labeler::GraphLabeler<Graph> &labeler,
+ const graph_colorer::GraphColorer<Graph> &colorer,
+ const vertex_linker::VertexLinker<Graph> &linker) {
+ Visualize(GraphComponent<Graph>::WholeGraph(graph_), os, labeler, colorer, linker);
+ }
+};
+
+
+template<class Graph>
+class ComponentNameGenerator {
+public:
+ virtual string ComponentName(const GraphComponent<Graph> &component) = 0;
+
+ virtual ~ComponentNameGenerator() {
+ }
+};
+
+template<class Graph>
+class SimpleCountingComponentNameGenerator : public ComponentNameGenerator<Graph> {
+private:
+ string name_;
+ string extension_;
+ size_t cnt_;
+public:
+ SimpleCountingComponentNameGenerator(string name, string extension) : name_(name), extension_(extension),
+ cnt_(0) {
+ }
+
+ string ComponentName(const GraphComponent<Graph> &component) {
+ cnt_++;
+ stringstream ss;
+ ss << name_ << "_" << cnt_;
+ if (component.name().size() > 0)
+ ss << "_" << component.name();
+ ss << "." << extension_;
+ return ss.str();
+ }
+};
+
+template<class Graph>
+class CountingSizeComponentNameGenerator : public ComponentNameGenerator<Graph> {
+private:
+ string name_;
+ string extension_;
+ size_t cnt_;
+public:
+ CountingSizeComponentNameGenerator(string name, string extension) : name_(name), extension_(extension),
+ cnt_(0) {
+ }
+
+ string ComponentName(const GraphComponent<Graph> &component) {
+ cnt_++;
+ stringstream ss;
+ ss << name_ << "_" << cnt_;
+ if (component.name().size() > 0)
+ ss << "_" << component.name();
+ ss << "_size_" << component.size();
+ ss << "." << extension_;
+
+ return ss.str();
+ }
+};
+
+
+template<class Graph>
+class SplittingGraphVisualizer {
+private:
+ const Graph &graph_;
+ const graph_labeler::GraphLabeler <Graph> &labeler_;
+ const graph_colorer::GraphColorer <Graph> &colorer_;
+ const vertex_linker::VertexLinker <Graph> &linker_;
+ const bool paired_;
+ const size_t max_component_number_;
+ static const size_t DEFAULT_MAX_COMPONENT_NUMBER = 500;
+
+ string ComponentFileName(size_t cnt, const string &folder, const GraphComponent<Graph> &component) {
+ stringstream ss;
+ ss << folder << cnt;
+ if (component.name().size() > 0)
+ ss << "graph_" << component.name();
+ ss << ".dot";
+ return ss.str();
+ }
+
+public:
+ SplittingGraphVisualizer(const Graph &graph,
+ const graph_labeler::GraphLabeler <Graph> &labeler,
+ const graph_colorer::GraphColorer <Graph> &colorer,
+ const vertex_linker::VertexLinker <Graph> &linker,
+ bool paired = true,
+ size_t max_component_number = DEFAULT_MAX_COMPONENT_NUMBER) :
+ graph_(graph), labeler_(labeler), colorer_(colorer), linker_(linker), paired_(paired),
+ max_component_number_(max_component_number) {
+ }
+
+ size_t SplitAndVisualize(GraphSplitter<Graph> &splitter, const string &folder) {
+ INFO("Writing components to folder " << folder);
+ ComponentVisualizer<Graph> visualizer(graph_, paired_);
+ size_t cnt = 0;
+ while (splitter.HasNext()) {
+ if (cnt > max_component_number_) {
+ INFO("The number of graph components exceeded " << max_component_number_
+ << ". Aborting current visualization.");
+ break;
+ }
+ cnt++;
+ GraphComponent<Graph> component = splitter.Next();
+ graph_colorer::BorderDecorator<Graph> border_colorer(component, colorer_, "yellow");
+ ofstream os(ComponentFileName(cnt, folder, component));
+ visualizer.Visualize(component, os, labeler_, border_colorer, linker_);
+ os.close();
+ }
+ return cnt;
+ }
+
+private:
+ DECL_LOGGER("SplittingGraphVisualizer");
+};
+
+}
+}
+
+
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
deleted file mode 100644
index 280629f..0000000
--- a/src/modules/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(spades_modules CXX)
-
-add_subdirectory(pipeline)
-add_subdirectory(assembly_graph)
-add_subdirectory(data_structures/sequence)
-add_subdirectory(math)
-add_subdirectory(algorithms/path_extend)
-add_subdirectory(algorithms)
-add_subdirectory(paired_info)
-add_subdirectory(stages)
-add_subdirectory(dev_support)
-add_subdirectory(io)
-add_subdirectory(data_structures/mph_index)
-
-add_library(spades_modules STATIC empty.cpp)
-
-target_link_libraries(spades_modules graph_support input sequence pipeline math_module path_extend paired_info stages dev_support mph_index algorithms)
diff --git a/src/modules/algorithms/CMakeLists.txt b/src/modules/algorithms/CMakeLists.txt
deleted file mode 100644
index a4b8d60..0000000
--- a/src/modules/algorithms/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(algorithms CXX)
-
-add_library(algorithms STATIC genome_consistance_checker.cpp)
-
diff --git a/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp b/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp
deleted file mode 100644
index 11c32d8..0000000
--- a/src/modules/algorithms/dijkstra/dijkstra_algorithm.hpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-
-#include "dev_support/simple_tools.hpp"
-#include "dijkstra_settings.hpp"
-
-#include <queue>
-#include <vector>
-#include <set>
-#include <map>
-
-namespace omnigraph {
-
-template<typename Graph, typename distance_t = size_t>
-struct element_t{
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- distance_t distance;
- VertexId curr_vertex;
- VertexId prev_vertex;
- EdgeId edge_between;
-
- element_t(distance_t new_distance, VertexId new_cur_vertex, VertexId new_prev_vertex,
- EdgeId new_edge_between) : distance(new_distance), curr_vertex(new_cur_vertex),
- prev_vertex(new_prev_vertex), edge_between(new_edge_between) { }
-};
-
-template<typename T>
-class ReverseDistanceComparator {
-public:
- ReverseDistanceComparator() {
- }
-
- bool operator()(T obj1, T obj2){
- if(obj1.distance != obj2.distance)
- return obj2.distance < obj1.distance;
- if(obj2.curr_vertex != obj1.curr_vertex)
- return obj2.curr_vertex < obj1.curr_vertex;
- if(obj2.prev_vertex != obj1.prev_vertex)
- return obj2.prev_vertex < obj1.prev_vertex;
- return obj2.edge_between < obj1.edge_between;
- }
-};
-
-template<class Graph, class DijkstraSettings, typename distance_t = size_t>
-class Dijkstra {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef distance_t DistanceType;
-
- typedef std::map<VertexId, distance_t> distances_map;
- typedef typename distances_map::const_iterator distances_map_ci;
- typedef typename std::priority_queue<element_t<Graph, distance_t>, std::vector<element_t<Graph, distance_t>>,
- ReverseDistanceComparator<element_t<Graph, distance_t>>> queue_t;
-
- // constructor parameters
- const Graph& graph_;
- DijkstraSettings settings_;
- const size_t max_vertex_number_;
-
- // changeable parameters
- bool finished_;
- size_t vertex_number_;
- bool vertex_limit_exceeded_;
-
- // accumulative structures
- distances_map distances_;
- std::set<VertexId> processed_vertices_;
- std::map<VertexId, pair<VertexId, EdgeId>> prev_vert_map_;
-
- void Init(VertexId start, queue_t &queue) {
- vertex_number_ = 0;
- distances_.clear();
- processed_vertices_.clear();
- prev_vert_map_.clear();
- set_finished(false);
- settings_.Init(start);
- queue.push(element_t<Graph, distance_t>(0, start, VertexId(0), EdgeId(0)));
- prev_vert_map_[start] = std::pair<VertexId, EdgeId>(VertexId(0), EdgeId(0));
- }
-
- void set_finished(bool state) {
- finished_ = state;
- }
-
- bool CheckPutVertex(VertexId vertex, EdgeId edge, distance_t length) const {
- return settings_.CheckPutVertex(vertex, edge, length);
- }
-
- bool CheckProcessVertex(VertexId vertex, distance_t distance) {
- ++vertex_number_;
- if (vertex_number_ > max_vertex_number_) {
- vertex_limit_exceeded_ = true;
- return false;
- }
- return (vertex_number_ < max_vertex_number_) && settings_.CheckProcessVertex(vertex, distance);
- }
-
- distance_t GetLength(EdgeId edge) const {
- return settings_.GetLength(edge);
- }
-
- void AddNeighboursToQueue(VertexId cur_vertex, distance_t cur_dist, queue_t& queue) {
- auto neigh_iterator = settings_.GetIterator(cur_vertex);
- while (neigh_iterator.HasNext()) {
- TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " started");
- auto cur_pair = neigh_iterator.Next();
- if (!DistanceCounted(cur_pair.vertex)) {
- TRACE("Adding new entry to queue");
- distance_t new_dist = GetLength(cur_pair.edge) + cur_dist;
- TRACE("Entry: vertex " << graph_.str(cur_vertex) << " distance " << new_dist);
- if (CheckPutVertex(cur_pair.vertex, cur_pair.edge, new_dist)) {
- TRACE("CheckPutVertex returned true and new entry is added");
- queue.push(element_t<Graph, distance_t>(new_dist, cur_pair.vertex,
- cur_vertex, cur_pair.edge));
- }
- }
- TRACE("Checking new neighbour of vertex " << graph_.str(cur_vertex) << " finished");
- }
- TRACE("All neighbours of vertex " << graph_.str(cur_vertex) << " processed");
- }
-
-public:
- Dijkstra(const Graph &graph, DijkstraSettings settings, size_t max_vertex_number = size_t(-1)) :
- graph_(graph),
- settings_(settings),
- max_vertex_number_(max_vertex_number),
- finished_(false),
- vertex_number_(0),
- vertex_limit_exceeded_(false) {}
-
- Dijkstra(Dijkstra&& /*other*/) = default;
-
- Dijkstra& operator=(Dijkstra&& /*other*/) = default;
-
- Dijkstra(const Dijkstra& /*other*/) = delete;
-
- Dijkstra& operator=(const Dijkstra& /*other*/) = delete;
-
- bool finished() const {
- return finished_;
- }
-
- bool DistanceCounted(VertexId vertex) const {
- return distances_.find(vertex) != distances_.end();
- }
-
- distance_t GetDistance(VertexId vertex) const {
- VERIFY(DistanceCounted(vertex));
- return distances_.find(vertex)->second;
- }
-
- std::pair<distances_map_ci, distances_map_ci> GetDistances() const {
- distances_map_ci begin = distances_.begin();
- distances_map_ci end = distances_.end();
- return make_pair(begin, end);
- }
-
- void Run(VertexId start) {
- TRACE("Starting dijkstra run from vertex " << graph_.str(start));
- queue_t queue;
- Init(start, queue);
- TRACE("Priority queue initialized. Starting search");
-
- while (!queue.empty() && !finished()) {
- TRACE("Dijkstra iteration started");
- const element_t<Graph, distance_t>& next = queue.top();
- distance_t distance = next.distance;
- VertexId vertex = next.curr_vertex;
-
- prev_vert_map_[vertex] = std::pair<VertexId, EdgeId>(next.prev_vertex, next.edge_between);
- queue.pop();
- TRACE("Vertex " << graph_.str(vertex) << " with distance " << distance << " fetched from queue");
-
- if (DistanceCounted(vertex)) {
- TRACE("Distance to vertex " << graph_.str(vertex) << " already counted. Proceeding to next queue entry.");
- continue;
- }
- distances_.insert(make_pair(vertex, distance));
-
- TRACE("Vertex " << graph_.str(vertex) << " is found to be at distance "
- << distance << " from vertex " << graph_.str(start));
- if (!CheckProcessVertex(vertex, distance)) {
- TRACE("Check for processing vertex failed. Proceeding to the next queue entry.");
- continue;
- }
- processed_vertices_.insert(vertex);
- AddNeighboursToQueue(vertex, distance, queue);
- }
- set_finished(true);
- TRACE("Finished dijkstra run from vertex " << graph_.str(start));
- }
-
- std::vector<EdgeId> GetShortestPathTo(VertexId vertex) {
- std::vector<EdgeId> path;
- if (prev_vert_map_.find(vertex) == prev_vert_map_.end())
- return path;
-
- VertexId curr_vertex = vertex;
- VertexId prev_vertex = get(prev_vert_map_, vertex).first;
- EdgeId edge = get(prev_vert_map_, curr_vertex).second;
-
- while (prev_vertex != VertexId(0)) {
- if (graph_.EdgeStart(edge) == prev_vertex)
- path.insert(path.begin(), edge);
- else
- path.push_back(edge);
- curr_vertex = prev_vertex;
- const auto& prev_v_e = get(prev_vert_map_, curr_vertex);
- prev_vertex = prev_v_e.first;
- edge = prev_v_e.second;
- }
- return path;
- }
-
- vector<VertexId> ReachedVertices() const {
- vector<VertexId> result;
- for (auto it = distances_.begin(); it != distances_.end(); ++it) {
- result.push_back(it->first);
- }
- return result;
- }
-
- const set<VertexId>& ProcessedVertices() const {
- return processed_vertices_;
- }
-
- bool VertexLimitExceeded() const {
- return vertex_limit_exceeded_;
- }
-
-private:
- DECL_LOGGER("Dijkstra");
-};
-
-template<class Graph>
-class DistanceCounter {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph>> BaseDijkstraSettings;
-
-public:
- DistanceCounter(const Graph& graph) :
- graph_(graph),
- dijkstra_(graph, BaseDijkstraSettings(
- LengthCalculator<Graph>(),
- VertexProcessChecker<Graph>(),
- VertexPutChecker<Graph>(),
- ForwardNeighbourIteratorFactory<Graph>())),
- ready_(false) {
- }
-
- bool IsReachable(VertexId from, VertexId to) {
- EnsureFrom(from);
- return dijkstra_.DistanceCounted(to);
- }
-
- size_t Distance(VertexId from, VertexId to) {
- EnsureFrom(from);
- return dijkstra_.GetDistance(to);
- }
-
-private:
- void EnsureFrom(VertexId from) {
- if (!ready_ || prev_ != from) {
- dijkstra_.run(from);
- ready_ = true;
- prev_ = from;
- }
- }
-
- const Graph& graph_;
- Dijkstra<Graph, BaseDijkstraSettings> dijkstra_;
- VertexId prev_;
- bool ready_;
-};
-
-}
diff --git a/src/modules/algorithms/dijkstra/dijkstra_helper.hpp b/src/modules/algorithms/dijkstra/dijkstra_helper.hpp
deleted file mode 100644
index 756f2af..0000000
--- a/src/modules/algorithms/dijkstra/dijkstra_helper.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dijkstra_algorithm.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-class DijkstraHelper {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > UnorientedDijkstra;
-
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- BackwardNeighbourIteratorFactory<Graph> > > BackwardDijkstra;
-
- //------------------------------
- // bounded dijkstra
- //------------------------------
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > BoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, BoundedDijkstraSettings> BoundedDijkstra;
-
- static BoundedDijkstra CreateBoundedDijkstra(const Graph &graph, size_t length_bound,
- size_t max_vertex_number = -1ul){
- return BoundedDijkstra(graph, BoundedDijkstraSettings(
- LengthCalculator<Graph>(graph),
- BoundProcessChecker<Graph>(length_bound),
- BoundPutChecker<Graph>(length_bound),
- ForwardNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-
- //------------------------------
- // bounded backward dijkstra
- //------------------------------
-
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- BackwardNeighbourIteratorFactory<Graph> > BackwardBoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, BackwardBoundedDijkstraSettings> BackwardBoundedDijkstra;
-
- static BackwardBoundedDijkstra CreateBackwardBoundedDijkstra(const Graph &graph,
- size_t bound, size_t max_vertex_number = size_t(-1)){
- return BackwardBoundedDijkstra(graph, BackwardBoundedDijkstraSettings(
- LengthCalculator<Graph>(graph),
- BoundProcessChecker<Graph>(bound),
- BoundPutChecker<Graph>(bound),
- BackwardNeighbourIteratorFactory<Graph>(graph)), max_vertex_number);
- }
-
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- EdgeComponentPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > ComponentFinder;
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- ComponentLenCalculator<Graph>,
- BoundProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > NeighbourhoodFinder;
- //------------------------------
-
- typedef Dijkstra<Graph, ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- VertexProcessChecker<Graph>,
- SubgraphPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > > SubgraphDijkstra;
-
- typedef ComposedDijkstraSettings<Graph,
- PathIgnoringLengthCalculator<Graph>,
- BoundProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > PathIgnoringDijkstraSettings;
-
-
- //------------------------------
- // short edge dijkstra settings
- //------------------------------
- typedef ComposedDijkstraSettings<Graph,
- BoundedEdgeLenCalculator<Graph>,
- ZeroLengthProcessChecker<Graph>,
- VertexPutChecker<Graph>,
- UnorientedNeighbourIteratorFactory<Graph> > ShortEdgeDijkstraSettings;
-
- typedef Dijkstra<Graph, ShortEdgeDijkstraSettings> ShortEdgeDijkstra;
-
- static ShortEdgeDijkstra CreateShortEdgeDijkstra(const Graph &graph, size_t edge_length_bound,
- size_t max_vertex_number = size_t(-1)){
- return ShortEdgeDijkstra(graph, ShortEdgeDijkstraSettings(
- BoundedEdgeLenCalculator<Graph>(graph, edge_length_bound),
- ZeroLengthProcessChecker<Graph>(),
- VertexPutChecker<Graph>(),
- UnorientedNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-
- //------------------------------
- // counting dijkstra
- //------------------------------
- typedef CountingDijkstraSettings<Graph,
- UnorientedNeighbourIteratorFactory<Graph> > UnorientCountingDijkstraSettings;
-
- typedef Dijkstra<Graph, UnorientCountingDijkstraSettings> CountingDijkstra;
-
- static CountingDijkstra CreateCountingDijkstra(const Graph &graph, size_t max_size,
- size_t edge_length_bound, size_t max_vertex_number = size_t(-1)){
- return CountingDijkstra(graph, UnorientCountingDijkstraSettings(graph,
- UnorientedNeighbourIteratorFactory<Graph>(graph),
- max_size, edge_length_bound), max_vertex_number);
- }
-
-
- //------------------------------
- // targeted bounded dijkstra
- //------------------------------
-
- typedef ComposedDijkstraSettings<Graph,
- LengthCalculator<Graph>,
- BoundedVertexTargetedProcessChecker<Graph>,
- BoundPutChecker<Graph>,
- ForwardNeighbourIteratorFactory<Graph> > TargeredBoundedDijkstraSettings;
-
- typedef Dijkstra<Graph, TargeredBoundedDijkstraSettings> TargeredBoundedDijkstra;
-
- static TargeredBoundedDijkstra CreateTargeredBoundedDijkstra(const Graph &graph,
- VertexId target_vertex, size_t bound, size_t max_vertex_number = size_t(-1)){
- return TargeredBoundedDijkstra(graph,
- TargeredBoundedDijkstraSettings(LengthCalculator<Graph>(graph),
- BoundedVertexTargetedProcessChecker<Graph>(target_vertex, bound),
- BoundPutChecker<Graph>(bound),
- ForwardNeighbourIteratorFactory<Graph>(graph)),
- max_vertex_number);
- }
-};
-
-}
diff --git a/src/modules/algorithms/dijkstra/length_calculator.hpp b/src/modules/algorithms/dijkstra/length_calculator.hpp
deleted file mode 100644
index ec29690..0000000
--- a/src/modules/algorithms/dijkstra/length_calculator.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dev_support/standard_base.hpp"
-
-namespace omnigraph {
-
-template<class Graph, typename distance_t = size_t>
-class LengthCalculator {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-protected:
- const Graph &graph_;
-public:
- LengthCalculator(const Graph &graph) : graph_(graph) { }
- virtual distance_t GetLength(EdgeId edge) const{
- return distance_t(graph_.length(edge));
- }
- virtual ~LengthCalculator() { }
-};
-
-template<class Graph, typename distance_t = size_t>
-class ComponentLenCalculator : public LengthCalculator<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<EdgeId> &component_;
-public:
- ComponentLenCalculator(const Graph &graph, set<EdgeId> &component) :
- LengthCalculator<Graph, distance_t>(graph), component_(component) { }
-
- distance_t GetLength(EdgeId edge) const{
- if (component_.count(edge) != 0)
- return 0;
- return this->graph_.length(edge);
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class BoundedEdgeLenCalculator : public LengthCalculator<Graph, distance_t> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- distance_t bound_;
-public:
- BoundedEdgeLenCalculator(const Graph &graph, distance_t bound) :
- LengthCalculator<Graph, distance_t>(graph), bound_(bound) { }
-
- distance_t GetLength(EdgeId edge) const{
- if(this->graph_.length(edge) <= bound_)
- return 0;
- return 1;
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class AlongPathLengthCalculator : public LengthCalculator<Graph, distance_t> {
- typedef LengthCalculator<Graph, distance_t> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<VertexId> vertex_path_;
- distance_t bound_;
-
- set<VertexId> CollectVertices(vector<EdgeId> &edge_path){
- set<VertexId> result;
- for(auto e = edge_path.begin(); e != edge_path.end(); e++){
- result.insert(this->graph_.EdgeStart(*e));
- result.insert(this->graph_.EdgeEnd(*e));
- }
- return result;
- }
-
-public:
- AlongPathLengthCalculator(const Graph &graph, vector<EdgeId> &edge_path, distance_t bound) :
- LengthCalculator<Graph, distance_t>(graph),
- vertex_path_(CollectVertices(edge_path)),
- bound_(bound) { }
-
- distance_t GetLength(EdgeId edge) const{
- if (vertex_path_.count(this->graph_.EdgeStart(edge))
- && vertex_path_.count(this->graph_.EdgeEnd(edge)))
- return min(int(base::GetLength(edge)), 200);
- return base::GetLength(edge);
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class PathIgnoringLengthCalculator : public LengthCalculator<Graph, distance_t> {
- typedef LengthCalculator<Graph, distance_t> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- set<EdgeId> path_;
- distance_t bound_;
-
-public:
- PathIgnoringLengthCalculator(const Graph &graph, const vector<EdgeId> &edge_path) :
- LengthCalculator<Graph, distance_t>(graph), path_(edge_path.begin(), edge_path.end())
- { }
-
- distance_t GetLength(EdgeId edge) const {
- if (path_.find(edge) != path_.end()) {
- return 0;
- }
- return base::GetLength(edge);
- }
-};
-
-
-}
diff --git a/src/modules/algorithms/genome_consistance_checker.cpp b/src/modules/algorithms/genome_consistance_checker.cpp
deleted file mode 100644
index f3009ad..0000000
--- a/src/modules/algorithms/genome_consistance_checker.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-#include "algorithms/genome_consistance_checker.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-#include <algorithm>
-#include <limits>
-namespace debruijn_graph {
-using omnigraph::MappingRange;
-using namespace std;
-
-//gap or overlap size. WITHOUT SIGN!
-static size_t gap(const Range &a, const Range &b) {
- return max(a.end_pos, b.start_pos) - min (a.end_pos, b.start_pos);
-}
-bool GenomeConsistenceChecker::consequent(const Range &mr1, const Range &mr2) const{
- if (mr1.end_pos > mr2.start_pos + absolute_max_gap_)
- return false;
- if (mr1.end_pos + absolute_max_gap_ < mr2.start_pos)
- return false;
- return true;
-
-}
-bool GenomeConsistenceChecker::consequent(const MappingRange &mr1, const MappingRange &mr2) const {
- //do not want to think about handling gaps near 0 position.
- if (!consequent(mr1.initial_range, mr2.initial_range) || !consequent(mr1.mapped_range, mr2.mapped_range))
- return false;
- size_t initial_gap = gap(mr1.initial_range, mr2.initial_range);
- size_t mapped_gap = gap(mr1.mapped_range, mr2.mapped_range);
- size_t max_gap = max(initial_gap, mapped_gap);
- if ( max_gap > relative_max_gap_* double (max (min(mr1.initial_range.size(), mr1.mapped_range.size()), min(mr2.initial_range.size(), mr2.mapped_range.size()))))
- return false;
- return true;
-}
-
-PathScore GenomeConsistenceChecker::CountMisassemblies(const BidirectionalPath &path) const {
- PathScore straight = CountMisassembliesWithStrand(path, "0");
- PathScore reverse = CountMisassembliesWithStrand(path, "1");
- size_t total_length = path.LengthAt(0);
-//TODO: constant;
- if (total_length > std::max(straight.mapped_length, reverse.mapped_length) * 2) {
- if (total_length > 10000) {
- INFO ("For path length " << total_length <<" mapped less than half of the path, skipping");
- }
- return PathScore(0,0,0);
- } else {
- if (straight.mapped_length > reverse.mapped_length) {
- return straight;
- } else {
- return reverse;
- }
- }
-}
-
-void GenomeConsistenceChecker::SpellGenome() {
- vector<pair<EdgeId, MappingRange> > to_sort;
- for(auto e: storage_) {
- if (excluded_unique_.find(e) == excluded_unique_.end() ) {
- set<MappingRange> mappings = gp_.edge_pos.GetEdgePositions(e, "fxd0");
- if (mappings.size() > 1) {
- INFO("edge " << e << "smth strange");
- } else if (mappings.size() == 0) {
- continue;
- } else {
- to_sort.push_back(make_pair(e, *mappings.begin()));
- }
- }
- }
- sort(to_sort.begin(), to_sort.end(), [](const pair<EdgeId, MappingRange> & a, const pair<EdgeId, MappingRange> & b) -> bool
- {
- return a.second.initial_range.start_pos < b.second.initial_range.start_pos;
- }
- );
- size_t count = 0;
- for(auto p: to_sort) {
- INFO("edge " << gp_.g.int_id(p.first) << " length "<< gp_.g.length(p.first) << " coverage " << gp_.g.coverage(p.first) << " mapped to " << p.second.mapped_range.start_pos << " - " << p.second.mapped_range.end_pos << " init_range " << p.second.initial_range.start_pos << " - " << p.second.initial_range.end_pos );
- genome_spelled_[p.first] = count;
- count++;
- }
-}
-
-PathScore GenomeConsistenceChecker::CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const {
- if (strand == "1") {
- return (CountMisassembliesWithStrand(*path.GetConjPath(), "0"));
- }
- PathScore res(0, 0, 0);
- EdgeId prev;
- size_t prev_in_genome = std::numeric_limits<std::size_t>::max();
- size_t prev_in_path = std::numeric_limits<std::size_t>::max();
- MappingRange prev_range;
- for (int i = 0; i < (int) path.Size(); i++) {
- if (genome_spelled_.find(path.At(i)) != genome_spelled_.end()) {
- size_t cur_in_genome = genome_spelled_[path.At(i)];
- MappingRange cur_range = *gp_.edge_pos.GetEdgePositions(path.At(i), "fxd0").begin();
- if (prev_in_genome != std::numeric_limits<std::size_t>::max()) {
- if (cur_in_genome == prev_in_genome + 1) {
- int dist_in_genome = (int) cur_range.initial_range.start_pos - (int) prev_range.initial_range.end_pos;
- int dist_in_path = (int) path.LengthAt(prev_in_path) - (int) path.LengthAt(i) + (int) cur_range.mapped_range.start_pos - (int) prev_range.mapped_range.end_pos;
- DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
- DEBUG("Gap in genome / gap in path: " << dist_in_genome << " / " << dist_in_path);
- if (size_t(abs(dist_in_genome - dist_in_path)) > absolute_max_gap_ && (dist_in_genome * (1 + relative_max_gap_) < dist_in_path || dist_in_path * (1 + relative_max_gap_) < dist_in_genome)) {
-
- res.wrong_gap_size ++;
- }
- } else {
- if (path.At(i) != circular_edge_ && path.At(prev_in_path) != circular_edge_)
- res.misassemblies++;
- else
- INFO("Skipping fake(circular) misassembly");
- }
- }
- res.mapped_length += cur_range.mapped_range.size();
- prev = path.At(i);
- prev_in_genome = cur_in_genome;
- prev_range = cur_range;
- prev_in_path = i;
- }
- }
- if (prev_in_path != std::numeric_limits<std::size_t>::max())
- DEBUG("Edge " << prev.int_id() << " position in genome ordering: " << prev_in_genome);
- return res;
-}
-void GenomeConsistenceChecker::RefillPos() {
- RefillPos("0");
- RefillPos("1");
-}
-
-
-void GenomeConsistenceChecker::RefillPos(const string &strand) {
- for (auto e: storage_) {
- RefillPos(strand, e);
- }
-}
-
-void GenomeConsistenceChecker::FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const {
- vector<MappingRange> to_process (old_mappings.begin(), old_mappings.end());
- sort(to_process.begin(), to_process.end(), [](const MappingRange & a, const MappingRange & b) -> bool
- {
- return a.mapped_range.start_pos < b.mapped_range.start_pos;
- } );
- size_t sz = to_process.size();
-//max weight path in orgraph of mappings
- TRACE("constructing mapping graph" << sz << " vertices");
- vector<vector<size_t>> consecutive_mappings(sz);
- for(size_t i = 0; i < sz; i++) {
- for (size_t j = i + 1; j < sz; j++) {
- if (consequent(to_process[i], to_process[j])) {
- consecutive_mappings[i].push_back(j);
- } else {
- if (to_process[j].mapped_range.start_pos > to_process[i].mapped_range.end_pos + absolute_max_gap_) {
- break;
- }
- }
- }
- }
- vector<size_t> scores(sz), prev(sz);
- for(size_t i = 0; i < sz; i++) {
- scores[i] = to_process[i].initial_range.size();
- prev[i] = std::numeric_limits<std::size_t>::max();
- }
- for(size_t i = 0; i < sz; i++) {
- for (size_t j = 0; j < consecutive_mappings[i].size(); j++) {
- TRACE(consecutive_mappings[i][j]);
- if (scores[consecutive_mappings[i][j]] < scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size()) {
- scores[consecutive_mappings[i][j]] = scores[i] + to_process[consecutive_mappings[i][j]].initial_range.size();
- prev[consecutive_mappings[i][j]] = i;
- }
- }
- }
- size_t cur_max = 0;
- size_t cur_i = 0;
- for(size_t i = 0; i < sz; i++) {
- if (scores[i] > cur_max) {
- cur_max = scores[i];
- cur_i = i;
- }
- }
- used_mappings.clear();
- while (cur_i != std::numeric_limits<std::size_t>::max()) {
- used_mappings.push_back(to_process[cur_i]);
- cur_i = prev[cur_i];
- }
- reverse(used_mappings.begin(), used_mappings.end());
-};
-
-void GenomeConsistenceChecker::RefillPos(const string &strand, const EdgeId &e) {
- set<MappingRange> old_mappings = gp_.edge_pos.GetEdgePositions(e, strand);
- TRACE("old mappings sz " << old_mappings.size() );
- size_t total_mapped = 0;
- for (auto mp:old_mappings) {
- total_mapped += mp.initial_range.size();
- }
- if (total_mapped > (double) gp_.g.length(e) * 1.5) {
- INFO ("Edge " << gp_.g.int_id(e) << "is not unique, excluding");
- excluded_unique_.insert(e);
- return;
- }
-//TODO: support non-unique edges;
- if (total_mapped < (double) gp_.g.length(e) * 0.5) {
- DEBUG ("Edge " << gp_.g.int_id(e) << "is not mapped on strand "<< strand <<", not used");
- return;
- }
- TRACE(total_mapped << " " << gp_.g.length(e));
- string new_strand = "fxd" + strand;
- vector<MappingRange> used_mappings;
- FindBestRangeSequence(old_mappings, used_mappings);
-
- size_t cur_i = 0;
- MappingRange new_mapping;
- new_mapping = used_mappings[cur_i];
- size_t used_mapped = new_mapping.initial_range.size();
- TRACE ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e));
- TRACE ("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
- << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
- while (cur_i < used_mappings.size() - 1) {
- cur_i ++;
- used_mapped += used_mappings[cur_i].initial_range.size();
- new_mapping = new_mapping.Merge(used_mappings[cur_i]);
- TRACE("new_mapping mp_range "<< new_mapping.mapped_range.start_pos << " - " << new_mapping.mapped_range.end_pos
- << " init_range " << new_mapping.initial_range.start_pos << " - " << new_mapping.initial_range.end_pos );
- }
-//used less that 0.9 of aligned length
- if (total_mapped * 10 >= used_mapped * 10 + gp_.g.length(e)) {
- INFO ("Edge " << gp_.g.int_id(e) << " length "<< gp_.g.length(e) << "is potentially misassembled! mappings: ");
- for (auto mp:old_mappings) {
- INFO("mp_range "<< mp.mapped_range.start_pos << " - " << mp.mapped_range.end_pos << " init_range " << mp.initial_range.start_pos << " - " << mp.initial_range.end_pos );
- if (mp.initial_range.start_pos < absolute_max_gap_) {
- INFO ("Fake(linear order) misassembly on edge "<< e.int_id());
- if (strand == "0") {
- circular_edge_ = e;
- }
- }
- }
-
- }
- gp_.edge_pos.AddEdgePosition(e, new_strand, new_mapping);
-}
-
-
-
-}
diff --git a/src/modules/algorithms/genome_consistance_checker.hpp b/src/modules/algorithms/genome_consistance_checker.hpp
deleted file mode 100644
index 7c106f3..0000000
--- a/src/modules/algorithms/genome_consistance_checker.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#pragma once
-#include "visualization/graph_labeler.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-#include "data_structures/sequence/sequence.hpp"
-#include "pipeline/graph_pack.hpp"
-#include "visualization/position_filler.hpp"
-#include "assembly_graph/paths/bidirectional_path.hpp"
-#include "assembly_graph/graph_support/scaff_supplementary.hpp"
-
-namespace debruijn_graph {
-
-
-using path_extend::BidirectionalPath;
-using path_extend::ScaffoldingUniqueEdgeStorage;
-
-struct PathScore{
- size_t misassemblies;
- size_t wrong_gap_size;
- size_t mapped_length;
- PathScore(size_t m, size_t w, size_t ml): misassemblies(m), wrong_gap_size(w), mapped_length(ml) {}
-};
-class GenomeConsistenceChecker {
-
-private:
- const conj_graph_pack &gp_;
- const Graph &graph_;
- //EdgesPositionHandler<Graph> &position_handler_;
- Sequence genome_;
- ScaffoldingUniqueEdgeStorage storage_;
- size_t absolute_max_gap_;
- double relative_max_gap_;
- set<EdgeId> excluded_unique_;
- EdgeId circular_edge_;
-//map from unique edges to their order in genome spelling;
- mutable map<EdgeId, size_t> genome_spelled_;
- bool consequent(const Range &mr1, const Range &mr2) const;
- bool consequent(const MappingRange &mr1, const MappingRange &mr2) const ;
-
- PathScore CountMisassembliesWithStrand(const BidirectionalPath &path, const string strand) const;
-//constructs longest sequence of consequetive ranges, stores result in used_mappings
- void FindBestRangeSequence(const set<MappingRange>& old_mappings, vector<MappingRange>& used_mappings) const;
-//Refills genomic positions uniting alingments separated with small gaps
- void RefillPos();
- void RefillPos(const string &strand);
- void RefillPos(const string &strand, const EdgeId &e);
-DECL_LOGGER("GenomeConsistenceChecker");
-
-
-public:
- GenomeConsistenceChecker(const conj_graph_pack &gp, ScaffoldingUniqueEdgeStorage &storage, size_t max_gap, double relative_max_gap /*= 0.2*/) : gp_(gp),
- graph_(gp.g), /*position_handler_(gp.edge_pos),*/ genome_(gp.genome.GetSequence()), storage_(storage),
- absolute_max_gap_(max_gap), relative_max_gap_(relative_max_gap), excluded_unique_(), circular_edge_() {
- if (!gp.edge_pos.IsAttached()) {
- gp.edge_pos.Attach();
- }
- gp.edge_pos.clear();
- FillPos(gp_, gp_.genome.GetSequence(), "0");
- FillPos(gp_, !gp_.genome.GetSequence(), "1");
- RefillPos();
- }
- PathScore CountMisassemblies(const BidirectionalPath &path) const;
-//spells genome in language of long unique edges from storage;
- void SpellGenome();
-
-};
-
-
-}
diff --git a/src/modules/algorithms/graph_construction.hpp b/src/modules/algorithms/graph_construction.hpp
deleted file mode 100644
index d7034e6..0000000
--- a/src/modules/algorithms/graph_construction.hpp
+++ /dev/null
@@ -1,180 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * graph_construction.hpp
- *
- * Created on: Aug 12, 2011
- * Author: sergey
- */
-#pragma once
-
-#include "pipeline/graph_pack.hpp"
-
-#include "io/reads_io/io_helper.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-
-#include "data_structures/debruijn_graph/debruijn_graph_constructor.hpp"
-#include "data_structures/debruijn_graph/early_simplification.hpp"
-
-#include "dev_support/perfcounter.hpp"
-#include "io/dataset_support/read_converter.hpp"
-
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "assembly_graph/graph_support/detail_coverage.hpp"
-#include "data_structures/indices/storing_traits.hpp"
-#include "data_structures/indices/edge_index_builders.hpp"
-#include "dev_support/openmp_wrapper.h"
-
-namespace debruijn_graph {
-
-template<class StoringType>
-struct CoverageCollector {
-};
-
-template<>
-struct CoverageCollector<SimpleStoring> {
- template<class Info>
- static void CollectCoverage(Info edge_info) {
- edge_info.edge_id->IncCoverage(edge_info.count);
- }
-};
-
-template<>
-struct CoverageCollector<InvertableStoring> {
- template<class Info>
- static void CollectCoverage(Info edge_info) {
- edge_info.edge_id->IncCoverage(edge_info.count);
- edge_info.edge_id->conjugate()->IncCoverage(edge_info.count);
- }
-};
-
-
-template<class Index>
-void FillCoverageFromIndex(const Index &index) {
- for (auto I = index.value_cbegin(), E = index.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- VERIFY(edge_info.offset != -1u);
-// VERIFY(edge_info.edge_id.get() != NULL);
- if(edge_info.offset != -1u) {
- CoverageCollector<typename Index::storing_type>::CollectCoverage(edge_info);
- }
- }
- DEBUG("Coverage counted");
-}
-
-template<class Graph, class Readers, class Index>
-size_t ConstructGraphUsingOldIndex(Readers& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- INFO("Constructing DeBruijn graph");
-
- TRACE("Filling indices");
- size_t rl = 0;
- VERIFY_MSG(streams.size(), "No input streams specified");
-
- TRACE("... in parallel");
- typedef typename Index::InnerIndexT InnerIndex;
- typedef typename EdgeIndexHelper<InnerIndex>::CoverageFillingEdgeIndexBuilderT IndexBuilder;
- InnerIndex& debruijn = index.inner_index();
- //fixme hack
- rl = IndexBuilder().BuildIndexFromStream(debruijn, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream));
-
- VERIFY(g.k() + 1== debruijn.k());
- // FIXME: output_dir here is damn ugly!
-
- TRACE("Filled indices");
-
- INFO("Condensing graph");
- DeBruijnGraphConstructor<Graph, InnerIndex> g_c(g, debruijn);
- TRACE("Constructor ok");
- VERIFY(!index.IsAttached());
- index.Attach();
- g_c.ConstructGraph(100, 10000, 1.2); // TODO: move magic constants to config
- INFO("Graph condensed");
-
- return rl;
-}
-
-template<class ExtensionIndex>
-void EarlyClipTips(size_t k, const config::debruijn_config::construction& params, size_t rl, ExtensionIndex& ext) {
- if (params.early_tc.enable) {
- size_t length_bound = rl - k;
- if (params.early_tc.length_bound)
- length_bound = params.early_tc.length_bound.get();
- AlternativeEarlyTipClipper(ext, length_bound).ClipTips();
- }
-}
-
-#include "data_structures/indices/kmer_extension_index_builder.hpp"
-
-template<class Graph, class Read, class Index>
-ReadStatistics ConstructGraphUsingExtentionIndex(const config::debruijn_config::construction params,
- io::ReadStreamList<Read>& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- size_t k = g.k();
- INFO("Constructing DeBruijn graph for k=" << k);
-
- TRACE("Filling indices");
- VERIFY_MSG(streams.size(), "No input streams specified");
-
- TRACE("... in parallel");
- // FIXME: output_dir here is damn ugly!
- typedef DeBruijnExtensionIndex<> ExtensionIndex;
- typedef typename ExtensionIndexHelper<ExtensionIndex>::DeBruijnExtensionIndexBuilderT ExtensionIndexBuilder;
- ExtensionIndex ext((unsigned) k, index.inner_index().workdir());
-
- //fixme hack
- ReadStatistics stats = ExtensionIndexBuilder().BuildExtensionIndexFromStream(ext, streams, (contigs_stream == 0) ? 0 : &(*contigs_stream), params.read_buffer_size);
-
- EarlyClipTips(k, params, stats.max_read_length_, ext);
-
- INFO("Condensing graph");
- VERIFY(!index.IsAttached());
- DeBruijnGraphExtentionConstructor<Graph> g_c(g, ext);
- g_c.ConstructGraph(100, 10000, 1.2, params.keep_perfect_loops);//TODO move these parameters to config
-
- INFO("Building index with from graph")
- //todo pass buffer size
- index.Refill();
- index.Attach();
-
- return stats;
-}
-
-template<class Graph, class Index, class Streams>
-ReadStatistics ConstructGraph(const config::debruijn_config::construction ¶ms,
- Streams& streams, Graph& g,
- Index& index, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- if (params.con_mode == config::construction_mode::extention) {
- return ConstructGraphUsingExtentionIndex(params, streams, g, index, contigs_stream);
-// } else if(params.con_mode == construction_mode::con_old){
-// return ConstructGraphUsingOldIndex(k, streams, g, index, contigs_stream);
- } else {
- INFO("Invalid construction mode")
- VERIFY(false);
- return {0,0,0};
- }
-}
-
-template<class Graph, class Index, class Streams>
-ReadStatistics ConstructGraphWithCoverage(const config::debruijn_config::construction ¶ms,
- Streams& streams, Graph& g,
- Index& index, FlankingCoverage<Graph>& flanking_cov,
- io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- ReadStatistics rs = ConstructGraph(params, streams, g, index, contigs_stream);
-
- typedef typename Index::InnerIndex InnerIndex;
- typedef typename EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
- INFO("Filling coverage index")
- IndexBuilder().ParallelFillCoverage(index.inner_index(), streams);
- INFO("Filling coverage and flanking coverage from index");
- FillCoverageAndFlanking(index.inner_index(), g, flanking_cov);
- return rs;
-}
-
-}
diff --git a/src/modules/algorithms/graph_read_correction.hpp b/src/modules/algorithms/graph_read_correction.hpp
deleted file mode 100644
index 311891d..0000000
--- a/src/modules/algorithms/graph_read_correction.hpp
+++ /dev/null
@@ -1,187 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "assembly_graph/paths/path_utils.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-#include "assembly_graph/paths/path_finders.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "io/reads_io/modifying_reader_wrapper.hpp"
-#include "assembly_graph/graph_core/order_and_law.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-
-namespace debruijn_graph {
-
-template<class gp_t>
-class TipsProjector {
- typedef typename gp_t::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
-
- gp_t& gp_;
-
- const omnigraph::UniquePathFinder<Graph> unique_path_finder_;
-
- optional<EdgeId> UniqueAlternativeEdge(EdgeId tip, bool outgoing_tip) {
- vector<EdgeId> edges;
- if (outgoing_tip) {
- push_back_all(edges, gp_.g.OutgoingEdges(gp_.g.EdgeStart(tip)));
- } else {
- push_back_all(edges, gp_.g.IncomingEdges(gp_.g.EdgeEnd(tip)));
- }
- restricted::set<EdgeId> edges_set(edges.begin(), edges.end());
- edges_set.erase(tip);
- if (edges_set.size() == 1)
- return optional < EdgeId > (*edges_set.begin());
- else
- return boost::none;
- }
-
- vector<EdgeId> UniqueAlternativePath(EdgeId tip, bool outgoing_tip) {
- optional<EdgeId> alt_edge = UniqueAlternativeEdge(tip, outgoing_tip);
- if (alt_edge) {
- if (outgoing_tip) {
- return unique_path_finder_.UniquePathForward(*alt_edge);
- } else {
- return unique_path_finder_.UniquePathBackward(*alt_edge);
- }
- }
- return vector<EdgeId>();
- }
-
- void AlignAndProject(const Sequence& tip_seq, const Sequence& alt_seq,
- bool outgoing_tip) {
- //todo refactor
- Sequence aligned_tip = tip_seq;
- Sequence aligned_alt = alt_seq;
- if (outgoing_tip) {
- if (tip_seq.size() >= alt_seq.size()) {
- aligned_tip = tip_seq.Subseq(0, alt_seq.size());
- } else {
- aligned_alt = alt_seq.Subseq(0, tip_seq.size());
- }
- } else {
- if (tip_seq.size() >= alt_seq.size()) {
- aligned_tip = tip_seq.Subseq(tip_seq.size() - alt_seq.size());
- } else {
- aligned_alt = alt_seq.Subseq(alt_seq.size() - tip_seq.size());
- }
- }
-
- INFO(
- "Remapping " << aligned_tip.size()
- << " kmers of aligned_tip to aligned_alt");
- gp_.kmer_mapper.RemapKmers(aligned_tip, aligned_alt);
- }
-
-public:
- TipsProjector(gp_t& gp) :
- gp_(gp), unique_path_finder_(gp.g) {
-
- }
-
- void ProjectTip(EdgeId tip) {
- TRACE("Trying to project tip " << gp_.g.str(tip));
- bool outgoing_tip = gp_.g.IsDeadEnd(gp_.g.EdgeEnd(tip));
- Sequence tip_seq = gp_.g.EdgeNucls(tip);
- vector<EdgeId> alt_path = UniqueAlternativePath(tip, outgoing_tip);
- if (alt_path.empty()) {
- TRACE(
- "Failed to find unique alt path for tip " << gp_.g.str(tip)
- << ". Wasn't projected!!!");
- } else {
- Sequence alt_seq = MergeSequences(gp_.g, alt_path);
- if (tip_seq.size() > alt_seq.size()) {
- TRACE(
- "Can't fully project tip " << gp_.g.str(tip)
- << " with seq length " << tip_seq.size()
- << " because alt path length is "
- << alt_seq.size()
- << ". Trying to project partially");
- }
- AlignAndProject(tip_seq, alt_seq, outgoing_tip);
- AlignAndProject(!tip_seq, !alt_seq, !outgoing_tip);
- TRACE("Tip projected");
- }
- }
-private:
- DECL_LOGGER("TipsProjector")
- ;
-};
-
-//todo improve logging
-template<class Graph, class Mapper>
-class GraphReadCorrector: public io::SequenceModifier {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& graph_;
- const Mapper mapper_;
- const MappingPathFixer<Graph> path_fixer_;
-
-public:
- /*virtual*/
- Sequence Modify(const Sequence& s) {
-// if(s < !s)
-// return !Refine(!s);
- omnigraph::MappingPath<EdgeId> mapping_path = mapper_.MapSequence(s);
-
- if (mapping_path.size() == 0 || s.size() < graph_.k() + 1
- || mapping_path.front().second.initial_range.start_pos != 0
- || mapping_path.back().second.initial_range.end_pos
- != s.size() - graph_.k()) {
- //todo reduce concat unmapped beginning and end in future???
- TRACE(
- "Won't fix because wasn't mapped or start/end fell on unprojected tip/erroneous connection");
-// TRACE(
-// "For sequence of length " << s.size()
-// << " returning empty sequence");
- return s;
-// return Sequence();
- }
-
- Path<EdgeId> path = path_fixer_.TryFixPath(mapping_path.path());
-// TRACE("Mapped sequence to path " << graph_.str(path.sequence()));
-
- if (!path_fixer_.CheckContiguous(path.sequence())) {
- TRACE("Even fixed path wasn't contiguous");
- return s;
- } else {
- TRACE("Fixed path is contiguous");
- Sequence answer = PathSequence(graph_, path);
-// if (answer != s) {
-// if (answer.size() < 1000) {
-// TRACE(
-// "Initial sequence modified, edit distance= "
-// << EditDistance(answer, s));
-// } else {
-// TRACE("Sequence too large, won't count edit distance");
-// }
-// }
- return answer;
- }
-
-// else {
-// TRACE("Initial sequence unmodified!");
-// }
- }
-
- GraphReadCorrector(const Graph& graph, const Mapper& mapper) :
- graph_(graph), mapper_(mapper), path_fixer_(graph) {
- }
-
-private:
- DECL_LOGGER("ContigRefiner");
-};
-
-template<class Graph, class Mapper>
-shared_ptr<GraphReadCorrector<Graph, Mapper>> GraphReadCorrectorInstance(
- const Graph& graph, const Mapper& mapper) {
- return std::make_shared<GraphReadCorrector<Graph, Mapper>>(graph, mapper);
-}
-
-}
diff --git a/src/modules/algorithms/mismatch_shall_not_pass.hpp b/src/modules/algorithms/mismatch_shall_not_pass.hpp
deleted file mode 100644
index 0451adb..0000000
--- a/src/modules/algorithms/mismatch_shall_not_pass.hpp
+++ /dev/null
@@ -1,339 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "algorithms/simplification/compressor.hpp"
-#include "assembly_graph/handlers/id_track_handler.hpp"
-#include "dev_support/logger/logger.hpp"
-
-#include "io/reads_io/read_stream_vector.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-
-#include "pipeline/config_struct.hpp"
-
-namespace debruijn_graph {
-
-namespace mismatches {
-struct NuclCount {
- size_t counts_[4];
-
- NuclCount() {
- memset(counts_, 0, sizeof(counts_));
- }
-
- size_t &operator[](size_t nucl) {
- return counts_[nucl];
- }
-
- NuclCount &operator+=(const NuclCount &other) {
- counts_[0] += other.counts_[0];
- counts_[1] += other.counts_[1];
- counts_[2] += other.counts_[2];
- counts_[3] += other.counts_[3];
- return *this;
- }
-};
-
-struct MismatchEdgeInfo {
- NuclCount operator[](size_t i) const {
- auto it = info_.find(i);
- if (it == info_.end())
- return NuclCount();
- else
- return it->second;
- }
-
- void operator+=(const MismatchEdgeInfo &other) {
- for (auto it = other.info_.begin(); it != other.info_.end(); ++it) {
- info_[it->first] += it->second;
- }
- }
-
- void IncIfContains(size_t position, size_t nucl) {
- auto it = info_.find(position);
- if (it != info_.end()) {
- it->second[nucl]++;
- }
- }
-
- void AddPosition(size_t position) {
- info_[position]; //in case map did not contain this key creates entry in the map with default value
- }
-
-public:
- map<size_t, NuclCount> info_;
-};
-
-template<typename EdgeId>
-class MismatchStatistics {
-private:
- typedef typename map<EdgeId, MismatchEdgeInfo>::const_iterator const_iterator;
- map<EdgeId, MismatchEdgeInfo> statistics_;
-
- template<class graph_pack>
- void CollectPotensialMismatches(const graph_pack &gp) {
- auto &kmer_mapper = gp.kmer_mapper;
- for (auto it = kmer_mapper.begin(); it != kmer_mapper.end(); ++it) {
- // Kmer mapper iterator dereferences to pair (KMer, KMer), not to the reference!
- const auto mentry = *it;
- const runtime_k::RtSeq &from = mentry.first;
- const runtime_k::RtSeq &to = mentry.second;
- size_t cnt = 0;
- size_t cnt_arr[4];
- for (size_t i = 0; i < 4; i++)
- cnt_arr[i] = 0;
- for (size_t i = 0; i < from.size(); i++) {
- if (from[i] != to[i]) {
- cnt++;
- cnt_arr[(i * 4) / from.size()]++;
- }
- }
- //last two contitions - to avoid excessive indels.
- //if two/third of nucleotides in first/last quoter are mismatches, then it means erroneous mapping
-
- if (cnt >= 1 && cnt <= from.size() / 3 && cnt_arr[0] <= from.size() / 6 &&
- cnt_arr[3] <= from.size() / 6) {
- for (size_t i = 0; i < from.size(); i++) {
- if (from[i] != to[i] && gp.index.contains(to)) {
- pair<EdgeId, size_t> position = gp.index.get(to);
- statistics_[position.first].AddPosition(position.second + i);
- }
- }
- }
- }
- }
-
- void operator+=(const MismatchStatistics<EdgeId> &other) {
- for (auto it = other.statistics_.begin(); it != other.statistics_.end(); ++it) {
- statistics_[it->first] += it->second;
- }
- }
-
-public:
- template<class graph_pack>
- MismatchStatistics(const graph_pack &gp) {
- CollectPotensialMismatches(gp);
- }
-
- const_iterator begin() const {
- return statistics_.begin();
- }
-
- const_iterator end() const {
- return statistics_.end();
- }
-
- const_iterator find(const EdgeId &edge) const {
- return statistics_.find(edge);
- }
-
- template<class graph_pack, class read_type>
- void Count(io::ReadStream<read_type> &stream, const graph_pack &gp) {
- stream.reset();
- DEBUG("count started");
- auto sm = MapperInstance(gp);
- DEBUG("seq mapper created");
- while (!stream.eof()) {
- read_type read;
- stream >> read;
- const Sequence &s_read = read.sequence();
- omnigraph::MappingPath<EdgeId> path = sm->MapSequence(s_read);
- TRACE("read mapped");
- if (path.size() == 1 && path[0].second.initial_range.size() == path[0].second.mapped_range.size()) {
- Range initial_range = path[0].second.initial_range;
- Range mapped_range = path[0].second.mapped_range;
- const Sequence &s_edge = gp.g.EdgeNucls(path[0].first);
- size_t len = initial_range.size() + gp.g.k();
- size_t cnt = 0;
- for (size_t i = 0; i < len; i++) {
- if (s_read[initial_range.start_pos + i] != s_edge[mapped_range.start_pos + i]) {
- cnt++;
- }
- }
- if (cnt <= gp.g.k() / 3) {
- TRACE("statistics changing");
- auto it = statistics_.find(path[0].first);
- if (it == statistics_.end()) {
- // if (gp.g.length(path[0].first) < 4000)
- // WARN ("id "<< gp.g.length(path[0].first)<<" " << len);
- continue;
- }
- for (size_t i = 0; i < len; i++) {
- size_t nucl_code = s_read[initial_range.start_pos + i];
- it->second.IncIfContains(mapped_range.start_pos + i, nucl_code);
- }
- }
- }
- }
- }
-
- template<class graph_pack, class read_type>
- void ParallelCount(io::ReadStreamList<read_type> &streams, const graph_pack &gp) {
- size_t nthreads = streams.size();
- std::vector<MismatchStatistics<EdgeId> *> statistics(nthreads);
-#pragma omp parallel for num_threads(nthreads) shared(streams, statistics)
- for (size_t i = 0; i < nthreads; ++i) {
- statistics[i] = new MismatchStatistics<EdgeId>(*this);
- DEBUG("statistics created thread " << i);
- statistics[i]->Count(streams[i], gp);
- DEBUG("count finished thread " << i);
- }
-
- INFO("Finished collecting potential mismatches positions");
- for (size_t i = 0; i < statistics.size(); i++) {
- *this += *statistics[i];
- delete statistics[i];
- }
- }
-};
-}
-
-template<class graph_pack, class read_type>
-class MismatchShallNotPass {
-private:
- typedef typename graph_pack::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- graph_pack &gp_;
- double relative_threshold_;
-
- EdgeId CorrectNucl(EdgeId edge, size_t position, char nucl) {
- VERIFY(position >= gp_.g.k());
- if (position + 1 < gp_.g.length(edge)) {
- edge = gp_.g.SplitEdge(edge, position + 1).first;
- }
- EdgeId mismatch = edge;
- if (position > gp_.g.k()) {
- auto tmp = gp_.g.SplitEdge(edge, position - gp_.g.k());
- edge = tmp.first;
- mismatch = tmp.second;
- }
- const Sequence &s_mm = gp_.g.EdgeNucls(mismatch);
- Sequence correct = s_mm.Subseq(0, gp_.g.k()) + Sequence(string(1, nucl)) +
- s_mm.Subseq(gp_.g.k() + 1, gp_.g.k() * 2 + 1);
- if (!gp_.kmer_mapper.CheckCanRemap(s_mm, correct)) {
- return edge;
- }
- VERIFY(nucl != s_mm[gp_.g.k()]);
- EdgeId correct_edge = gp_.g.AddEdge(gp_.g.EdgeStart(mismatch), gp_.g.EdgeEnd(mismatch), correct);
- if (position > gp_.g.k()) {
- gp_.g.GlueEdges(mismatch, correct_edge);
- return edge;
- } else {
- return gp_.g.GlueEdges(mismatch, correct_edge);
- }
- }
-
- EdgeId CorrectNucls(EdgeId edge, const std::vector<pair<size_t, char>> &mismatches) {
- for (auto it = mismatches.rbegin(); it != mismatches.rend(); ++it) {
- edge = CorrectNucl(edge, it->first, it->second);
- }
- EdgeId tmp = Compressor<Graph>(gp_.g).CompressVertexEdgeId(gp_.g.EdgeEnd(edge));
- if (tmp == EdgeId(0))
- return edge;
- else
- return tmp;
- }
-
- vector<pair<size_t, char>> FindMismatches(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
- vector<pair<size_t, char>> to_correct;
- const Sequence &s_edge = gp_.g.EdgeNucls(edge);
- for (size_t i = gp_.g.k(); i < gp_.g.length(edge); i++) {
- size_t cur_best = 0;
- mismatches::NuclCount nc = statistics[i];
- for (size_t j = 1; j < 4; j++) {
- if (nc[j] > nc[cur_best]) {
- cur_best = j;
- }
- }
- size_t nucl_code = s_edge[i];
- if ((double) nc[cur_best] > relative_threshold_ * (double) nc[nucl_code] + 1.) {
- to_correct.push_back(make_pair(i, cur_best));
- i += gp_.g.k();
- }
-
- }
- return to_correct;
- }
-
- size_t CorrectEdge(EdgeId edge, const mismatches::MismatchEdgeInfo &statistics) {
- vector<pair<size_t, char>> to_correct = FindMismatches(edge, statistics);
- EdgeId new_edge = CorrectNucls(edge, to_correct);
- if (new_edge == EdgeId(0))
- new_edge = edge;
-
- return to_correct.size();
- }
-
- size_t CorrectAllEdges(const mismatches::MismatchStatistics<typename Graph::EdgeId> &statistics) {
- size_t res = 0;
- set<EdgeId> conjugate_fix;
- for (auto it = gp_.g.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (conjugate_fix.find(gp_.g.conjugate(*it)) == conjugate_fix.end()) {
- conjugate_fix.insert(*it);
- }
- }
- for (auto it = conjugate_fix.begin(); it != conjugate_fix.end(); ++it) {
- DEBUG("processing edge" << gp_.g.int_id(*it));
-
- if (statistics.find(*it) != statistics.end()) {
- if (!gp_.g.RelatedVertices(gp_.g.EdgeStart(*it), gp_.g.EdgeEnd(*it)))
- res += CorrectEdge(*it, statistics.find(*it)->second);
- }
- }
- INFO("All edges processed");
- return res;
- }
-
- size_t StopMismatchIteration(io::ReadStream<read_type> &stream) {
- mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
- statistics.Count(stream, gp_);
- return CorrectAllEdges(statistics);
- }
-
- size_t ParallelStopMismatchIteration(io::ReadStreamList<read_type> &streams) {
- mismatches::MismatchStatistics<typename Graph::EdgeId> statistics(gp_);
- statistics.ParallelCount(streams, gp_);
- return CorrectAllEdges(statistics);
- }
-
-public:
- MismatchShallNotPass(graph_pack &gp, double relative_threshold = 1.5) : gp_(gp), relative_threshold_(
- relative_threshold) {
- VERIFY(relative_threshold >= 1);
- }
-
-
- size_t StopAllMismatches(io::ReadStream<read_type> &stream, size_t max_iterations = 1) {
- size_t res = 0;
- while (max_iterations > 0) {
- size_t last = StopMismatchIteration(stream);
- res += last;
- if (last == 0)
- break;
- max_iterations--;
- }
- return res;
- }
-
- size_t ParallelStopAllMismatches(io::ReadStreamList<read_type> &streams, size_t max_iterations = 1) {
- size_t res = 0;
- while (max_iterations > 0) {
- size_t last = ParallelStopMismatchIteration(streams);
- res += last;
- if (last == 0)
- break;
- max_iterations--;
- }
- return res;
- }
-};
-
-}
diff --git a/src/modules/algorithms/path_extend/CMakeLists.txt b/src/modules/algorithms/path_extend/CMakeLists.txt
deleted file mode 100644
index 03b447b..0000000
--- a/src/modules/algorithms/path_extend/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(path_extend CXX)
-
-add_library(path_extend STATIC pe_config_struct.cpp
- scaffolder2015/extension_chooser2015.cpp
- scaffolder2015/scaffold_graph.cpp
- scaffolder2015/scaffold_graph_constructor.cpp
- scaffolder2015/scaffold_graph_visualizer.cpp
- scaffolder2015/connection_condition2015.cpp)
-
-target_link_libraries(path_extend graph_support)
-
diff --git a/src/modules/algorithms/path_extend/extension_chooser.hpp b/src/modules/algorithms/path_extend/extension_chooser.hpp
deleted file mode 100644
index b0a989a..0000000
--- a/src/modules/algorithms/path_extend/extension_chooser.hpp
+++ /dev/null
@@ -1,1555 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * extension.hpp
- *
- * Created on: Mar 5, 2012
- * Author: andrey
- */
-
-#ifndef EXTENSION_HPP_
-#define EXTENSION_HPP_
-
-#include <cfloat>
-#include <iostream>
-#include <fstream>
-#include "weight_counter.hpp"
-#include "pe_utils.hpp"
-#include "next_path_searcher.hpp"
-
-//#include "scaff_supplementary.hpp"
-
-namespace path_extend {
-
-typedef std::multimap<double, EdgeWithDistance> AlternativeContainer;
-
-
-class PathAnalyzer {
-protected:
- const Graph& g_;
-
-public:
- PathAnalyzer(const Graph& g): g_(g) {
- }
-
- void RemoveTrivial(const BidirectionalPath& path, std::set<size_t>& to_exclude, bool exclude_bulges = true) const {
- if (exclude_bulges) {
- ExcludeTrivialWithBulges(path, to_exclude);
- } else {
- ExcludeTrivial(path, to_exclude);
- }
- }
-
-protected:
- virtual int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const {
- int edgeIndex = (from == -1) ? (int) path.Size() - 1 : from;
- if ((int) path.Size() <= from) {
- return edgeIndex;
- }
- VertexId currentVertex = g_.EdgeEnd(path[edgeIndex]);
- while (edgeIndex >= 0 && g_.CheckUniqueIncomingEdge(currentVertex)) {
- EdgeId e = g_.GetUniqueIncomingEdge(currentVertex);
- currentVertex = g_.EdgeStart(e);
-
- edges.insert((size_t) edgeIndex);
- --edgeIndex;
- }
- return edgeIndex;
- }
-
- virtual int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const {
-
- if (path.Empty()) {
- return 0;
- }
-
- int lastEdge = (int) path.Size() - 1;
- do {
- lastEdge = ExcludeTrivial(path, edges, lastEdge);
- bool bulge = true;
-
- if (lastEdge >= 0) {
- VertexId v = g_.EdgeEnd(path[lastEdge]);
- VertexId u = g_.EdgeStart(path[lastEdge]);
- auto bulgeCandidates = g_.IncomingEdges(v);
-
- for (const auto& candidate: bulgeCandidates) {
- if (g_.EdgeStart(candidate) != u) {
- bulge = false;
- break;
- }
- }
-
- if (!bulge) {
- break;
- }
- --lastEdge;
- }
- } while (lastEdge >= 0);
-
- return lastEdge;
- }
-
-protected:
- DECL_LOGGER("PathAnalyzer")
-};
-
-
-class PreserveSimplePathsAnalyzer: public PathAnalyzer {
-
-public:
- PreserveSimplePathsAnalyzer(const Graph &g) : PathAnalyzer(g) {
- }
-
- int ExcludeTrivial(const BidirectionalPath& path, std::set<size_t>& edges, int from = -1) const override {
- int edgeIndex = PathAnalyzer::ExcludeTrivial(path, edges, from);
-
- //Preserving simple path
- if (edgeIndex == -1) {
- edges.clear();
- return (from == -1) ? (int) path.Size() - 1 : from;;
- }
- return edgeIndex;
- }
-
- int ExcludeTrivialWithBulges(const BidirectionalPath& path, std::set<size_t>& edges) const override {
-
- if (path.Empty()) {
- return 0;
- }
-
- int lastEdge = (int) path.Size() - 1;
- bool has_bulge = false;
- do {
- lastEdge = PathAnalyzer::ExcludeTrivial(path, edges, lastEdge);
-
- if (lastEdge >= 0) {
- VertexId v = g_.EdgeEnd(path[lastEdge]);
- VertexId u = g_.EdgeStart(path[lastEdge]);
- auto bulgeCandidates = g_.IncomingEdges(v);
- has_bulge = true;
-
- for (auto iter = bulgeCandidates.begin(); iter != bulgeCandidates.end(); ++iter) {
- if (g_.EdgeStart(*iter) != u) {
- has_bulge = false;
- break;
- }
- }
-
- --lastEdge;
- }
- } while (lastEdge >= 0);
-
- //Preserving simple path
- if (!has_bulge && lastEdge == -1) {
- edges.clear();
- lastEdge = (int) path.Size() - 1;
- }
-
- return lastEdge;
- }
-
-protected:
- DECL_LOGGER("PathAnalyzer")
-
-};
-
-
-class ExtensionChooserListener {
-
-public:
-
- virtual void ExtensionChosen(double weight) = 0;
-
- virtual void ExtensionChosen(const AlternativeContainer& alts) = 0;
-
- virtual ~ExtensionChooserListener() {
-
- }
-};
-
-
-class ExtensionChooser {
-
-public:
- typedef std::vector<EdgeWithDistance> EdgeContainer;
-
-protected:
- const Graph& g_;
- shared_ptr<WeightCounter> wc_;
- //FIXME memory leak?!
- std::vector<ExtensionChooserListener *> listeners_;
-
- double weight_threshold_;
-
-public:
- ExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc = nullptr, double weight_threshold = -1.):
- g_(g), wc_(wc),
- weight_threshold_(weight_threshold) {
- }
-
- virtual ~ExtensionChooser() {
-
- }
-
- virtual EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const = 0;
-
- bool CheckThreshold(double weight) const {
- return math::ge(weight, weight_threshold_);
- }
-
- void Subscribe(ExtensionChooserListener * listener) {
- listeners_.push_back(listener);
- }
-
- void NotifyAll(double weight) const {
- for (auto listener_ptr : listeners_) {
- listener_ptr->ExtensionChosen(weight);
- }
- }
-
- void NotifyAll(const AlternativeContainer& alts) const {
- for (auto listener_ptr : listeners_) {
- listener_ptr->ExtensionChosen(alts);
- }
- }
-
- bool WeightCounterBased() const {
- return wc_ != nullptr;
- }
-
- const WeightCounter& wc() const {
- VERIFY(wc_);
- return *wc_;
- }
-
-protected:
- bool HasIdealInfo(EdgeId e1, EdgeId e2, size_t dist) const {
- return math::gr(wc_->lib().IdealPairedInfo(e1, e2, (int) dist), 0.);
- }
-
- bool HasIdealInfo(const BidirectionalPath& p, EdgeId e, size_t gap) const {
- for (int i = (int) p.Size() - 1; i >= 0; --i)
- if (HasIdealInfo(p[i], e, gap + p.LengthAt(i)))
- return true;
- return false;
- }
-
-private:
- DECL_LOGGER("ExtensionChooser");
-};
-
-
-class JointExtensionChooser: public ExtensionChooser {
-
-protected:
- shared_ptr<ExtensionChooser> first_;
-
- shared_ptr<ExtensionChooser> second_;
-
-public:
- JointExtensionChooser(const Graph& g, shared_ptr<ExtensionChooser> first, shared_ptr<ExtensionChooser> second): ExtensionChooser(g),
- first_(first), second_(second)
- {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- EdgeContainer e1 = first_->Filter(path, edges);
- return second_->Filter(path, e1);
- }
-};
-
-
-class TrivialExtensionChooser: public ExtensionChooser {
-
-public:
- TrivialExtensionChooser(Graph& g): ExtensionChooser(g) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& /*path*/, const EdgeContainer& edges) const override {
- if (edges.size() == 1) {
- return edges;
- }
- return EdgeContainer();
- }
-};
-
-
-class TrivialExtensionChooserWithPI: public ExtensionChooser {
-
-public:
- TrivialExtensionChooserWithPI(Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold):
- ExtensionChooser(g, wc, weight_threshold) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- if (edges.size() == 1) {
- double weight = wc_->CountWeight(path, edges.back().e_, std::set<size_t>());
- NotifyAll(weight);
-
- if (CheckThreshold(weight)) {
- return edges;
- }
- }
- return EdgeContainer();
- }
-};
-
-class ExcludingExtensionChooser: public ExtensionChooser {
- //FIXME what is the logic behind it?
-protected:
- PathAnalyzer analyzer_;
- double prior_coeff_;
-
- AlternativeContainer FindWeights(const BidirectionalPath& path, const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
- AlternativeContainer weights;
- for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
- double weight = wc_->CountWeight(path, iter->e_, to_exclude);
- weights.insert(std::make_pair(weight, *iter));
- DEBUG("Candidate " << g_.int_id(iter->e_) << " weight " << weight << " length " << g_.length(iter->e_));
- }
- NotifyAll(weights);
- return weights;
- }
-
- EdgeContainer FindPossibleEdges(const AlternativeContainer& weights,
- double max_weight) const {
- EdgeContainer top;
- auto possible_edge = weights.lower_bound(max_weight / prior_coeff_);
- for (auto iter = possible_edge; iter != weights.end(); ++iter) {
- top.push_back(iter->second);
- }
- return top;
- }
-
- EdgeContainer FindFilteredEdges(const BidirectionalPath& path,
- const EdgeContainer& edges, const std::set<size_t>& to_exclude) const {
- AlternativeContainer weights = FindWeights(path, edges, to_exclude);
- auto max_weight = (--weights.end())->first;
- EdgeContainer top = FindPossibleEdges(weights, max_weight);
- EdgeContainer result;
- if (top.size() >= 1 && CheckThreshold(max_weight)) {
- result = top;
- }
- return result;
- }
-
-protected:
-
- virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const = 0;
-
-public:
- ExcludingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, PathAnalyzer analyzer, double weight_threshold, double priority) :
- ExtensionChooser(g, wc, weight_threshold), analyzer_(analyzer), prior_coeff_(priority) {
-
- }
-
- virtual EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const {
- DEBUG("Paired-end extension chooser");
- if (edges.empty()) {
- return edges;
- }
- std::set<size_t> to_exclude;
- analyzer_.RemoveTrivial(path, to_exclude);
- path.Print();
- EdgeContainer result = edges;
- ExcludeEdges(path, result, to_exclude);
- result = FindFilteredEdges(path, result, to_exclude);
- if (result.size() == 1) {
- DEBUG("Paired-end extension chooser helped");
- }
- return result;
- }
-
-private:
- DECL_LOGGER("ExcludingExtensionChooser");
-
-};
-
-class SimpleExtensionChooser: public ExcludingExtensionChooser {
-protected:
- void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const override {
- if (edges.size() < 2) {
- return;
- }
- //excluding based on absense of ideal info
- int index = (int) path.Size() - 1;
- while (index >= 0) {
- if (to_exclude.count(index)) {
- index--;
- continue;
- }
- EdgeId path_edge = path[index];
-
- for (size_t i = 0; i < edges.size(); ++i) {
- if (!HasIdealInfo(path_edge,
- edges.at(i).e_,
- path.LengthAt(index))) {
- to_exclude.insert((size_t) index);
- }
- }
-
- index--;
- }
-
- //excluding based on presense of ambiguous paired info
- map<size_t, unsigned> edge_2_extension_cnt;
- for (size_t i = 0; i < edges.size(); ++i) {
- for (size_t e : wc_->PairInfoExist(path, edges.at(i).e_)) {
- edge_2_extension_cnt[e] += 1;
- }
- }
-
- for (auto e_w_ec : edge_2_extension_cnt) {
- if (e_w_ec.second == edges.size()) {
- to_exclude.insert(e_w_ec.first);
- }
- }
- }
-
-public:
-
- SimpleExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
- }
-
-private:
- DECL_LOGGER("SimpleExtensionChooser");
-};
-
-
-class RNAExtensionChooser: public ExcludingExtensionChooser {
-protected:
- void ExcludeEdges(const BidirectionalPath& /*path*/, const EdgeContainer& /*edges*/, std::set<size_t>& /*to_exclude*/) const override {
- }
-
-public:
-
- RNAExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExcludingExtensionChooser(g, wc, PreserveSimplePathsAnalyzer(g), weight_threshold, priority) {
- }
-
-private:
- DECL_LOGGER("SimpleExtensionChooser");
-};
-
-class LongEdgeExtensionChooser: public ExcludingExtensionChooser {
-protected:
- virtual void ExcludeEdges(const BidirectionalPath& path, const EdgeContainer& edges, std::set<size_t>& to_exclude) const {
- if (edges.size() < 2) {
- return;
- }
- int index = (int) path.Size() - 1;
- while (index >= 0) {
- if (to_exclude.count(index)) {
- index--;
- continue;
- }
- EdgeId path_edge = path[index];
- //FIXME configure!
- if (path.graph().length(path_edge) < 200)
- to_exclude.insert((size_t) index);
- index--;
- }
- }
-public:
- LongEdgeExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc, double weight_threshold, double priority) :
- ExcludingExtensionChooser(g, wc, PathAnalyzer(g), weight_threshold, priority) {
- }
-};
-
-class ScaffoldingExtensionChooser : public ExtensionChooser {
-
-protected:
- typedef ExtensionChooser base;
- double raw_weight_threshold_;
- double cl_weight_threshold_;
- const double is_scatter_coeff_ = 3.0;
-
- void AddInfoFromEdge(const std::vector<int>& distances, const std::vector<double>& weights,
- std::vector<pair<int, double>>& histogram, size_t len_to_path_end) const {
- for (size_t l = 0; l < distances.size(); ++l) {
- //todo commented out condition seems unnecessary and should be library dependent! do we need "max(0" there?
- if (/*distances[l] > max(0, (int) len_to_path_end - int(1000)) && */math::ge(weights[l], raw_weight_threshold_)) {
- histogram.push_back(make_pair(distances[l] - (int) len_to_path_end, weights[l]));
- }
- }
- }
-
- int CountMean(const vector<pair<int, double> >& histogram) const {
- double dist = 0.0;
- double sum = 0.0;
- for (size_t i = 0; i < histogram.size(); ++i) {
- dist += histogram[i].first * histogram[i].second;
- sum += histogram[i].second;
- }
- dist /= sum;
- return (int) round(dist);
- }
-
- void GetDistances(EdgeId e1, EdgeId e2, std::vector<int>& dist,
- std::vector<double>& w) const {
- wc_->lib().CountDistances(e1, e2, dist, w);
- }
-
- void CountAvrgDists(const BidirectionalPath& path, EdgeId e, std::vector<pair<int, double>> & histogram) const {
- for (size_t j = 0; j < path.Size(); ++j) {
- std::vector<int> distances;
- std::vector<double> weights;
- GetDistances(path.At(j), e, distances, weights);
- if (distances.size() > 0) {
- AddInfoFromEdge(distances, weights, histogram, path.LengthAt(j));
- }
- }
- }
-
- void FindBestFittedEdgesForClustered(const BidirectionalPath& path, const set<EdgeId>& edges, EdgeContainer& result) const {
- for (EdgeId e : edges) {
- std::vector<pair<int, double>> histogram;
- CountAvrgDists(path, e, histogram);
- double sum = 0.0;
- for (size_t j = 0; j < histogram.size(); ++j) {
- sum += histogram[j].second;
- }
- DEBUG("Weight for scaffolding = " << sum << ", threshold = " << cl_weight_threshold_)
- if (math::ls(sum, cl_weight_threshold_)) {
- continue;
- }
-
- int gap = CountMean(histogram);
- if (HasIdealInfo(path, e, gap)) {
- DEBUG("scaffolding " << g_.int_id(e) << " gap " << gap);
- result.push_back(EdgeWithDistance(e, gap));
- }
- }
- }
-
- bool IsTip(EdgeId e) const {
- return g_.IncomingEdgeCount(g_.EdgeStart(e)) == 0;
- }
-
- set<EdgeId> FindCandidates(const BidirectionalPath& path) const {
- set<EdgeId> jumping_edges;
- const auto& lib = wc_->lib();
- //todo lib (and FindJumpEdges) knows its var so it can be counted there
- int is_scatter = int(math::round(double(lib.GetIsVar()) * is_scatter_coeff_));
- for (int i = (int) path.Size() - 1; i >= 0 && path.LengthAt(i) - g_.length(path.At(i)) <= lib.GetISMax(); --i) {
- set<EdgeId> jump_edges_i;
- lib.FindJumpEdges(path.At(i), jump_edges_i,
- std::max(0, (int)path.LengthAt(i) - is_scatter),
- //FIXME do we need is_scatter here?
- int((path.LengthAt(i) + lib.GetISMax() + is_scatter)),
- 0);
- for (EdgeId e : jump_edges_i) {
- if (IsTip(e)) {
- jumping_edges.insert(e);
- }
- }
- }
- return jumping_edges;
- }
-
-public:
-
-
- ScaffoldingExtensionChooser(const Graph& g, shared_ptr<WeightCounter> wc,
- double cl_weight_threshold,
- double is_scatter_coeff) :
- ExtensionChooser(g, wc), raw_weight_threshold_(0.0),
- cl_weight_threshold_(cl_weight_threshold),
- is_scatter_coeff_(is_scatter_coeff) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override {
- if (edges.empty()) {
- return edges;
- }
- set<EdgeId> candidates = FindCandidates(path);
- EdgeContainer result;
- FindBestFittedEdgesForClustered(path, candidates, result);
- return result;
- }
-
-private:
- DECL_LOGGER("ScaffoldingExtensionChooser");
-};
-
-inline bool EdgeWithWeightCompareReverse(const pair<EdgeId, double>& p1,
- const pair<EdgeId, double>& p2) {
- return p1.second > p2.second;
-}
-
-class LongReadsUniqueEdgeAnalyzer {
-private:
- DECL_LOGGER("LongReadsUniqueEdgeAnalyzer")
-public:
- LongReadsUniqueEdgeAnalyzer(const Graph& g, const GraphCoverageMap& cov_map,
- double filter_threshold, double prior_threshold,
- size_t max_repeat_length, bool uneven_depth)
- : g_(g),
- cov_map_(cov_map),
- filter_threshold_(filter_threshold),
- prior_threshold_(prior_threshold),
- max_repeat_length_(max_repeat_length),
- uneven_depth_(uneven_depth) {
-
- FindAllUniqueEdges();
- }
-
- bool IsUnique(EdgeId e) const {
- return unique_edges_.count(e) > 0;
- }
-
-private:
- bool UniqueEdge(EdgeId e) const {
- if (g_.length(e) > max_repeat_length_)
- return true;
- DEBUG("Analyze unique edge " << g_.int_id(e));
- if (cov_map_.size() == 0) {
- return false;
- }
- auto cov_paths = cov_map_.GetCoveringPaths(e);
- for (auto it1 = cov_paths.begin(); it1 != cov_paths.end(); ++it1) {
- auto pos1 = (*it1)->FindAll(e);
- if (pos1.size() > 1) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- for (auto it2 = it1; it2 != cov_paths.end(); it2++) {
- auto pos2 = (*it2)->FindAll(e);
- if (pos2.size() > 1) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- if (!ConsistentPath(**it1, pos1[0], **it2, pos2[0])) {
- DEBUG("Checking inconsistency");
- if (CheckInconsistence(**it1, pos1[0], **it2, pos2[0],
- cov_paths)) {
- DEBUG("***not unique " << g_.int_id(e) << " len " << g_.length(e) << "***");
- return false;
- }
- }
- }
- }
- DEBUG("***edge " << g_.int_id(e) << " is unique.***");
- return true;
- }
-
- bool ConsistentPath(const BidirectionalPath& path1, size_t pos1,
- const BidirectionalPath& path2, size_t pos2) const {
- return EqualBegins(path1, pos1, path2, pos2, false)
- && EqualEnds(path1, pos1, path2, pos2, false);
- }
- bool SignificantlyDiffWeights(double w1, double w2) const {
- if (w1 > filter_threshold_ and w2 > filter_threshold_) {
- if (w1 > w2 * prior_threshold_ or w2 > w1 * prior_threshold_) {
- return true;
- }
- return false;
- }
- return true;
- }
-
- bool CheckInconsistence(
- const BidirectionalPath& path1, size_t pos1,
- const BidirectionalPath& path2, size_t pos2,
- const BidirectionalPathSet& cov_paths) const {
- size_t first_diff_pos1 = FirstNotEqualPosition(path1, pos1, path2, pos2, false);
- size_t first_diff_pos2 = FirstNotEqualPosition(path2, pos2, path1, pos1, false);
- if (first_diff_pos1 != -1UL && first_diff_pos2 != -1UL) {
- const BidirectionalPath cand1 = path1.SubPath(first_diff_pos1,
- pos1 + 1);
- const BidirectionalPath cand2 = path2.SubPath(first_diff_pos2,
- pos2 + 1);
- std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
- cov_paths);
- DEBUG("Not equal begin " << g_.int_id(path1.At(first_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(first_diff_pos2)) << " weight " << weights.second);
- if (!SignificantlyDiffWeights(weights.first, weights.second)) {
- DEBUG("not significantly different");
- return true;
- }
- }
- size_t last_diff_pos1 = LastNotEqualPosition(path1, pos1, path2, pos2, false);
- size_t last_diff_pos2 = LastNotEqualPosition(path2, pos2, path1, pos1, false);
- if (last_diff_pos1 != -1UL) {
- const BidirectionalPath cand1 = path1.SubPath(pos1,
- last_diff_pos1 + 1);
- const BidirectionalPath cand2 = path2.SubPath(pos2,
- last_diff_pos2 + 1);
- std::pair<double, double> weights = GetSubPathsWeights(cand1, cand2,
- cov_paths);
- DEBUG("Not equal end " << g_.int_id(path1.At(last_diff_pos1)) << " weight " << weights.first << "; " << g_.int_id(path2.At(last_diff_pos2)) << " weight " << weights.second);
- if (!SignificantlyDiffWeights(weights.first, weights.second)) {
- DEBUG("not significantly different");
- return true;
- }
- }
- return false;
- }
-
- std::pair<double, double> GetSubPathsWeights(
- const BidirectionalPath& cand1, const BidirectionalPath& cand2,
- const BidirectionalPathSet& cov_paths) const {
- double weight1 = 0.0;
- double weight2 = 0.0;
- for (auto iter = cov_paths.begin(); iter != cov_paths.end(); ++iter) {
- BidirectionalPath* path = *iter;
- if (ContainSubPath(*path, cand1)) {
- weight1 += path->GetWeight();
- } else if (ContainSubPath(*path, cand2)) {
- weight2 += path->GetWeight();
- }
- }
- return std::make_pair(weight1, weight2);
- }
-
- bool ContainSubPath(const BidirectionalPath& path,
- const BidirectionalPath& subpath) const {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (path.CompareFrom(i, subpath))
- return true;
- }
- return false;
- }
-
- void FindAllUniqueCoverageEdges() {
- VERIFY(!uneven_depth_);
- double sum_cov = 0;
- size_t sum_len = 0;
- size_t total_len = 0;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- total_len += g_.length(*iter);
- if (g_.length(*iter) >= max_repeat_length_) {
- sum_cov += g_.coverage(*iter) * (double)g_.length(*iter);
- sum_len += g_.length(*iter);
- }
- }
- if (sum_len * 4 < total_len) return;
- sum_cov /= (double)sum_len;
- DEBUG("average coverage of long edges: " << sum_cov) ;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.length(*iter) > 500 && (double)g_.coverage(*iter) < 1.2 * sum_cov) {
- if (unique_edges_.find(*iter) == unique_edges_.end()) {
- unique_edges_.insert(*iter);
- unique_edges_.insert(g_.conjugate(*iter));
- DEBUG("Added coverage based unique edge " << g_.int_id(*iter) << " len "<< g_.length(*iter) << " " << g_.coverage(*iter));
- }
- }
- }
- }
-
-
- void FindAllUniqueEdges() {
- DEBUG("Looking for unique edges");
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (UniqueEdge(*iter)) {
- unique_edges_.insert(*iter);
- unique_edges_.insert(g_.conjugate(*iter));
- }
- }
- DEBUG("coverage based uniqueness started");
- if (!uneven_depth_)
- FindAllUniqueCoverageEdges();
- DEBUG("Unique edges are found");
- }
-
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- double filter_threshold_;
- double prior_threshold_;
- std::set<EdgeId> unique_edges_;
- size_t max_repeat_length_;
- bool uneven_depth_;
-};
-
-class SimpleScaffolding {
-public:
- SimpleScaffolding(const Graph& g) : g_(g) {}
-
- BidirectionalPath FindMaxCommonPath(const vector<BidirectionalPath*>& paths,
- size_t max_diff_len) const {
- BidirectionalPath max_end(g_);
- for (auto it1 = paths.begin(); it1 != paths.end(); ++it1) {
- BidirectionalPath* p1 = *it1;
- for (size_t i = 0; i < p1->Size(); ++i) {
- if (p1->Length() - p1->LengthAt(i) > max_diff_len) {
- break;
- }
- bool contain_all = true;
- for (size_t i1 = i + 1; i1 <= p1->Size() && contain_all; ++i1) {
- BidirectionalPath subpath = p1->SubPath(i, i1);
- for (auto it2 = paths.begin(); it2 != paths.end() && contain_all; ++it2) {
- BidirectionalPath* p2 = *it2;
- vector<size_t> positions2 = p2->FindAll(subpath.At(0));
- bool contain = false;
- for (size_t ipos2 = 0; ipos2 < positions2.size(); ++ipos2) {
- size_t pos2 = positions2[ipos2];
- if (p2->Length() - p2->LengthAt(pos2) <= max_diff_len
- && EqualEnds(subpath, 0, *p2, pos2, false)) {
- contain = true;
- break;
- }
- }
- if (!contain) {
- contain_all = false;
- }
- }
- if (contain_all && (i1 - i) >= max_end.Size()) {
- max_end.Clear();
- max_end.PushBack(subpath);
- }
- }
- }
- }
- return max_end;
- }
-
-private:
- const Graph& g_;
-};
-
-class LongReadsExtensionChooser : public ExtensionChooser {
-public:
- LongReadsExtensionChooser(const Graph& g, PathContainer& pc,
- double filtering_threshold,
- double weight_priority_threshold,
- double unique_edge_priority_threshold,
- size_t min_significant_overlap,
- size_t max_repeat_length,
- bool uneven_depth)
- : ExtensionChooser(g),
- filtering_threshold_(filtering_threshold),
- weight_priority_threshold_(weight_priority_threshold),
- min_significant_overlap_(min_significant_overlap),
- cov_map_(g, pc),
- unique_edge_analyzer_(g, cov_map_, filtering_threshold,
- unique_edge_priority_threshold,
- max_repeat_length, uneven_depth),
- simple_scaffolding_(g) {
-
- }
-
- /* Choose extension as correct only if we have reads that traverse a unique edge from the path and this extension.
- * Edge is unique if all reads mapped to this edge are consistent.
- * Two reads are consistent if they can form one path in the graph.
- */
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const override {
- if (edges.empty()) {
- return edges;
- }DEBUG("We in Filter of LongReadsExtensionChooser");
- path.Print();
- map<EdgeId, double> weights_cands;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- weights_cands.insert(make_pair(it->e_, 0.0));
- }
- set<EdgeId> filtered_cands;
- map<EdgeId, BidirectionalPathSet > support_paths_ends;
- auto support_paths = cov_map_.GetCoveringPaths(path.Back());
- DEBUG("Found " << support_paths.size() << " covering paths!!!");
- for (auto it = support_paths.begin(); it != support_paths.end(); ++it) {
- auto positions = (*it)->FindAll(path.Back());
- (*it)->Print();
- for (size_t i = 0; i < positions.size(); ++i) {
- if ((int) positions[i] < (int) (*it)->Size() - 1
- && EqualBegins(path, (int) path.Size() - 1, **it,
- positions[i], false)) {
- DEBUG("Checking unique path_back for " << (*it)->GetId());
-
- if (UniqueBackPath(**it, positions[i])) {
- DEBUG("Success");
-
- EdgeId next = (*it)->At(positions[i] + 1);
- weights_cands[next] += (*it)->GetWeight();
- filtered_cands.insert(next);
- if (support_paths_ends.count(next) == 0){
- support_paths_ends[next] = BidirectionalPathSet();
- }
- support_paths_ends[next].insert(new BidirectionalPath((*it)->SubPath(positions[i] + 1)));
- }
- }
- }
- }
- DEBUG("Candidates");
- for (auto iter = weights_cands.begin(); iter != weights_cands.end(); ++iter) {
- DEBUG("Candidate " << g_.int_id(iter->first) << " weight " << iter->second);
- }
- vector<pair<EdgeId, double> > sort_res = MapToSortVector(weights_cands);
- DEBUG("sort res " << sort_res.size() << " tr " << weight_priority_threshold_);
- if (sort_res.size() < 1 || sort_res[0].second < filtering_threshold_) {
- filtered_cands.clear();
- } else if (sort_res.size() > 1
- && sort_res[0].second > weight_priority_threshold_ * sort_res[1].second) {
- filtered_cands.clear();
- filtered_cands.insert(sort_res[0].first);
- } else if (sort_res.size() > 1) {
- for (size_t i = 0; i < sort_res.size(); ++i) {
- if (sort_res[i].second * weight_priority_threshold_ < sort_res[0].second) {
- filtered_cands.erase(sort_res[i].first);
- }
- }
- }
- EdgeContainer result;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if (filtered_cands.find(it->e_) != filtered_cands.end()) {
- result.push_back(*it);
- }
- }
- if (result.size() != 1) {
- DEBUG("Long reads doesn't help =(");
- }
- return result;
- }
-
-private:
- bool UniqueBackPath(const BidirectionalPath& path, size_t pos) const {
- int int_pos = (int) pos;
- while (int_pos >= 0) {
- if (unique_edge_analyzer_.IsUnique(path.At(int_pos)) > 0 && g_.length(path.At(int_pos)) >= min_significant_overlap_)
- return true;
- int_pos--;
- }
- return false;
- }
-
- vector<pair<EdgeId, double> > MapToSortVector(const map<EdgeId, double>& map) const {
- vector<pair<EdgeId, double> > result1(map.begin(), map.end());
- std::sort(result1.begin(), result1.end(), EdgeWithWeightCompareReverse);
- return result1;
- }
-
- double filtering_threshold_;
- double weight_priority_threshold_;
- size_t min_significant_overlap_;
- const GraphCoverageMap cov_map_;
- LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
- SimpleScaffolding simple_scaffolding_;
-
- DECL_LOGGER("LongReadsExtensionChooser");
-};
-
-class MatePairExtensionChooser : public ExtensionChooser {
-public:
- MatePairExtensionChooser(const Graph& g, shared_ptr<PairedInfoLibrary> lib,
- const PathContainer& paths, size_t max_number_of_paths_to_search,
- bool uneven_depth)
- : ExtensionChooser(g),
- g_(g),
- lib_(lib),
- search_dist_(lib->GetISMax()),
- weight_counter_(g, lib, 10),
- cov_map_(g_, paths),
- path_searcher_(g_, cov_map_, lib_->GetISMax(), PathsWeightCounter(g, lib, (size_t) lib->GetSingleThreshold()), max_number_of_paths_to_search),
- //TODO params
- unique_edge_analyzer_(g, cov_map_, 0., 1000., 8000., uneven_depth),
- simple_scaffolder_(g) {
- }
-
- //Attention! Uses const_cast to modify path!!!
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& init_edges) const override {
- DEBUG("mp chooser");
- path.Print();
- if (path.Length() < lib_->GetISMin()) {
- return EdgeContainer();
- }
- EdgeContainer edges = TryResolveBulge(path, init_edges);
- map<EdgeId, BidirectionalPath*> best_paths;
- for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
- BidirectionalPathSet following_paths = path_searcher_.FindNextPaths(path, edges[iedge].e_);
- vector<BidirectionalPath*> max_weighted = MaxWeightedPath(path, following_paths);
- if (max_weighted.size() == 0) {
- DEBUG("too much paths or tip");
- DeleteMapWithPaths(best_paths);
- DeletePaths(following_paths);
- best_paths.clear();
- break;
- } else {
- best_paths[edges[iedge].e_] = new BidirectionalPath(*max_weighted[0]);
- }
- DeletePaths(following_paths);
- }
-
- BidirectionalPathSet next_paths;
- if (edges.size() == 0) {
- DEBUG("scaffolding edges size " << edges.size())
- next_paths = path_searcher_.FindNextPaths(path, path.Back());
- } else if (best_paths.size() == edges.size()) {
- for (size_t iedge = 0; iedge < edges.size(); ++iedge) {
- if (best_paths.count(edges[iedge].e_) > 0){
- next_paths.insert(best_paths[edges[iedge].e_]);
- }
- }
- }
- EdgeContainer result = ChooseBest(path, next_paths);
- if (result.size() != 1) {
- DEBUG("scaffold tree");
- result = ScaffoldTree(const_cast<BidirectionalPath&>(path));
- }
- DeletePaths(next_paths);
- if (result.size() != 1) {
- DEBUG("nobody can extend " << g_.int_id(path.Back()));
- }
- return result;
- }
-
-private:
- EdgeContainer ScaffoldTree(BidirectionalPath& path) const {
- DEBUG("try scaffold tree");
- vector<BidirectionalPath*> next_paths = path_searcher_.ScaffoldTree(path);
- VERIFY(next_paths.size() <= 1);
- EdgeContainer result;
- if (!next_paths.empty() && next_paths.back()->Size() > 0) {
- BidirectionalPath* res = next_paths.back();
- for (size_t i = 0; i < res->Size() - 1; ++i) {
- path.PushBack(res->At(i), res->GapAt(i), res->TrashPreviousAt(i), res->TrashCurrentAt(i));
- }
- result = EdgeContainer(1, EdgeWithDistance(res->Back(), res->GapAt(res->Size() - 1)));
- }
- DeletePaths(next_paths);
- return result;
- }
-
- bool IsBulge(const EdgeContainer& edges) const {
- if (edges.size() == 0)
- return false;
- for (EdgeWithDistance e : edges) {
- if (!InBuble(e.e_, g_))
- return false;
- }
- return true;
- }
-
- map<EdgeId, double> FindBulgeWeights(const BidirectionalPath& p, const EdgeContainer& edges) const {
- map<EdgeId, double> result;
- for (size_t i = 0; i < edges.size(); ++i) {
- result[edges[i].e_] = 0.0;
- }
- for (size_t i = 0; i < p.Size(); ++i) {
- bool common = true;
- bool common_ideal = true;
- for (EdgeWithDistance e : edges) {
- common_ideal = common_ideal && weight_counter_.HasIdealPI(p.At(i), e.e_, (int) p.LengthAt(i));
- common = common && weight_counter_.HasPI(p.At(i), e.e_, (int) p.LengthAt(i));
- }
- if (!common_ideal || common) {
- continue;
- }
- for (size_t j = 0; j < edges.size(); ++j) {
- result[edges[j].e_] += weight_counter_.PI(p.At(i), edges[j].e_, (int) p.LengthAt(i));
- }
- }
- return result;
- }
-
- EdgeContainer TryResolveBulge(const BidirectionalPath& p, const EdgeContainer& edges) const {
- if (!IsBulge(edges))
- return edges;
- map<EdgeId, double> weights = FindBulgeWeights(p, edges);
- double max_w = 0.0;
- EdgeContainer result;
- for (EdgeWithDistance e : edges) {
- double w = weights[e.e_];
- DEBUG("bulge " << g_.int_id(e.e_) << " w = " << w);
- if (math::gr(w, max_w)) {
- max_w = w;
- result.clear();
- result.push_back(e);
- } else if (math::eq(w, max_w)) {
- result.push_back(e);
- }
- }
- if (result.size() != 1) {
- result = edges;
- }
- return result;
- }
-
- EdgeContainer ChooseBest(const BidirectionalPath& path, const BidirectionalPathSet& next_paths) const {
- DEBUG("Try to choose from best paths...");
- vector<BidirectionalPath*> best_path = MaxWeightedPath(path, next_paths);
- EdgeContainer result;
- if (best_path.size() == 1) {
- result.push_back(EdgeWithDistance((*best_path.begin())->At(0), (*best_path.begin())->GapAt(0)));
- } else if (best_path.size() > 1) {
- result = TryToScaffold(path, best_path);
- }
- return result;
- }
-
- bool HasPIFromUniqueEdges(const BidirectionalPath& p1, const BidirectionalPath& p2, const set<size_t>& p1_unique_edges) const {
- for (size_t i1 = 0; i1 < p1.Size(); ++i1) {
- if (p1_unique_edges.find(i1) == p1_unique_edges.end()) {
- continue;
- }
- for (size_t i2 = 0; i2 < p2.Size(); ++i2) {
- int gap = (int) p1.LengthAt(i1) + (int) p2.Length() - (int) p2.LengthAt(i2);
- if (unique_edge_analyzer_.IsUnique(p2.At(i2)) && weight_counter_.HasPI(p1.At(i1), p2.At(i2), gap)) {
- DEBUG("has unique edge " << g_.int_id(p1.At(i1)) << " " << g_.int_id(p2.At(i2)));
- return true;
- }
- }
- }
- return false;
- }
-
- bool SignificallyDifferentEdges(const BidirectionalPath& init_path, const BidirectionalPath& path1, const map<size_t, double>& pi1,
- const BidirectionalPath& path2, const map<size_t, double>& pi2, const set<size_t>& unique_init_edges) const {
- double not_common_w1 = 0.0;
- double common_w = 0.0;
- for (auto iter = pi1.begin(); iter != pi1.end(); ++iter) {
- auto iter2 = pi2.find(iter->first);
- double w = 0.0;
- if (iter2 != pi2.end() && !math::eq(iter2->second, 0.0)) {
- w = min(iter2->second, iter->second);
- }
- not_common_w1 += iter->second - w;
- common_w += w;
- }
- if (common_w < 0.8 * (not_common_w1 + common_w)
- || (HasPIFromUniqueEdges(init_path, path1, unique_init_edges) && !HasPIFromUniqueEdges(init_path, path2, unique_init_edges))) {
- DEBUG("common_w " << common_w << " sum * 0.8 = " << 0.8 * (not_common_w1 + common_w))
- return true;
- }
- return false;
- }
-
- set<size_t> FindNotCommonEdges(const BidirectionalPath& path, const BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- set<size_t> res;
- for (size_t i = 0; i < path.Size(); ++i) {
- if (!unique_edge_analyzer_.IsUnique(path.At(i))) {
- continue;
- }
- size_t pi_count = 0;
- for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
- const map<size_t, double>& info = iter->second;
- if (info.count(i) > 0 && math::gr(info.at(i), 0.0)) {
- pi_count++;
- }
- }
- if (pi_count == 1)
- res.insert(i);
- }
- return res;
- }
-
- void DeleteSmallWeights(const BidirectionalPath& path, BidirectionalPathSet& paths, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- double max_weight = 0.0;
- BidirectionalPath* max_path = NULL;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if ((*iter)->GetWeight() >= max_weight) {
- max_weight = max(max_weight, (*iter)->GetWeight());
- max_path = *iter;
- }
- }
- BidirectionalPathSet to_del;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (math::gr(max_weight, (*iter)->GetWeight() * 1.5) //TODO: move 1.5 to config
- && SignificallyDifferentEdges(path, *max_path, all_pi.find(max_path)->second, **iter, all_pi.find(*iter)->second,
- FindNotCommonEdges(path, all_pi)))
- to_del.insert(*iter);
- }
- for (BidirectionalPath* p : to_del) {
- paths.erase(p);
- all_pi.erase(p);
- }
- }
-
- void DeleteCommonPi(const BidirectionalPath& p, BidirectionalPathMap< map<size_t, double> >& all_pi) const {
- weight_counter_.ClearCommonWeight();
- for (size_t i = 0; i < p.Size(); ++i) {
- double common = DBL_MAX;
- for (auto iter = all_pi.begin(); iter != all_pi.end(); ++iter) {
- common = iter->second.count(i) == 0 ? 0.0 : min(common, iter->second.at(i));
- }
- weight_counter_.SetCommonWeightFrom(i, common);
- }
- }
-
- size_t FindCommonBegin(const BidirectionalPathSet& paths) const {
- if (paths.size() == 0) {
- return 0;
- }
- size_t common_begin = 0;
- BidirectionalPath* p = *paths.begin();
- while (common_begin < p->Size()) {
- EdgeId e = p->At(common_begin);
- for (BidirectionalPath* next : paths) {
- if (common_begin >= next->Size() || next->At(common_begin) != e) {
- return common_begin;
- }
- }
- common_begin++;
- }
- return common_begin;
- }
-
- void CountAllPairInfo(const BidirectionalPath& path, const BidirectionalPathSet& next_paths,
- BidirectionalPathMap<map<size_t, double>>& result) const {
- result.clear();
- size_t common_begin = FindCommonBegin(next_paths);
- DEBUG("common begin " << common_begin);
- for (BidirectionalPath* next : next_paths) {
- result[next] = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), *next, common_begin, next->Size());
- }
- }
-
- void CountWeightsAndFilter(const BidirectionalPath& path, BidirectionalPathSet& next_paths, bool delete_small_w) const {
- BidirectionalPathMap<map<size_t, double> > all_pi;
- CountAllPairInfo(path, next_paths, all_pi);
- DeleteCommonPi(path, all_pi);
- for (BidirectionalPath* next : next_paths) {
- next->SetWeight((float) weight_counter_.CountPairInfo(path, 0, path.Size(), *next, 0, next->Size()));
- }
- if (delete_small_w) {
- DeleteSmallWeights(path, next_paths, all_pi);
- }
- }
-
- struct PathWithWeightSort {
- PathWithWeightSort(const MatePairExtensionChooser& mp_chooser, const BidirectionalPath& path, BidirectionalPathMap< map<size_t, double> >& all_pi)
- : mp_chooser_(mp_chooser),
- path_(path),
- not_common_(mp_chooser_.FindNotCommonEdges(path_, all_pi)) {
- }
-
- bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) {
- if (mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_)) {
- return true;
- }
- if (mp_chooser_.HasPIFromUniqueEdges(path_, *p2, not_common_) && !mp_chooser_.HasPIFromUniqueEdges(path_, *p1, not_common_)) {
- return false;
- }
- if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
- return math::gr(p1->GetWeight(), p2->GetWeight());
- }
- if (!math::eq(p1->GetWeight(), p2->GetWeight())) {
- return math::gr(p1->GetWeight(), p2->GetWeight());
- }
- if (p1->Length() != p2->Length()) {
- return p1->Length() > p2->Length();
- }
- return p1->Size() > p2->Size();
- }
- const MatePairExtensionChooser& mp_chooser_;
- const BidirectionalPath& path_;
- const set<size_t> not_common_;
- };
-
- vector<BidirectionalPath*> SortResult(const BidirectionalPath& path, BidirectionalPathSet& next_paths) const {
- BidirectionalPathMap< map<size_t, double> > all_pi;
- CountAllPairInfo(path, next_paths, all_pi);
- CountWeightsAndFilter(path, next_paths, false);
- vector<BidirectionalPath*> to_sort(next_paths.begin(), next_paths.end());
- PathWithWeightSort comparator(*this, path, all_pi);
- std::sort(to_sort.begin(), to_sort.end(), comparator);
- return to_sort;
- }
-
- vector<BidirectionalPath*> MaxWeightedPath(const BidirectionalPath& path, const BidirectionalPathSet& following_paths) const {
- BidirectionalPathSet result(following_paths);
- BidirectionalPathSet prev_result;
- while (prev_result.size() != result.size()) {
- prev_result = result;
- DEBUG("iteration with paths " << result.size());
- CountWeightsAndFilter(path, result, true);
- if (result.size() == 0)
- result = prev_result;
- if (result.size() == 1)
- break;
- }
- if (result.size() == 0) {
- DEBUG("bad case");
- return vector<BidirectionalPath*>();
- }
- return SortResult(path, result);
- }
-
- BidirectionalPath ChooseFromEnds(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths, const BidirectionalPath& end) const { //TODO" rewrite
- DEBUG("choose from ends " << paths.size());
- end.Print();
- vector<BidirectionalPath*> new_paths;
- vector<BidirectionalPath*> paths_to_cover;
- for (BidirectionalPath* p : paths) {
- int from = 0;
- int pos = p->FindFirst(end, from);
- while (pos > -1) {
- BidirectionalPath* new_p = new BidirectionalPath(path);
- BidirectionalPath* new_end = new BidirectionalPath(p->SubPath(0, pos + end.Size()));
- new_p->PushBack(*new_end);
- new_paths.push_back(new_p);
- paths_to_cover.push_back(new_end);
- from = pos + 1;
- pos = p->FindFirst(end, from);
- }
- }
- BidirectionalPath max = **new_paths.begin();
- size_t covered_edges_max = 0;
- size_t min_size = max.Size();
- for (BidirectionalPath* p : new_paths) {
- size_t cov_edges = 0;
- for (BidirectionalPath* e : paths_to_cover) {
- vector<size_t> poses = p->FindAll(e->Back());
- for (size_t pos : poses) {
- if (EqualBegins(*p, pos, *e, e->Size() - 1, true)) {
- cov_edges++;
- break;
- }
- }
- }
- if (cov_edges > covered_edges_max || (cov_edges == covered_edges_max && min_size > p->Size())) {
- DEBUG("cov_e " << cov_edges << " s " << p->Size());
- max.Clear();
- max.PushBack(*p);
- covered_edges_max = cov_edges;
- min_size = max.Size();
- }
- }
- for (BidirectionalPath* p : new_paths) {
- delete p;
- }
- for (BidirectionalPath* p : paths_to_cover) {
- delete p;
- }
- BidirectionalPath result = max.SubPath(path.Size());
- DEBUG("res");
- result.Print();
- return result;
- }
-
- int CheckPairInfo(const BidirectionalPath& path, const BidirectionalPath& result_end, int to_add) const {
- while (to_add < (int)result_end.Size()) {
- map<size_t, double> weights = weight_counter_.FindPairInfoFromPath(path, 0, path.Size(), result_end, to_add, to_add + 1);
- double weight_to_edge = 0.0;
- for (auto iter = weights.begin(); iter != weights.end(); ++iter) {
- weight_to_edge += iter->second;
- }
- if (math::gr(weight_to_edge, 0.0)) {
- break;
- }
- to_add++;
- }
- return to_add;
- }
-
- EdgeContainer TryToScaffold(const BidirectionalPath& path, const vector<BidirectionalPath*>& paths) const {
- if (paths.size() == 0) {
- return EdgeContainer();
- }
- DEBUG("Simple Scaffolding")
- for (BidirectionalPath* p : paths) {
- p->Print();
- }
- BidirectionalPath max_end = simple_scaffolder_.FindMaxCommonPath(paths, search_dist_);
- if (max_end.Size() == 0) {
- return EdgeContainer();
- }
- BidirectionalPath result_end = ChooseFromEnds(path, paths, max_end);
- int to_add = result_end.FindFirst(max_end);
- result_end.Print();
- EdgeContainer result;
- to_add = CheckPairInfo(path, result_end, to_add);
- if (to_add < 0 || to_add >= (int) result_end.Size()) {
- return EdgeContainer();
- }
- size_t gap_length = result_end.Length() - result_end.LengthAt(to_add);
- DEBUG(" edge to add " << g_.int_id(result_end.At(to_add)) << " with length " << gap_length);
- result.push_back(EdgeWithDistance(result_end.At(to_add), gap_length));
- return result;
- }
-
- const Graph& g_;
- shared_ptr<PairedInfoLibrary> lib_;
- size_t search_dist_;
- mutable PathsWeightCounter weight_counter_;
- const GraphCoverageMap cov_map_;
- NextPathSearcher path_searcher_;
- LongReadsUniqueEdgeAnalyzer unique_edge_analyzer_;
- SimpleScaffolding simple_scaffolder_;
-
- DECL_LOGGER("MatePairExtensionChooser");
-};
-
-class CoordinatedCoverageExtensionChooser: public ExtensionChooser {
-public:
- CoordinatedCoverageExtensionChooser(const Graph& g,
- CoverageAwareIdealInfoProvider& coverage_provider,
- size_t max_edge_length_in_repeat, double delta, size_t min_path_len) :
- ExtensionChooser(g), provider_(coverage_provider),
- max_edge_length_in_repeat_(max_edge_length_in_repeat), delta_(delta), min_path_len_(min_path_len) {
- }
-
- EdgeContainer Filter(const BidirectionalPath& path,
- const EdgeContainer& edges) const override {
-
- if(path.Length() < min_path_len_) {
- DEBUG("Path is too short");
- return EdgeContainer();
- }
-
- double path_coverage = provider_.EstimatePathCoverage(path);
- if (math::eq(path_coverage, -1.0)) {
- DEBUG("Path coverage can't be calculated");
- return EdgeContainer();
- }
- DEBUG("Path coverage is " << path_coverage);
-
- for (auto e_d : edges) {
- if (path.Contains(g_.EdgeEnd(e_d.e_))) {
- DEBUG("Avoid to create loops");
- return EdgeContainer();
- }
- }
- return FindExtensionTroughRepeat(edges, path_coverage);
- }
-
-private:
-
- void UpdateCanBeProcessed(VertexId v,
- std::queue<VertexId>& can_be_processed, double path_coverage) const {
- DEBUG("Updating can be processed");
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId neighbour_v = g_.EdgeEnd(e);
- if (g_.length(e) <= max_edge_length_in_repeat_ && CompatibleEdge(e, path_coverage)) {
- DEBUG("Adding vertex " << neighbour_v.int_id()
- << "through edge " << g_.str(e));
- can_be_processed.push(neighbour_v);
- }
- }
- }
-
- GraphComponent<Graph> GetRepeatComponent(const VertexId start, double path_coverage) const {
- set<VertexId> vertices_of_component;
- vertices_of_component.insert(start);
- std::queue<VertexId> can_be_processed;
- UpdateCanBeProcessed(start, can_be_processed, path_coverage);
- while (!can_be_processed.empty()) {
- VertexId v = can_be_processed.front();
- can_be_processed.pop();
- if (vertices_of_component.count(v) != 0) {
- DEBUG("Component is too complex");
- return GraphComponent<Graph>(g_, false);
- }
- DEBUG("Adding vertex " << g_.str(v) << " to component set");
- vertices_of_component.insert(v);
- UpdateCanBeProcessed(v, can_be_processed, path_coverage);
- }
-
- GraphComponent<Graph> gc(g_, vertices_of_component.begin(),
- vertices_of_component.end());
- return gc;
- }
-
- EdgeContainer FinalFilter(const EdgeContainer& edges,
- EdgeId edge_to_extend) const {
- EdgeContainer result;
- for (auto e_with_d : edges) {
- if (e_with_d.e_ == edge_to_extend) {
- result.push_back(e_with_d);
- }
- }
- return result;
- }
-
- bool CompatibleEdge(EdgeId e, double path_coverage) const {
- return math::ge(g_.coverage(e), path_coverage * delta_);
- }
-
- //returns lowest coverage among long compatible edges ahead of e
- //if std::numeric_limits<double>::max() -- no such edges were detected
- //if negative -- abort at once
- double AnalyzeExtension(EdgeId ext, double path_coverage) const {
- double answer = std::numeric_limits<double>::max();
-
- if (!CompatibleEdge(ext, path_coverage)) {
- DEBUG("Extension coverage too low");
- return answer;
- }
-
- if (g_.length(ext) > max_edge_length_in_repeat_) {
- DEBUG("Long extension");
- return g_.coverage(ext);
- }
-
- DEBUG("Short extension, launching repeat component analysis");
- GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(ext), path_coverage);
- if (gc.v_size() == 0) {
- DEBUG("Component search failed");
- return -1.;
- }
-
- for (auto e : gc.edges()) {
- if (g_.length(e) > max_edge_length_in_repeat_) {
- DEBUG("Repeat component contains long edges");
- return -1.;
- }
- }
-
- DEBUG("Checking long sinks");
- for (auto v : gc.sinks()) {
- for (auto e : g_.OutgoingEdges(v)) {
- if (g_.length(e) > max_edge_length_in_repeat_ &&
- CompatibleEdge(e, path_coverage) &&
- math::ls(g_.coverage(e), answer)) {
- DEBUG("Updating answer to coverage of edge " << g_.str(e));
- answer = g_.coverage(e);
- }
- }
- }
-
- return answer;
- }
-
- EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
- static EdgeContainer EMPTY_CONTAINER;
-
- map<EdgeId, double> good_extension_to_ahead_cov;
-
- for (auto edge : edges) {
- DEBUG("Processing candidate extension " << g_.str(edge.e_));
- double analysis_res = AnalyzeExtension(edge.e_, path_coverage);
-
- if (analysis_res == std::numeric_limits<double>::max()) {
- DEBUG("Ignoring extension");
- } else if (math::ls(analysis_res, 0.)) {
- DEBUG("Troubles detected, abort mission");
- return EMPTY_CONTAINER;
- } else {
- good_extension_to_ahead_cov[edge.e_] = analysis_res;
- DEBUG("Extension mapped to ahead coverage of " << analysis_res);
- }
- }
-
- DEBUG("Number of good extensions is " << good_extension_to_ahead_cov.size());
-
- if (good_extension_to_ahead_cov.size() == 1) {
- auto extension_info = *good_extension_to_ahead_cov.begin();
- DEBUG("Single extension candidate " << g_.str(extension_info.first));
- if (math::le(extension_info.second, path_coverage / delta_)) {
- DEBUG("Extending");
- return FinalFilter(edges, extension_info.first);
- } else {
- DEBUG("Predicted ahead coverage is too high");
- }
- } else {
- DEBUG("Multiple extension candidates");
- }
-
- return EMPTY_CONTAINER;
- }
-
- CoverageAwareIdealInfoProvider provider_;
- const size_t max_edge_length_in_repeat_;
- const double delta_;
- const size_t min_path_len_;
- DECL_LOGGER("CoordCoverageExtensionChooser");
-};
-
-}
-#endif /* EXTENSION_HPP_ */
diff --git a/src/modules/algorithms/path_extend/loop_traverser.hpp b/src/modules/algorithms/path_extend/loop_traverser.hpp
deleted file mode 100644
index 57eda57..0000000
--- a/src/modules/algorithms/path_extend/loop_traverser.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * loop_traverser.hpp
- *
- * Created on: Jan 28, 2013
- * Author: ira
- */
-
-#ifndef LOOP_TRAVERSER_H_
-#define LOOP_TRAVERSER_H_
-
-#include "path_extender.hpp"
-#include "pe_resolver.hpp"
-#include "path_visualizer.hpp"
-
-namespace path_extend {
-
-class LoopTraverser {
-
- const Graph& g_;
- GraphCoverageMap& covMap_;
- shared_ptr<ContigsMaker> extender_;
- static const size_t MAX_EDGE_LENGTH = 1000;
-private:
- EdgeId FindStart(const set<VertexId>& component_set) const{
- EdgeId result;
- for (auto it = component_set.begin(); it != component_set.end(); ++it) {
- for (auto eit = g_.in_begin(*it); eit != g_.in_end(*it); ++eit) {
- if (component_set.count(g_.EdgeStart(*eit)) == 0) {
- if (result != EdgeId()) {
- return EdgeId();
- }
- result = *eit;
- }
- }
- }
- return result;
- }
-
- EdgeId FindFinish(const set<VertexId>& component_set) {
- EdgeId result;
- for (auto it = component_set.begin(); it != component_set.end(); ++it) {
- for (auto I = g_.out_begin(*it), E = g_.out_end(*it);
- I != E; ++I) {
- if (component_set.count(g_.EdgeEnd(*I)) == 0) {
- if (result != EdgeId()) {
- return EdgeId();
- }
- result = *I;
- }
- }
- }
- return result;
- }
-
- void TryToGrow(BidirectionalPath* path, EdgeId component_entrance) {
- BidirectionalPath clone = *path;
- extender_->GrowPathSimple(*path);
- if (!path->Contains(component_entrance)) {
- DEBUG("Grown paths do not contain initial edges, rolling back");
- path->Clear();
- path->PushBack(clone);
- }
- }
-
- bool IsEndInsideComponent(const BidirectionalPath &path,
- const set <VertexId> &component_set) {
- if (component_set.count(g_.EdgeStart(path.Front())) == 0) {
- return false;
- }
- for (size_t i = 0; i < path.Size(); ++i) {
- if (component_set.count(g_.EdgeEnd(path.At(i))) == 0)
- return false;
- }
- return true;
- }
-
-
- bool IsEndInsideComponent(const BidirectionalPath &path, EdgeId component_entrance,
- const set <VertexId> &component_set,
- bool conjugate = false) {
- int i = path.FindLast(component_entrance);
- VERIFY_MSG(i != -1, "Component edge is not found in the path")
-
- if ((size_t) i == path.Size() - 1) {
- if (conjugate)
- return component_set.count(g_.conjugate(g_.EdgeEnd(path.Back()))) > 0;
- else
- return component_set.count(g_.EdgeEnd(path.Back())) > 0;
- }
-
- if (conjugate)
- return IsEndInsideComponent(path.SubPath((size_t) i + 1).Conjugate(), component_set);
- else
- return IsEndInsideComponent(path.SubPath((size_t) i + 1), component_set);
- }
-
- void TraverseLoop(EdgeId start, EdgeId end, const set<VertexId>& component_set) {
- DEBUG("start " << g_.int_id(start) << " end " << g_.int_id(end));
- BidirectionalPathSet coveredStartPaths =
- covMap_.GetCoveringPaths(start);
- BidirectionalPathSet coveredEndPaths =
- covMap_.GetCoveringPaths(end);
-
- for (auto it_path = coveredStartPaths.begin();
- it_path != coveredStartPaths.end(); ++it_path) {
- if ((*it_path)->FindAll(end).size() > 0) {
- return;
- }
- }
- if (coveredStartPaths.size() < 1 or coveredEndPaths.size() < 1) {
- DEBUG("TraverseLoop STRANGE SITUATION: start " << coveredStartPaths.size() << " end " << coveredEndPaths.size());
- return;
- }
-
- if (coveredStartPaths.size() > 1 or coveredEndPaths.size() > 1) {
- DEBUG("Ambiguous situation in path joining, quitting");
- return;
- }
-
- BidirectionalPath* startPath = *coveredStartPaths.begin();
- BidirectionalPath* endPath = *coveredEndPaths.begin();
- if ((*startPath) == endPath->Conjugate()){
- return;
- }
-
- //TryToGrow(startPath, start);
- //TryToGrow(endPath->GetConjPath(), g_.conjugate(end));
-
- //Checking that paths ends are within component
- if (!IsEndInsideComponent(*startPath, start, component_set) ||
- !IsEndInsideComponent(*endPath->GetConjPath(), g_.conjugate(end), component_set, true)) {
- DEBUG("Some path goes outside of the component")
- return;
- }
-
- size_t commonSize = startPath->CommonEndSize(*endPath);
- size_t nLen = 0;
- DEBUG("Str " << startPath->Size() << ", end" << endPath->Size());
- if (commonSize == 0 && !startPath->Empty() > 0 && !endPath->Empty()) {
- DEBUG("Estimating gap size");
- VertexId lastVertex = g_.EdgeEnd(startPath->Back());
- VertexId firstVertex = g_.EdgeStart(endPath->Front());
-
- if (firstVertex == lastVertex) {
- nLen = 0;
- } else {
- DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, 1000, 3000));
- dijkstra.Run(lastVertex);
- vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart(endPath->Front()));
-
- if (shortest_path.size() == 0) {
- DEBUG("Failed to find closing path");
- return;
- } else if (!IsEndInsideComponent(BidirectionalPath(g_, shortest_path), component_set)) {
- DEBUG("Closing path is outside the component");
- return;
- } else {
- for (size_t i = 0; i < shortest_path.size(); ++i) {
- nLen += g_.length(shortest_path[i]);
- }
- }
- }
- }
- if (commonSize < endPath->Size()){
- startPath->PushBack(endPath->At(commonSize), (int) nLen);
- }
- for (size_t i = commonSize + 1; i < endPath->Size(); ++i) {
- startPath->PushBack(endPath->At(i), endPath->GapAt(i), endPath->TrashPreviousAt(i), endPath->TrashCurrentAt(i));
- }
- DEBUG("travers");
- startPath->Print();
- endPath->Print();
- DEBUG("conj");
- endPath->GetConjPath()->Print();
- endPath->Clear();
- }
-
- bool ContainsLongEdges(const GraphComponent<Graph>& component) const {
- for(auto e : component.edges()) {
- if(g_.length(e) > MAX_EDGE_LENGTH) {
- return true;
- }
- }
- return false;
- }
-
-public:
- LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, shared_ptr<ContigsMaker> extender) :
- g_(g), covMap_(coverageMap), extender_(extender) {
- }
-
- void TraverseAllLoops() {
- DEBUG("TraverseAllLoops");
- shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, MAX_EDGE_LENGTH);
- while (splitter->HasNext()) {
- GraphComponent<Graph> component = splitter->Next();
- if (component.v_size() > 10)
- continue;
- if(ContainsLongEdges(component))
- continue;
- set<VertexId> component_set(component.v_begin(), component.v_end());
- EdgeId start = FindStart(component_set);
- EdgeId finish = FindFinish(component_set);
- if (start == EdgeId() || finish == EdgeId()) {
- continue;
- }
- TraverseLoop(start, finish, component_set);
- }
-
- }
-protected:
- DECL_LOGGER("LoopTraverser");
-};
-
-}
-
-#endif /* LOOP_TRAVERSER_H_ */
diff --git a/src/modules/algorithms/path_extend/next_path_searcher.hpp b/src/modules/algorithms/path_extend/next_path_searcher.hpp
deleted file mode 100644
index e332805..0000000
--- a/src/modules/algorithms/path_extend/next_path_searcher.hpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * next_path_searcher.hpp
- *
- * Created on: Sep 27, 2013
- * Author: ira
- */
-#pragma once
-
-#include <set>
-#include <vector>
-#include <map>
-
-#include "pipeline/graph_pack.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-#include "assembly_graph/paths/bidirectional_path.hpp"
-#include "pe_utils.hpp"
-
-namespace path_extend {
-using debruijn_graph::Graph;
-using std::set;
-using std::vector;
-using std::multimap;
-
-class Edge {
-public:
- Edge(const Graph& g, EdgeId id, Edge* prev_e, size_t dist, int gap = 0)
- : g_(g),
- id_(id),
- prev_edge_(prev_e),
- dist_(dist),
- gap_(gap) {
- }
- ~Edge() {
- for (size_t i = 0; i < out_edges_.size(); ++i) {
- delete out_edges_[i];
- }
- for (size_t i = 0; i < not_out_edges_.size(); ++i) {
- delete not_out_edges_[i];
- }
- }
- Edge* AddOutEdge(EdgeId edge, int gap = 0) {
- return AddIfNotExist(edge, gap, out_edges_);
- }
- Edge* AddIncorrectOutEdge(EdgeId edge, int gap = 0) {
- for (size_t i = 0; i < out_edges_.size(); ++i) {
- if (out_edges_[i]->GetId() == edge) {
- not_out_edges_.push_back(out_edges_[i]);
- out_edges_.erase(out_edges_.begin() + i);
- break;
- }
- }
- return AddIfNotExist(edge, gap, not_out_edges_);
- }
- Edge* AddPath(const BidirectionalPath& path, size_t from) {
- Edge* e = this;
- for (size_t i = from; i < path.Size(); ++i) {
- e = e->AddOutEdge(path.At(i), path.GapAt(i));
- }
- return e;
- }
-
- int GetOutEdgeIndex(EdgeId edge) const {
- return GetEdgeIndex(edge, out_edges_);
- }
-
- int GetIncorrectEdgeIndex(EdgeId edge) const {
- return GetEdgeIndex(edge, not_out_edges_);
- }
-
- size_t OutSize() const {
- return out_edges_.size();
- }
-
- Edge* GetOutEdge(size_t i) const {
- return out_edges_[i];
- }
-
- BidirectionalPath GetPrevPath(size_t from) const {
- BidirectionalPath result(g_);
- vector<pair<EdgeId, int> > edges_wgaps;
- const Edge* e = this;
- edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
- while (e->prev_edge_) {
- e = e->prev_edge_;
- edges_wgaps.push_back(make_pair(e->GetId(), e->Gap()));
- }
- for (int i = (int) edges_wgaps.size() - 1 - (int) from; i >= 0; i--) {
- result.PushBack(edges_wgaps[i].first, edges_wgaps[i].second);
- }
- return result;
- }
-
- bool IsCorrect() {
- Edge* e = this;
- while (e->prev_edge_) {
- if (e->prev_edge_->GetOutEdgeIndex(e->GetId()) == -1) {
- TRACE("after " << g_.int_id(e->prev_edge_->GetId()) << " souldn't go " << g_.int_id(e->GetId()));
- return false;
- }
- e = e->prev_edge_;
- }
- return true;
- }
-
- bool EqualBegins(const BidirectionalPath& path, int pos) {
- BidirectionalPath p = this->GetPrevPath(0);
- return path_extend::EqualBegins(path, (size_t) pos, p, p.Size() - 1, true);
- }
- size_t Length() const {
- return dist_;
- }
- set<Edge*> GetPrevEdges(size_t dist) {
- size_t init_len = Length();
- Edge* e = this;
- set<Edge*> result;
- while (e && init_len - e->Length() < dist) {
- result.insert(e);
- e = e->prev_edge_;
- }
- return result;
- }
- EdgeId GetId() const {
- return id_;
- }
- int Gap() const {
- return gap_;
- }
-private:
- Edge* AddIfNotExist(EdgeId e, int gap, vector<Edge*>& vect) {
- int i = GetEdgeIndex(e, vect);
- if (i != -1) {
- return vect[i];
- }
- size_t dist = dist_ + gap + g_.length(e);
- vect.push_back(new Edge(g_, e, this, dist, gap));
- return vect.back();
- }
- int GetEdgeIndex(EdgeId e, const vector<Edge*>& vect) const {
- for (size_t i = 0; i < vect.size(); ++i) {
- if (vect[i]->GetId() == e)
- return (int) i;
- }
- return -1;
- }
- const Graph& g_;
- EdgeId id_;
- vector<Edge*> out_edges_;
- vector<Edge*> not_out_edges_;
- Edge* prev_edge_;
- size_t dist_;
- int gap_;
-
-protected:
- DECL_LOGGER("NextPathSearcher")
-};
-struct PathWithDistance {
- PathWithDistance(BidirectionalPath p, int dist)
- : p_(p),
- dist_(dist) {
-
- }
- BidirectionalPath p_;
- int dist_;
-};
-class NextPathSearcher {
-public:
- typedef set<EdgeWithDistance, EdgeWithDistance::DistanceComparator> EdgeSet;
- typedef multimap<EdgeId, PathWithDistance> ConstructedPathT;
-
- NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search);
- BidirectionalPathSet FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump = true) const ;
- vector<BidirectionalPath*> ScaffoldTree(const BidirectionalPath& path) const;
-private:
- bool IsOutTip(VertexId v) const;
- bool IsInTip(VertexId v) const;
- vector<Edge*> GrowPath(const BidirectionalPath& init_path, Edge* e) const;
- Edge* AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const;
- bool AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const;
-
- void ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths, vector<Edge*>& to_add,
- bool jump) const;
- void ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const;
- void Scaffold(const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
- void FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const;
- void FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const;
- void OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path, Edge* current_path, ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const;
- void RemoveRedundant(ConstructedPathT& constructed_paths) const;
- void ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const;
- void ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
- ConstructedPathT& constructed_paths, bool is_gap) const;
- int EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const;
- void AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const;
- void FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths, size_t max_len = -1UL) const;
- void JoinPathsByGraph(ConstructedPathT& constructed_paths) const;
- void JoinPathsByPI(ConstructedPathT& constructed_paths) const;
- void JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const;
- map<PathWithDistance*, size_t> FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const;
- void FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
- vector<vector<PathWithDistance*> > FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const;
- void ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const;
-
- const Graph& g_;
- const GraphCoverageMap& cover_map_;
- size_t search_dist_;
- PathsWeightCounter weight_counter_;
- size_t long_edge_len_;
- size_t max_paths_;
-
-protected:
- DECL_LOGGER("NextPathSearcher")
-};
-
-inline NextPathSearcher::NextPathSearcher(const Graph& g, const GraphCoverageMap& cover_map, size_t search_dist, PathsWeightCounter weight_counter, size_t max_number_of_paths_to_search)
- : g_(g),
- cover_map_(cover_map),
- search_dist_(search_dist),
- weight_counter_(weight_counter),
- long_edge_len_(500),
- max_paths_(max_number_of_paths_to_search) {
-
-}
-
-inline vector<BidirectionalPath*> NextPathSearcher::ScaffoldTree(const BidirectionalPath& path) const {
- Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
- Edge* e = start_e->AddPath(path, 1);
- //jump forward when too much paths
- DEBUG("Scaffolding tree for edge " << g_.int_id(start_e->GetId()));
- path.Print();
- vector<Edge*> result_edges;
- ScaffoldChristmasTree(path, e, result_edges, 0);
- std::vector<BidirectionalPath*> result_paths;
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty())
- result_paths.push_back(new BidirectionalPath(result_path));
- }
- if (result_paths.size() != 1) {
- for (size_t i = 0; i < result_paths.size(); ++i) {
- delete result_paths[i];
- }
- result_paths.clear();
- result_edges.clear();
- ScaffoldChristmasTree(path, e, result_edges, long_edge_len_);
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty())
- result_paths.push_back(new BidirectionalPath(result_path));
- }
- }
- delete start_e;
- DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
- return result_paths;
-}
-
-inline BidirectionalPathSet NextPathSearcher::FindNextPaths(const BidirectionalPath& path, EdgeId begin_edge, bool jump) const {
- TRACE("begin find next paths");
- vector<Edge*> grow_paths;
- vector<Edge*> result_edges;
- vector<Edge*> stopped_paths;
- size_t max_len = search_dist_ + path.Length();
- std::set<Edge*> used_edges;
- int count_to_grow = 1;
-
- Edge* start_e = new Edge(g_, path.At(0), NULL, g_.length(path.At(0)) + path.GapAt(0), path.GapAt(0));
- Edge* e = start_e->AddPath(path, 1);
- if (begin_edge != path.Back()) {
- e = e->AddOutEdge(begin_edge);
- DEBUG( "Try to find next path for path with edge " << g_.int_id(begin_edge));
- } else {
- DEBUG( "Try to search for path with last edge " << g_.int_id(path.Back()) << " Scaffolding: " << jump << ", next edges " << g_.OutgoingEdgeCount(g_.EdgeEnd(path.Back())));
- }
- grow_paths.push_back(e);
-
- size_t ipath = 0;
- DEBUG("Processing paths");
- while (ipath < grow_paths.size()) {
- DEBUG("Processing path " << ipath << " of " << grow_paths.size() << " need to grow " << count_to_grow);
- Edge* current_path = grow_paths[ipath++];
- DEBUG(" edge " << g_.int_id(current_path->GetId()));
- if (used_edges.count(current_path) > 0) {
- count_to_grow--;
- continue;
- }
- used_edges.insert(current_path);
- if (current_path->Length() >= max_len && current_path->IsCorrect()) {
- result_edges.push_back(current_path);
- count_to_grow--;
- continue;
- }
- DEBUG("Growing path");
- vector<Edge*> to_add = GrowPath(path, current_path);
- DEBUG("Path grown");
- if (to_add.empty() && current_path->IsCorrect()) {
- DEBUG("scaffold tip");
- ScaffoldTip(path, current_path, result_edges, stopped_paths, to_add, jump);
- }
- count_to_grow--;
- for (Edge* e_to_add : to_add) {
- grow_paths.push_back(e_to_add);
- count_to_grow++;
- }
-
- if (count_to_grow > (int) max_paths_ || ipath > max_paths_ * 10) {
- DEBUG("too many paths");
- delete start_e;
- return BidirectionalPathSet();
- }
- }
- DEBUG("Paths processed");
-
- BidirectionalPathSet result_paths;
- TRACE("adding paths " << result_edges.size());
- for (size_t i = 0; i < result_edges.size(); ++i) {
- BidirectionalPath result_path = result_edges[i]->GetPrevPath(path.Size());
- if (!result_path.Empty()) {
- result_paths.insert(new BidirectionalPath(result_path));
- }
- }
- delete start_e;
- DEBUG( "for path " << path.GetId() << " several extension " << result_paths.size());
- return result_paths;
-}
-
-inline bool NextPathSearcher::AnalyzeBubble(const BidirectionalPath& p, EdgeId buldge_edge, size_t gap, Edge* prev_edge) const {
- EdgeId max_edge = buldge_edge;
- if (prev_edge->GetOutEdgeIndex(buldge_edge) != -1 || prev_edge->GetIncorrectEdgeIndex(buldge_edge) != -1) {
- return prev_edge->GetOutEdgeIndex(buldge_edge) != -1;
- }
- double max_w = 0.0;
- for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
- double w = weight_counter_.CountPairInfo(p, 0, p.Size(), e, gap);
- if (math::gr(w, max_w) || (math::eq(w, max_w) && g_.int_id(e) < g_.int_id(max_edge))) {
- max_w = w;
- max_edge = e;
- }
- }
- for (EdgeId e : g_.OutgoingEdges(g_.EdgeStart(buldge_edge))) {
- if (e == max_edge) {
- prev_edge->AddOutEdge(e);
- } else {
- prev_edge->AddIncorrectOutEdge(e);
- }
- }
- return max_edge == buldge_edge;
-}
-
-inline Edge* NextPathSearcher::AddEdge(const BidirectionalPath& init_path, Edge* prev_e, EdgeId e_to_add, int gap) const {
- Edge* e = prev_e;
- if (e->GetIncorrectEdgeIndex(e_to_add) != -1) {
- return e;
- }
- int inext = e->GetOutEdgeIndex(e_to_add);
- if (inext != -1) {
- return e->GetOutEdge(inext);
- }
- if (InBuble(e_to_add, g_)) {
- if (AnalyzeBubble(init_path, e_to_add, gap, e)) {
- return e->AddOutEdge(e_to_add);
- }
- } else if (e->GetId() != e_to_add) {
- return e->AddOutEdge(e_to_add);
- }
- return e;
-}
-
-inline vector<Edge*> NextPathSearcher::GrowPath(const BidirectionalPath& init_path, Edge* e) const {
- TRACE("in growing path");
- vector<Edge*> to_add;
- if (!e->IsCorrect()) {
- TRACE("incorrect");
- return to_add;
- }
- for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
- TRACE("Analyze outgoing edge " << g_.int_id(next_edge));
- BidirectionalPathSet cov_paths = cover_map_.GetCoveringPaths(next_edge);
- TRACE("cov_map size " << cov_paths.size());
- bool already_added = false;
- for (auto inext_path = cov_paths.begin(); inext_path != cov_paths.end() && !already_added; ++inext_path) {
- vector<size_t> positions = (*inext_path)->FindAll(next_edge);
- for (size_t pos : positions) {
- if (pos == 0 || e->EqualBegins(**inext_path, (int) pos - 1)) {
- TRACE("Found equal begin");
- Edge* new_edge = AddEdge(init_path, e, (*inext_path)->At(pos), (*inext_path)->GapAt(pos));
- if (new_edge && new_edge != e) {
- TRACE("Add edge")
- to_add.push_back(new_edge);
- already_added = true;
- break;
- }
- }
- }
- }
- }
- if (to_add.size() == 0) {
- for (EdgeId next_edge : g_.OutgoingEdges(g_.EdgeEnd(e->GetId()))) {
- if (next_edge != e->GetId()) {
- to_add.push_back(e->AddOutEdge(next_edge));
- }
- }
- }
- stringstream str;
- str << " for edge " << g_.int_id(e->GetId()) << " add ";
- for (Edge* e1 : to_add) {
- str << " " << g_.int_id(e1->GetId());
- }
- TRACE(str.str());
- return to_add;
-}
-
-inline void NextPathSearcher::ScaffoldTip(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& result_edges, vector<Edge*>& stopped_paths,
- vector<Edge*>& to_add, bool jump) const {
-
- if (jump) {
- //jump forward when tip
- DEBUG("Scaffolding");
- ConstructedPathT constructed_paths;
- set<EdgeId> seeds;
- Scaffold(path, current_path, constructed_paths, seeds, true);
- if (constructed_paths.empty()) {
- stopped_paths.push_back(current_path);
- } else {
- DEBUG("Jumped! " << to_add.size());
- ConvertPaths(constructed_paths, current_path, to_add);
- }
- } else {
- DEBUG("Not scaffolding because going back");
- result_edges.push_back(current_path);
- }
-}
-
-inline void NextPathSearcher::ScaffoldChristmasTree(const BidirectionalPath& path, Edge * current_path, vector<Edge*>& to_add, size_t min_length_from) const {
- //jump forward when too much paths
- DEBUG("========= Scaffolding when too many paths =========");
- ConstructedPathT constructed_paths;
- set<EdgeId> seeds;
- //Scaffold(path, current_path, constructed_paths, seeds, false);
- EdgeSet candidate_set;
- FindScaffoldingCandidates(path, current_path, candidate_set, min_length_from);
- for (EdgeWithDistance e : candidate_set) {
- constructed_paths.insert(make_pair(e.e_,PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- }
- RemoveRedundant(constructed_paths);
- JoinPathsByDejikstra(path, constructed_paths);
-
- RemoveRedundant(constructed_paths);
- DEBUG("Scafolding candidates");
- for (EdgeWithDistance e : candidate_set) {
- DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- }
-
- DEBUG("scaffolding candidates for tree " << constructed_paths.size());
- for (auto iter = constructed_paths.begin(); iter != constructed_paths.end(); ++iter){
- iter->second.p_.Print();
- }
-
- if (constructed_paths.size() > 0 && constructed_paths.upper_bound(constructed_paths.begin()->first) == constructed_paths.end()) {
- DEBUG("All paths from one seed");
- int first_seed_pos = 0;
- auto p = constructed_paths.begin();
- if (constructed_paths.size() > 1) {
- //Searching for path with max number of seeds
- DEBUG("Many paths from one seed " << constructed_paths.size());
- int max_seeds = 0;
- for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
- int seed_count = 0;
- for (EdgeId e : seeds) {
- if (it->second.p_.Contains(e)) {
- ++seed_count;
- }
- }
- if (seed_count > max_seeds) {
- max_seeds = seed_count;
- p = it;
- }
- }
- DEBUG("Max seed containing contains " << max_seeds << " seeds");
- //Looking for first seed in that path
- PathWithDistance& winner(p->second);
- first_seed_pos = (int) winner.p_.Size() + 1;
- for (EdgeId e : seeds) {
- int pos = winner.p_.FindFirst(e);
- if (pos != -1)
- first_seed_pos = min(pos, first_seed_pos);
- }
- VERIFY(first_seed_pos != (int) winner.p_.Size() + 1);
- DEBUG("First seed position " << first_seed_pos << " seeds");
- }
- PathWithDistance& path_to_add(p->second);
- int distance = path_to_add.dist_ + (int) path_to_add.p_.Length() - (int) path_to_add.p_.LengthAt(first_seed_pos);
- to_add.push_back(current_path->AddOutEdge(path_to_add.p_[first_seed_pos], distance));
- to_add.back() = to_add.back()->AddPath(path_to_add.p_, first_seed_pos + 1);
- }
- DEBUG("========= Done scaffolding when too many paths =========");
-}
-
-inline void NextPathSearcher::Scaffold(const BidirectionalPath& init_path, Edge* current_path,
- ConstructedPathT& constructed_paths, set<EdgeId>& seeds, bool is_gap) const {
-
- EdgeSet candidate_set;
- FindScaffoldingCandidates(init_path, current_path, candidate_set, 0);
-
- DEBUG("Scafolding candidates");
- for (EdgeWithDistance e : candidate_set) {
- DEBUG( "Edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- }
-
- OrderScaffoldingCandidates(candidate_set, init_path, current_path, constructed_paths, seeds, is_gap);
-}
-
-inline void NextPathSearcher::FindScaffoldingCandidates(const BidirectionalPath& init_path, Edge* current_path, EdgeSet& candidate_set, size_t min_length_from) const {
- set<EdgeId> path_end;
- set<Edge*> prev_edges = current_path->GetPrevEdges(search_dist_);
- for (Edge* e : prev_edges) {
- path_end.insert(e->GetId());
- path_end.insert(g_.conjugate(e->GetId()));
- }
- map<EdgeId, vector<int> > candidates;
- //current_path->GetPrevPath(0).Print();
- TRACE(current_path->Length() << " " << init_path.Length());
- VERIFY(current_path->Length() >= init_path.Length());
- size_t grown_path_len = current_path->Length() - init_path.Length();
- TRACE("Path already grown to " << grown_path_len);
-
- for (size_t i = 0; i < init_path.Size(); ++i) {
- if (g_.length(init_path[i]) <= min_length_from) {
- continue;
- }
- vector<EdgeWithDistance> jump_edges;
- size_t distance_to_tip = init_path.LengthAt(i) + grown_path_len;
- FindScaffoldingCandidates(init_path[i], distance_to_tip, jump_edges);
- for (EdgeWithDistance e : jump_edges) {
- if (candidates.find(e.e_) == candidates.end()) {
- candidates[e.e_] = vector<int>();
- }
- DEBUG("ADD JUMP EDGE FROM " << g_.int_id(init_path[i]) << " TO " << g_.int_id(e.e_))
- candidates[e.e_].push_back(/*max(e.d_ - (int) distance_to_tip, 100)*/100);
- }
- }
-
- for (std::pair<EdgeId, vector<int> > e : candidates) {
- if (path_end.count(e.first) > 0) {
- continue;
- }
- int avg_distance = 0;
- TRACE( "All distances for edge " << g_.int_id(e.first) << " (" << g_.length(e.first) << ")");
- for (int dist : e.second) {
- TRACE(dist);
- avg_distance += dist;
- }
- avg_distance /= (int) e.second.size();
- candidate_set.insert(EdgeWithDistance(e.first, avg_distance));
- }
-}
-
-inline void NextPathSearcher::FindScaffoldingCandidates(EdgeId e, size_t distance_to_tip, vector<EdgeWithDistance>& jump_edges) const {
- if (g_.length(e) < long_edge_len_ || distance_to_tip - g_.length(e) >= search_dist_)
- return;
-
- TRACE("Edge " << g_.int_id(e) << ", length " << g_.length(e));
- TRACE( distance_to_tip << " " << distance_to_tip - g_.length(e) << " " << search_dist_);
-
- set<EdgeId> candidate_edges;
- int min_distance = std::max((int) distance_to_tip - (int) weight_counter_.GetLib()->GetLeftVar(), 0);
- int max_distance = (int) search_dist_ + (int) g_.length(e);
- TRACE("Looking in range " << min_distance << " " << max_distance);
- weight_counter_.FindJumpCandidates(e, min_distance, max_distance, long_edge_len_, candidate_edges);
- weight_counter_.FindJumpEdges(e, candidate_edges, min_distance, max_distance, jump_edges);
- TRACE("Found " << jump_edges.size() << " candidate(s) from this edge");
-}
-
-inline void NextPathSearcher::OrderScaffoldingCandidates(EdgeSet& candidate_set, const BidirectionalPath& init_path,
- Edge* current_path, ConstructedPathT& constructed_paths,
- set<EdgeId>& seeds, bool is_gap) const {
- size_t grown_path_len = current_path->Length() - init_path.Length();
-
- TRACE("Order Scaffolding Candidates, is gap " << is_gap);
- for (EdgeWithDistance e : candidate_set) {
- TRACE("e " << g_.int_id(e.e_));
- if (constructed_paths.count(e.e_) > 0) {
- TRACE("visited");
- continue;
- }
- ProcessScaffoldingCandidate(e, candidate_set, current_path, grown_path_len, constructed_paths, is_gap);
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- TRACE("current constructed paths " << g_.int_id(p1->first));
- //p1->second.p_.Print();
- }
-
- }
- RemoveRedundant(constructed_paths);
- for (auto it = constructed_paths.begin(); it != constructed_paths.end(); ++it) {
- seeds.insert(it->first);
- }
- JoinPathsByGraph(constructed_paths);
- JoinPathsByPI(constructed_paths);
-
- RemoveRedundant(constructed_paths);
-}
-
-inline void NextPathSearcher::ConvertPaths(const ConstructedPathT& constructed_paths, Edge* current_path, vector<Edge*>& to_add) const {
- for (auto edge = constructed_paths.begin(); edge != constructed_paths.end(); ++edge) {
- to_add.push_back(current_path->AddOutEdge(edge->second.p_[0], edge->second.dist_));
- to_add.back() = to_add.back()->AddPath(edge->second.p_, 1);
- }
-}
-
-inline void NextPathSearcher::RemoveRedundant(ConstructedPathT& constructed_paths) const {
- for (auto edge = constructed_paths.begin(); edge != constructed_paths.end();) {
- if (edge->second.p_.Empty()) {
- edge = constructed_paths.erase(edge);
- } else {
- ++edge;
- }
- }
-}
-
-inline void NextPathSearcher::ProcessScaffoldingCandidate(EdgeWithDistance& e, EdgeSet& candidate_set, Edge* current_path, size_t grown_path_len,
- ConstructedPathT& constructed_paths, bool is_gap) const {
- bool looking_for_tip = is_gap;
- //Search back from e till tip or maximim length back
- TRACE(" === Searching back === ");
- TRACE( "Distances: search = " << search_dist_ << ", grown = " << grown_path_len << ", estimated gap = " << e.d_);
- VERIFY(search_dist_ >= grown_path_len);
- VERIFY((int) search_dist_ >= e.d_);
-
- size_t max_length_back = search_dist_ - grown_path_len;
- TRACE(search_dist_ << " " << grown_path_len);
- TRACE( "Searchin for edge of length " << g_.length(e.e_) << " to dist " << max_length_back);
- NextPathSearcher back_searcher(g_, cover_map_, max_length_back, weight_counter_, max_paths_);
- BidirectionalPath jumped_edge(g_, g_.conjugate(e.e_));
- BidirectionalPathSet back_paths = back_searcher.FindNextPaths(jumped_edge, jumped_edge.Back(), false);
- TRACE(" === DONE SEARCHING === ");
- TRACE("Found " << back_paths.size() << " is tip " << IsInTip(g_.EdgeStart(e.e_)) << " look for tip " << looking_for_tip);
-
- if (back_paths.empty()) {
- if (IsInTip(g_.EdgeStart(e.e_)) && looking_for_tip) {
- TRACE( "Added tip edge " << g_.int_id(e.e_) << " (" << g_.length(e.e_) << ")" << ", distance " << e.d_);
- constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- } else if (!IsInTip(g_.EdgeStart(e.e_)) && !looking_for_tip) {
- constructed_paths.insert(make_pair(e.e_, PathWithDistance(BidirectionalPath(g_, e.e_), e.d_)));
- }
- } else {
- TRACE("Found several back paths " << back_paths.size());
- BidirectionalPathSet reached_paths;
- FilterBackPaths(back_paths, g_.conjugate(current_path->GetId()), reached_paths, search_dist_ - grown_path_len);
- //Found a path back to the init path
- if (reached_paths.size() > 0 && !looking_for_tip) {
- TRACE("Found " << reached_paths.size() << " direct path(s) back");
- int i = 0;
- for (BidirectionalPath* p : reached_paths) {
- TRACE("Processing reached path " << i++);
- BidirectionalPath cp = p->Conjugate();
- //Adding jumped edge since its not included in the path
- cp.PushBack(e.e_);
- //cp.Print();
- int reached_edge_pos = cp.FindLast(current_path->GetId());
- VERIFY(reached_edge_pos != -1);
- AddConstructedPath(cp, reached_edge_pos + 1, 0, constructed_paths);
- }
- } else if (reached_paths.size() > 0 && looking_for_tip) {
- DEBUG("Impossible: back path reaches tip");
- } else if (looking_for_tip) {
- TRACE( "Found " << back_paths.size() << " path(s) going back to tip");
- int i = 0;
- for (BidirectionalPath* p : back_paths) {
- DEBUG("Processing tip path " << i++);
- BidirectionalPath cp = p->Conjugate();
- //Adding jumped edge since its not included in the path
- cp.PushBack(e.e_);
- AddConstructedPath(cp, 0, EstimateGapForPath(candidate_set, cp), constructed_paths);
- }
- }
- }
- for (BidirectionalPath* p : back_paths) {
- delete p;
- }
-}
-
-inline int NextPathSearcher::EstimateGapForPath(EdgeSet& candidate_set, const BidirectionalPath& p) const {
- int gap = 0;
- int count = 0;
- for (EdgeWithDistance e : candidate_set) {
- int pos = p.FindFirst(e.e_);
- if (pos != -1) {
- size_t length_to_e = 0;
- for (int i = 0; i < pos; ++i) {
- length_to_e += p.LengthAt(i);
- }
- gap += e.d_ - (int) length_to_e;
- }
- ++count;
- }
- gap /= count;
- return gap > 0 ? gap : 100;
-}
-
-inline void NextPathSearcher::AddConstructedPath(const BidirectionalPath& cp, size_t from, int gap, ConstructedPathT& constructed_paths) const {
- VERIFY(!cp.Empty());
-
- //Adding if there is unique (candidate - tip)
- EdgeId candidate = cp.Back();
- for (auto it = constructed_paths.lower_bound(candidate); it != constructed_paths.upper_bound(candidate); ++it) {
- if (it->second.p_.Front() == cp.Front()) {
- return;
- }
- }
-
- TRACE("Adding path starting from " << from);
- constructed_paths.insert(make_pair(candidate, PathWithDistance(cp.SubPath(from), gap)));
- TRACE("add constructed path " << g_.int_id(candidate));
- //cp.Print();
-
- for (size_t i = 0; i < cp.Size() - 1; ++i) {
- EdgeId edge = cp[i];
- for (auto it = constructed_paths.lower_bound(edge); it != constructed_paths.upper_bound(edge); ++it) {
- TRACE("found " << g_.int_id(edge));
- //it->second.p_.Print();
- TRACE("clear");
- it->second.p_.Clear();
- }
- }
-}
-inline bool NextPathSearcher::IsOutTip(VertexId v) const {
- if (g_.OutgoingEdgeCount(v) == 0) {
- return true;
- }
- if (g_.OutgoingEdgeCount(v) != 1) {
- return false;
- }
- EdgeId oute = *g_.OutgoingEdges(v).begin();
- for (EdgeId ine : g_.IncomingEdges(v)) {
- if (oute == ine) {
- return true;
- }
- }
- return false;
-}
-inline bool NextPathSearcher::IsInTip(VertexId v) const {
- if (g_.IncomingEdgeCount(v) == 0) {
- return true;
- }
- if (g_.IncomingEdgeCount(v) != 1) {
- return false;
- }
- EdgeId ine = *g_.IncomingEdges(v).begin();
- for (EdgeId oute : g_.OutgoingEdges(v)) {
- if (oute == ine) {
- return true;
- }
- }
- return false;
-}
-inline void NextPathSearcher::FilterBackPaths(BidirectionalPathSet& back_paths, EdgeId edge_to_reach, BidirectionalPathSet& reached_paths,
- size_t max_len) const {
- TRACE("Searching for proper back paths");
-
- int i = 0;
- for (auto piter = back_paths.begin(); piter != back_paths.end();) {
- BidirectionalPath* p = *piter;
- VERIFY(!p->Empty());
- EdgeId last_e = p->Back();
- VertexId last_v = g_.EdgeEnd(last_e);
- TRACE("Processing path " << i++);
- //p->Print();
- if (p->FindFirst(edge_to_reach) != -1) {
- reached_paths.insert(p);
- ++piter;
- } else if (IsInTip(last_v) == 0 && p->Length() < max_len) {
- ++piter;
- } else {
- delete p;
- piter = back_paths.erase(piter);
- }
- }
-}
-
-inline void NextPathSearcher::JoinPathsByGraph(ConstructedPathT& constructed_paths) const {
- TRACE("== try to join paths using graph ==");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- //p1->second.p_.Print();
- }
- TRACE("== printed ==");
-
- //Removing edges whose seed is contained in any other path
- set<EdgeId> to_remove;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (to_remove.count(p1->first) > 0) {
- continue;
- }
- for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
- if (p1->first == p2->first || to_remove.count(p2->first) > 0) {
- continue;
- }
- if (p1->second.p_.Contains(p2->first)) {
- to_remove.insert(p2->first);
- }
- }
- }
- for (auto p = constructed_paths.begin(); p != constructed_paths.end(); ) {
- if (to_remove.count(p->first) > 0) {
- p = constructed_paths.erase(p);
- } else {
- ++p;
- }
- }
-}
-
-inline void NextPathSearcher::JoinPathsByPI(ConstructedPathT& constructed_paths) const {
- DEBUG("== try to join paths ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-
- //Checking paired info
- set<EdgeId> visited;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (visited.count(p1->first) > 0) {
- continue;
- }
- for (auto p2 = constructed_paths.begin(); p2 != constructed_paths.end(); ++p2) {
- if (p1->first == p2->first) {
- continue;
- }
- BidirectionalPath& path1 = p1->second.p_;
- BidirectionalPath& path2 = p2->second.p_;
- bool has_pi = false;
- for (size_t i = 0; i < path1.Size(); ++i) {
-
- for (size_t j = 0; j < path2.Size(); ++j) {
- size_t len_to_e2 = path2.Length() - path2.LengthAt(j);
- size_t dist = path1.LengthAt(i) + len_to_e2;
- size_t min_dist = (size_t) max(0, (int) dist - (int) weight_counter_.GetLib()->GetLeftVar());
- size_t max_dist = dist + search_dist_;
- DEBUG("try to find pair info between " << g_.int_id(path1[i]) << " and " << g_.int_id(path2[j])
- << " distance from " << min_dist
- <<" to " << max_dist);
- if (path1[i] != path2[j] &&
- weight_counter_.HasPI(path1[i], path2[j], min_dist, max_dist)) {
- has_pi = true;
- break;
- }
- }
- if (has_pi) {
- break;
- }
- }
-
- set<EdgeId> edges_path1;
- for (size_t i = 0; i < path1.Size(); ++i) {
- edges_path1.insert(path1.At(i));
- }
- for (size_t i = 0; i < path2.Size(); ++i) {
- if (edges_path1.count(path2.At(i)) > 0 || edges_path1.count(g_.conjugate(path2.At(i))) > 0) {
- has_pi = false;
- }
- }
- if (has_pi) {
- DEBUG("has pi from ");
- path1.Print();
- DEBUG("to");
- path2.Print();
- path1.PushBack(path2.Front(), 100);
- for (int i = 1; i < (int) path2.Size(); ++i) {
- path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
- }
- DEBUG("new path");
- path1.Print();
- path2.Clear();
- visited.insert(p2->first);
- }
- }
- }
-}
-inline void Generate(size_t l, size_t r, vector<size_t> a,
- vector<vector<size_t> >& res, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
- if (l == r) {
- DEBUG("result " << a.size())
- res.push_back(a);
- } else {
- for (size_t i = l; i < r; ++i) {
- if (l > 0 && connections[all_paths[a[l - 1]]].count(all_paths[a[i]]) == 0) {
- DEBUG(" not connected " << a[l-1] << " and " << a[i])
- continue;
- }
- DEBUG(" connected " << l-1 << " and " << i)
- size_t v = a[l];
- a[l] = a[i];
- a[i] = v;
- Generate(l + 1, r, a, res, all_paths, connections);
- v = a[l];
- a[l] = a[i];
- a[i] = v;
- }
- }
-}
-
-inline vector<vector<size_t> > Generate(size_t n, vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) {
- vector<vector<size_t> > result;
- if (n > 5) {
- return result;
- }
- vector<size_t> a;
- for (size_t i = 0; i < n; ++i) {
- a.push_back(i);
- }
- Generate(0, n, a, result, all_paths, connections);
- return result;
-}
-
-inline map<PathWithDistance*, size_t> NextPathSearcher::FindDistances(const BidirectionalPath& p, vector<PathWithDistance*>& paths) const {
- DEBUG("find distances from e " << g_.int_id(p.Back()))
- map<PathWithDistance*, size_t> result;
- DijkstraHelper<Graph>::BoundedDijkstra dijkstra(DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, search_dist_, 3000));
- dijkstra.Run(g_.EdgeEnd(p.Back()));
- DEBUG("paths size " << paths.size());
- for (auto ipath = paths.begin(); ipath != paths.end(); ++ipath) {
- vector<EdgeId> shortest_path = dijkstra.GetShortestPathTo(g_.EdgeStart((*ipath)->p_.Front()));
- if (shortest_path.size() != 0) {
- int gap = 0;
- for (size_t i = 0; i < shortest_path.size(); ++i) {
- gap += (int) g_.length(shortest_path[i]);
- }
- gap += (int) g_.k();
- result[*ipath] = gap;
- }
- }
- DEBUG("return result " << result.size());
- return result;
-}
-
-inline void NextPathSearcher::FindConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
- for (auto p1 = all_paths.begin(); p1 != all_paths.end(); ++p1) {
- map<PathWithDistance*, size_t> distances = FindDistances((*p1)->p_, all_paths);
- connections[*p1] = set<PathWithDistance*>();
- for (auto iter = distances.begin(); iter != distances.end(); ++iter) {
- if ((*p1)->p_.Length() + iter->second < search_dist_){
- connections[*p1].insert(iter->first);
- }
- }
- }
-}
-
-inline void NextPathSearcher::ConnectPaths(const BidirectionalPath& init_path, vector<vector<PathWithDistance*> >& variants) const {
- if (variants.size() == 1 && variants[0].size() > 0) {
- vector<PathWithDistance*> res = variants[0];
- vector<PathWithDistance*> for_dijkstra;
- BidirectionalPath& path1 = res[0]->p_;
- for_dijkstra.push_back(res[0]);
- map<PathWithDistance*, size_t> distances = FindDistances(init_path, for_dijkstra);
- size_t gap = distances.count(res[0]) > 0 ? distances[res[0]] : 100 + g_.k();
- BidirectionalPath p(path1);
- path1.Clear();
- path1.PushBack(p.Front(), (int)gap);
- path1.PushBack(p.SubPath(1));
- for (size_t i = 1; i < res.size(); ++i) {
- for_dijkstra.clear();
- for_dijkstra.push_back(res[i]);
- BidirectionalPath& path2 = res[i]->p_;
- distances = FindDistances(path1, for_dijkstra);
- gap = distances.count(res[i]) > 0 ? distances[res[i]] : 100 + g_.k();
- path1.PushBack(path2.Front(), (int)gap);
- for (int i = 1; i < (int) path2.Size(); ++i) {
- path1.PushBack(path2[i], path2.GapAt(i), path2.TrashPreviousAt(i), path2.TrashCurrentAt(i));
- }
- path2.Clear();
- }
- } else if (variants.size() > 1) {
- vector<PathWithDistance*> res = variants[0];
- EdgeId last = res.back()->p_.Back();
- for (size_t i = 1; i < variants.size(); ++i) {
- if (last != variants[i].back()->p_.Back()) {
- return;
- }
- }
- for (size_t i = 0; i < res.size(); ++i) {
- res[i]->p_.Clear();
- }
- int gap = (int) 1000 + (int) g_.k();
- res[0]->p_.PushBack(last, gap);
- }
-}
-
-inline vector<vector<PathWithDistance*> > NextPathSearcher::FilterConnections(vector<PathWithDistance*>& all_paths, map<PathWithDistance*, set<PathWithDistance*> >& connections) const {
- vector<vector<PathWithDistance*> > variants;
- DEBUG("filter connections " << connections.size() << " all paths size " << all_paths.size())
- vector<vector<size_t> > permutations = Generate(all_paths.size(), all_paths, connections);
- DEBUG("generated all permutations " << permutations.size());
- for (size_t i = 0; i < permutations.size(); ++i) {
- vector<PathWithDistance*> variant;
- for (size_t j = 0; j < permutations[i].size(); ++j) {
- variant.push_back(all_paths[permutations[i][j]]);
- }
- variants.push_back(variant);
- }
- return variants;
-}
-
-inline void NextPathSearcher::JoinPathsByDejikstra(const BidirectionalPath& init_path, ConstructedPathT& constructed_paths) const {
- DEBUG("== try to join paths by dejikstra ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-
- vector<PathWithDistance*> all_paths;
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- if (p1->second.p_.Size() != 0) {
- all_paths.push_back(&p1->second);
- }
- }
- map<PathWithDistance*, set<PathWithDistance*> > connections;
- FindConnections(all_paths, connections);
- vector<vector<PathWithDistance*> > variants = FilterConnections(all_paths, connections);
- ConnectPaths(init_path, variants);
-
- DEBUG("== after to join paths ===");
- for (auto p1 = constructed_paths.begin(); p1 != constructed_paths.end(); ++p1) {
- p1->second.p_.Print();
- }
- DEBUG("== printed ===");
-}
-
-} // namespace path_extend
diff --git a/src/modules/algorithms/path_extend/overlap_analysis.hpp b/src/modules/algorithms/path_extend/overlap_analysis.hpp
deleted file mode 100644
index b119a7d..0000000
--- a/src/modules/algorithms/path_extend/overlap_analysis.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/range.hpp"
-#include "ssw/ssw_cpp.h"
-
-namespace debruijn_graph {
-using omnigraph::Range;
-
-struct OverlapInfo {
- Range r1;
- Range r2;
- size_t match_cnt;
-
- OverlapInfo(const Range& r1_, const Range& r2_, size_t match_cnt_)
- : r1(r1_),
- r2(r2_),
- match_cnt(match_cnt_) {
- VERIFY(match_cnt <= std::min(r1.size(), r2.size()));
- }
-
- OverlapInfo()
- : match_cnt(0) {
- }
-
- double identity() const {
- if (match_cnt == 0)
- return 0.;
- return (double)match_cnt / (double)size();
- }
-
- size_t size() const {
- return std::max(r1.size(), r2.size());
- }
-
- bool operator==(const OverlapInfo &that) const {
- return r1 == that.r1 && r2 == that.r2 && match_cnt == that.match_cnt;
- }
-
- bool operator!=(const OverlapInfo &that) const {
- return !(*this == that);
- }
-};
-
-std::ostream& operator<<(std::ostream& os, const OverlapInfo& info) {
- return os << "R1: [" << info.r1.start_pos << ", " << info.r1.end_pos
- << "]; R2: [" << info.r2.start_pos << ", " << info.r2.end_pos << "]"
- << "; match_cnt: " << info.match_cnt;
-}
-
-class SWOverlapAnalyzer {
- static const uint32_t CIGAR_FLAG_MASK = (1 << 4) - 1;
- static const uint32_t CIGAR_MATCH_FLAG = 7;
- typedef typename Graph::EdgeId EdgeId;
- size_t flank_length_;
-
- const StripedSmithWaterman::Aligner aligner_;
- const StripedSmithWaterman::Filter filter_;
-
- size_t CountMatches(std::vector<uint32_t> cigar) const {
- size_t match_cnt = 0;
- for (uint32_t entry : cigar) {
- if ((entry & CIGAR_FLAG_MASK) == CIGAR_MATCH_FLAG) {
- match_cnt += (entry >> 4);
- }
- }
- return match_cnt;
- }
-
- OverlapInfo InnerAnalyze(const Sequence& s1, const Sequence& s2) const {
- if (s1.size() == 0 || s2.size() == 0) {
- return OverlapInfo();
- }
- StripedSmithWaterman::Alignment alignment;
- if (aligner_.Align(s1.str().c_str(), s2.str().c_str(), int(s2.size()), filter_, &alignment)) {
- if (alignment.sw_score > 0) {
- return OverlapInfo(Range(alignment.query_begin, alignment.query_end + 1),
- Range(alignment.ref_begin, alignment.ref_end + 1),
- CountMatches(alignment.cigar));
- }
- }
- return OverlapInfo();
- }
-
-public:
- SWOverlapAnalyzer(size_t flank_length)
- : flank_length_(flank_length),
- aligner_(/*match_score*/1,
- /*mismatch_penalty*/3,
- /*gap_opening_penalty*/4,
- /*gap_extending_penalty*/3) {
- }
-
-
- OverlapInfo AnalyzeOverlap(const Sequence& s1, const Sequence& s2) const {
- DEBUG("Analysis started");
- size_t start1 = flank_length_ > s1.size() ? 0 : s1.size() - flank_length_;
- size_t end2 = flank_length_ > s2.size() ? s2.size() : flank_length_;
-
- DEBUG("s1 " << s1.Subseq(start1, s1.size()));
- DEBUG("s2 " << s2.Subseq(0, end2));
- OverlapInfo result = InnerAnalyze(s1.Subseq(start1, s1.size()), s2.Subseq(0, end2));
- if (result == OverlapInfo()) {
- DEBUG("Empty overlap")
- return result;
- }
-
- result.r1.shift(int(start1));
- DEBUG("Result " << result)
- return result;
- }
-
- template<class Graph>
- OverlapInfo AnalyzeOverlap(const Graph& g, EdgeId e1, EdgeId e2) const {
- DEBUG("Analyzing edges " << g.str(e1) << " and " << g.str(e2));
- return AnalyzeOverlap(g.EdgeNucls(e1), g.EdgeNucls(e2));
- }
-
-private:
- DECL_LOGGER("SWOverlapAnalyzer");
-};
-
-}
diff --git a/src/modules/algorithms/path_extend/paired_library.hpp b/src/modules/algorithms/path_extend/paired_library.hpp
deleted file mode 100644
index f176ab9..0000000
--- a/src/modules/algorithms/path_extend/paired_library.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * paired_library.hpp
- *
- * Created on: Feb 19, 2012
- * Author: andrey
- */
-
-#ifndef PAIRED_LIBRARY_HPP_
-#define PAIRED_LIBRARY_HPP_
-
-#include "pipeline/graph_pack.hpp"
-#include "paired_info/paired_info.hpp"
-#include "ideal_pair_info.hpp"
-
-#include "math/xmath.h"
-
-namespace path_extend {
-
-using debruijn_graph::Graph;
-using debruijn_graph::EdgeId;
-
-using omnigraph::de::PairedInfoIndexT;
-typedef omnigraph::de::PairInfo<EdgeId> DePairInfo;
-using omnigraph::de::Point;
-
-struct PairedInfoLibrary {
- PairedInfoLibrary(size_t k, const Graph& g, size_t readS, size_t is,
- size_t is_min, size_t is_max, size_t is_var,
- bool is_mp,
- const std::map<int, size_t>& is_distribution)
- : g_(g),
- k_(k),
- read_size_(readS),
- is_(is),
- is_min_(is_min),
- is_max_(is_max),
- is_var_(is_var),
- is_mp_(is_mp),
- single_threshold_(-1.0),
- coverage_coeff_(1.0),
- ideal_pi_counter_(g, (int) is_min, (int) is_max, readS, is_distribution) {
- }
-
- virtual ~PairedInfoLibrary() {}
-
- void SetCoverage(double cov) { coverage_coeff_ = cov; }
- void SetSingleThreshold(double threshold) { single_threshold_ = threshold; }
-
- virtual size_t FindJumpEdges(EdgeId e, set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const = 0;
- virtual void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const = 0;
- virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int distance, bool from_interval = false) const = 0;
- virtual double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const = 0;
-
- double IdealPairedInfo(EdgeId e1, EdgeId e2, int distance, bool additive = false) const {
- return ideal_pi_counter_.IdealPairedInfo(e1, e2, distance, additive);
- }
-
- size_t GetISMin() const { return is_min_; }
- double GetSingleThreshold() const { return single_threshold_; }
- double GetCoverageCoeff() const { return coverage_coeff_; }
- size_t GetISMax() const { return is_max_; }
- size_t GetIsVar() const { return is_var_; }
- size_t GetLeftVar() const { return is_ - is_min_; }
- size_t GetRightVar() const { return is_max_ - is_; }
- size_t GetReadSize() const { return read_size_; }
- bool IsMp() const { return is_mp_; }
-
- const Graph& g_;
- size_t k_;
- size_t read_size_;
- size_t is_;
- size_t is_min_;
- size_t is_max_;
- size_t is_var_;
- bool is_mp_;
- double single_threshold_;
- double coverage_coeff_;
- IdealPairInfoCounter ideal_pi_counter_;
-protected:
- DECL_LOGGER("PathExtendPI");
-};
-
-template<class Index>
-struct PairedInfoLibraryWithIndex : public PairedInfoLibrary {
-
- PairedInfoLibraryWithIndex(size_t k, const Graph& g, size_t readS, size_t is, size_t is_min, size_t is_max, size_t is_div,
- const Index& index, bool is_mp,
- const std::map<int, size_t>& is_distribution)
- : PairedInfoLibrary(k, g, readS, is, is_min, is_max, is_div, is_mp, is_distribution),
- index_(index) {}
-
- size_t FindJumpEdges(EdgeId e, std::set<EdgeId>& result, int min_dist, int max_dist, size_t min_len = 0) const override {
- VERIFY(index_.size() > 0);
- result.clear();
-
- auto infos = index_.Get(e);
- // We do not care about iteration order here - all the edges collected
- // will be inside std::set<EdgeId>
- for (auto it : infos) {
- EdgeId e2 = it.first;
- if (e2 == e)
- continue;
- if (g_.length(e2) < min_len)
- continue;
- for (auto point : it.second) {
- omnigraph::de::DEDistance dist = point.d;
- if (math::le(dist, (omnigraph::de::DEDistance) max_dist) &&
- math::ge(dist, (omnigraph::de::DEDistance) min_dist)) {
- result.insert(e2);
- }
- }
- }
- return result.size();
- }
-
-
- void CountDistances(EdgeId e1, EdgeId e2, vector<int>& dist, vector<double>& w) const override {
- VERIFY(index_.size() > 0);
- if (e1 == e2)
- return;
-
- for (auto point : index_.Get(e1, e2)) {
- int pairedDistance = de::rounded_d(point);
- dist.push_back(pairedDistance);
- w.push_back(point.weight);
- }
- }
-
- double CountPairedInfo(EdgeId e1, EdgeId e2, int distance,
- bool from_interval = false) const override {
- VERIFY(index_.size() != 0);
- double weight = 0.0;
-
- for (auto point : index_.Get(e1, e2)) {
- int pairedDistance = de::rounded_d(point);
- int distanceDev = (int) point.variance(); //max((int) pointIter->var, (int) is_variation_);
- //Can be modified according to distance comparison
- int d_min = distance - distanceDev;
- int d_max = distance + distanceDev;
-
- if (from_interval) {
- d_min -= (int) (is_ - is_min_);
- d_max += (int) (is_max_ - is_);
- }
- if (pairedDistance >= d_min && pairedDistance <= d_max) {
- weight += point.weight;
- }
- }
- return weight;
- }
-
- double CountPairedInfo(EdgeId e1, EdgeId e2, int dist_min, int dist_max) const override {
- VERIFY(index_.size() != 0);
- double weight = 0.0;
- for (const auto &point : index_.Get(e1, e2)) {
- int dist = de::rounded_d(point);
- if (dist >= dist_min && dist <= dist_max)
- weight += point.weight;
- }
- return weight;
- }
-
- const Index& index_;
-protected:
- DECL_LOGGER("PathExtendPI");
-};
-
-typedef std::vector<shared_ptr<PairedInfoLibrary> > PairedInfoLibraries;
-
-} // path extend
-
-#endif /* PAIRED_LIBRARY_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_extend_launch.hpp b/src/modules/algorithms/path_extend/path_extend_launch.hpp
deleted file mode 100644
index ba1d4e3..0000000
--- a/src/modules/algorithms/path_extend/path_extend_launch.hpp
+++ /dev/null
@@ -1,1257 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * lc_launch.hpp
- *
- * Created on: Dec 1, 2011
- * Author: andrey
- */
-
-#ifndef PATH_EXTEND_LAUNCH_HPP_
-#define PATH_EXTEND_LAUNCH_HPP_
-
-#include "scaffolder2015/scaffold_graph_constructor.hpp"
-#include "pe_config_struct.hpp"
-#include "pe_resolver.hpp"
-#include "path_extender.hpp"
-#include "pe_io.hpp"
-#include "path_visualizer.hpp"
-#include "loop_traverser.hpp"
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
-#include "next_path_searcher.hpp"
-#include "scaffolder2015/extension_chooser2015.hpp"
-#include "algorithms/genome_consistance_checker.hpp"
-#include "scaffolder2015/scaffold_graph.hpp"
-#include "scaffolder2015/scaffold_graph_visualizer.hpp"
-
-namespace path_extend {
-
-using namespace debruijn_graph;
-
-struct PathExtendParamsContainer {
-
- PathExtendParamsContainer(const pe_config::MainPEParamsT& pe_cfg_,
- const std::string& output_dir_,
- const std::string& contigs_name_,
- const std::string& scf_name_,
- config::pipeline_type mode_,
- bool uneven_depth_,
- bool avoid_rc_connections_,
- bool use_scaffolder_,
- bool output_broken_scaffolds_ = true):
- pe_cfg(pe_cfg_),
- pset(pe_cfg_.param_set),
- output_dir(output_dir_),
- etc_dir(output_dir + pe_cfg_.etc_dir + "/"),
- contigs_name(scf_name_),
- broken_contigs(contigs_name_),
- mode(mode_),
- uneven_depth(uneven_depth_),
- avoid_rc_connections(avoid_rc_connections_),
- use_scaffolder(use_scaffolder_),
- traverse_loops(true),
- output_broken_scaffolds(output_broken_scaffolds_)
- {
- if (!(use_scaffolder && pset.scaffolder_options.enabled)) {
- contigs_name = contigs_name_;
- traverse_loops = false;
- output_broken_scaffolds = false;
- }
- }
-
- const pe_config::MainPEParamsT& pe_cfg;
- const pe_config::ParamSetT& pset;
-
- std::string output_dir;
- std::string etc_dir;
-
- std::string contigs_name;
- std::string broken_contigs;
-
- config::pipeline_type mode;
- bool uneven_depth;
-
- bool avoid_rc_connections;
- bool use_scaffolder;
- bool traverse_loops;
- bool output_broken_scaffolds;
-};
-
-inline void DebugOutputPaths(const conj_graph_pack& gp,
- const PathExtendParamsContainer& params,
- const PathContainer& paths,
- const string& name) {
- PathInfoWriter path_writer;
- PathVisualizer visualizer;
-
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor, gp.components, params.mode == config::pipeline_type::plasmid);
-
- if (!params.pe_cfg.debug_output) {
- return;
- }
- writer.OutputPaths(paths, params.etc_dir + name);
- if (params.pe_cfg.output.write_paths) {
- path_writer.WritePaths(paths, params.etc_dir + name + ".dat");
- }
- if (params.pe_cfg.viz.print_paths) {
- visualizer.writeGraphWithPathsSimple(gp, params.etc_dir + name + ".dot", name, paths);
- }
-}
-
-inline double GetWeightThreshold(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
- return lib->IsMp() ? pset.mate_pair_options.weight_threshold : pset.extension_options.weight_threshold;
-}
-
-inline double GetPriorityCoeff(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT& pset) {
- return lib->IsMp() ? pset.mate_pair_options.priority_coeff : pset.extension_options.priority_coeff;
-}
-
-inline void SetSingleThresholdForLib(shared_ptr<PairedInfoLibrary> lib, const pe_config::ParamSetT &pset, double threshold, double correction_coeff = 1.0) {
- if (lib->IsMp()) {
- lib->SetSingleThreshold(pset.mate_pair_options.use_default_single_threshold || math::le(threshold, 0.0) ?
- pset.mate_pair_options.single_threshold : threshold);
- }
- else {
- double t = pset.extension_options.use_default_single_threshold || math::le(threshold, 0.0) ?
- pset.extension_options.single_threshold : threshold;
- t = correction_coeff * t;
- lib->SetSingleThreshold(t);
- }
-}
-
-
-inline void OutputBrokenScaffolds(PathContainer& paths,
- const PathExtendParamsContainer& params,
- int k,
- const ContigWriter& writer,
- const std::string& filename) {
- if (!params.pset.scaffolder_options.enabled
- || !params.use_scaffolder
- || params.pe_cfg.obs == obs_none) {
- return;
- }
-
- int min_gap = params.pe_cfg.obs == obs_break_all ? k / 2 : k;
-
- ScaffoldBreaker breaker(min_gap, paths);
- breaker.container().SortByLength();
- writer.OutputPaths(breaker.container(), filename);
-}
-
-inline void AddPathsToContainer(const conj_graph_pack& gp,
- const std::vector<PathInfo<Graph> > paths,
- size_t size_threshold, PathContainer& result) {
- for (size_t i = 0; i < paths.size(); ++i) {
- auto path = paths.at(i);
- vector<EdgeId> edges = path.getPath();
- if (edges.size() <= size_threshold) {
- continue;
- }
- BidirectionalPath* new_path = new BidirectionalPath(gp.g, edges);
- BidirectionalPath* conj_path = new BidirectionalPath(new_path->Conjugate());
- new_path->SetWeight((float) path.getWeight());
- conj_path->SetWeight((float) path.getWeight());
- result.AddPair(new_path, conj_path);
- }
- DEBUG("Long reads paths " << result.size() << " == ");
-}
-
-bool HasOnlyMPLibs(const config::dataset& dataset_info) {
- for (const auto& lib : dataset_info.reads) {
- if (!((lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs) &&
- lib.data().mean_insert_size > 0.0)) {
- return false;
- }
- }
- return true;
-}
-
-bool UseCoverageResolverForSingleReads(const config::dataset& dataset_info,
- const io::LibraryType& type) {
- return HasOnlyMPLibs(dataset_info) && (type == io::LibraryType::HQMatePairs);
-}
-
-inline size_t CountEdgesInGraph(const Graph& g) {
- size_t count = 0;
- for (auto iter = g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- count++;
- }
- return count;
-}
-
-inline size_t GetNumberMPPaths(const Graph& g) {
- size_t count_edge = CountEdgesInGraph(g);
- if (count_edge < 1000) {
- return 1000;
- }
- if (count_edge < 10000) {
- return 100;
- }
- return 50;
-}
-
-inline string LibStr(size_t count) {
- return count == 1 ? "library" : "libraries";
-}
-
-inline void ClonePathContainer(PathContainer& spaths, PathContainer& tpaths, GraphCoverageMap& tmap) {
- tpaths.clear();
- tmap.Clear();
-
- for (auto iter = spaths.begin(); iter != spaths.end(); ++iter) {
- BidirectionalPath& path = *iter.get();
- BidirectionalPath* new_path = new BidirectionalPath(path.graph());
- new_path->Subscribe(&tmap);
- new_path->PushBack(path);
-
- BidirectionalPath& cpath = *iter.getConjugate();
- BidirectionalPath* new_cpath = new BidirectionalPath(cpath.graph());
- new_cpath->Subscribe(&tmap);
- new_cpath->PushBack(cpath);
-
- tpaths.AddPair(new_path, new_cpath);
- }
-}
-
-inline void FinalizePaths(const PathExtendParamsContainer& params,
- PathContainer& paths,
- const Graph& g,
- GraphCoverageMap& cover_map,
- size_t min_edge_len,
- size_t max_path_diff,
- bool mate_pairs = false) {
- PathExtendResolver resolver(cover_map.graph());
-
- if (params.pset.remove_overlaps) {
- resolver.removeOverlaps(paths, cover_map, min_edge_len, max_path_diff,
- params.pset.cut_all_overlaps,
- (params.mode == config::pipeline_type::moleculo));
- }
- else {
- resolver.removeEqualPaths(paths, cover_map, min_edge_len);
- }
- if (mate_pairs) {
- resolver.RemoveMatePairEnds(paths, min_edge_len);
- }
- if (params.avoid_rc_connections) {
- paths.FilterInterstandBulges();
- }
- paths.FilterEmptyPaths();
- if (!mate_pairs) {
- resolver.addUncoveredEdges(paths, cover_map);
- }
- if (params.pset.path_filtration.enabled) {
- LengthPathFilter(g, params.pset.path_filtration.min_length).filter(paths);;
- IsolatedPathFilter(g, params.pset.path_filtration.min_length_for_low_covered, params.pset.path_filtration.min_coverage).filter(paths);
- IsolatedPathFilter(g, params.pset.path_filtration.isolated_min_length).filter(paths);
- }
- paths.SortByLength();
- for(auto& path : paths) {
- path.first->ResetOverlaps();
- }
-
-}
-
-inline void TraverseLoops(PathContainer& paths, GraphCoverageMap& cover_map, shared_ptr<ContigsMaker> extender) {
- INFO("Traversing tandem repeats");
- LoopTraverser loopTraverser(cover_map.graph(), cover_map, extender);
- loopTraverser.TraverseAllLoops();
- paths.SortByLength();
-}
-
-inline bool IsForSingleReadExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
- io::LibraryType lt = lib.type();
- return (lib.data().single_reads_mapped ||
- lt == io::LibraryType::PacBioReads ||
- lt == io::LibraryType::SangerReads ||
- lt == io::LibraryType::NanoporeReads ||
- lib.is_contig_lib());
-}
-
-inline bool IsForPEExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForShortLoopExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForScaffoldingExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
- return (lib.type() == io::LibraryType::PairedEnd &&
- lib.data().mean_insert_size > 0.0);
-}
-
-inline bool IsForMPExtender(const io::SequencingLibrary<config::DataSetData> &lib) {
- return lib.data().mean_insert_size > 0.0 &&
- (lib.type() == io::LibraryType::HQMatePairs ||
- lib.type() == io::LibraryType::MatePairs);
-}
-
-enum class PathExtendStage {
- PEStage,
- PEPolishing,
- MPStage,
- FinalizingPEStage,
- FinalPolishing,
-};
-
-inline bool IsPEStage(PathExtendStage stage) {
- return stage == PathExtendStage::PEPolishing || stage == PathExtendStage::PEStage;
-}
-
-inline bool IsMPStage(PathExtendStage stage) {
- return stage == PathExtendStage::MPStage;
-}
-
-inline bool IsFinalStage(PathExtendStage stage) {
- return stage == PathExtendStage::FinalizingPEStage || stage == PathExtendStage::FinalPolishing;
-}
-
-inline bool IsPolishingStage(PathExtendStage stage) {
- return stage == PathExtendStage::PEPolishing || stage == PathExtendStage::FinalPolishing;
-}
-
-
-template<class Index>
-inline shared_ptr<PairedInfoLibrary> MakeNewLib(const config::dataset::Library& lib,
- const conj_graph_pack::graph_t& g,
- const Index& paired_index) {
- size_t read_length = lib.data().read_length;
- size_t is = (size_t) lib.data().mean_insert_size;
- int is_min = (int) lib.data().insert_size_left_quantile;
- int is_max = (int) lib.data().insert_size_right_quantile;
- int var = (int) lib.data().insert_size_deviation;
- bool is_mp = lib.type() == io::LibraryType::MatePairs || lib.type() == io::LibraryType::HQMatePairs ;
- return make_shared< PairedInfoLibraryWithIndex<decltype(paired_index)> >(g.k(), g, read_length,
- is, is_min > 0.0 ? size_t(is_min) : 0, is_max > 0.0 ? size_t(is_max) : 0,
- size_t(var),
- paired_index, is_mp,
- lib.data().insert_size_distribution);
-}
-
-pe_config::LongReads GetLongReadsConfig(const PathExtendParamsContainer& params,
- const io::LibraryType& type) {
- if (io::SequencingLibraryBase::is_long_read_lib(type)) {
- return params.pe_cfg.long_reads.pacbio_reads;
- } else if (type == io::LibraryType::PathExtendContigs){
- return params.pe_cfg.long_reads.meta_contigs;
- } else if (io::SequencingLibraryBase::is_contig_lib(type)) {
- return params.pe_cfg.long_reads.contigs;
- }
- return params.pe_cfg.long_reads.single_reads;
-}
-
-
-inline shared_ptr<ExtensionChooser> MakeLongReadsExtensionChooser(const config::dataset::Library& lib,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp) {
- PathContainer paths;
- AddPathsToContainer(gp, gp.single_long_reads[lib_index].GetAllPaths(), 1, paths);
-
- auto long_reads_config = GetLongReadsConfig(params, lib.type());
- return make_shared<LongReadsExtensionChooser>(gp.g, paths, long_reads_config.filtering,
- long_reads_config.weight_priority,
- long_reads_config.unique_edge_priority,
- long_reads_config.min_significant_overlap,
- params.pset.extension_options.max_repeat_length,
- params.uneven_depth);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeLongReadsExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map) {
- const auto& lib = dataset_info.reads[lib_index];
- size_t resolvable_repeat_length_bound = 10000ul;
- if (!lib.is_contig_lib()) {
- resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
- }
- INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
-
-
- auto long_read_ec = MakeLongReadsExtensionChooser(lib, lib_index, params, gp);
- return make_shared<SimpleExtender>(gp, cov_map,
- long_read_ec,
- resolvable_repeat_length_bound,
- params.pset.loop_removal.max_loops,
- true, /* investigate short loops */
- UseCoverageResolverForSingleReads(dataset_info, lib.type()));
-}
-
-inline shared_ptr<SimpleExtender> MakeLongEdgePEExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- bool investigate_loops) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
- SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc =
- make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
- shared_ptr<ExtensionChooser> extension =
- make_shared<LongEdgeExtensionChooser>(gp.g, wc,
- GetWeightThreshold(paired_lib, params.pset),
- GetPriorityCoeff(paired_lib, params.pset));
-
- return make_shared<SimpleExtender>(gp, cov_map,
- extension,
- paired_lib->GetISMax(),
- params.pset.loop_removal.max_loops,
- investigate_loops,
- false /*use short loop coverage resolver*/);
-}
-
-inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(shared_ptr<PairedInfoLibrary> lib,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- size_t read_length) {
- VERIFY(params.mode == config::pipeline_type::meta);
- VERIFY(!lib->IsMp());
- shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g,
- lib,
- read_length, //read_length
- 0.3, //normalized_threshold
- 3, //raw_threshold
- 0 /*estimation_edge_length*/ );
- return make_shared<SimpleExtensionChooser>(gp.g, wc,
- params.pset.extension_options.weight_threshold,
- params.pset.extension_options.priority_coeff);
-}
-
-inline shared_ptr<SimpleExtender> MakeMetaExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- bool investigate_loops) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
-
- return make_shared<SimpleExtender>(gp, cov_map,
- MakeMetaExtensionChooser(paired_lib, params, gp, dataset_info.RL()),
- paired_lib->GetISMax(),
- params.pset.loop_removal.max_loops,
- investigate_loops,
- false /*use short loop coverage resolver*/);
-}
-
-inline shared_ptr<SimpleExtender> MakePEExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- bool investigate_loops) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
- SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
- auto extension = make_shared<SimpleExtensionChooser>(gp.g, wc,
- GetWeightThreshold(paired_lib, params.pset),
- GetPriorityCoeff(paired_lib, params.pset));
-
- return make_shared<SimpleExtender>(gp, cov_map,
- extension,
- paired_lib->GetISMax(),
- params.pset.loop_removal.max_loops,
- investigate_loops,
- false /*use short loop coverage resolver*/);
-}
-
-
-inline shared_ptr<PathExtender> MakeScaffoldingExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map) {
- const auto& lib = dataset_info.reads[lib_index];
- const auto& pset = params.pset;
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.scaffolding_indices[lib_index]);
-
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
-
- auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter,
- pset.scaffolder_options.cl_threshold,
- pset.scaffolder_options.var_coeff);
-
- vector<shared_ptr<GapJoiner>> joiners;
- if (params.pset.scaffolder_options.use_la_gap_joiner)
- joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
- pset.scaffolder_options.flank_multiplication_coefficient,
- pset.scaffolder_options.flank_addition_coefficient));
-
-
- joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g,
- pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL()));
-
- auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
- joiners,
- size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()), /* may overlap threshold */
- int(math::round((double) gp.g.k() - pset.scaffolder_options.var_coeff * (double) paired_lib->GetIsVar())), /* must overlap threshold */
- pset.scaffolder_options.artificial_gap);
-
- return make_shared<ScaffoldingPathExtender>(gp, cov_map, scaff_chooser,
- composite_gap_joiner,
- paired_lib->GetISMax(),
- pset.loop_removal.max_loops,
- false, /* investigate short loops */
- params.avoid_rc_connections);
-}
-
-
-inline shared_ptr<PathExtender> MakeRNAScaffoldingExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map) {
-
- const auto& lib = dataset_info.reads[lib_index];
- const auto& pset = params.pset;
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
-
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
-
- auto scaff_chooser = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, pset.scaffolder_options.cutoff, pset.scaffolder_options.var_coeff);
- auto scaff_chooser2 = std::make_shared<ScaffoldingExtensionChooser>(gp.g, counter, pset.scaffolder_options.hard_cutoff, pset.scaffolder_options.var_coeff);
-
- vector<shared_ptr<GapJoiner>> joiners;
- if (params.pset.scaffolder_options.use_la_gap_joiner)
- joiners.push_back(std::make_shared<LAGapJoiner>(gp.g, pset.scaffolder_options.min_overlap_length,
- pset.scaffolder_options.flank_multiplication_coefficient,
- pset.scaffolder_options.flank_addition_coefficient));
-
-
- joiners.push_back(std::make_shared<HammingGapJoiner>(gp.g,
- pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL()));
-
- auto composite_gap_joiner = std::make_shared<CompositeGapJoiner>(gp.g,
- joiners,
- size_t(pset.scaffolder_options.max_can_overlap * (double) gp.g.k()), /* may overlap threshold */
- int(math::round((double) gp.g.k() - pset.scaffolder_options.var_coeff * (double) paired_lib->GetIsVar())), /* must overlap threshold */
- pset.scaffolder_options.artificial_gap);
-
- VERIFY(pset.scaffolder_options.min_overlap_for_rna_scaffolding.is_initialized());
- return make_shared<RNAScaffoldingPathExtender>(gp, cov_map,
- scaff_chooser,
- scaff_chooser2,
- composite_gap_joiner,
- paired_lib->GetISMax(),
- pset.loop_removal.max_loops,
- false /* investigate short loops */,
- *pset.scaffolder_options.min_overlap_for_rna_scaffolding);
-}
-
-
-inline shared_ptr<PathExtender> MakeScaffolding2015Extender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- const ScaffoldingUniqueEdgeStorage& storage) {
-
- const auto& lib = dataset_info.reads[lib_index];
- const auto& pset = params.pset;
- shared_ptr<PairedInfoLibrary> paired_lib;
- INFO("Creating Scaffolding 2015 extender for lib #" << lib_index);
-
- //TODO:: temporary solution
- if (gp.paired_indices[lib_index].size() > gp.clustered_indices[lib_index].size()) {
- INFO("Paired unclustered indices not empty, using them");
- paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
- } else if (gp.clustered_indices[lib_index].size() != 0 ) {
- INFO("clustered indices not empty, using them");
- paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
- } else {
- ERROR("All paired indices are empty!");
- }
-
- shared_ptr<WeightCounter> counter = make_shared<ReadCountWeightCounter>(gp.g, paired_lib);
-//TODO::was copypasted from MakeScaffoldingExtender, refactor 2015 extension chhoser
- DEBUG("creating extchooser");
-
- auto scaff_chooser = std::make_shared<ExtensionChooser2015>(gp.g,
- counter,
- lib_index,
- storage,
- pset.scaffolder_options.cl_threshold,
- pset.scaffolder_options.var_coeff,
- pset.scaffolding2015.relative_weight_cutoff);
-
- auto gap_joiner = std::make_shared<HammingGapJoiner>(gp.g, pset.scaffolder_options.min_gap_score,
- pset.scaffolder_options.short_overlap,
- (int) pset.scaffolder_options.basic_overlap_coeff * dataset_info.RL());
-
- return make_shared<ScaffoldingPathExtender>(gp, cov_map,
- scaff_chooser,
- gap_joiner,
- paired_lib->GetISMax(),
- pset.loop_removal.max_loops,
- false, /* investigate short loops */
- params.avoid_rc_connections,
- false /* jump only from tips */);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeMPExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- const PathContainer& paths) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
-
- SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
-
- size_t max_number_of_paths_to_search = GetNumberMPPaths(gp.g);
- DEBUG("max number of mp paths " << max_number_of_paths_to_search);
-
- shared_ptr<MatePairExtensionChooser> chooser =
- make_shared<MatePairExtensionChooser>(gp.g,
- paired_lib,
- paths,
- max_number_of_paths_to_search,
- params.uneven_depth);
-
- return make_shared<SimpleExtender>(gp, cov_map,
- chooser,
- paired_lib->GetISMax(),
- params.pset.loop_removal.mp_max_loops,
- true, /* investigate short loops */
- false /*use short loop coverage resolver*/);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeCoordCoverageExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
-
- CoverageAwareIdealInfoProvider provider(gp.g, paired_lib, -1ul, 0);
- auto coord_chooser = make_shared<CoordinatedCoverageExtensionChooser>(gp.g, provider,
- params.pset.coordinated_coverage.max_edge_length_in_repeat,
- params.pset.coordinated_coverage.delta,
- params.pset.coordinated_coverage.min_path_len);
- auto chooser = make_shared<JointExtensionChooser>(gp.g, MakeMetaExtensionChooser(paired_lib, params, gp, dataset_info.RL()), coord_chooser);
-
- return make_shared<SimpleExtender>(gp, cov_map, chooser,
- -1ul /* insert size */,
- params.pset.loop_removal.mp_max_loops,
- true, /* investigate short loops */
- false /*use short loop coverage resolver*/);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeRNAExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- bool investigate_loops) {
-
- const auto& lib = dataset_info.reads[lib_index];
- shared_ptr<PairedInfoLibrary> paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
- SetSingleThresholdForLib(paired_lib, params.pset, lib.data().pi_threshold);
- INFO("Threshold for lib #" << lib_index << ": " << paired_lib->GetSingleThreshold());
-
- shared_ptr<WeightCounter> wc = make_shared<PathCoverWeightCounter>(gp.g, paired_lib, params.pset.normalize_weight);
- shared_ptr<RNAExtensionChooser> extension =
- make_shared<RNAExtensionChooser>(gp.g, wc,
- GetWeightThreshold(paired_lib, params.pset),
- GetPriorityCoeff(paired_lib, params.pset));
-
- return make_shared<MultiExtender>(gp, cov_map,
- extension,
- paired_lib->GetISMax(),
- params.pset.loop_removal.max_loops,
- investigate_loops,
- false /*use short loop coverage resolver*/);
-}
-
-
-inline shared_ptr<SimpleExtender> MakeRNALongReadsExtender(const config::dataset& dataset_info,
- size_t lib_index,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map) {
-
- VERIFY_MSG(false, "Long reads rna extender is not implemented yet")
-
- const auto& lib = dataset_info.reads[lib_index];
- size_t resolvable_repeat_length_bound = 10000ul;
- if (!lib.is_contig_lib()) {
- resolvable_repeat_length_bound = std::max(resolvable_repeat_length_bound, lib.data().read_length);
- }
- INFO("resolvable_repeat_length_bound set to " << resolvable_repeat_length_bound);
-
- auto long_reads_ec = MakeLongReadsExtensionChooser(lib, lib_index, params, gp);
-
- return make_shared<SimpleExtender>(gp, cov_map,
- long_reads_ec,
- resolvable_repeat_length_bound,
- params.pset.loop_removal.max_loops,
- true, /* investigate short loops */
- UseCoverageResolverForSingleReads(dataset_info, lib.type()));
-}
-
-
-template<typename Base, typename T>
-inline bool instanceof(const T *ptr) {
- return dynamic_cast<const Base*>(ptr) != nullptr;
-}
-
-
-//Used for debug purpose only
-inline void PrintExtenders(vector<shared_ptr<PathExtender> >& extenders) {
- DEBUG("Extenders in vector:");
- for(size_t i = 0; i < extenders.size(); ++i) {
- string type = typeid(*extenders[i]).name();
- DEBUG("Extender #i" << type);
- if (instanceof<SimpleExtender>(extenders[i].get())) {
- auto ec = ((SimpleExtender *) extenders[i].get())->GetExtensionChooser();
- string chooser_type = typeid(*ec).name();
- DEBUG(" Extender #i" << chooser_type);
- }
- else if (instanceof<ScaffoldingPathExtender>(extenders[i].get())) {
- auto ec = ((ScaffoldingPathExtender *) extenders[i].get())->GetExtensionChooser();
- string chooser_type = typeid(*ec).name();
- DEBUG(" Extender #i" << chooser_type);
- }
- }
-}
-
-inline vector<shared_ptr<PathExtender> > MakeAllExtenders(PathExtendStage stage,
- const config::dataset& dataset_info,
- const PathExtendParamsContainer& params,
- const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- const ScaffoldingUniqueEdgeStorage& storage,
- const PathContainer& paths_for_mp = PathContainer()) {
-
- vector<shared_ptr<PathExtender> > result;
- vector<shared_ptr<PathExtender> > pes;
- vector<shared_ptr<PathExtender> > pes2015;
- vector<shared_ptr<PathExtender> > pe_loops;
- vector<shared_ptr<PathExtender> > pe_scafs;
- vector<shared_ptr<PathExtender> > mps;
-
- size_t single_read_libs = 0;
- size_t pe_libs = 0;
- size_t scf_pe_libs = 0;
- size_t mp_libs = 0;
-
- const auto& pset = params.pset;
-
- for (io::LibraryType lt : io::LibraryPriotity) {
- for (size_t lib_index = 0; lib_index < dataset_info.reads.lib_count(); ++lib_index) {
- const auto& lib = dataset_info.reads[lib_index];
- if (lib.type() != lt)
- continue;
-
- //TODO: scaff2015 does not need any single read libs?
- if (IsForSingleReadExtender(lib) && pset.sm != sm_2015) {
- result.push_back(MakeLongReadsExtender(dataset_info, lib_index, params, gp, cov_map));
- ++single_read_libs;
- }
- if (IsForPEExtender(lib)) {
- ++pe_libs;
- if (IsPEStage(stage) && IsOldPEEnabled(pset.sm)) {
- if (params.mode == config::pipeline_type::meta)
- //TODO proper configuration via config
- pes.push_back(MakeMetaExtender(dataset_info, lib_index, params, gp, cov_map, false));
- else if (params.mode == config::pipeline_type::moleculo)
- pes.push_back(MakeLongEdgePEExtender(dataset_info, lib_index, params, gp, cov_map, false));
- else if (pset.multi_path_extend && !IsPolishingStage(stage))
- pes.push_back(MakeRNAExtender(dataset_info, lib_index, params, gp, cov_map, false));
- else
- pes.push_back(MakePEExtender(dataset_info, lib_index, params, gp, cov_map, false));
- }
- else if (pset.sm == sm_2015) {
- pes2015.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- }
- }
- //FIXME logic is very cryptic!
- if (IsForShortLoopExtender(lib) && IsOldPEEnabled(pset.sm)) {
- if (params.mode == config::pipeline_type::meta)
- pes.push_back(MakeMetaExtender(dataset_info, lib_index, params, gp, cov_map, true));
- else if (pset.multi_path_extend && !IsPolishingStage(stage))
- pes.push_back(MakeRNAExtender(dataset_info, lib_index, params, gp, cov_map, true));
- else
- pe_loops.push_back(MakePEExtender(dataset_info, lib_index, params, gp, cov_map, true));
- }
- if (IsForScaffoldingExtender(lib) && params.use_scaffolder && pset.scaffolder_options.enabled) {
- ++scf_pe_libs;
- if (params.mode == config::pipeline_type::rna) {
- pe_scafs.push_back(MakeRNAScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
- }
- else {
- switch (pset.sm) {
- case sm_old: {
- pe_scafs.push_back(MakeScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
- break;
- }
- case sm_old_pe_2015: {
- pe_scafs.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- break;
- }
- case sm_combined: {
- pe_scafs.push_back(MakeScaffoldingExtender(dataset_info, lib_index, params, gp, cov_map));
- pe_scafs.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- break;
- }
- default:
- break;
- }
- }
- }
- if (IsForMPExtender(lib) && IsMPStage(stage)) {
- ++mp_libs;
- switch (pset.sm) {
- case sm_old: {
- mps.push_back(MakeMPExtender(dataset_info, lib_index, params, gp, cov_map, paths_for_mp));
- break;
- }
- case sm_old_pe_2015: {
- mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- break;
- }
- case sm_2015: {
- mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- break;
- }
- case sm_combined: {
- mps.push_back(MakeMPExtender(dataset_info, lib_index, params, gp, cov_map, paths_for_mp));
- mps.push_back(MakeScaffolding2015Extender(dataset_info, lib_index, params, gp, cov_map, storage));
- break;
- }
- default:
- break;
- }
- }
- }
-
- result.insert(result.end(), pes.begin(), pes.end());
- result.insert(result.end(), pes2015.begin(), pes2015.end());
- result.insert(result.end(), pe_loops.begin(), pe_loops.end());
- result.insert(result.end(), pe_scafs.begin(), pe_scafs.end());
- result.insert(result.end(), mps.begin(), mps.end());
- pes.clear();
- pe_loops.clear();
- pe_scafs.clear();
- pes2015.clear();
- mps.clear();
- }
-
- INFO("Using " << pe_libs << " paired-end " << LibStr(pe_libs));
- INFO("Using " << scf_pe_libs << " paired-end scaffolding " << LibStr(scf_pe_libs));
- INFO("Using " << mp_libs << " mate-pair " << LibStr(mp_libs));
- INFO("Using " << single_read_libs << " single read " << LibStr(single_read_libs));
- INFO("Scaffolder is " << (pset.scaffolder_options.enabled ? "on" : "off"));
-
- if (pset.use_coordinated_coverage) {
- INFO("Using additional coordinated coverage extender");
- result.push_back(MakeCoordCoverageExtender(dataset_info, 0 /* lib index */, params, gp, cov_map));
- }
-
- PrintExtenders(result);
- return result;
-}
-
-inline shared_ptr<scaffold_graph::ScaffoldGraph> ConstructScaffoldGraph(const config::dataset& dataset_info,
- const pe_config::ParamSetT::ScaffoldGraphParamsT& params,
- const conj_graph_pack& gp,
- const ScaffoldingUniqueEdgeStorage& edge_storage) {
- using namespace scaffold_graph;
- vector<shared_ptr<ConnectionCondition>> conditions;
-
- INFO("Constructing connections");
- LengthEdgeCondition edge_condition(gp.g, edge_storage.GetMinLength());
-
- for (size_t lib_index = 0; lib_index < dataset_info.reads.lib_count(); ++lib_index) {
- const auto& lib = dataset_info.reads[lib_index];
- if (lib.is_paired()) {
- shared_ptr<PairedInfoLibrary> paired_lib;
- if (IsForMPExtender(lib))
- paired_lib = MakeNewLib(lib, gp.g, gp.paired_indices[lib_index]);
- else if (IsForPEExtender(lib))
- paired_lib = MakeNewLib(lib, gp.g, gp.clustered_indices[lib_index]);
- else
- INFO("Unusable paired lib #" << lib_index);
- conditions.push_back(make_shared<AdvancedPairedConnectionCondition>(gp.g, paired_lib, lib_index,
- params.always_add,
- params.never_add,
- params.relative_threshold));
- }
- }
- if (params.graph_connectivity) {
- auto as_con = make_shared<AssemblyGraphConnectionCondition>(gp.g, params.max_path_length, edge_storage);
- for (auto e_iter = gp.g.ConstEdgeBegin(); !e_iter.IsEnd(); ++e_iter) {
- if (edge_condition.IsSuitable(*e_iter))
- as_con->AddInterestingEdge(*e_iter);
- }
- conditions.push_back(as_con);
- }
- INFO("Total conditions " << conditions.size());
-
- INFO("Constructing scaffold graph from set of size " << edge_storage.GetSet().size());
-
- DefaultScaffoldGraphConstructor constructor(gp.g, edge_storage.GetSet(), conditions, edge_condition);
- auto scaffoldGraph = constructor.Construct();
-
- INFO("Scaffold graph contains " << scaffoldGraph->VertexCount() << " vertices and " << scaffoldGraph->EdgeCount() << " edges");
- return scaffoldGraph;
-}
-
-
-inline void PrintScaffoldGraph(shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph,
- const set<EdgeId>& main_edge_set,
- const string& filename) {
- using namespace scaffold_graph;
-
- auto vcolorer = make_shared<ScaffoldVertexSetColorer>(main_edge_set);
- auto ecolorer = make_shared<ScaffoldEdgeColorer>();
- CompositeGraphColorer <ScaffoldGraph> colorer(vcolorer, ecolorer);
-
- INFO("Visualizing single grpah");
- ScaffoldGraphVisualizer singleVisualizer(*scaffoldGraph, false);
- std::ofstream single_dot;
- single_dot.open((filename + "_single.dot").c_str());
- singleVisualizer.Visualize(single_dot, colorer);
- single_dot.close();
-
- INFO("Visualizing paired grpah");
- ScaffoldGraphVisualizer pairedVisualizer(*scaffoldGraph, true);
- std::ofstream paired_dot;
- paired_dot.open((filename + "_paired.dot").c_str());
- pairedVisualizer.Visualize(paired_dot, colorer);
- paired_dot.close();
-
- INFO("Printing scaffold grpah");
- std::ofstream data_stream;
- data_stream.open((filename + ".data").c_str());
- scaffoldGraph->Print(data_stream);
- data_stream.close();
-}
-
-
-inline size_t FindOverlapLenForStage(PathExtendStage stage, const config::dataset& dataset_info) {
- size_t res = 0;
- for (const auto& lib : dataset_info.reads) {
- if (IsForPEExtender(lib) && IsPEStage(stage)) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- } else if (IsForShortLoopExtender(lib)) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- } else if (IsForMPExtender(lib) && IsMPStage(stage)) {
- res = max(res, (size_t) lib.data().insert_size_right_quantile);
- }
- }
- return res;
-}
-
-inline bool MPLibsExist(const config::dataset& dataset_info) {
- for (const auto& lib : dataset_info.reads)
- if (IsForMPExtender(lib))
- return true;
-
- return false;
-}
-
-inline void CountMisassembliesWithReference(debruijn_graph::GenomeConsistenceChecker& genome_checker, const PathContainer& paths) {
- size_t total_mis = 0 , gap_mis = 0;
- genome_checker.SpellGenome();
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- BidirectionalPath *path = iter.get();
- auto map_res = genome_checker.CountMisassemblies(*path);
- if (map_res.misassemblies > 0) {
- INFO ("there are " << map_res.misassemblies << " misassemblies in path: ");
- path->PrintInfo();
- total_mis += map_res.misassemblies;
- }
- if (map_res.wrong_gap_size > 0) {
- INFO ("there are " << map_res.wrong_gap_size << " wrong gaps in path: ");
- path->PrintInfo();
- gap_mis += map_res.wrong_gap_size;
- }
- }
- INFO ("In total found " << total_mis << " misassemblies " << " and " << gap_mis << " gaps.");
-}
-
-inline ScaffoldingUniqueEdgeStorage FillUniqueEdgeStorage(const conj_graph_pack& gp,
- const config::dataset& dataset_info,
- size_t& min_unique_length,
- double& unique_variation,
- bool autodetect) {
-
- ScaffoldingUniqueEdgeStorage main_unique_storage;
- //Setting scaffolding2015 parameters
- if (autodetect) {
- INFO("Autodetecting unique edge set parameters...");
- bool pe_found = false;
- //TODO constants
- size_t min_MP_IS = 10000;
- for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
-
- if (IsForPEExtender(dataset_info.reads[i])) {
- pe_found = true;
- }
- if (IsForMPExtender(dataset_info.reads[i])) {
- min_MP_IS = min(min_MP_IS, (size_t) dataset_info.reads[i].data().mean_insert_size);
- }
- }
- if (pe_found) {
- //TODO constants
- unique_variation = 0.5;
- INFO("PE lib found, we believe in coverage");
- } else {
- unique_variation = 50;
- INFO("No paired libs found, we do not believe in coverage");
- }
- min_unique_length = min_MP_IS;
- INFO("Minimal unique edge length set to the smallest MP library IS: " << min_unique_length);
-
- } else {
- INFO("Unique edge set constructed with parameters from config : length " << min_unique_length
- << " variation " << unique_variation);
- }
- ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, min_unique_length, unique_variation);
- unique_edge_analyzer.FillUniqueEdgeStorage(main_unique_storage);
-
- return main_unique_storage;
-}
-
-
-inline void ResolveRepeatsPe(const config::dataset& dataset_info,
- const PathExtendParamsContainer& params,
- conj_graph_pack& gp) {
-
- INFO("ExSPAnder repeat resolving tool started");
- const pe_config::ParamSetT &pset = params.pset;
-
- ScaffoldingUniqueEdgeStorage main_unique_storage;
- auto sc_mode = pset.sm;
- auto min_unique_length = pset.scaffolding2015.min_unique_length;
- auto unique_variaton = pset.scaffolding2015.unique_coverage_variation;
- bool detect_repeats_online = !(IsScaffolder2015Enabled(sc_mode) || params.mode == config::pipeline_type::meta);
-
- //Fill the storage to enable unique edge check
- if (IsScaffolder2015Enabled(sc_mode)) {
- main_unique_storage = FillUniqueEdgeStorage(gp, dataset_info,
- min_unique_length,
- unique_variaton,
- pset.scaffolding2015.autodetect);
- }
-
- make_dir(params.output_dir);
- make_dir(params.etc_dir);
-
-
- //Scaffold graph
- shared_ptr<scaffold_graph::ScaffoldGraph> scaffoldGraph;
- if (pset.scaffold_graph_params.construct) {
- scaffoldGraph = ConstructScaffoldGraph(dataset_info, params.pset.scaffold_graph_params, gp, main_unique_storage);
- if (pset.scaffold_graph_params.output) {
- PrintScaffoldGraph(scaffoldGraph, main_unique_storage.GetSet(), params.etc_dir + "scaffold_graph");
- }
- }
-
-
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
- ContigWriter writer(gp.g, constructor, gp.components, params.mode == config::pipeline_type::plasmid);
-
-
-//make pe + long reads extenders
- GraphCoverageMap cover_map(gp.g);
- INFO("SUBSTAGE = paired-end libraries")
- PathExtendStage exspander_stage = PathExtendStage::PEStage;
- vector<shared_ptr<PathExtender> > all_libs =
- MakeAllExtenders(exspander_stage, dataset_info, params, gp, cover_map, main_unique_storage);
-
- //Parameters are subject to change
- size_t max_is_right_quantile = max(FindOverlapLenForStage(exspander_stage, dataset_info), gp.g.k() + 100);
- size_t min_edge_len = 100;
- size_t max_edge_diff_pe = /*cfg::get().mode == config::pipeline_type::rna ? 0 :*/ max_is_right_quantile;
-
- shared_ptr<CompositeExtender> mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
-
-//extend pe + long reads
- PathExtendResolver resolver(gp.g);
- auto seeds = resolver.makeSimpleSeeds();
- DebugOutputPaths(gp, params, seeds, "init_paths");
- seeds.SortByLength();
- INFO("Growing paths using paired-end and long single reads");
- INFO("Multi path extend is " << (cfg::get().pe_params.param_set.multi_path_extend ? "on" : "off"))
- INFO("Overlap removal is " << (cfg::get().pe_params.param_set.remove_overlaps ? "on" : "off"))
- auto paths = resolver.extendSeeds(seeds, *mainPE);
- paths.SortByLength();
- DebugOutputPaths(gp, params, paths, "pe_before_overlap");
-
- PathContainer clone_paths;
- GraphCoverageMap clone_map(gp.g);
- bool mp_exist = MPLibsExist(dataset_info);
-
- if (mp_exist) {
- ClonePathContainer(paths, clone_paths, clone_map);
- }
-
- exspander_stage = PathExtendStage::PEPolishing;
- all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, cover_map, main_unique_storage);
- mainPE = make_shared<CompositeExtender>(gp.g, cover_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
-
- //We do not run overlap removal in 2015 mode
- if (!IsScaffolder2015Enabled(sc_mode))
- FinalizePaths(params, paths, gp.g, cover_map, min_edge_len, max_edge_diff_pe);
- if (params.output_broken_scaffolds) {
- OutputBrokenScaffolds(paths, params, (int) gp.g.k(), writer,
- params.output_dir + (mp_exist ? "pe_contigs" : params.broken_contigs));
- }
- DebugOutputPaths(gp, params, paths, "pe_before_traverse");
- if (params.traverse_loops) {
- TraverseLoops(paths, cover_map, mainPE);
- FinalizePaths(params, paths, gp.g, cover_map, min_edge_len, max_edge_diff_pe);
- }
- DebugOutputPaths(gp, params, paths, (mp_exist ? "pe_final_paths" : "final_paths"));
- writer.OutputPaths(paths, params.output_dir + (mp_exist ? "pe_scaffolds" : params.contigs_name));
-
- cover_map.Clear();
- seeds.DeleteAllPaths();
- paths.DeleteAllPaths();
- if (!mp_exist) {
- return;
- }
-
-//MP
- DebugOutputPaths(gp, params, clone_paths, "mp_before_extend");
-
- INFO("SUBSTAGE = mate-pair libraries ")
- exspander_stage = PathExtendStage::MPStage;
- all_libs.clear();
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage, dataset_info);
- PathContainer mp_paths(clone_paths);
-
- if (IsScaffolder2015Enabled(sc_mode)) {
- //TODO: constants
- for (auto cur_length = min_unique_length; cur_length > 500; cur_length -= 500) {
- ScaffoldingUniqueEdgeStorage current_unique_storage;
- ScaffoldingUniqueEdgeAnalyzer unique_edge_analyzer(gp, cur_length, unique_variaton);
- unique_edge_analyzer.FillUniqueEdgeStorage(current_unique_storage);
- all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, current_unique_storage, clone_paths);
- shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
- INFO("Growing paths using mate-pairs unique length " << cur_length);
- mp_paths = resolver.extendSeeds(mp_paths, *mp_main_pe);
- DebugOutputPaths(gp, params, mp_paths, "mp_before_overlap_" + std::to_string(cur_length));
- }
- } else {
- all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage, clone_paths);
- shared_ptr<CompositeExtender> mp_main_pe = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
- INFO("Growing paths using mate-pairs");
- mp_paths = resolver.extendSeeds(clone_paths, *mp_main_pe);
-
- DebugOutputPaths(gp, params, mp_paths, "mp_before_overlap");
- FinalizePaths(params, mp_paths, gp.g, clone_map, max_is_right_quantile, max_is_right_quantile, true);
- }
- DebugOutputPaths(gp, params, mp_paths, "mp_final_paths");
- DEBUG("Paths are grown with mate-pairs");
-
-//MP end
-
-//pe again
- INFO("SUBSTAGE = polishing paths")
- exspander_stage = PathExtendStage::FinalizingPEStage;
- all_libs.clear();
- all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage);
- max_is_right_quantile = FindOverlapLenForStage(exspander_stage, dataset_info);
- shared_ptr<CompositeExtender> last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
-
- auto last_paths = resolver.extendSeeds(mp_paths, *last_extender);
- DebugOutputPaths(gp, params, last_paths, "mp2_before_overlap");
-
- exspander_stage = PathExtendStage::FinalPolishing;
- all_libs = MakeAllExtenders(exspander_stage, dataset_info, params, gp, clone_map, main_unique_storage);
- last_extender = make_shared<CompositeExtender>(gp.g, clone_map, all_libs,
- main_unique_storage,
- max_is_right_quantile,
- pset.extension_options.max_repeat_length,
- detect_repeats_online);
- if (!IsScaffolder2015Enabled(sc_mode)) {
- FinalizePaths(params, last_paths, gp.g, clone_map, min_edge_len, max_is_right_quantile);
- DebugOutputPaths(gp, params, last_paths, "mp2_before_traverse");
- }
-
- TraverseLoops(last_paths, clone_map, last_extender);
- FinalizePaths(params, last_paths, gp.g, clone_map, min_edge_len, max_is_right_quantile);
-
-//result
- if (params.output_broken_scaffolds) {
- OutputBrokenScaffolds(last_paths, params, (int) gp.g.k(), writer, params.output_dir + params.broken_contigs);
- }
- debruijn_graph::GenomeConsistenceChecker genome_checker (gp, main_unique_storage, 1000, 0.2);
- DebugOutputPaths(gp, params, last_paths, "mp2_final_paths");
- writer.OutputPaths(last_paths, params.output_dir + params.contigs_name);
- if (gp.genome.size() > 0)
- CountMisassembliesWithReference(genome_checker, last_paths);
- //FinalizeUniquenessPaths();
-
-//TODO:: destructor?
- last_paths.DeleteAllPaths();
- seeds.DeleteAllPaths();
- mp_paths.DeleteAllPaths();
- clone_paths.DeleteAllPaths();
-
- INFO("ExSPAnder repeat resolving tool finished");
-}
-
-} /* path_extend */
-
-
-
-#endif /* PATH_EXTEND_LAUNCH_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_extender.hpp b/src/modules/algorithms/path_extend/path_extender.hpp
deleted file mode 100644
index 0c8bda5..0000000
--- a/src/modules/algorithms/path_extend/path_extender.hpp
+++ /dev/null
@@ -1,1561 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//****************************************************************************
-
-/*
- * path_extender.hpp
- *
- * Created on: Mar 5, 2012
- * Author: andrey
- */
-
-#pragma once
-
-
-#include "extension_chooser.hpp"
-#include "path_filter.hpp"
-#include "overlap_analysis.hpp"
-#include "assembly_graph/graph_support/scaff_supplementary.hpp"
-#include <cmath>
-
-
-namespace path_extend {
-
-class ShortLoopResolver {
-public:
- ShortLoopResolver(const Graph& g)
- : g_(g) { }
-
- virtual ~ShortLoopResolver() { }
-
- virtual void ResolveShortLoop(BidirectionalPath& path) const = 0;
-
-protected:
- DECL_LOGGER("PathExtender")
- const Graph& g_;
-
- void UndoCycles(BidirectionalPath& p, EdgeId next_edge) const {
- if (p.Size() <= 2) {
- return;
- }
- EdgeId first_edge = p.Back();
- EdgeId second_edge = next_edge;
- while (p.Size() > 2) {
- if (p.At(p.Size() - 1) == first_edge && p.At(p.Size() - 2) == second_edge) {
- p.PopBack(2);
- } else {
- return;;
- }
- }
- }
-
- void MakeCycleStep(BidirectionalPath& path, EdgeId e) const {
- if (path.Size() == 0) {
- return;
- }
- EdgeId pathEnd = path.Back();
- path.PushBack(e);
- path.PushBack(pathEnd);
- }
-};
-
-class CovShortLoopResolver : public ShortLoopResolver {
-public:
- CovShortLoopResolver(const conj_graph_pack& gp)
- : ShortLoopResolver(gp.g), gp_(gp) {
-
- }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- DEBUG("resolve short loop by coverage");
- path.Print();
-
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Coverage Short Loop Resolver");
- UndoCycles(path, edges.first);
- EdgeId e1 = path.Back();
- EdgeId e2 = edges.first;
- EdgeId e_out = edges.second;
- auto prob_e_in = g_.IncomingEdges(g_.EdgeEnd(e2));
- EdgeId e_in = *prob_e_in.begin();
- size_t count = 0;
- for (auto edge = prob_e_in.begin(); edge != prob_e_in.end(); ++edge) {
- if (*edge != e2)
- e_in = *edge;
- count++;
- }
- if (count != 2) {
- return;
- }
- double in_cov = gp_.flanking_cov.GetOutCov(e_in); //g_.coverage(e_in);
- double out_cov = gp_.flanking_cov.GetInCov(e_out); //g_.coverage(e_out);
- double cov = (in_cov + out_cov) / 2.0;
- double time1 = math::round(gp_.flanking_cov.GetInCov(e1) / cov);//math::round(gp_.g.coverage(e1) / cov);
- double time2 = math::round(gp_.flanking_cov.GetInCov(e2) / cov);////math::round(gp_.g.coverage(e2) / cov);
- size_t time = (size_t) std::max(0.0, std::min(time1 - 1.0, time2));
- for (size_t i = 0; i < time; ++i) {
- MakeCycleStep(path, edges.first);
- }
- path.PushBack(edges.second);
- DEBUG("loop with start " << g_.int_id(e_in)
- <<" e1 " << g_.int_id(e1)
- << " e2 " << g_.int_id(e2)
- << " out " <<g_.int_id(e_out)
- << " cov in = " << in_cov
- << " cov out " << out_cov
- << " cov " << cov
- << " cov e1 = " << gp_.g.coverage(e1)
- << " cov e2 = " << gp_.g.coverage(e2)
- << " time1 = " << time1
- << " time2 = " << time2
- << " time = " << time);
- }
- }
-private:
- const conj_graph_pack& gp_;
-};
-
-class SimpleLoopResolver : public ShortLoopResolver {
-
-public:
- SimpleLoopResolver(Graph& g) : ShortLoopResolver(g) { }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >= 1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Resolving short loop...");
- EdgeId e = path.Back();
- path.PushBack(edges.first);
- path.PushBack(e);
- path.PushBack(edges.second);
- DEBUG("Resolving short loop done");
- }
- }
-
-protected:
- DECL_LOGGER("PathExtender")
-};
-
-class LoopResolver : public ShortLoopResolver {
- static const size_t ITER_COUNT = 10;
- const WeightCounter& wc_;
-
-public:
- LoopResolver(const Graph& g, const WeightCounter& wc)
- : ShortLoopResolver(g),
- wc_(wc) { }
-
- void MakeBestChoice(BidirectionalPath& path, pair<EdgeId, EdgeId>& edges) const {
- UndoCycles(path, edges.first);
- BidirectionalPath experiment(path);
- double max_weight = wc_.CountWeight(experiment, edges.second);
- double diff = max_weight - wc_.CountWeight(experiment, edges.first);
- size_t maxIter = 0;
- for (size_t i = 1; i <= ITER_COUNT; ++i) {
- double weight = wc_.CountWeight(experiment, edges.first);
- if (weight > 0) {
- MakeCycleStep(experiment, edges.first);
- weight = wc_.CountWeight(experiment, edges.second);
- double weight2 = wc_.CountWeight(experiment, edges.first);
- if (weight > max_weight || (weight == max_weight && weight - weight2 > diff)
- || (weight == max_weight && weight - weight2 == diff && i == 1)) {
- max_weight = weight;
- maxIter = i;
- diff = weight - weight2;
- }
- }
- }
- for (size_t i = 0; i < maxIter; ++i) {
- MakeCycleStep(path, edges.first);
- }
- path.PushBack(edges.second);
- }
-
- void ResolveShortLoop(BidirectionalPath& path) const override {
- pair<EdgeId, EdgeId> edges;
- if (path.Size() >=1 && GetLoopAndExit(g_, path.Back(), edges)) {
- DEBUG("Resolving short loop...");
- MakeBestChoice(path, edges);
- DEBUG("Resolving short loop done");
- }
- }
-};
-
-class GapJoiner {
-
-public:
- static const int INVALID_GAP = -1000000;
- GapJoiner(const Graph& g)
- : g_(g) { }
-
- virtual Gap FixGap( EdgeId source, EdgeId sink, int initial_gap) const = 0;
-
- virtual ~GapJoiner() { }
-protected:
- const Graph& g_;
-};
-
-class SimpleGapJoiner : public GapJoiner {
-
-public:
- SimpleGapJoiner(const Graph& g) : GapJoiner(g) { }
-
- Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
- if (initial_gap > 2 * (int) g_.k()) {
- return Gap(initial_gap);
- }
- for (int l = (int) g_.k(); l > 0; --l) {
- if (g_.EdgeNucls(sink).Subseq(g_.length(source) + g_.k() - l) == g_.EdgeNucls(sink).Subseq(0, l)) {
- DEBUG("Found correct gap length");
- DEBUG("Inintial: " << initial_gap << ", new gap: " << g_.k() - l);
- return Gap((int) g_.k() - l);
- }
- }
- DEBUG("Perfect overlap is not found, inintial: " << initial_gap);
- return Gap(initial_gap);
- }
-};
-
-class HammingGapJoiner: public GapJoiner {
- const double min_gap_score_;
- const size_t short_overlap_threshold_;
- const size_t basic_overlap_length_;
-
- vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- vector < size_t > answer;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- answer.push_back(i);
- return answer;
- }
-
- size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- size_t dist = 0;
- for (size_t i = 0; i < s1.size(); ++i) {
- if (s1[i] != s2[i]) {
- dist++;
- }
- }
- return dist;
- }
-
-// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
-// VERIFY(s1.size() == s2.size());
-// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
-// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
-// }
-
-
- double ScoreGap(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size();
- }
-
-public:
-
- //todo review parameters in usages
- HammingGapJoiner(const Graph& g,
- double min_gap_score,
- size_t short_overlap_threshold,
- size_t basic_overlap_length):
- GapJoiner(g),
- min_gap_score_(min_gap_score),
- short_overlap_threshold_(short_overlap_threshold),
- basic_overlap_length_(basic_overlap_length)
- {
- DEBUG("HammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
- "\n short_overlap_threshold " << short_overlap_threshold_ <<
- "\n basic_overlap_length " << basic_overlap_length_);
- }
-
- //estimated_gap is in k-mers
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
-
- size_t corrected_start_overlap = basic_overlap_length_;
- if (estimated_gap < 0) {
- corrected_start_overlap -= estimated_gap;
- }
-
- corrected_start_overlap = min(corrected_start_overlap,
- g_.k() + min(g_.length(source), g_.length(sink)));
-
- DEBUG("Corrected max overlap " << corrected_start_overlap);
-
- double best_score = min_gap_score_;
- int fixed_gap = INVALID_GAP;
-
- double overlap_coeff = 0.3;
- size_t min_overlap = 1ul;
- if (estimated_gap < 0) {
- size_t estimated_overlap = g_.k() - estimated_gap;
- min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
- }
- //todo better usage of estimated overlap
- DEBUG("Min overlap " << min_overlap);
-
- for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
- //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
- //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
- double score = 0;
- score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
- g_.EdgeNucls(sink).Subseq(0, l));
- if (math::gr(score, best_score)) {
- TRACE("Curr overlap " << l);
- TRACE("Score: " << score);
- best_score = score;
- fixed_gap = int(g_.k() - l);
- }
-
- if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
- //look at "short" overlaps only if long overlaps couldn't be found
- DEBUG("Not looking at short overlaps");
- break;
- }
- }
-
- if (fixed_gap != INVALID_GAP) {
- DEBUG("Found candidate gap length with score " << best_score);
- DEBUG("Estimated gap: " << estimated_gap <<
- ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
- }
- return Gap(fixed_gap);
- }
-
-private:
- DECL_LOGGER("HammingGapJoiner");
-};
-
-//deprecated!
-//fixme reduce code duplication with HammingGapJoiner
-class LikelihoodHammingGapJoiner: public GapJoiner {
- static const size_t DEFAULT_PADDING_LENGTH = 10;
- const double min_gap_score_;
- const size_t short_overlap_threshold_;
- const size_t basic_overlap_length_;
-
- vector<size_t> DiffPos(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- vector < size_t > answer;
- for (size_t i = 0; i < s1.size(); ++i)
- if (s1[i] != s2[i])
- answer.push_back(i);
- return answer;
- }
-
- size_t HammingDistance(const Sequence& s1, const Sequence& s2) const {
- VERIFY(s1.size() == s2.size());
- size_t dist = 0;
- for (size_t i = 0; i < s1.size(); ++i) {
- if (s1[i] != s2[i]) {
- dist++;
- }
- }
- return dist;
- }
-
-// double ScoreGap(const Sequence& s1, const Sequence& s2, int gap, int initial_gap) const {
-// VERIFY(s1.size() == s2.size());
-// return 1.0 - (double) HammingDistance(s1, s2) / (double) s1.size()
-// - (double) abs(gap - initial_gap) / (double) (2 * g_.k());
-// }
-
- //FIXME use GC content, change match prob and use partition of tip sequence into bad and good part
- double ScoreGap(const Sequence& s1, const Sequence& s2) const {
- static double match_prob = 0.9;
- static double log_match_prob = log2(match_prob);
- static double log_mismatch_prob = log2(1. - match_prob);
- VERIFY(s1.size() == s2.size());
- size_t n = s1.size();
- size_t mismatches = HammingDistance(s1, s2);
- VERIFY(mismatches <= n);
- return 2.*double(n) + double(n - mismatches) * log_match_prob + double(mismatches) * log_mismatch_prob;
- }
-
-public:
-
- //todo review parameters in usages
- LikelihoodHammingGapJoiner(const Graph& g,
- double min_gap_score,
- size_t short_overlap_threshold,
- size_t basic_overlap_length):
- GapJoiner(g),
- min_gap_score_(min_gap_score),
- short_overlap_threshold_(short_overlap_threshold),
- basic_overlap_length_(basic_overlap_length)
- {
- DEBUG("LikelihoodHammingGapJoiner params: \n min_gap_score " << min_gap_score_ <<
- "\n short_overlap_threshold " << short_overlap_threshold_ <<
- "\n basic_overlap_length " << basic_overlap_length_);
- }
-
- //estimated_gap is in k-mers
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
-
- size_t corrected_start_overlap = basic_overlap_length_;
- if (estimated_gap < 0) {
- corrected_start_overlap -= estimated_gap;
- }
-
- corrected_start_overlap = min(corrected_start_overlap,
- g_.k() + min(g_.length(source), g_.length(sink)));
-
- DEBUG("Corrected max overlap " << corrected_start_overlap);
-
- double best_score = min_gap_score_;
- int fixed_gap = INVALID_GAP;
-
- double overlap_coeff = 0.3;
- size_t min_overlap = 1ul;
- if (estimated_gap < 0) {
- size_t estimated_overlap = g_.k() - estimated_gap;
- min_overlap = max(size_t(math::round(overlap_coeff * double(estimated_overlap))), 1ul);
- }
- //todo better usage of estimated overlap
- DEBUG("Min overlap " << min_overlap);
-
- for (size_t l = corrected_start_overlap; l >= min_overlap; --l) {
- //TRACE("Sink: " << g_.EdgeNucls(sink).Subseq(g_.length(sink) + g_.k() - l).str());
- //TRACE("Source: " << g_.EdgeNucls(source).Subseq(0, l));
- double score = 0;
- score = ScoreGap(g_.EdgeNucls(source).Subseq(g_.length(source) + g_.k() - l),
- g_.EdgeNucls(sink).Subseq(0, l));
- if (math::gr(score, best_score)) {
- TRACE("Curr overlap " << l);
- TRACE("Score: " << score);
- best_score = score;
- fixed_gap = int(g_.k() - l);
- }
-
- if (l == short_overlap_threshold_ && fixed_gap != INVALID_GAP) {
- //look at "short" overlaps only if long overlaps couldn't be found
- DEBUG("Not looking at short overlaps");
- break;
- }
- }
-
- if (fixed_gap != INVALID_GAP) {
- DEBUG("Found candidate gap length with score " << best_score);
- DEBUG("Estimated gap: " << estimated_gap <<
- ", fixed gap: " << fixed_gap << " (overlap " << g_.k() - fixed_gap<< ")");
- }
- return Gap(fixed_gap);
- }
-
-private:
- DECL_LOGGER("LikelihoodHammingGapJoiner");
-};
-
-//if I was in LA
-class LAGapJoiner: public GapJoiner {
-public:
- LAGapJoiner(const Graph& g, size_t min_la_length,
- double flank_multiplication_coefficient,
- double flank_addition_coefficient) :
- GapJoiner(g), min_la_length_(min_la_length), flank_addition_coefficient_(
- flank_addition_coefficient), flank_multiplication_coefficient_(
- flank_multiplication_coefficient) {
- DEBUG("flank_multiplication_coefficient - " << flank_multiplication_coefficient_);
- DEBUG("flank_addition_coefficient_ - " << flank_addition_coefficient_ );
- }
-
- Gap FixGap(EdgeId source, EdgeId sink, int initial_gap) const override {
-
- DEBUG("Overlap doesn't exceed " << size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
- SWOverlapAnalyzer overlap_analyzer(
- size_t(abs(initial_gap) * ESTIMATED_GAP_MULTIPLIER) + GAP_ADDITIONAL_COEFFICIENT);
-
- auto overlap_info = overlap_analyzer.AnalyzeOverlap(g_, source,
- sink);
-
- DEBUG(overlap_info);
-
- if (overlap_info.size() < min_la_length_) {
- DEBUG("Low alignment size");
- return Gap(INVALID_GAP);
- }
-
- size_t max_flank_length = max(overlap_info.r2.start_pos,
- g_.length(source) + g_.k() - overlap_info.r1.end_pos);
- DEBUG("Max flank length - " << max_flank_length);
-
- if ((double) max_flank_length * flank_multiplication_coefficient_
- + flank_addition_coefficient_ > overlap_info.size()) {
- DEBUG("Too long flanks for such alignment");
- return Gap(INVALID_GAP);
- }
-
- if (math::ls(overlap_info.identity(), IDENTITY_RATIO)) {
- DEBUG("Low identity score");
- return Gap(INVALID_GAP);
- }
-
- if ((g_.length(source) + g_.k()) - overlap_info.r1.end_pos > g_.length(source)) {
- DEBUG("Save kmers. Don't want to have edges shorter than k");
- return Gap(INVALID_GAP);
- }
-
- if (overlap_info.r2.start_pos > g_.length(sink)) {
- DEBUG("Save kmers. Don't want to have edges shorter than k");
- return Gap(INVALID_GAP);
- }
-
- return Gap(
- (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos
- + g_.k()),
- (uint32_t) (g_.length(source) + g_.k()
- - overlap_info.r1.end_pos),
- (uint32_t) overlap_info.r2.start_pos);
- }
-
-private:
- DECL_LOGGER("LAGapJoiner");
- const size_t min_la_length_;
- const double flank_addition_coefficient_;
- const double flank_multiplication_coefficient_;
- constexpr static double IDENTITY_RATIO = 0.9;
- constexpr static double ESTIMATED_GAP_MULTIPLIER = 2.0;
- const size_t GAP_ADDITIONAL_COEFFICIENT = 30;
-};
-
-
-class CompositeGapJoiner: public GapJoiner {
-public:
-
- CompositeGapJoiner(const Graph& g,
- const vector<shared_ptr<GapJoiner>>& joiners,
- size_t may_overlap_threhold,
- int must_overlap_threhold,
- size_t artificail_gap) :
- GapJoiner(g),
- joiners_(joiners),
- may_overlap_threshold_(may_overlap_threhold),
- must_overlap_threshold_(must_overlap_threhold),
- artificial_gap_(artificail_gap)
- { }
-
- Gap FixGap(EdgeId source, EdgeId sink, int estimated_gap) const override {
- DEBUG("Trying to fix estimated gap " << estimated_gap <<
- " between " << g_.str(source) << " and " << g_.str(sink));
-
- if (estimated_gap > int(g_.k() + may_overlap_threshold_)) {
- DEBUG("Edges are supposed to be too far to check overlaps");
- return Gap(estimated_gap);
- }
-
- for (auto joiner : joiners_) {
- Gap gap = joiner->FixGap(source, sink, estimated_gap);
- if (gap.gap_ != GapJoiner::INVALID_GAP) {
- return gap;
- }
- }
-
- //couldn't find decent overlap
- if (estimated_gap < must_overlap_threshold_) {
- DEBUG("Estimated gap looks unreliable");
- return Gap(INVALID_GAP);
- } else {
- DEBUG("Overlap was not found");
- return Gap(max(estimated_gap, int(g_.k() + artificial_gap_)));
- }
- }
-
-private:
- vector<shared_ptr<GapJoiner>> joiners_;
- const size_t may_overlap_threshold_;
- const int must_overlap_threshold_;
- const size_t artificial_gap_;
-
- DECL_LOGGER("CompositeGapJoiner");
-};
-
-//FIXME move to tests
-//Just for test. Look at overlap_analysis_tests
-inline Gap MimicLAGapJoiner(Sequence& s1, Sequence& s2) {
- const int INVALID_GAP = -1000000;
- constexpr static double IDENTITY_RATIO = 0.9;
-
- SWOverlapAnalyzer overlap_analyzer_(10000);
- auto overlap_info = overlap_analyzer_.AnalyzeOverlap(s1, s2);
- size_t min_la_length_ = 4;
- if (overlap_info.size() < min_la_length_) {
- DEBUG("Low alignment size");
- return Gap(INVALID_GAP);
- }
- if (overlap_info.identity() < IDENTITY_RATIO) {
- DEBUG("Low identity score");
- return Gap(INVALID_GAP);
- }
- std::cout << overlap_info;
-
- return Gap(
- (int) (-overlap_info.r1.size() - overlap_info.r2.start_pos),
- (uint32_t) (s1.size() - overlap_info.r1.end_pos),
- (uint32_t) overlap_info.r2.start_pos);
-}
-
-
-//Detects a cycle as a minsuffix > IS present earlier in the path. Overlap is allowed.
-class InsertSizeLoopDetector {
-protected:
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- size_t min_cycle_len_;
-
-public:
- InsertSizeLoopDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t is): g_(g), cov_map_(cov_map), min_cycle_len_(is) {
- }
-
- size_t GetMinCycleLenth() const {
- return min_cycle_len_;
- }
-
- bool CheckCycledNonIS(const BidirectionalPath& path) const {
- if (path.Size() <= 2) {
- return false;
- }
- BidirectionalPath last = path.SubPath(path.Size() - 2);
- int pos = path.FindFirst(last);
- VERIFY(pos >= 0);
- return size_t(pos) != path.Size() - 2;
- }
-
- bool CheckCycled(const BidirectionalPath& path) const {
- return FindCycleStart(path) != -1;
- }
-//first suffix longer than min_cycle_len
- int FindPosIS(const BidirectionalPath& path) const {
- int i = (int) path.Size() - 1;
- while (i >= 0 && path.LengthAt(i) < min_cycle_len_) {
- --i;
- }
- return i;
- }
- int FindCycleStart(const BidirectionalPath& path) const {
- TRACE("Looking for IS cycle " << min_cycle_len_);
- int i = FindPosIS(path);
- TRACE("last is pos " << i);
- if (i < 0) return -1;
-//Tail
- BidirectionalPath last = path.SubPath(i);
- //last.Print();
-
- int pos = path.FindFirst(last);
-// not cycle
- if (pos == i) pos = -1;
- TRACE("looking for 1sr IS cycle " << pos);
- return pos;
- }
-
-//After cycle detected, removes min suffix > IS.
-//returns the beginning of the cycle.
- int RemoveCycle(BidirectionalPath& path) const {
- int pos = FindCycleStart(path);
- DEBUG("Found IS cycle " << pos);
- if (pos == -1) {
- return -1;
- }
-
- int last_edge_pos = FindPosIS(path);
- VERIFY(last_edge_pos > -1);
- DEBUG("last edge pos " << last_edge_pos);
- VERIFY(last_edge_pos > pos);
- for (int i = (int) path.Size() - 1; i >= last_edge_pos; --i) {
- path.PopBack();
- }
- VERIFY((int) path.Size() == last_edge_pos);
- VERIFY(pos < (int) path.Size());
- DEBUG("result pos " <<pos);
- return pos;
- }
-};
-
-class RepeatDetector {
-public:
- RepeatDetector(const Graph& g, const GraphCoverageMap& cov_map, size_t max_repeat_len)
- : g_(g),
- cov_map_(cov_map),
- used_paths_(),
- repeat_len_(max_repeat_len){
- empty_ = new BidirectionalPath(g_);
- }
- ~RepeatDetector() {
- delete empty_;
- }
-
- BidirectionalPath* RepeatPath(const BidirectionalPath& p) {
- if (p.Size() == 0) {
- return empty_;
- }
- EdgeId last_e = p.Back();
- BidirectionalPathSet cov_paths = cov_map_.GetCoveringPaths(last_e);
- DEBUG("cov paths for e " << g_.int_id(last_e) << " size " << cov_paths.size());
- size_t max_common_size = 0;
- BidirectionalPath* result_p = empty_;
- for (BidirectionalPath* cov_p : cov_paths) {
- if (used_paths_.find(cov_p) == used_paths_.end() || cov_p == &p || cov_p == p.GetConjPath()) {
- continue;
- }
- size_t common_size = MaxCommonSize(p, *cov_p);
- DEBUG("max comon size with path " << cov_p->GetId() << " is " << common_size);
- if (common_size == 0) {
- continue;
- }
- VERIFY(common_size <= p.Size());
- if (p.LengthAt(p.Size() - common_size) > repeat_len_) {
- DEBUG("repeat from " << (p.Size() - common_size) << " length " << p.LengthAt(p.Size() - common_size) << " repeat length " << repeat_len_);
- max_common_size = max(common_size, max_common_size);
- result_p = cov_p;
- }
- }
- used_paths_.insert(&p);
- DEBUG("max common size " << max_common_size);
- return result_p;
- }
- size_t MaxCommonSize(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
- DEBUG("max coomon size ")
- EdgeId last_e = p1.Back();
- vector<size_t> positions2 = p2.FindAll(last_e);
- DEBUG("pos size " << positions2.size())
- size_t max_common_size = 0;
- for (size_t pos2 : positions2) {
- size_t common_size = MaxCommonSize(p1, p1.Size() - 1, p2, pos2);
- DEBUG("max common size from " << pos2 << " is " << common_size);
- max_common_size = max(max_common_size, common_size);
- }
- return max_common_size;
- }
-private:
- size_t MaxCommonSize(const BidirectionalPath& p1, size_t pos1, const BidirectionalPath& p2, size_t pos2) const {
- int i1 = (int) pos1;
- int i2 = (int) pos2;
- while (i1 >= 0 && i2 >= 0 &&
- p1.At((size_t) i1) == p2.At((size_t) i2) &&
- p1.GapAt((size_t) i1) == p2.GapAt((size_t) i2)) {
- i1--;
- i2--;
- }
- if (i1 >=0 && i2>=0 && p1.At((size_t) i1) == p2.At((size_t) i2)) {
- i1--;
- i2--;
- }
-
- VERIFY(i1 <= (int)pos1);
- return std::max(size_t((int) pos1 - i1), (size_t)1);
- }
- const Graph& g_;
- const GraphCoverageMap& cov_map_;
- set<const BidirectionalPath*> used_paths_;
- size_t repeat_len_;
- BidirectionalPath* empty_;
-};
-
-class ContigsMaker {
-public:
- ContigsMaker(const Graph & g)
- : g_(g) { }
-
- virtual ~ContigsMaker() { }
-
- virtual void GrowPath(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
-
- virtual void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
-
- virtual void GrowAll(PathContainer & paths, PathContainer& paths_storage) = 0;
-
-protected:
- const Graph& g_;
- DECL_LOGGER("PathExtender")
-};
-
-struct UsedUniqueStorage {
- set<EdgeId> used_;
-
- const ScaffoldingUniqueEdgeStorage& unique_;
-
- UsedUniqueStorage(const ScaffoldingUniqueEdgeStorage& unique ):used_(), unique_(unique) {}
-
- void insert(EdgeId e) {
- if (unique_.IsUnique(e)) {
- used_.insert(e);
- used_.insert(e->conjugate());
- }
- }
-
- bool IsUsedAndUnique(EdgeId e) const {
- return (unique_.IsUnique(e) && used_.find(e) != used_.end());
- }
-
- bool UniqueCheckEnabled() const {
- return unique_.size() > 0;
- }
-
-
-};
-
-class PathExtender {
-public:
- PathExtender(const Graph & g):
- g_(g){ }
-
- virtual ~PathExtender() { }
-
- virtual bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
-
- void AddUniqueEdgeStorage(shared_ptr<UsedUniqueStorage> used_storage) {
- used_storage_ = used_storage;
- }
-protected:
- const Graph& g_;
- shared_ptr<UsedUniqueStorage> used_storage_;
- DECL_LOGGER("PathExtender")
-};
-
-class CompositeExtender : public ContigsMaker {
-public:
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map,
- size_t max_diff_len,
- size_t max_repeat_length,
- bool detect_repeats_online)
- : ContigsMaker(g),
- cover_map_(cov_map),
- repeat_detector_(g, cover_map_, 2 * max_repeat_length),
- extenders_(),
- max_diff_len_(max_diff_len),
- max_repeat_len_(max_repeat_length),
- detect_repeats_online_(detect_repeats_online) {
- }
-
- CompositeExtender(Graph & g, GraphCoverageMap& cov_map,
- vector<shared_ptr<PathExtender> > pes,
- const ScaffoldingUniqueEdgeStorage& unique,
- size_t max_diff_len,
- size_t max_repeat_length,
- bool detect_repeats_online)
- : ContigsMaker(g),
- cover_map_(cov_map),
- repeat_detector_(g, cover_map_, 2 * max_repeat_length),
- extenders_(),
- max_diff_len_(max_diff_len),
- max_repeat_len_(max_repeat_length),
- detect_repeats_online_(detect_repeats_online) {
- extenders_ = pes;
- used_storage_ = make_shared<UsedUniqueStorage>(UsedUniqueStorage(unique));
- for (auto ex: extenders_) {
- ex->AddUniqueEdgeStorage(used_storage_);
- }
- }
-
- void AddExtender(shared_ptr<PathExtender> pe) {
- extenders_.push_back(pe);
- pe->AddUniqueEdgeStorage(used_storage_);
- }
-
- void GrowAll(PathContainer& paths, PathContainer& result) override {
- result.clear();
- GrowAllPaths(paths, result);
- LengthPathFilter filter(g_, 0);
- filter.filter(result);
- }
-
- void GrowPath(BidirectionalPath& path, PathContainer* paths_storage) override {
- while (MakeGrowStep(path, paths_storage)) { }
- }
-
- void GrowPathSimple(BidirectionalPath& path, PathContainer* paths_storage) override {
- while (MakeGrowStep(path, paths_storage, false)) { }
- }
-
- bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage, bool detect_repeats_online_local = true) {
- DEBUG("make grow step composite extender");
- if (detect_repeats_online_ && detect_repeats_online_local) {
- BidirectionalPath *repeat_path = repeat_detector_.RepeatPath(path);
- size_t repeat_size = repeat_detector_.MaxCommonSize(path, *repeat_path);
-
- if (repeat_size > 0) {
- DEBUG("repeat with length " << repeat_size);
- path.Print();
- repeat_path->Print();
- BidirectionalPath repeat = path.SubPath(path.Size() - repeat_size);
- int begin_repeat = repeat_path->FindLast(repeat);
- VERIFY(begin_repeat > -1);
- size_t end_repeat = (size_t) begin_repeat + repeat_size;
- DEBUG("not consistent subpaths ");
- BidirectionalPath begin1 = path.SubPath(0, path.Size() - repeat_size);
- begin1.Print();
- BidirectionalPath begin2 = repeat_path->SubPath(0, begin_repeat);
- begin2.Print();
- int gpa_in_repeat_path = repeat_path->GapAt(begin_repeat);
- BidirectionalPath end2 = repeat_path->SubPath(end_repeat);
- BidirectionalPath begin1_conj = path.SubPath(0, path.Size() - repeat_size + 1).Conjugate();
- BidirectionalPath begin2_conj = repeat_path->SubPath(0, begin_repeat + 1).Conjugate();
- pair<size_t, size_t> last = ComparePaths(0, 0, begin1_conj, begin2_conj, max_diff_len_);
- DEBUG("last " << last.first << " last2 " << last.second);
- path.Clear();
- repeat_path->Clear();
- int gap_len = repeat.GapAt(0);
-
- if (begin2.Size() == 0 || last.second != 0) { //TODO: incorrect: common edges, but then different ends
- path.PushBack(begin1);
- repeat_path->PushBack(begin2);
- } else {
- gap_len = gpa_in_repeat_path;
- path.PushBack(begin2);
- repeat_path->PushBack(begin1);
- }
-
- path.PushBack(repeat.At(0), gap_len);
- path.PushBack(repeat.SubPath(1));
- path.PushBack(end2);
- DEBUG("new path");
- path.Print();
- return false;
- }
- }
-
- size_t current = 0;
- while (current < extenders_.size()) {
- DEBUG("step " << current << " of total " << extenders_.size());
- if (extenders_[current]->MakeGrowStep(path, paths_storage)) {
- return true;
- }
- ++current;
- }
- return false;
- }
-
-private:
- GraphCoverageMap& cover_map_;
- RepeatDetector repeat_detector_;
- vector<shared_ptr<PathExtender> > extenders_;
- size_t max_diff_len_;
- size_t max_repeat_len_;
- bool detect_repeats_online_;
- shared_ptr<UsedUniqueStorage> used_storage_;
-
- void SubscribeCoverageMap(BidirectionalPath * path) {
- path->Subscribe(&cover_map_);
- for (size_t i = 0; i < path->Size(); ++i) {
- cover_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- void GrowAllPaths(PathContainer& paths, PathContainer& result) {
- cover_map_.Clear();
- for (size_t i = 0; i < paths.size(); ++i) {
- VERBOSE_POWER_T2(i, 100, "Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
- if (paths.size() > 10 && i % (paths.size() / 10 + 1) == 0) {
- INFO("Processed " << i << " paths from " << paths.size() << " (" << i * 100 / paths.size() << "%)");
- }
-//In 2015 modes do not use a seed already used in paths.
- if (used_storage_->UniqueCheckEnabled()) {
- bool was_used = false;
- for (size_t ind =0; ind < paths.Get(i)->Size(); ind++) {
- EdgeId eid = paths.Get(i)->At(ind);
- if (used_storage_->IsUsedAndUnique(eid)) {
- was_used = true; break;
- } else {
- used_storage_->insert(eid);
- }
- }
- if (was_used) {
- DEBUG("skipping already used seed");
- continue;
- }
- }
-//TODO: coverage_map should be exterminated
- if (!cover_map_.IsCovered(*paths.Get(i))) {
- BidirectionalPath * path = new BidirectionalPath(*paths.Get(i));
- BidirectionalPath * conjugatePath = new BidirectionalPath(*paths.GetConjugate(i));
- result.AddPair(path, conjugatePath);
- SubscribeCoverageMap(path);
- SubscribeCoverageMap(conjugatePath);
- size_t count_trying = 0;
- size_t current_path_len = 0;
- do {
- current_path_len = path->Length();
- count_trying++;
- GrowPath(*path, &result);
- GrowPath(*conjugatePath, &result);
- } while (count_trying < 10 && (path->Length() != current_path_len));
- path->CheckConjugateEnd(max_repeat_len_);
- DEBUG("result path " << path->GetId());
- path->Print();
- }
- }
- }
-
-};
-
-//All Path-Extenders inherits this one.
-
-class LoopDetectingPathExtender : public PathExtender {
-
-protected:
- size_t maxLoops_;
- bool investigate_short_loops_;
- bool use_short_loop_cov_resolver_;
- CovShortLoopResolver cov_loop_resolver_;
-
- vector<shared_ptr<BidirectionalPath> > visited_cycles_;
- InsertSizeLoopDetector is_detector_;
- const GraphCoverageMap& cov_map_;
-
-public:
- LoopDetectingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, size_t max_loops,
- bool investigate_short_loops,
- bool use_short_loop_cov_resolver, size_t is)
- : PathExtender(gp.g),
- maxLoops_(max_loops),
- investigate_short_loops_(investigate_short_loops),
- use_short_loop_cov_resolver_(use_short_loop_cov_resolver),
- cov_loop_resolver_(gp),
- is_detector_(gp.g, cov_map, is),
- cov_map_(cov_map) {
-
- }
-
- size_t getMaxLoops() const {
- return maxLoops_;
- }
-
- bool isInvestigateShortLoops() const {
- return investigate_short_loops_;
- }
-
- void setInvestigateShortLoops(bool investigateShortLoops) {
- this->investigate_short_loops_ = investigateShortLoops;
- }
-
- void setMaxLoops(size_t maxLoops) {
- if (maxLoops != 0) {
- this->maxLoops_ = maxLoops;
- }
- }
-//seems that it is outofdate
- bool InExistingLoop(const BidirectionalPath& path) {
- TRACE("Checking existing loops");
- int j = 0;
- for (auto cycle : visited_cycles_) {
- VERBOSE_POWER2(j++, "checking ");
- int pos = path.FindLast(*cycle);
- if (pos == -1)
- continue;
-
- int start_cycle_pos = pos + (int) cycle->Size();
- bool only_cycles_in_tail = true;
- int last_cycle_pos = start_cycle_pos;
- DEBUG("start_cycle pos "<< last_cycle_pos);
- for (int i = start_cycle_pos; i < (int) path.Size() - (int) cycle->Size(); i += (int) cycle->Size()) {
- if (!path.CompareFrom(i, *cycle)) {
- only_cycles_in_tail = false;
- break;
- } else {
- last_cycle_pos = i + (int) cycle->Size();
- DEBUG("last cycle pos changed " << last_cycle_pos);
- }
- }
- DEBUG("last_cycle_pos " << last_cycle_pos);
- only_cycles_in_tail = only_cycles_in_tail && cycle->CompareFrom(0, path.SubPath(last_cycle_pos));
- if (only_cycles_in_tail) {
-// seems that most of this is useless, checking
- VERIFY (last_cycle_pos == start_cycle_pos);
- DEBUG("find cycle " << last_cycle_pos);
- DEBUG("path");
- path.Print();
- DEBUG("last subpath");
- path.SubPath(last_cycle_pos).Print();
- DEBUG("cycle");
- cycle->Print();
- DEBUG("last_cycle_pos " << last_cycle_pos << " path size " << path.Size());
- VERIFY(last_cycle_pos <= (int)path.Size());
- DEBUG("last cycle pos + cycle " << last_cycle_pos + (int)cycle->Size());
- VERIFY(last_cycle_pos + (int)cycle->Size() >= (int)path.Size());
-
- return true;
- }
- }
- return false;
- }
-
- void AddCycledEdges(const BidirectionalPath& path, size_t pos) {
- if (pos >= path.Size()) {
- DEBUG("Wrong position in IS cycle");
- return;
- }
- visited_cycles_.push_back(std::make_shared<BidirectionalPath>(path.SubPath(pos)));
- DEBUG("add cycle");
- path.SubPath(pos).Print();
- }
-
- bool DetectCycle(BidirectionalPath& path) {
- DEBUG("detect cycle");
- if (is_detector_.CheckCycled(path)) {
- DEBUG("Checking IS cycle");
- int loop_pos = is_detector_.RemoveCycle(path);
- DEBUG("Removed IS cycle");
- if (loop_pos != -1) {
- AddCycledEdges(path, loop_pos);
- return true;
- }
- }
- return false;
- }
-
- bool DetectCycleScaffolding(BidirectionalPath& path) {
- return is_detector_.CheckCycledNonIS(path);
- }
-
- virtual bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage = nullptr) = 0;
-
- virtual bool ResolveShortLoopByCov(BidirectionalPath& path) = 0;
-
- virtual bool ResolveShortLoopByPI(BidirectionalPath& path) = 0;
-
- virtual bool CanInvestigateShortLoop() const {
- return false;
- }
-
- bool MakeGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
- if (InExistingLoop(path)) {
- DEBUG("in existing loop");
- return false;
- }
- bool result = false;
- LoopDetector loop_detector(&path, cov_map_);
- if (DetectCycle(path)) {
- result = false;
- } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back()) && use_short_loop_cov_resolver_) {
- DEBUG("edge in short loop");
- result = ResolveShortLoop(path);
- } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop() && use_short_loop_cov_resolver_) {
- DEBUG("Prev edge in short loop");
- path.PopBack();
- result = ResolveShortLoop(path);
- } else {
- DEBUG("Making step");
- result = MakeSimpleGrowStep(path, paths_storage);
- DEBUG("Made step");
- if (DetectCycle(path)) {
- result = false;
- } else if (path.Size() >= 1 && InvestigateShortLoop() && loop_detector.EdgeInShortLoop(path.Back())) {
- DEBUG("Edge in short loop");
- result = ResolveShortLoop(path);
- } else if (InvestigateShortLoop() && loop_detector.PrevEdgeInShortLoop()) {
- DEBUG("Prev edge in short loop");
- path.PopBack();
- result = ResolveShortLoop(path);
- }
- }
- return result;
- }
-
-private:
- bool ResolveShortLoop(BidirectionalPath& p) {
- if (use_short_loop_cov_resolver_) {
- return ResolveShortLoopByCov(p);
- } else {
- return ResolveShortLoopByPI(p);
- }
- }
-
- bool InvestigateShortLoop() {
- return investigate_short_loops_ && (use_short_loop_cov_resolver_ || CanInvestigateShortLoop());
- }
-protected:
- DECL_LOGGER("LoopDetectingPathExtender")
-};
-
-class SimpleExtender: public LoopDetectingPathExtender {
-
-protected:
-
- shared_ptr<ExtensionChooser> extensionChooser_;
-
- void FindFollowingEdges(BidirectionalPath& path, ExtensionChooser::EdgeContainer * result) {
- DEBUG("Looking for the following edges")
- result->clear();
- vector<EdgeId> edges;
- DEBUG("Pushing back")
- push_back_all(edges, g_.OutgoingEdges(g_.EdgeEnd(path.Back())));
- result->reserve(edges.size());
- for (auto iter = edges.begin(); iter != edges.end(); ++iter) {
- DEBUG("Adding edge w distance " << g_.int_id(*iter));
- result->push_back(EdgeWithDistance(*iter, 0));
- }
- DEBUG("Following edges found");
- }
-
-
-public:
-
- SimpleExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, shared_ptr<ExtensionChooser> ec,
- size_t is, size_t max_loops, bool investigate_short_loops, bool use_short_loop_cov_resolver):
- LoopDetectingPathExtender(gp, cov_map, max_loops, investigate_short_loops, use_short_loop_cov_resolver, is),
- extensionChooser_(ec) {
- }
-
- std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
- return extensionChooser_;
- }
-
- bool CanInvestigateShortLoop() const override {
- return extensionChooser_->WeightCounterBased();
- }
-
- bool ResolveShortLoopByCov(BidirectionalPath& path) override {
- LoopDetector loop_detector(&path, cov_map_);
- size_t init_len = path.Length();
- bool result = false;
- while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
- cov_loop_resolver_.ResolveShortLoop(path);
- if (init_len == path.Length()) {
- return result;
- } else {
- result = true;
- }
- init_len = path.Length();
- }
- return true;
- }
-
- bool ResolveShortLoopByPI(BidirectionalPath& path) override {
- if (extensionChooser_->WeightCounterBased()) {
- LoopResolver loop_resolver(g_, extensionChooser_->wc());
- LoopDetector loop_detector(&path, cov_map_);
- size_t init_len = path.Length();
- bool result = false;
- while (path.Size() >= 1 && loop_detector.EdgeInShortLoop(path.Back())) {
- loop_resolver.ResolveShortLoop(path);
- if (init_len == path.Length()) {
- return result;
- } else {
- result = true;
- }
- init_len = path.Length();
- }
- return true;
- }
- return false;
- }
-
- bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* paths_storage) override {
- ExtensionChooser::EdgeContainer candidates;
- return FilterCandidates(path, candidates) and AddCandidates(path, paths_storage, candidates);
- }
-
-protected:
- virtual bool FilterCandidates(BidirectionalPath& path, ExtensionChooser::EdgeContainer& candidates) {
- if (path.Size() == 0) {
- return false;
- }
- DEBUG("Simple grow step");
- path.Print();
- FindFollowingEdges(path, &candidates);
- DEBUG("found candidates");
- DEBUG(candidates.size())
- if (candidates.size() == 1) {
- LoopDetector loop_detector(&path, cov_map_);
- if (!investigate_short_loops_ && (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
- }
- DEBUG("more filtering");
- candidates = extensionChooser_->Filter(path, candidates);
- DEBUG("filtered candidates");
- DEBUG(candidates.size())
- return true;
- }
-
- virtual bool AddCandidates(BidirectionalPath& path, PathContainer* /*paths_storage*/, ExtensionChooser::EdgeContainer& candidates) {
- if (candidates.size() != 1)
- return false;
-
- LoopDetector loop_detector(&path, cov_map_);
- DEBUG("loop detecor");
- if (!investigate_short_loops_ &&
- (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
- DEBUG("push");
- EdgeId eid = candidates.back().e_;
-//In 2015 modes when trying to use already used unique edge, it is not added and path growing stops.
-//That allows us to avoid overlap removal hacks used earlier.
- if (used_storage_->UniqueCheckEnabled()) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
- path.PushBack(eid, candidates.back().d_);
- DEBUG("push done");
- return true;
- }
-
-protected:
- DECL_LOGGER("SimpleExtender")
-
-};
-
-
-class MultiExtender: public SimpleExtender {
-
-protected:
- size_t max_candidates_;
-
-public:
-
- MultiExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, shared_ptr<ExtensionChooser> ec,
- size_t is, size_t max_loops, bool investigate_short_loops, bool use_short_loop_cov_resolver,
- size_t max_candidates = 0):
- SimpleExtender(gp, cov_map, ec, is, max_loops, investigate_short_loops, use_short_loop_cov_resolver),
- max_candidates_(max_candidates) {
- }
-
-protected:
- virtual bool AddCandidates(BidirectionalPath& path, PathContainer* paths_storage, ExtensionChooser::EdgeContainer& candidates) override {
- bool res = false;
-
- if (candidates.size() == 1) {
- LoopDetector loop_detector(&path, cov_map_);
- DEBUG("loop detecor");
- if (!investigate_short_loops_ &&
- (loop_detector.EdgeInShortLoop(path.Back()) or loop_detector.EdgeInShortLoop(candidates.back().e_))
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
- DEBUG("push");
- EdgeId eid = candidates.back().e_;
- path.PushBack(eid, candidates.back().d_);
- DEBUG("push done");
- return true;
- }
- else if (candidates.size() == 2 && (max_candidates_ == 0 || candidates.size() <= max_candidates_)) {
- //Check for bulge
- auto v = g_.EdgeStart(candidates.front().e_);
- auto u = g_.EdgeEnd(candidates.front().e_);
- for (auto edge : candidates) {
- if (v != g_.EdgeStart(edge.e_) || u != g_.EdgeEnd(edge.e_))
- return false;
- }
-
- LoopDetector loop_detector(&path, cov_map_);
- DEBUG("loop detector");
- if (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(path.Back())
- && extensionChooser_->WeightCounterBased()) {
- return false;
- }
-//First candidate is adding to THIS path.
- else if (not (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
- && extensionChooser_->WeightCounterBased())) {
- DEBUG("push");
- path.PushBack(candidates.front().e_, candidates.front().d_);
- DEBUG("push done");
- res = true;
- }
- if (candidates.size() > 1) {
- DEBUG("Found " << candidates.size() << " candidates");
- }
-//Creating new paths for other than new candidate.
- for (size_t i = 1; i < candidates.size(); ++i) {
- if (not (!investigate_short_loops_ && loop_detector.EdgeInShortLoop(candidates.front().e_)
- && extensionChooser_->WeightCounterBased())) {
- BidirectionalPath *p = new BidirectionalPath(path);
- p->PushBack(candidates[i].e_, candidates[i].d_);
- BidirectionalPath *cp = new BidirectionalPath(p->Conjugate());
- paths_storage->AddPair(p, cp);
- }
- }
- }
-
- return res;
- }
-
-protected:
- DECL_LOGGER("MultiExtender")
-
-};
-
-
-class ScaffoldingPathExtender: public LoopDetectingPathExtender {
-private:
- std::shared_ptr<ExtensionChooser> extension_chooser_;
- ExtensionChooser::EdgeContainer sources_;
- std::shared_ptr<GapJoiner> gap_joiner_;
- bool avoid_rc_connections_;
-
-//When check_sink_ set to false we can scaffold not only tips
- bool check_sink_;
-
- void InitSources() {
- sources_.clear();
-
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.IncomingEdgeCount(g_.EdgeStart(*iter)) == 0) {
- sources_.push_back(EdgeWithDistance(*iter, 0));
- }
- }
- }
-
- bool IsSink(EdgeId e) const {
- return g_.OutgoingEdgeCount(g_.EdgeEnd(e)) == 0;
- }
-
-protected:
- virtual bool GapSatisfies(int /*gap*/) const {
- return true;
- }
-
- bool MakeSimpleGrowStepForChooser(BidirectionalPath& path, std::shared_ptr<ExtensionChooser> ec, bool must_overlap = false) {
- if (path.Size() < 1 || (check_sink_ && !IsSink(path.Back()))) {
- return false;
- }
- DEBUG("scaffolding:");
- DEBUG("Simple grow step, growing path");
- path.Print();
- ExtensionChooser::EdgeContainer candidates = ec->Filter(path, sources_);
- DEBUG("scaffolding candidates " << candidates.size() << " from sources " << sources_.size());
-
- //DEBUG("Extension chooser threshold = " << ec->GetThreshold())
- DEBUG("Candidate size = " << candidates.size())
- if (candidates.size() == 1) {
- if (candidates[0].e_ == path.Back()
- || (avoid_rc_connections_ && candidates[0].e_ == g_.conjugate(path.Back()))) {
- return false;
- }
- BidirectionalPath temp_path(path);
- temp_path.PushBack(candidates[0].e_);
- if (this->DetectCycleScaffolding(temp_path)) {
- return false;
- }
-
- EdgeId eid = candidates.back().e_;
- if (check_sink_) {
- Gap gap = gap_joiner_->FixGap(path.Back(), candidates.back().e_, candidates.back().d_);
- DEBUG("Gap after fixing " << gap.gap_ << " (was " << candidates.back().d_ << ")");
- if (gap.gap_ != GapJoiner::INVALID_GAP) {
- DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length() <<
- ", fixed gap length: " << gap.gap_ << ", trash length: " << gap.trash_previous_ << "-" <<
- gap.trash_current_);
-
- if (used_storage_->UniqueCheckEnabled()) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
-
- if (must_overlap && GapSatisfies(gap.gap_)) {
- DEBUG("Overlap is not large enogh")
- return false;
- }
- DEBUG("Overlap is good, success")
- path.PushBack(eid, gap);
- return true;
- }
- else {
- DEBUG("Looks like wrong scaffolding. PathId: " << path.GetId() << " path length: " <<
- path.Length() << ", fixed gap length: " << candidates.back().d_ << ", fixed = " << gap.gap_);
- return false;
- }
- }
- else {
- DEBUG("Gap joiners off");
- DEBUG("Scaffolding. PathId: " << path.GetId() << " path length: " << path.Length()
- << ", fixed gap length: " << candidates.back().d_);
-
- if (used_storage_->UniqueCheckEnabled()) {
- if (used_storage_->IsUsedAndUnique(eid)) {
- return false;
- } else {
- used_storage_->insert(eid);
- }
- }
- path.PushBack(candidates.back().e_, candidates.back().d_);
- return true;
- }
- }
- DEBUG("scaffolding end");
- return false;
- }
-
-public:
-
- ScaffoldingPathExtender(const conj_graph_pack& gp,
- const GraphCoverageMap& cov_map,
- std::shared_ptr<ExtensionChooser> extension_chooser,
- std::shared_ptr<GapJoiner> gap_joiner,
- size_t is,
- size_t max_loops,
- bool investigate_short_loops,
- bool avoid_rc_connections,
- bool check_sink = true):
- LoopDetectingPathExtender(gp, cov_map, max_loops, investigate_short_loops, false, is),
- extension_chooser_(extension_chooser),
- gap_joiner_(gap_joiner),
- avoid_rc_connections_(avoid_rc_connections),
- check_sink_(check_sink)
- {
- InitSources();
- }
-
- bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
- return MakeSimpleGrowStepForChooser(path, extension_chooser_);
- }
-
- bool ResolveShortLoopByCov(BidirectionalPath&) override {
- return false;
- }
-
- bool ResolveShortLoopByPI(BidirectionalPath&) override {
- return false;
- }
-
- std::shared_ptr<ExtensionChooser> GetExtensionChooser() const {
- return extension_chooser_;
- }
-
-protected:
- DECL_LOGGER("ScaffoldingPathExtender");
-};
-
-
-class RNAScaffoldingPathExtender: public ScaffoldingPathExtender {
- std::shared_ptr<ExtensionChooser> strict_extension_chooser_;
-
- int min_overlap_;
-
-protected:
- bool GapSatisfies(int gap) const override {
- return gap > (int) g_.k() - min_overlap_;
- }
-
-public:
-
- RNAScaffoldingPathExtender(const conj_graph_pack& gp, const GraphCoverageMap& cov_map, std::shared_ptr<ExtensionChooser> extension_chooser,
- std::shared_ptr<ExtensionChooser> strict_extension_chooser,
- std::shared_ptr<GapJoiner> gap_joiner,
- size_t is,
- size_t max_loops,
- bool investigate_short_loops,
- int min_overlap = 0):
- ScaffoldingPathExtender(gp, cov_map, extension_chooser, gap_joiner, is, max_loops, investigate_short_loops, true),
- strict_extension_chooser_(strict_extension_chooser), min_overlap_(min_overlap) {}
-
-
- bool MakeSimpleGrowStep(BidirectionalPath& path, PathContainer* /*paths_storage*/) override {
- return MakeSimpleGrowStepForChooser(path, GetExtensionChooser(), true) ||
- MakeSimpleGrowStepForChooser(path, strict_extension_chooser_);
- }
-
-};
-
-}
diff --git a/src/modules/algorithms/path_extend/path_filter.hpp b/src/modules/algorithms/path_extend/path_filter.hpp
deleted file mode 100644
index fa19ce9..0000000
--- a/src/modules/algorithms/path_extend/path_filter.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_filter.hpp
- *
- * Created on: Mar 14, 2012
- * Author: andrey
- */
-
-#ifndef PATH_FILTER_HPP_
-#define PATH_FILTER_HPP_
-
-#include "assembly_graph/paths/bidirectional_path.hpp"
-
-namespace path_extend {
-
-class CopyOnWritePathFilter {
-
-protected:
- Graph& g;
-
-public:
- CopyOnWritePathFilter(Graph& g_): g(g_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) = 0;
-
- PathContainer filter(PathContainer& paths) {
- PathContainer result;
-
- for (size_t i = 0; i < paths.size(); ++i) {
- if (predicate(*paths.Get(i)) || predicate(*paths.GetConjugate(i))) {
- result.AddPair(paths.Get(i), paths.GetConjugate(i));
- }
- }
-
- return result;
- }
-
-};
-
-
-class IdFilter: public CopyOnWritePathFilter {
-
-protected:
- std::set<size_t> ids;
-
-public:
-
- IdFilter(Graph& g_, std::set<size_t> ids_): CopyOnWritePathFilter(g_), ids(ids_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- return ids.count(path.GetId()) > 0;
- }
-};
-
-
-class ErasingPathFilter {
-
-protected:
- const Graph& g;
-
-public:
- ErasingPathFilter(const Graph& g_): g(g_) {
- }
-
- virtual bool predicate(BidirectionalPath& path) = 0;
-
- void filter(PathContainer& paths) {
- for (PathContainer::Iterator iter = paths.begin(); iter != paths.end(); ) {
- if (predicate(*iter.get()) || predicate(*iter.getConjugate())) {
- iter = paths.erase(iter);
- }
- else {
- ++iter;
- }
- }
- }
-
-};
-
-
-class CoveragePathFilter: public ErasingPathFilter {
-
-protected:
- double minCoverage;
-
-public:
- CoveragePathFilter(Graph& g_, double cov): ErasingPathFilter(g_), minCoverage(cov) {
-
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (math::ls(g.coverage(path[i]), minCoverage)) {
- return true;
- }
- }
- return false;
- }
-};
-
-
-class LengthPathFilter: public ErasingPathFilter {
-
-protected:
- size_t minLength;
-
-public:
- LengthPathFilter(const Graph& g_, size_t len): ErasingPathFilter(g_), minLength(len) {
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- return path.Length() <= minLength;
- }
-};
-
-
-class IsolatedPathFilter: public ErasingPathFilter {
-
-protected:
- size_t min_length_;
-
- double min_cov_;
-
-public:
- IsolatedPathFilter(const Graph& g_, size_t min_length, double min_cov = 10000000.0):
- ErasingPathFilter(g_),
- min_length_(min_length),
- min_cov_(min_cov) {
- }
-
- virtual bool predicate(BidirectionalPath& path) {
- if (path.Empty())
- return true;
-
- if (path.Size() <= 2) {
- auto v1 = g.EdgeStart(path.Front());
- auto v2 = g.EdgeEnd(path.Back());
-
- return g.IncomingEdgeCount(v1) == 0 &&
- g.OutgoingEdgeCount(v2) == 0 &&
- path.Length() < min_length_ &&
- math::ls(path.Coverage(), min_cov_);
- }
- return false;
- }
-};
-
-}
-
-#endif /* PATH_FILTER_HPP_ */
diff --git a/src/modules/algorithms/path_extend/path_visualizer.hpp b/src/modules/algorithms/path_extend/path_visualizer.hpp
deleted file mode 100644
index abcd4ad..0000000
--- a/src/modules/algorithms/path_extend/path_visualizer.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_visualizer.hpp
- *
- * Created on: Mar 22, 2012
- * Author: andrey
- */
-
-#ifndef PATH_VISUALIZER_HPP_
-#define PATH_VISUALIZER_HPP_
-
-#include "assembly_graph/paths/bidirectional_path.hpp"
-#include "assembly_graph/stats/picture_dump.hpp"
-
-namespace path_extend {
-
-using namespace debruijn_graph;
-
-template<class Graph>
-class PathGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- std::map<EdgeId, std::string> labels_;
-
-public:
- PathGraphLabeler(const Graph& g, const PathContainer& paths) : base(g) {
- for(size_t i = 0; i < paths.size(); ++i) {
- BidirectionalPath * path = paths.Get(i);
- for (size_t j = 0; j < path->Size(); ++j) {
- if (labels_.count(path->At(j)) > 0) {
- labels_[path->At(j)] += ", ";
- }
- labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
- }
-
- path = paths.GetConjugate(i);
- for (size_t j = 0; j < path->Size(); ++j) {
- if (labels_.count(path->At(j)) > 0) {
- labels_[path->At(j)] += ", ";
- }
- labels_[path->At(j)] += "(" + ToString(path->GetId()) + " : " + ToString(j) + ")";
- }
- }
- }
-
- virtual std::string label(VertexId /*vertexId*/) const {
- return "";
- }
-
- virtual std::string label(EdgeId edgeId) const {
- auto label = labels_.find(edgeId);
- return label == labels_.end() ? "" : label->second;
- }
-};
-
-
-class PathVisualizer {
-
-protected:
- bool writeLength;
- bool writePos;
-
-public:
-
- PathVisualizer(): writeLength(true), writePos(true) {
-
- }
-
- void writeGraphWithPathsSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name, const PathContainer& paths) const {
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(gp.g);
- PathGraphLabeler<Graph> path_labeler(gp.g, paths);
- CoverageGraphLabeler<Graph> cov_labler(gp.g);
- EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
-
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, path_labeler, pos_labeler);
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
- if (gp.index.IsAttached()) {
- colorer = stats::DefaultColorer(gp);
- } else {
- colorer = omnigraph::visualization::DefaultColorer(gp.g);
- }
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- void writeGraphSimple(const conj_graph_pack& gp, const string& file_name, const string& graph_name) const{
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(gp.g);
- EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
- CoverageGraphLabeler<Graph> cov_labler(gp.g);
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler, pos_labeler);
-
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
-
- if (gp.index.IsAttached()) {
- colorer = stats::DefaultColorer(gp);
- } else {
- Path<EdgeId> empty;
- colorer = omnigraph::visualization::DefaultColorer(gp.g, empty, empty);
- }
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(gp.g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- void writeGraphSimple(const Graph& g, const string& file_name, const string& graph_name) const{
- INFO("Visualizing graph " << graph_name << " to file " << file_name);
- std::fstream filestr;
- filestr.open(file_name.c_str(), std::fstream::out);
-
- StrGraphLabeler<Graph> str_labeler(g);
- CoverageGraphLabeler<Graph> cov_labler(g);
- CompositeLabeler<Graph> composite_labeler(str_labeler, cov_labler);
-
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer;
-
- Path<EdgeId> empty;
- colorer = omnigraph::visualization::DefaultColorer(g, empty, empty);
-
- omnigraph::visualization::ComponentVisualizer<Graph> visualizer(g, false);
- omnigraph::visualization::EmptyGraphLinker<Graph> linker;
- visualizer.Visualize(filestr, composite_labeler, *colorer, linker);
- filestr.close();
- INFO("Visualizing graph done");
- }
-
- bool isWriteLength() const
- {
- return writeLength;
- }
-
- bool isWritePos() const
- {
- return writePos;
- }
-
- void setWriteLength(bool writeLength)
- {
- this->writeLength = writeLength;
- }
-
- void setWritePos(bool writePos)
- {
- this->writePos = writePos;
- }
-};
-
-}
-
-#endif /* PATH_VISUALIZER_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.cpp b/src/modules/algorithms/path_extend/pe_config_struct.cpp
deleted file mode 100644
index 1acab7c..0000000
--- a/src/modules/algorithms/path_extend/pe_config_struct.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "pe_config_struct.hpp"
-#include "pipeline/config_common.hpp"
-
-namespace path_extend {
-
-void load(output_broken_scaffolds& obs, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- obs = pe_config::output_broken_scaffolds_id(ep);
- }
-}
-
-void load(scaffolding_mode &sm, boost::property_tree::ptree const& pt, std::string const& key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- sm = pe_config::scaffolding_mode_id(ep);
- }
-}
-
-void load(pe_config::ParamSetT::ScaffoldGraphParamsT& sg, boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(sg.construct, pt, "construct" );
- load(sg.output, pt, "output" );
- load(sg.always_add, pt, "always_add" );
- load(sg.never_add, pt, "never_add" );
- load(sg.relative_threshold, pt, "relative_threshold" );
- load(sg.graph_connectivity, pt, "graph_connectivity");
- load(sg.max_path_length, pt, "max_path_length" );
-}
-
-void load(pe_config::OutputParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(o.write_overlaped_paths, pt, "write_overlaped_paths" , complete);
- load(o.write_paths, pt, "write_paths" , complete);
-}
-
-void load(pe_config::VisualizeParamsT& o, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(o.print_overlaped_paths, pt, "print_overlaped_paths" , complete);
- load(o.print_paths, pt, "print_paths" , complete);
-}
-
-void load(pe_config::ParamSetT::ExtensionOptionsT& es,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(es.use_default_single_threshold, pt, "use_default_single_threshold", complete);
- load(es.priority_coeff, pt, "priority_coeff", complete);
- load(es.weight_threshold, pt, "weight_threshold", complete);
- load(es.single_threshold, pt, "single_threshold", complete);
- load(es.max_repeat_length, pt, "max_repeat_length", complete);
-}
-
-void load(pe_config::ParamSetT::LoopRemovalT& lr,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(lr.max_loops, pt, "max_loops", complete);
- load(lr.mp_max_loops, pt, "mp_max_loops", complete);
-}
-
-void load(pe_config::ParamSetT::CoordinatedCoverageT& coord_cov,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(coord_cov.max_edge_length_in_repeat, pt, "max_edge_length_repeat", complete);
- load(coord_cov.delta, pt, "delta", complete);
- load(coord_cov.min_path_len, pt, "min_path_len", complete);
-}
-
-void load(pe_config::ParamSetT::ScaffolderOptionsT& so,
- boost::property_tree::ptree const& pt, bool complete)
-{
- using config_common::load;
- load(so.enabled, pt, "enabled" , complete);
- load(so.cutoff , pt, "cutoff", complete);
- load(so.hard_cutoff , pt, "hard_cutoff", complete);
- load(so.rel_cutoff , pt, "rel_cutoff", complete);
- load(so.sum_threshold , pt, "sum_threshold", complete);
-
- load(so.cluster_info , pt, "cluster_info", complete);
- load(so.cl_threshold , pt, "cl_threshold", complete);
-
- load(so.use_la_gap_joiner , pt, "use_la_gap_joiner", complete);
- load(so.min_gap_score , pt, "min_gap_score", complete);
- load(so.max_must_overlap , pt, "max_must_overlap", complete);
- load(so.max_can_overlap , pt, "max_can_overlap", complete);
- load(so.short_overlap , pt, "short_overlap", complete);
- load(so.artificial_gap , pt, "artificial_gap", complete);
- load(so.use_old_score , pt, "use_old_score", complete);
- load(so.min_overlap_length, pt, "min_overlap_length", complete);
- load(so.flank_addition_coefficient, pt, "flank_addition_coefficient", complete);
- load(so.flank_multiplication_coefficient, pt, "flank_multiplication_coefficient", complete);
-
- load(so.var_coeff , pt, "var_coeff", complete);
- load(so.basic_overlap_coeff, pt, "basic_overlap_coeff", complete);
-
- if (pt.count("min_overlap_for_rna_scaffolding")) {
- VERIFY_MSG(!so.min_overlap_for_rna_scaffolding, "Option can be loaded only once");
- so.min_overlap_for_rna_scaffolding.reset(0);
- load(*so.min_overlap_for_rna_scaffolding, pt, "min_overlap_for_rna_scaffolding");
- }
-}
-
-
-void load(pe_config::ParamSetT::PathFiltrationT& pf,
- boost::property_tree::ptree const& pt, bool complete)
-{
- using config_common::load;
- load(pf.enabled , pt, "enabled" , complete);
- if (pf.enabled) {
- load(pf.min_length , pt, "min_length" , complete);
- load(pf.isolated_min_length , pt, "isolated_min_length" , complete);
- load(pf.min_length_for_low_covered , pt, "min_length_for_low_covered" , complete);
- load(pf.min_coverage , pt, "min_coverage" , complete);
- }
-}
-
-void load(pe_config::ParamSetT& p, boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(p.sm, pt, "scaffolding_mode", complete);
- load(p.normalize_weight, pt, "normalize_weight", complete);
- load(p.cut_all_overlaps, pt, "cut_all_overlaps", complete);
- load(p.remove_overlaps, pt, "remove_overlaps", complete);
- load(p.multi_path_extend, pt, "multi_path_extend", complete);
- load(p.split_edge_length, pt, "split_edge_length", complete);
- load(p.extension_options, pt, "extension_options", complete);
- load(p.mate_pair_options, pt, "mate_pair_options", complete);
- load(p.scaffolder_options, pt, "scaffolder", complete);
- load(p.loop_removal, pt, "loop_removal", complete);
- load(p.coordinated_coverage, pt, "coordinated_coverage", complete);
- load(p.use_coordinated_coverage, pt, "use_coordinated_coverage", complete);
- load(p.scaffolding2015, pt, "scaffolding2015", complete);
- load(p.scaffold_graph_params, pt, "scaffold_graph", complete);
- load(p.path_filtration, pt, "path_cleaning", complete);
-
-}
-
-
-void load(pe_config::LongReads& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.filtering, pt, "filtering", complete);
- load(p.weight_priority, pt, "weight_priority", complete);
- load(p.unique_edge_priority, pt, "unique_edge_priority", complete);
- load(p.min_significant_overlap, pt, "min_significant_overlap", complete);
-
-}
-
-void load(pe_config::ParamSetT::Scaffolding2015& p, boost::property_tree::ptree const& pt,
- bool) {
- using config_common::load;
- load(p.autodetect, pt, "autodetect");
- load(p.min_unique_length, pt, "min_unique_length");
- load(p.unique_coverage_variation, pt, "unique_coverage_variation");
- load(p.relative_weight_cutoff, pt, "relative_weight_cutoff");
-
-}
-
-void load(pe_config::AllLongReads& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.pacbio_reads, pt, "pacbio_reads", complete);
- load(p.single_reads, pt, "single_reads", complete);
- load(p.contigs, pt, "contigs", complete);
- load(p.meta_contigs, pt, "meta_untrusted_contigs", complete);
-}
-
-void load(pe_config::MainPEParamsT& p, boost::property_tree::ptree const& pt,
- bool complete) {
- using config_common::load;
- load(p.debug_output, pt, "debug_output", complete);
- load(p.output, pt, "output", complete);
- load(p.viz, pt, "visualize", complete);
- load(p.obs, pt, "output_broken_scaffolds", complete);
- load(p.param_set, pt, "params", complete);
- load(p.long_reads, pt, "long_reads", complete);
- if (!p.debug_output) {
- p.output.DisableAll();
- p.viz.DisableAll();
- }
- p.etc_dir = "path_extend";
-}
-
-//// main long contigs config load function
-//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete) {
-// using config_common::load;
-//
-// load(pe_cfg.dataset_name , pt, "dataset", complete);
-// load(pe_cfg.params , pt, "pe_params", complete);
-//}
-
-};
-
diff --git a/src/modules/algorithms/path_extend/pe_config_struct.hpp b/src/modules/algorithms/path_extend/pe_config_struct.hpp
deleted file mode 100644
index 620f7c8..0000000
--- a/src/modules/algorithms/path_extend/pe_config_struct.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * lc_config_struct.hpp
- *
- * Created on: Aug 16, 2011
- * Author: Alexey.Gurevich
- */
-
-#ifndef LC_CONFIG_STRUCT_HPP_
-#define LC_CONFIG_STRUCT_HPP_
-
-#include "pipeline/config_singl.hpp"
-#include "dev_support/cpp_utils.hpp"
-
-#include <boost/optional.hpp>
-#include <boost/property_tree/ptree_fwd.hpp>
-#include <boost/bimap.hpp>
-
-#include <string>
-#include <vector>
-
-namespace path_extend {
-
-enum output_broken_scaffolds {
- obs_none,
- obs_break_gaps,
- obs_break_all
-};
-
-enum scaffolding_mode {
- sm_old,
- sm_2015,
- sm_combined,
- sm_old_pe_2015
-};
-
-inline bool IsScaffolder2015Enabled(const scaffolding_mode mode) {
- return (mode == sm_old_pe_2015 || mode == sm_2015 || mode == sm_combined);
-}
-
-inline bool IsOldPEEnabled(const scaffolding_mode mode) {
- return (mode == sm_old_pe_2015 || mode == sm_old || mode == sm_combined);
-}
-
-// struct for path extend subproject's configuration file
-struct pe_config {
-
- typedef boost::bimap<std::string, output_broken_scaffolds> output_broken_scaffolds_id_mapping;
-
- static const output_broken_scaffolds_id_mapping FillOBSInfo() {
- output_broken_scaffolds_id_mapping::value_type info[] = {
- output_broken_scaffolds_id_mapping::value_type("none", obs_none),
- output_broken_scaffolds_id_mapping::value_type("break_gaps", obs_break_gaps),
- output_broken_scaffolds_id_mapping::value_type("break_all", obs_break_all)
- };
-
- return output_broken_scaffolds_id_mapping(info, utils::array_end(info));
- }
-
- static const output_broken_scaffolds_id_mapping &output_broken_scaffolds_info() {
- static output_broken_scaffolds_id_mapping output_broken_scaffolds_info = FillOBSInfo();
- return output_broken_scaffolds_info;
- }
-
- static const std::string &output_broken_scaffolds_name(output_broken_scaffolds obs) {
- auto it = output_broken_scaffolds_info().right.find(obs);
- VERIFY_MSG(it != output_broken_scaffolds_info().right.end(),
- "No name for output broken scaffolds mode id = " << obs);
-
- return it->second;
- }
-
- static output_broken_scaffolds output_broken_scaffolds_id(std::string name) {
- auto it = output_broken_scaffolds_info().left.find(name);
- VERIFY_MSG(it != output_broken_scaffolds_info().left.end(),
- "There is no output broken scaffolds mode with name = " << name);
-
- return it->second;
- }
-
- typedef boost::bimap<std::string, scaffolding_mode> scaffolding_mode_id_mapping;
-
- static const scaffolding_mode_id_mapping FillSMInfo() {
- scaffolding_mode_id_mapping::value_type info[] = {
- scaffolding_mode_id_mapping::value_type("old", sm_old),
- scaffolding_mode_id_mapping::value_type("2015", sm_2015),
- scaffolding_mode_id_mapping::value_type("combined", sm_combined),
- scaffolding_mode_id_mapping::value_type("old_pe_2015", sm_old_pe_2015)
- };
-
- return scaffolding_mode_id_mapping(info, utils::array_end(info));
- }
-
- static const scaffolding_mode_id_mapping &scaffolding_mode_info() {
- static scaffolding_mode_id_mapping scaffolding_mode_info = FillSMInfo();
- return scaffolding_mode_info;
- }
-
- static const std::string &scaffolding_mode_name(scaffolding_mode sm) {
- auto it = scaffolding_mode_info().right.find(sm);
- VERIFY_MSG(it != scaffolding_mode_info().right.end(),
- "No name for scaffolding mode id = " << sm);
-
- return it->second;
- }
-
- static scaffolding_mode scaffolding_mode_id(std::string name) {
- auto it = scaffolding_mode_info().left.find(name);
- VERIFY_MSG(it != scaffolding_mode_info().left.end(),
- "There is no scaffolding mode with name = " << name);
-
- return it->second;
- }
-
- struct OutputParamsT {
- bool write_overlaped_paths;
- bool write_paths;
-
- void DisableAll() {
- write_overlaped_paths = false;
- write_paths = false;
- }
- };
-
-
- struct VisualizeParamsT {
- bool print_overlaped_paths;
- bool print_paths;
-
- void DisableAll() {
- print_overlaped_paths = false;
- print_paths = false;
- }
- };
-
- struct ParamSetT {
- scaffolding_mode sm;
-
- bool normalize_weight;
- size_t split_edge_length;
-
- bool multi_path_extend;
- bool remove_overlaps;
- bool cut_all_overlaps;
-
- struct ExtensionOptionsT {
- bool use_default_single_threshold;
- double single_threshold;
- double weight_threshold;
- double priority_coeff;
- size_t max_repeat_length;
- } extension_options;
-
- ExtensionOptionsT mate_pair_options;
-
-
- struct ScaffolderOptionsT {
- bool enabled;
- int cutoff;
- int hard_cutoff;
- double rel_cutoff;
- double sum_threshold;
-
- bool cluster_info;
- double cl_threshold;
-
- bool use_la_gap_joiner;
- double min_gap_score;
- double max_must_overlap;
- double max_can_overlap;
- int short_overlap;
- size_t artificial_gap;
-
- bool use_old_score;
-
- double var_coeff;
- double basic_overlap_coeff;
-
- size_t min_overlap_length;
- double flank_addition_coefficient;
- double flank_multiplication_coefficient;
-
- boost::optional<int> min_overlap_for_rna_scaffolding;
- } scaffolder_options;
-
-
- struct LoopRemovalT {
- size_t max_loops;
- size_t mp_max_loops;
- } loop_removal;
-
- struct PathFiltrationT {
- bool enabled;
- size_t min_length;
- size_t isolated_min_length;
- size_t min_length_for_low_covered;
- double min_coverage;
- } path_filtration;
-
-
- bool use_coordinated_coverage;
-
- struct CoordinatedCoverageT {
- size_t max_edge_length_in_repeat;
- double delta;
- size_t min_path_len;
- } coordinated_coverage;
-
- struct Scaffolding2015 {
- bool autodetect;
- size_t min_unique_length;
- double unique_coverage_variation;
- double relative_weight_cutoff;
- } scaffolding2015;
-
- struct ScaffoldGraphParamsT {
- bool construct;
- bool output;
- size_t always_add;
- size_t never_add;
- double relative_threshold;
- bool graph_connectivity;
- size_t max_path_length;
- } scaffold_graph_params;
- };
-
- struct LongReads {
- double filtering;
- double weight_priority;
- double unique_edge_priority;
- size_t min_significant_overlap;
- };
-
- struct AllLongReads {
- LongReads single_reads;
- LongReads pacbio_reads;
- LongReads contigs;
- LongReads meta_contigs;
- };
-
-
- struct MainPEParamsT {
- output_broken_scaffolds obs;
-
- bool finalize_paths;
- bool debug_output;
- std::string etc_dir;
-
- OutputParamsT output;
- VisualizeParamsT viz;
- ParamSetT param_set;
- AllLongReads long_reads;
- }; // params;
-
-};
-
-void load(pe_config::ParamSetT &p, boost::property_tree::ptree const &pt, bool complete = true);
-void load(pe_config::MainPEParamsT &p, boost::property_tree::ptree const &pt, bool complete = true);
-//void load(pe_config& pe_cfg, boost::property_tree::ptree const& pt, bool complete);
-
-}
-
-//typedef config_common::config<path_extend::pe_config> pe_cfg;
-
-#endif /* CONFIG_STRUCT_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_io.hpp b/src/modules/algorithms/path_extend/pe_io.hpp
deleted file mode 100644
index a31623c..0000000
--- a/src/modules/algorithms/path_extend/pe_io.hpp
+++ /dev/null
@@ -1,290 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef PE_IO_HPP_
-#define PE_IO_HPP_
-
-
-#include "algorithms/genome_consistance_checker.hpp"
-#include "assembly_graph/paths/bidirectional_path.hpp"
-#include "assembly_graph/graph_support/contig_output.hpp"
-#include "assembly_graph/components/connected_component.hpp"
-#include "io/reads_io/osequencestream.hpp"
-#include <stack>
-#include <algorithm>
-
-namespace path_extend {
-
-using namespace debruijn_graph;
-
-class ContigWriter {
-protected:
- DECL_LOGGER("PathExtendIO")
-
-protected:
- const Graph& g_;
- ContigConstructor<Graph> &constructor_;
- size_t k_;
- map<EdgeId, ExtendedContigIdT> ids_;
- const ConnectedComponentCounter &c_counter_;
- bool plasmid_contig_naming_;
-
- //TODO: add constructor
- string ToString(const BidirectionalPath& path) const {
- stringstream ss;
- if (path.IsInterstrandBulge() && path.Size() == 1) {
- ss << constructor_.construct(path.Back()).first.substr(k_, g_.length(path.Back()) - k_);
- return ss.str();
- }
-
- if (!path.Empty()) {
- ss << constructor_.construct(path[0]).first.substr(0, k_);
- }
-
-
- size_t i = 0;
- while (i < path.Size()) {
- int gap = i == 0 ? 0 : path.GapAt(i);
- if (gap > (int) k_) {
- for (size_t j = 0; j < gap - k_; ++j) {
- ss << "N";
- }
- ss << constructor_.construct(path[i]).first;
- }
- else {
- int overlapLen = (int) k_ - gap;
- if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
- overlapLen -= (int) g_.length(path[i]) + (int) k_;
- ++i;
- //skipping overlapping edges
- while (i < path.Size() && overlapLen >= (int) g_.length(path[i]) + path.GapAt(i)) {
- overlapLen -= (int) g_.length(path[i]) + path.GapAt(i);
- ++i;
- }
- if (i == path.Size()) {
- break;
- }
-
- overlapLen = overlapLen + (int) k_ - path.GapAt(i);
- if(overlapLen < 0) {
- for (size_t j = 0; j < abs(overlapLen); ++j) {
- ss << "N";
- }
- overlapLen = 0;
- }
- }
- auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
- if(i != path.Size() - 1) {
- for(size_t j = 0 ; j < path.TrashPreviousAt(i + 1); ++j) {
- temp_str.pop_back();
- if(temp_str.size() == 0) {
- break;
- }
- }
- }
- ss << temp_str;
- }
- ++i;
- }
- return ss.str();
- }
-
- string ToFASTGString(const BidirectionalPath& path) const {
- if (path.Empty())
- return "";
- string res = ids_.at(path.Front()).short_id_;
- for (size_t i = 1; i < path.Size(); ++i) {
- if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i])) {
- res += ";\n" + ids_.at(path[i]).short_id_;
- }
- else {
- res += "," + ids_.at(path[i]).short_id_;
- }
- }
- return res;
- }
-
-
-public:
- ContigWriter(const Graph& g,
- ContigConstructor<Graph> &constructor,
- const ConnectedComponentCounter &c_counter,
- bool plasmid_contig_naming = false):
- g_(g), constructor_(constructor), k_(g.k()),
- ids_(), c_counter_(c_counter),
- plasmid_contig_naming_(plasmid_contig_naming)
- {
- MakeContigIdMap(g_, ids_, c_counter, "NODE");
- }
-
-
- void WriteEdges(const string &filename) const {
- INFO("Outputting edges to " << filename);
- io::osequencestream_with_id oss(filename);
-
- set<EdgeId> included;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (included.count(*iter) == 0) {
- oss.setCoverage(g_.coverage(*iter));
- oss.setID((int) g_.int_id(*iter));
- oss << g_.EdgeNucls(*iter);
-
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- DEBUG("Contigs written");
- }
-
-
- void WritePaths(const PathContainer &paths, const string &filename) const {
- INFO("Outputting path data to " << filename);
- std::ofstream oss;
- oss.open(filename.c_str());
- int i = 1;
- oss << paths.size() << endl;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- //oss << i << endl;
- i++;
- BidirectionalPath* path = iter.get();
- if (path->GetId() % 2 != 0) {
- path = path->GetConjPath();
- }
- oss << "PATH " << path->GetId() << " " << path->Size() << " " << path->Length() + k_ << endl;
- for (size_t j = 0; j < path->Size(); ++j) {
- oss << g_.int_id(path->At(j)) << " " << g_.length(path->At(j)) << " " << path->GapAt(j) << " " << path->TrashPreviousAt(j) << " " << path->TrashCurrentAt(j) << endl;
- }
- //oss << endl;
- }
- oss.close();
- DEBUG("Edges written");
- }
-
- void LoadPaths(PathContainer &paths, GraphCoverageMap &cover_map, const string &filename) const {
- paths.clear();
- map<size_t, EdgeId> int_ids;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- int_ids.insert(make_pair(g_.int_id(*iter), *iter));
- }
-
- std::ifstream iss;
- iss.open(filename);
- size_t psize;
- iss >> psize;
- for(size_t i = 0; i < psize && !iss.eof(); ++i) {
- string s;
- size_t id;
- size_t size;
- size_t len;
- iss >> s >> id >> size >> len;
- VERIFY(s == "PATH");
-
- BidirectionalPath * path = new BidirectionalPath(g_);
- BidirectionalPath * conjugatePath = new BidirectionalPath(g_);
- paths.AddPair(path, conjugatePath);
- path->Subscribe(&cover_map);
- conjugatePath->Subscribe(&cover_map);
- for (size_t j = 0; !iss.eof() && j < size; ++j) {
- size_t eid;
- size_t elen;
- int gap;
- uint32_t trash_prev;
- uint32_t trash_current;
-
- iss >> eid >> elen >> gap >> trash_prev >> trash_current;
- Gap gap_struct(gap, trash_prev, trash_current);
- EdgeId edge = int_ids[eid];
- conjugatePath->PushBack(edge, gap_struct);
- VERIFY(g_.length(edge) == elen);
- }
- VERIFY(path->Length() + k_ == len);
- }
- VERIFY(psize == paths.size());
- iss.close();
- }
-
- void WritePathsToFASTA(const PathContainer &paths,
- const string &filename_base,
- bool write_fastg = true) const {
-
- INFO("Writing contigs to " << filename_base);
- io::osequencestream_simple oss(filename_base + ".fasta");
-
- std::ofstream os_fastg;
- if (write_fastg)
- os_fastg.open((filename_base + ".paths").c_str());
-
- int i = 0;
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (iter.get()->Length() <= 0)
- continue;
- i++;
- DEBUG("NODE " << i);
- BidirectionalPath* path = iter.get();
- path->Print();
- string contig_id;
- string path_string = ToString(*path);
- if (plasmid_contig_naming_) {
- EdgeId e = path->At(0);
- size_t component = c_counter_.GetComponent(e);
- contig_id = io::MakeContigComponentId(i, path_string.length(), path->Coverage(), component);
- } else {
- contig_id = io::MakeContigId(i, path_string.length(), path->Coverage());
- }
- oss.set_header(contig_id);
- if (write_fastg) {
- os_fastg << contig_id<< endl;
- os_fastg << ToFASTGString(*iter.get()) << endl;
- os_fastg << contig_id << "'" << endl;
- os_fastg << ToFASTGString(*iter.getConjugate()) << endl;
- }
- oss << path_string;
- }
- if (write_fastg)
- os_fastg.close();
- DEBUG("Contigs written");
- }
-
- void WriteFASTGPaths(const PathContainer& paths, const string& filename) const {
- INFO("Writing FASTG paths to " << filename);
- std::ofstream oss(filename.c_str());
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- if (iter.get()->Length() <= 0)
- continue;
- oss << ToFASTGString(*iter.get()) << endl;
- oss << ToFASTGString(*iter.getConjugate()) << endl;
- }
- oss.close();
- }
-
- void OutputPaths(const PathContainer& paths, const string& filename_base) const {
- WritePathsToFASTA(paths, filename_base);
- }
-
-};
-
-
-class PathInfoWriter {
-protected:
- DECL_LOGGER("PathExtendIO")
-
-public:
-
- void WritePaths(const PathContainer &paths, const string &filename){
- std::ofstream oss(filename.c_str());
-
- for (auto iter = paths.begin(); iter != paths.end(); ++iter) {
- iter.get()->Print(oss);
- }
-
- oss.close();
- }
-};
-
-}
-
-#endif /* PE_IO_HPP_ */
diff --git a/src/modules/algorithms/path_extend/pe_resolver.hpp b/src/modules/algorithms/path_extend/pe_resolver.hpp
deleted file mode 100644
index bc36993..0000000
--- a/src/modules/algorithms/path_extend/pe_resolver.hpp
+++ /dev/null
@@ -1,523 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pe_resolver.hpp
- *
- * Created on: Mar 12, 2012
- * Author: andrey
- */
-
-#ifndef PE_RESOLVER_HPP_
-#define PE_RESOLVER_HPP_
-
-#include "path_extender.hpp"
-#include "pe_io.hpp"
-
-namespace path_extend {
-
-
-class SimpleOverlapRemover {
-
-public:
- SimpleOverlapRemover(const Graph& g, GraphCoverageMap& cm)
- : g_(g), coverage_map_(cm) {
- }
-
- void RemoveOverlaps(PathContainer& paths) const {
- for (size_t i = 0; i < paths.size(); i++) {
- FindAndRemovePathOverlap(paths, paths.Get(i));
- FindAndRemovePathOverlap(paths, paths.GetConjugate(i));
- }
- }
-
- void CutPseudoSelfConjugatePaths(PathContainer& paths) {
- vector<pair<BidirectionalPath *, BidirectionalPath *>> tmp_paths(paths.begin(), paths.end());
- for(auto it = tmp_paths.begin(); it != tmp_paths.end(); ++it) {
- BidirectionalPath * path1 = it->first;
- BidirectionalPath * path2 = it->second;
- bool ups = false;
- if(path1 != path2) {
- size_t last = 0;
- while(last < path1->Size() && path1->operator [](last) == path2->operator [](last)) {
- last++;
- }
- if(last > 0) {
- AddOverlap(paths, path1, 0, last - 1);
- path1->PopBack(last);
- path2->PopBack(last);
- }
- }
- if(ups) path1->Print();
- }
- }
-
- void RemoveSimilarPaths(PathContainer& paths, size_t min_edge_len, size_t max_path_diff, bool del_only_equal, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
- DEBUG("== Removing similar paths ==");
- DEBUG("Min edge len " << min_edge_len << ", max path diff " << max_path_diff)
- DEBUG("Only equal " << del_only_equal << ", subpaths " << del_subpaths << ", starts " << del_begins << ", all " << del_all << ", add starts " << add_overlap_begins);
- std::vector<EdgeId> edges = GetSortedEdges();
- for (size_t edgeIndex = 0; edgeIndex < edges.size(); ++edgeIndex) {
- EdgeId edge = edges.at(edgeIndex);
- BidirectionalPathSet cov_paths = coverage_map_.GetCoveringPaths(edge);
- std::vector<BidirectionalPath*> cov_vect(cov_paths.begin(), cov_paths.end());
- std::sort(cov_vect.begin(), cov_vect.end(), PathIdCompare);
- for (size_t vect_i = 0; vect_i < cov_vect.size(); ++vect_i) {
- BidirectionalPath* path1 = cov_vect.at(vect_i);
- if (cov_paths.find(path1) == cov_paths.end()) {
- continue;
- }
- for (size_t vect_i1 = vect_i + 1; vect_i1 < cov_vect.size(); ++vect_i1) {
- BidirectionalPath* path2 = cov_vect.at(vect_i1);
- if (path1 == path2 || path1 == path2->GetConjPath()) {
- continue;
- }
- if (cov_paths.find(path2) == cov_paths.end())
- continue;
- if ((*path1) == (*path2)) {
- if (path2->IsOverlap()) {
- path1->SetOverlap(true);
- }
- DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
- path2->Print();
- path1->Print();
- path2->Clear();
- cov_paths = coverage_map_.GetCoveringPaths(edge);
- continue;
- }
- if (g_.length(edge) <= min_edge_len || path1->IsOverlap() || path2->IsOverlap() || del_only_equal) {
- continue;
- }
- CompareAndCut(paths, edge, path1, path2, max_path_diff,
- del_subpaths, del_begins, del_all, add_overlap_begins);
- cov_paths = coverage_map_.GetCoveringPaths(edge);
- }
- }
- }
- DEBUG("== Emd removing similar paths ==");
- }
-
-private:
-
- void SubscribeCoverageMap(BidirectionalPath* path) const {
- path->Subscribe(&coverage_map_);
- for (size_t i = 0; i < path->Size(); ++i) {
- coverage_map_.BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- void CompareAndCut(PathContainer& paths, EdgeId edge, BidirectionalPath* path1, BidirectionalPath* path2,
- size_t max_path_diff,
- bool del_subpaths, bool del_begins,
- bool del_all, bool add_overlap_begins) const {
- vector<size_t> positions1 = path1->FindAll(edge);
- vector<size_t> positions2 = path2->FindAll(edge);
- size_t i1 = 0;
- size_t i2 = 0;
- bool renewed = false;
- while (i1 < positions1.size()) {
- while (i2 < positions2.size()) {
- DEBUG("CompareAndCutFromPos paths " << g_.int_id(edge));
- CompareAndCutFromPos(paths, path1, (int) positions1[i1], path2,
- (int) positions2[i2], max_path_diff,
- del_subpaths, del_begins, del_all, add_overlap_begins);
-
- if (positions1[i1] >= path1->Size() || path1->At(positions1[i1]) != edge || positions2[i2] >= path2->Size() || path2->At(positions2[i2]) != edge) {
- vector<size_t> new_positions1 = path1->FindAll(edge);
- vector<size_t> new_positions2 = path2->FindAll(edge);
-
- if (new_positions1.size() == positions1.size() && new_positions2.size() == positions2.size()) {
- return;
- }
- else {
- positions1 = new_positions1;
- positions2 = new_positions2;
- i1 = 0;
- i2 = 0;
- renewed = true;
- break;
- }
- ++i2;
- }
- ++i2;
- }
-
- if (renewed) {
- renewed = false;
- continue;
- }
- ++i1;
- }
- }
-
- void CompareAndCutFromPos(PathContainer& paths, BidirectionalPath* path1, int pos1,
- BidirectionalPath* path2, int pos2,
- size_t max_path_diff,
- bool delete_subpaths, bool delete_begins,
- bool delete_all, bool add_overlap_begins) const {
- int last2 = pos2;
- int last1 = pos1;
- if (last1 >= (int) path1->Size() || last2 >= (int) path2->Size()) {
- return;
- }
- vector<int> other_path_end;
- pair<int, int> posRes = ComparePaths(last1, last2, *path1, *path2, max_path_diff);
- last1 = posRes.first;
- last2 = posRes.second;
- BidirectionalPath* conj1 = path1->GetConjPath();
- BidirectionalPath* conj2 = path2->GetConjPath();
- size_t first1 = conj1->Size() - pos1 - 1;
- size_t first2 = conj2->Size() - pos2 - 1;
- posRes = ComparePaths(first1, first2, *conj1, *conj2, max_path_diff);
- first2 = conj2->Size() - posRes.second - 1;
- first1 = conj1->Size() - posRes.first - 1;
- if ((int)path2->LengthAt(last2) - (int)g_.length(path2->At(last2)) < (int) max_path_diff) {
- last2 = (int)path2->Size() - 1;
- }
- if ((int)path2->Length() - (int)path2->LengthAt(first2) < (int) max_path_diff) {
- first2 = 0;
- }
- if ((int)path1->LengthAt(last1) - (int)g_.length(path1->At(last1)) < (int) max_path_diff) {
- last1 = (int)path1->Size() - 1;
- }
- if ((int)path1->Length() - (int)path1->LengthAt(first1) < (int) max_path_diff) {
- first1 = 0;
- }
-
- CutOverlaps(paths, path1, first1, last1, path1->Size(), path2,
- first2, last2, path2->Size(), delete_subpaths,
- delete_begins, delete_all, add_overlap_begins);
- }
-
- void AddOverlap(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1) const {
- BidirectionalPath* overlap = new BidirectionalPath(path1->SubPath(first1, last1 + 1));
- BidirectionalPath* conj_overlap = new BidirectionalPath(overlap->Conjugate());
- SubscribeCoverageMap(overlap);
- SubscribeCoverageMap(conj_overlap);
- paths.AddPair(overlap, conj_overlap);
- }
-
- bool CutOverlaps(PathContainer& paths, BidirectionalPath* path1, size_t first1, size_t last1, size_t size1, BidirectionalPath* path2, size_t first2,
- size_t last2, size_t size2, bool del_subpaths, bool del_begins, bool del_all, bool add_overlap_begins) const {
- if (first1 == 0 && last1 == size1 - 1 && del_subpaths) {
- DEBUG("Removing path " << path1->GetId() << " because of path " << path2->GetId());
- path1->Print();
- path2->Print();
- path1->Clear();
- } else if (first2 == 0 && last2 == size2 - 1 && del_subpaths) {
- DEBUG("Removing path " << path2->GetId() << " because of path " << path1->GetId());
- path2->Print();
- path1->Print();
- path2->Clear();
- } else if (first2 == 0 && first1 == 0 && del_begins) {
- DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar starts");
- DEBUG("Path 1: " << last1 << " edges of length " << path1->Length() - path1->LengthAt(min(last1 + 1, path1->Size() - 1)));
- DEBUG("Path 2: " << last2 << " edges of length " << path2->Length() - path2->LengthAt(min(last2 + 1, path2->Size() - 1)));
- DEBUG("Path 1 has overlap start " << path1->HasOverlapedBegin() << ", path 2 has overlap start " << path2->HasOverlapedBegin());
-
- if (add_overlap_begins) {
- AddOverlap(paths, path1, first1, last1);
- DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
- path2->Print();
- path1->Print();
- path1->GetConjPath()->PopBack(last1 + 1);
- path2->GetConjPath()->PopBack(last2 + 1);
- } else if (path1->Length() < path2->Length()) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->GetConjPath()->PopBack(last1 + 1);
- } else {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- path2->GetConjPath()->PopBack(last2 + 1);
- }
- } else if ((last1 == size1 - 1 && last2 == size2 - 1) && del_begins) {
- DEBUG("Path " << path1->GetId() << ", len " << path1->Length() << " and path " << path2->GetId() << ", len " << path2->Length() << " have similar ends");
- DEBUG("Path 1: " << path1->Size() - first1 << " edges of length " << path1->LengthAt(first1));
- DEBUG("Path 2: " << path2->Size() - first2 << " edges of length " << path2->LengthAt(first2));
- DEBUG("Path 1 has overlap end " << path1->HasOverlapedEnd() << ", path 2 has overlap end " << path2->HasOverlapedEnd());
-
- if (add_overlap_begins){
- AddOverlap(paths, path1, first1, last1);
- DEBUG("Detaching overlap " << path2->GetId() << " and " << path1->GetId());
- path2->Print();
- path1->Print();
- path1->PopBack(last1 + 1 - first1);
- path2->PopBack(last2 + 1 - first2);
- }
- if (path1->Length() < path2->Length()) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->PopBack(last1 + 1 - first1);
- } else {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- path2->PopBack(last2 + 1 - first2);
- }
- } else if (first2 == 0 && del_all) {
- DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
- DEBUG("Does it have overlap in the beginning: " << path2->HasOverlapedBegin());
- path2->Print();
- DEBUG(" >>>> ")
- path1->Print();
- DEBUG(" ==== ");
- path2->GetConjPath()->PopBack(last2 + 1);
- } else if (last2 == size2 - 1 && del_all) {
- DEBUG("Detaching overlap from " << path2->GetId() << " because of "<< path1->GetId());
- DEBUG("Does it have overlap in the end: " << path2->HasOverlapedEnd());
- path2->Print();
- DEBUG(" >>>> ")
- path1->Print();
- DEBUG(" ==== ");
- path2->PopBack(last1 + 1 - first1);
- } else if (first1 == 0 && del_all) {
- DEBUG("Detaching overlap from " << path1->GetConjPath()->GetId() << " because of "<< path2->GetId());
- DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
- path1->Print();
- DEBUG(" >>>> ")
- path2->Print();
- DEBUG(" ==== ");
- path1->GetConjPath()->PopBack(last1 + 1);
- } else if (last1 == size1 - 1 && del_all) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- DEBUG("Does it have overlap in the end: " << path1->HasOverlapedBegin());
- path1->Print();
- DEBUG(" >>>> ")
- path2->Print();
- DEBUG(" ==== ");
- path1->PopBack(last1 + 1 - first1);
- } else {
- return false;
- }
- return true;
- }
-
- std::vector<EdgeId> GetSortedEdges() const {
- std::set<EdgeId> edges_set;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- edges_set.insert(*iter);
- edges_set.insert(g_.conjugate(*iter));
- }
- std::vector<EdgeId> edges(edges_set.begin(), edges_set.end());
- std::sort(edges.begin(), edges.end(), EdgeLengthAndIdComparator(g_));
- return edges;
- }
-
- bool HasAlreadyOverlapedEnd(BidirectionalPath * path) const {
- return !path->IsOverlap() and path->HasOverlapedEnd();
- }
-
- bool HasAlreadyOverlapedBegin(BidirectionalPath * path) const {
- return !path->IsOverlap() and path->HasOverlapedBegin();
- }
-
- bool IsSamePath(BidirectionalPath * path1,
- BidirectionalPath * path2) const {
- return *path2 == *path1 or *path2 == *path1->GetConjPath();
- }
-
- void RemoveOverlap(PathContainer& paths, BidirectionalPath* path1,
- BidirectionalPath* path2, size_t overlap_size) const {
- BidirectionalPath* conj2 = path2->GetConjPath();
- if (path1->IsOverlap() && overlap_size == path1->Size()) {
- DEBUG("Detaching overlap from " << path2->GetConjPath()->GetId() << " because of "<< path1->GetId());
- path2->Print();
- path1->Print();
- conj2->PopBack(overlap_size);
- path2->SetOverlapedBeginTo(path1);
- } else if (path2->IsOverlap() && path2->Size() == overlap_size) {
- DEBUG("Detaching overlap from " << path1->GetId() << " because of "<< path2->GetId());
- path1->Print();
- path2->Print();
- path1->PopBack(overlap_size);
- path1->SetOverlapedEndTo(path2);
- } else if (overlap_size < path2->Size()
- && overlap_size < path1->Size()) {
- BidirectionalPath* overlap = new BidirectionalPath(g_, path1->Back());
- BidirectionalPath* conj_overlap = new BidirectionalPath(g_, g_.conjugate(path1->Back()));
- SubscribeCoverageMap(overlap);
- SubscribeCoverageMap(conj_overlap);
- paths.AddPair(overlap, conj_overlap);
- DEBUG("Detaching overlap " << path1->GetId() << " and " << conj2->GetId());
- path1->Print();
- conj2->Print();
- path1->PopBack();
- conj2->PopBack();
-
- for (size_t i = 1; i < overlap_size; ++i) {
- conj_overlap->PushBack(g_.conjugate(path1->Back()));
- path1->PopBack();
- conj2->PopBack();
- }
- overlap->SetOverlap(true);
- path1->SetOverlapedEndTo(overlap);
- path2->SetOverlapedBeginTo(overlap);
- }
- }
-
- void FindAndRemovePathOverlap(PathContainer& all_paths,
- BidirectionalPath* path1) const {
- int last = (int) path1->Size() - 1;
- if (last <= 0 or coverage_map_.GetCoverage(path1->At(last)) <= 1) {
- return;
- }
- BidirectionalPathSet paths =
- coverage_map_.GetCoveringPaths(path1->At(last));
- BidirectionalPath* overlap_path = NULL;
- size_t overlap_size = 0;
- for (auto path_iter = paths.begin(); path_iter != paths.end();
- ++path_iter) {
- if (IsSamePath(*path_iter, path1)) {
- continue;
- }
- size_t over_size = path1->OverlapEndSize(*path_iter);
- if (over_size > overlap_size) {
- overlap_size = over_size;
- overlap_path = *path_iter;
- } else if (over_size == overlap_size &&
- (overlap_path == NULL || (*path_iter)->GetId() < overlap_path->GetId())) {
- overlap_path = *path_iter;
- }
- }
- if (overlap_path == NULL) {
- return;
- }
- if (overlap_size > 0) {
- RemoveOverlap(all_paths, path1, overlap_path, overlap_size);
- }
- }
-
- class EdgeLengthAndIdComparator {
- public:
- EdgeLengthAndIdComparator(const Graph& g)
- : g_(g) {
- }
- bool operator()(const EdgeId& e1, const EdgeId& e2) const {
- if (g_.length(e1) > g_.length(e2)) {
- return true;
- }
- if (g_.length(e2) > g_.length(e1)) {
- return false;
- }
- return e1.int_id() < e2.int_id();
- }
- private:
- const Graph& g_;
- };
-
- const Graph& g_;
- GraphCoverageMap& coverage_map_;
-protected:
- DECL_LOGGER("PEResolver")
-};
-
-class PathExtendResolver {
-
-protected:
- const Graph& g_;
- size_t k_;
-
-public:
- PathExtendResolver(const Graph& g): g_(g), k_(g.k()) {
- }
-
- PathContainer makeSimpleSeeds() {
- std::set<EdgeId> included;
- PathContainer edges;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.int_id(*iter) <= 0 or InTwoEdgeCycle(*iter, g_))
- continue;
- if (included.count(*iter) == 0) {
- BidirectionalPath * first = new BidirectionalPath(g_, *iter);
- BidirectionalPath * second = new BidirectionalPath(g_, g_.conjugate(*iter));
- edges.AddPair(first,second);
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- return edges;
- }
-
- PathContainer extendSeeds(PathContainer& seeds, ContigsMaker& pathExtender) {
- PathContainer paths;
- pathExtender.GrowAll(seeds, paths);
- return paths;
- }
-
- void removeEqualPaths(PathContainer& paths, GraphCoverageMap& coverage_map,
- size_t max_overlap) {
-
- SimpleOverlapRemover remover(g_, coverage_map);
- remover.RemoveSimilarPaths(paths, max_overlap, max_overlap, true, false, false, false, false);
- }
-
- void removeOverlaps(PathContainer& paths, GraphCoverageMap& coverage_map,
- size_t min_edge_len, size_t max_path_diff,
- bool add_overlaps_begin,
- bool cut_preudo_self_conjugate) {
- SimpleOverlapRemover remover(g_, coverage_map);
- if (cut_preudo_self_conjugate)
- remover.CutPseudoSelfConjugatePaths(paths);
- //writer.WritePathsToFASTA(paths, output_dir + "/before.fasta");
- //DEBUG("Removing subpaths");
- //delete not only eq,
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, false, false, add_overlaps_begin);
- //writer.WritePathsToFASTA(paths, output_dir + "/remove_similar.fasta");
- //DEBUG("Remove overlaps")
- remover.RemoveOverlaps(paths);
- //writer.WritePathsToFASTA(paths, output_dir + "/after_remove_overlaps.fasta");
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, true, false, false, false, add_overlaps_begin);
- //writer.WritePathsToFASTA(paths, output_dir + "/remove_equal.fasta");
- //DEBUG("remove similar path. Max difference " << max_overlap);
- remover.RemoveSimilarPaths(paths, min_edge_len, max_path_diff, false, true, true, true, add_overlaps_begin);
- DEBUG("end removing");
- }
-
- void RemoveMatePairEnds(PathContainer& paths, size_t min_edge_len) const {
- DEBUG("remove mp ends");
- for (size_t i = 0; i < paths.size(); ++i) {
- RemoveMatePairEnd(*paths.Get(i), min_edge_len);
- RemoveMatePairEnd(*paths.GetConjugate(i), min_edge_len);
- }
- }
-
- void addUncoveredEdges(PathContainer& paths, GraphCoverageMap& coverageMap) {
- std::set<EdgeId> included;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (included.count(*iter) == 0 && !coverageMap.IsCovered(*iter)) {
- BidirectionalPath* path = new BidirectionalPath(g_, *iter);
- BidirectionalPath* conj = new BidirectionalPath(g_, g_.conjugate(*iter));
- path->Subscribe(&coverageMap);
- conj->Subscribe(&coverageMap);
- coverageMap.BackEdgeAdded(path->At(0), path, path->GapAt(0));
- coverageMap.BackEdgeAdded(conj->At(0), conj, conj->GapAt(0));
- paths.AddPair(path, conj);
- included.insert(*iter);
- included.insert(g_.conjugate(*iter));
- }
- }
- }
-
-private:
- void RemoveMatePairEnd(BidirectionalPath& path, size_t min_edge_len) const {
- int pos = int(path.Size()) - 1;
- while (pos > 0 and g_.length(path.At(pos)) < min_edge_len) {
- path.PopBack();
- pos--;
- }
- }
-protected:
- DECL_LOGGER("PEResolver")
-};
-
-} /* PE_RESOLVER_HPP_ */
-
-#endif
diff --git a/src/modules/algorithms/path_extend/pe_utils.hpp b/src/modules/algorithms/path_extend/pe_utils.hpp
deleted file mode 100644
index f061af5..0000000
--- a/src/modules/algorithms/path_extend/pe_utils.hpp
+++ /dev/null
@@ -1,462 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pe_utils.hpp
- *
- * Created on: Nov 27, 2012
- * Author: andrey
- */
-
-#ifndef PE_UTILS_HPP_
-#define PE_UTILS_HPP_
-
-#include "assembly_graph/paths/bidirectional_path.hpp"
-
-using namespace debruijn_graph;
-
-namespace path_extend {
-
-//Checks whether we are in a cycle of length 2, used only for seed selection.
-inline bool InTwoEdgeCycle(EdgeId e, const Graph &g) {
- auto v = g.EdgeEnd(e);
- if (g.OutgoingEdgeCount(v) >= 1) {
- auto edges = g.OutgoingEdges(v);
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if (g.EdgeStart(e) == g.EdgeEnd(*it)) {
- return true;
- }
- }
- }
- return false;
-}
-
-inline bool InBuble(EdgeId e, const Graph& g) {
- auto edges = g.OutgoingEdges(g.EdgeStart(e));
- auto endVertex = g.EdgeEnd(e);
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- if ((g.EdgeEnd(*it) == endVertex) and (*it != e)) {
- return true;
- }
- }
- return false;
-}
-
-
-// Handles all paths in PathContainer.
-// For each edge output all paths that _traverse_ this path. If path contains multiple instances - count them. Position of the edge is not reported.
-//TODO: Inside is some WTF, should be rewritten.
-//TODO: Memory leaks, inefficient data structure.
-class GraphCoverageMap: public PathListener {
-
-public:
- typedef BidirectionalPathMultiset MapDataT;
-
-
-protected:
- const Graph& g_;
-
- std::map <EdgeId, MapDataT * > edgeCoverage_;
-
- MapDataT * empty_;
-
- virtual void EdgeAdded(EdgeId e, BidirectionalPath * path, Gap /*gap*/) {
- auto iter = edgeCoverage_.find(e);
- if (iter == edgeCoverage_.end()) {
- edgeCoverage_.insert(std::make_pair(e, new MapDataT()));
- }
- edgeCoverage_[e]->insert(path);
- }
-
- virtual void EdgeRemoved(EdgeId e, BidirectionalPath * path) {
- auto iter = edgeCoverage_.find(e);
- if (iter != edgeCoverage_.end()) {
- if (iter->second->count(path) == 0) {
- DEBUG("Error erasing path from coverage map");
- } else {
- auto entry = iter->second->find(path);
- iter->second->erase(entry);
- }
- }
- }
-
-public:
- GraphCoverageMap(const Graph& g) : g_(g), edgeCoverage_() {
- empty_ = new MapDataT();
- }
-
- GraphCoverageMap(const Graph& g, const PathContainer& paths) : g_(g), edgeCoverage_() {
- empty_ = new MapDataT();
- for (size_t i = 0; i < paths.size(); ++i) {
- for (size_t j = 0; j < paths.Get(i)->Size(); ++j) {
- EdgeAdded(paths.Get(i)->At(j), paths.Get(i), paths.Get(i)->GapAt(j));
- }
- for (size_t j = 0; j < paths.GetConjugate(i)->Size(); ++j) {
- EdgeAdded(paths.GetConjugate(i)->At(j), paths.GetConjugate(i), paths.GetConjugate(i)->GapAt(j));
- }
- }
- }
-
- virtual ~GraphCoverageMap() {
- delete empty_;
- for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
- delete iter->second;
- }
- }
-
- void Clear() {
- for (auto iter = edgeCoverage_.begin(); iter != edgeCoverage_.end(); ++iter) {
- MapDataT* cover_paths = iter->second;
- for (auto ipath = cover_paths->begin(); ipath != cover_paths->end(); ++ipath) {
- BidirectionalPath* p = *ipath;
- p->Unsubscribe(this);
- }
- delete cover_paths;
- }
- edgeCoverage_.clear();
- }
-
- void Subscribe(BidirectionalPath * path) {
- path->Subscribe(this);
- for (size_t i = 0; i < path->Size(); ++i) {
- BackEdgeAdded(path->At(i), path, path->GapAt(i));
- }
- }
-
- virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
- EdgeAdded(e, path, gap);
- }
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) {
- EdgeAdded(e, path, gap);
- }
-
- virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) {
- EdgeRemoved(e, path);
- }
-
- virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) {
- EdgeRemoved(e, path);
- }
-
- MapDataT * GetEdgePaths(EdgeId e) const {
- auto iter = edgeCoverage_.find(e);
- if (iter != edgeCoverage_.end()) {
- return iter->second;
- }
- return empty_;
- }
-
- int GetCoverage(EdgeId e) const {
- return (int) GetEdgePaths(e)->size();
- }
-
- bool IsCovered(EdgeId e) const {
- return GetCoverage(e) > 0;
- }
-
- bool IsCovered(const BidirectionalPath& path) const {
- for (size_t i = 0; i < path.Size(); ++i) {
- if (!IsCovered(path[i])) {
- return false;
- }
- }
- return true;
- }
-
- int GetCoverage(const BidirectionalPath& path) const {
- if (path.Empty()) {
- return 0;
- }
-
- int cov = GetCoverage(path[0]);
- for (size_t i = 1; i < path.Size(); ++i) {
- int currentCov = GetCoverage(path[i]);
- if (cov > currentCov) {
- cov = currentCov;
- }
- }
-
- return cov;
- }
-
- BidirectionalPathSet GetCoveringPaths(EdgeId e) const {
- auto mapData = GetEdgePaths(e);
- return BidirectionalPathSet(mapData->begin(), mapData->end());
- }
-
- int GetUniqueCoverage(EdgeId e) const {
- return (int) GetCoveringPaths(e).size();
- }
-
- std::map <EdgeId, MapDataT * >::const_iterator begin() const {
- return edgeCoverage_.begin();
- }
-
- std::map <EdgeId, MapDataT * >::const_iterator end() const {
- return edgeCoverage_.end();
- }
-
- // DEBUG
-
- void PrintUncovered() const {
- DEBUG("Uncovered edges");
- int s = 0;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (!IsCovered(*iter)) {
- DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << ") ~ " << g_.int_id(g_.conjugate(*iter)) << " (" << g_.length(g_.conjugate(*iter)) << ")");
- s += 1;
- }
- }
- DEBUG("Uncovered edges " << s / 2);
- }
-
- void PrintMulticovered() const {
- DEBUG("Multicovered edges");
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- auto paths = GetCoveringPaths(*iter);
- if (paths.size() > 1 && g_.length(*iter) > 1000) {
- DEBUG(g_.int_id(*iter) << " (" << g_.length(*iter) << "). " << " Covered: " << paths.size());
- for (auto path = paths.begin(); path != paths.end(); ++path) {
- (*path)->Print();
- }
- DEBUG("=====");
- }
- }
- }
-
- size_t size() const {
- return edgeCoverage_.size();
- }
-
- const Graph& graph() const {
- return g_;
- }
-
-private:
- GraphCoverageMap(const GraphCoverageMap& t) : g_(t.g_), empty_(t.empty_) {}
-};
-
-inline bool GetLoopAndExit(const Graph& g, EdgeId e, pair<EdgeId, EdgeId>& result) {
- VertexId v = g.EdgeEnd(e);
- VertexId start = g.EdgeStart(e);
- if (g.OutgoingEdgeCount(v) != 2 || g.IncomingEdgeCount(v) != 1 || g.OutgoingEdgeCount(start) != 1 || g.IncomingEdgeCount(start) != 2) {
- return false;
- }
- EdgeId loop;
- EdgeId exit;
- bool loop_found = false;
- bool exit_found = false;
- auto edges = g.OutgoingEdges(v);
- for (auto edge = edges.begin(); edge != edges.end(); ++edge) {
- if (g.EdgeEnd(*edge) == g.EdgeStart(e) && *edge != e) {
- loop = *edge;
- loop_found = true;
- } else if (*edge != e) {
- exit = *edge;
- exit_found = true;
- }
- }
- result = make_pair(loop, exit);
- return exit_found && loop_found;
-}
-
-class LoopDetector {
-public:
- LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map);
- size_t LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- size_t LoopLength(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- bool PathIsLoop(size_t edges) const;
- size_t LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const;
- size_t LastLoopCount(size_t edges) const;
- bool IsCycled(size_t loopLimit, size_t& skip_identical_edges) const;
- size_t EdgesToRemove(size_t skip_identical_edges, bool fullRemoval = false) const;
- void RemoveLoop(size_t skip_identical_edges, bool fullRemoval = true);
- bool EdgeInShortLoop(EdgeId e) const;
- bool PrevEdgeInShortLoop() const;
-private:
- BidirectionalPath* path_;
- const GraphCoverageMap& cov_map_;
- DECL_LOGGER("BidirectionalPath");
-};
-
-inline LoopDetector::LoopDetector(BidirectionalPath* p, const GraphCoverageMap& cov_map)
- : path_(p),
- cov_map_(cov_map) {
-}
-
-inline size_t LoopDetector::LoopEdges(size_t skip_identical_edges, size_t min_cycle_appearences) const {
- if (path_->Size() == 0) {
- return 0;
- }
- EdgeId e = path_->Back();
- size_t count = cov_map_.GetEdgePaths(e)->count(path_);
- if (count <= 1 || count < min_cycle_appearences * (skip_identical_edges + 1)) {
- return 0;
- }
- vector<size_t> edge_positions = path_->FindAll(e);
- VERIFY(edge_positions.size() == count);
- VERIFY(edge_positions.size() >= skip_identical_edges);
- size_t loopSize = edge_positions.back() - edge_positions[edge_positions.size() - 1 - (skip_identical_edges + 1)];
- return loopSize;
-}
-
-inline bool LoopDetector::PathIsLoop(size_t edges) const {
- if (edges == 0 || path_->Size() <= 1)
- return false;
-
- for (size_t i = 0; i < edges; ++i) {
- EdgeId e = path_->At(i);
- for (int j = (int) path_->Size() - ((int) edges - (int) i); j >= 0; j -= (int) edges) {
- if (path_->operator [](j) != e) {
- return false;
- }
- }
- }
- return true;
-}
-
-inline size_t LoopDetector::LastLoopCount(size_t skip_identical_edges, size_t min_cycle_appearences) const {
- size_t edges = LoopEdges(skip_identical_edges, min_cycle_appearences);
- return LastLoopCount(edges);
-}
-
-inline size_t LoopDetector::LastLoopCount(size_t edges) const {
- if (edges == 0) {
- return 0;
- }
-
- BidirectionalPath loop = path_->SubPath(path_->Size() - edges);
- size_t count = 0;
- int i = (int) path_->Size() - (int) edges;
- int delta = -(int) edges;
-
- while (i >= 0) {
- if (!path_->CompareFrom(i, loop)) {
- break;
- }
- ++count;
- i += delta;
- }
-
- return count;
-}
-
-inline bool LoopDetector::IsCycled(size_t loopLimit, size_t& skip_identical_edges) const {
- if (path_->Size() == 0 or cov_map_.GetEdgePaths(path_->Back())->count(path_) < loopLimit) {
- return false;
- }
- skip_identical_edges = 0;
- size_t loop_count = LastLoopCount(skip_identical_edges, loopLimit);
- while (loop_count > 0) {
- if (loop_count >= loopLimit) {
- return true;
- }
- loop_count = LastLoopCount(++skip_identical_edges, loopLimit);
- }
- return false;
-}
-
-inline size_t LoopDetector::EdgesToRemove(size_t skip_identical_edges, bool fullRemoval) const {
- size_t edges = LoopEdges(skip_identical_edges, 1);
- size_t count = LastLoopCount(edges);
- bool onlyCycle = PathIsLoop(edges);
- int result;
-
- if (onlyCycle || path_->Size() <= count * edges) {
- result = (int) path_->Size() - (int) edges;
- } else if (fullRemoval) {
- result = (int) count * (int) edges;
- } else {
- result = (int) (count - 1) * (int) edges;
- }
-
- return result < 0 ? 0 : result;
-}
-
-inline void LoopDetector::RemoveLoop(size_t skip_identical_edges, bool fullRemoval) {
- size_t toRemove = EdgesToRemove(skip_identical_edges, fullRemoval);
- for (size_t i = 0; i < toRemove; ++i) {
- path_->PopBack();
- }
-}
-
-inline bool LoopDetector::EdgeInShortLoop(EdgeId e) const {
- pair<EdgeId, EdgeId> temp;
- return GetLoopAndExit(path_->graph(), e, temp);
-}
-
-inline bool LoopDetector::PrevEdgeInShortLoop() const {
- if (path_->Size() <= 2) {
- return false;
- }
- const Graph& g = path_->graph();
- EdgeId e2 = path_->At(path_->Size() - 1);
- EdgeId e1 = path_->At(path_->Size() - 2);
- VertexId v2 = g.EdgeEnd(e1);
- if (g.OutgoingEdgeCount(v2) == 2 && g.EdgeEnd(e2) == g.EdgeStart(e1) && g.EdgeEnd(e1) == g.EdgeStart(e2)) {
- return EdgeInShortLoop(e1);
- }
- return false;
-}
-
-class ScaffoldBreaker {
-private:
-
- int min_gap_;
-
- PathContainer container_;
-
- void SplitPath(const BidirectionalPath& path) {
- size_t i = 0;
-
- while (i < path.Size()) {
- BidirectionalPath * p = new BidirectionalPath(path.graph(), path[i]);
- ++i;
-
- while (i < path.Size() and path.GapAt(i) <= min_gap_) {
- p->PushBack(path[i], path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
- ++i;
- }
-
- if (i < path.Size()) {
- DEBUG("split path " << i << " gap " << path.GapAt(i));
- p->Print();
- }
-
- BidirectionalPath * cp = new BidirectionalPath(p->Conjugate());
- container_.AddPair(p, cp);
- }
- }
-
-public:
-
- ScaffoldBreaker(int min_gap, const PathContainer &paths)
- : min_gap_(min_gap) {
- for (auto it = paths.begin(); it != paths.end(); ++it) {
- SplitPath(*it.get());
- }
- }
-
- ~ScaffoldBreaker() {
- // FIXME: WTF, Why doesn't PathContainer own the paths?
- container_.DeleteAllPaths();
- }
-
- void clear() {
- container_.clear();
- }
-
- PathContainer& container() {
- return container_;
- }
-
-};
-
-}
-
-#endif /* PE_UTILS_HPP_ */
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp
deleted file mode 100644
index 14ba367..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "connection_condition2015.hpp"
-namespace path_extend {
-
-PairedLibConnectionCondition::PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t min_read_count) :
- graph_(graph),
- lib_(lib),
- lib_index_(lib_index),
- min_read_count_(min_read_count),
-//TODO reconsider condition
- left_dist_delta_(5 * (int) lib_->GetISMax()),
- right_dist_delta_(max(5 * (int) lib_->GetIsVar(), int(lib_->is_))) {
-}
-
-size_t PairedLibConnectionCondition::GetLibIndex() const {
- return lib_index_;
-}
-
-set <debruijn_graph::EdgeId> PairedLibConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
- set <debruijn_graph::EdgeId> all_edges;
- int e_length = (int) graph_.length(e);
- lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
-
- set <debruijn_graph::EdgeId> result;
- for (auto edge : all_edges) {
- if (edge != e && edge != graph_.conjugate(e) &&
- math::ge(GetWeight(e, edge), (double) min_read_count_)) {
- result.insert(edge);
- }
- }
- return result;
-}
-
-double PairedLibConnectionCondition::GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
- int e_length = (int) graph_.length(e1);
- return lib_->CountPairedInfo(e1, e2, e_length - left_dist_delta_, e_length + right_dist_delta_);
-}
-
-AdvancedPairedConnectionCondition::AdvancedPairedConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t always_add,
- size_t never_add,
- double relative_threshold):
- PairedLibConnectionCondition(graph, lib, lib_index, never_add),
- always_add_(always_add),
- never_add_(never_add),
- relative_threshold_(relative_threshold) {}
-
-set <debruijn_graph::EdgeId> AdvancedPairedConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
- set <debruijn_graph::EdgeId> all_edges;
- int e_length = (int) graph_.length(e);
- lib_->FindJumpEdges(e, all_edges, e_length - left_dist_delta_, e_length + right_dist_delta_);
-
- double max_weight = 0;
- for (auto edge : all_edges) {
- if (edge != e && edge != graph_.conjugate(e)) {
- double w = GetWeight(e, edge);
- if (math::gr(w, max_weight))
- max_weight = w;
- }
- }
- double threshold = std::max((double) never_add_, std::min((double) always_add_, max_weight * relative_threshold_));
-
- set <debruijn_graph::EdgeId> result;
- for (auto edge : all_edges) {
- if (edge != e && edge != graph_.conjugate(e) &&
- math::ge(GetWeight(e, edge), threshold)) {
- result.insert(edge);
- }
- }
- return result;
-}
-
-
-//TODO: We use same part of index twice, is it necessary?
-int PairedLibConnectionCondition::GetMedianGap(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const {
- std::vector<int> distances;
- std::vector<double> weights;
- int e_length = (int) graph_.length(e1);
- lib_->CountDistances(e1, e2, distances, weights);
- std::vector<pair<int, double> >h(distances.size());
- for (size_t i = 0; i< distances.size(); i++) {
- if (distances[i] >= e_length - left_dist_delta_ && distances[i] <= e_length + right_dist_delta_)
- h.push_back(std::make_pair(distances[i], weights[i]));
- }
-//TODO: is it really necessary?
- std::sort(h.begin(), h.end());
- double sum = 0.0;
- double sum2 = 0.0;
- for (size_t j = 0; j< h.size(); ++j) {
- sum += h[j].second;
- }
- size_t i = 0;
- for (; i < h.size(); ++i) {
- sum2 += h[i].second;
- if (sum2 * 2 > sum)
- break;
- }
- return (int) round(h[i].first - e_length);
-}
-
-AssemblyGraphConnectionCondition::AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g,
- size_t max_connection_length, const ScaffoldingUniqueEdgeStorage & unique_edges) :
- g_(g), max_connection_length_(max_connection_length), interesting_edge_set_(unique_edges.GetSet()), stored_distances_() {
-}
-
-set <debruijn_graph::EdgeId> AssemblyGraphConnectionCondition::ConnectedWith(debruijn_graph::EdgeId e) const {
- VERIFY_MSG(interesting_edge_set_.find(e)!= interesting_edge_set_.end(), " edge "<< e.int_id() << " not applicable for connection condition");
- if (stored_distances_.find(e) != stored_distances_.end()) {
- return stored_distances_[e];
- }
- stored_distances_.insert(make_pair(e, set<debruijn_graph::EdgeId>()));
- for (auto connected: g_.OutgoingEdges(g_.EdgeEnd(e))) {
- if (interesting_edge_set_.find(connected) != interesting_edge_set_.end()) {
- stored_distances_[e].insert(connected);
- }
- }
- DijkstraHelper<debruijn_graph::Graph>::BoundedDijkstra dijkstra(
- DijkstraHelper<debruijn_graph::Graph>::CreateBoundedDijkstra(g_, max_connection_length_));
- dijkstra.Run(g_.EdgeEnd(e));
- for (auto v: dijkstra.ReachedVertices()) {
- for (auto connected: g_.OutgoingEdges(v)) {
- if (interesting_edge_set_.find(connected) != interesting_edge_set_.end() && dijkstra.GetDistance(v) < max_connection_length_) {
- stored_distances_[e].insert(connected);
- }
- }
- }
- return stored_distances_[e];
-}
-void AssemblyGraphConnectionCondition::AddInterestingEdge(debruijn_graph::EdgeId e) {
- interesting_edge_set_.insert(e);
-}
-double AssemblyGraphConnectionCondition::GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const {
- return 1.0;
-}
-
-size_t AssemblyGraphConnectionCondition::GetLibIndex() const {
- return (size_t) - 1;
-}
-
-}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp b/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp
deleted file mode 100644
index 0cfe58e..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/connection_condition2015.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-
-#ifndef CONNECTION_CONDITION2015_HPP
-#define CONNECTION_CONDITION2015_HPP
-#include "algorithms/genome_consistance_checker.hpp"
-#include "dev_support/logger/logger.hpp"
-#include "algorithms/path_extend/paired_library.hpp"
-#include "assembly_graph/graph_support/scaff_supplementary.hpp"
-#include <map>
-#include <set>
-
-namespace path_extend {
-
-/* Connection condition are used by both scaffolder's extension chooser and scaffold graph */
-
-class ConnectionCondition {
-public:
-// Outputs the edges e is connected with.
-//TODO performance issue: think about inside filtering. Return only unique connected edges?
- virtual set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const = 0;
-// Outputs the weight of the pair e1 and e2
- virtual double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const = 0;
- virtual size_t GetLibIndex() const = 0;
- virtual ~ConnectionCondition() {
- }
-};
-
-// Main (mate pair library) connection condition.
-class PairedLibConnectionCondition : public ConnectionCondition {
-protected:
- const debruijn_graph::Graph &graph_;
- shared_ptr <PairedInfoLibrary> lib_;
- size_t lib_index_;
-//Minimal number of mate pairs to call connection sound
- size_t min_read_count_;
-public:
-//Only paired info with gap between e1 and e2 between -left_dist_delta_ and right_dist_delta_ taken in account
- int left_dist_delta_;
- int right_dist_delta_;
-
- PairedLibConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t min_read_count);
- size_t GetLibIndex() const override;
- set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
- double GetWeight(debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const override;
-//Returns median gap size
- int GetMedianGap (debruijn_graph::EdgeId e1, debruijn_graph::EdgeId e2) const;
-};
-
-//Advanced mate-pair connection condition
-class AdvancedPairedConnectionCondition: public PairedLibConnectionCondition {
-protected:
- size_t always_add_;
- size_t never_add_;
- double relative_threshold_;
-
-public:
- AdvancedPairedConnectionCondition(const debruijn_graph::Graph &graph,
- shared_ptr <PairedInfoLibrary> lib,
- size_t lib_index,
- size_t always_add,
- size_t never_add,
- double relative_threshold);
-
- set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
-
-};
-
-/* Condition used to find connected in graph edges.
-*
-*/
-class AssemblyGraphConnectionCondition : public ConnectionCondition {
-protected:
- const debruijn_graph::Graph &g_;
-//Maximal gap to the connection.
- size_t max_connection_length_;
- set<EdgeId> interesting_edge_set_;
- mutable map <debruijn_graph::Graph::EdgeId, set<debruijn_graph::Graph::EdgeId>> stored_distances_;
-public:
- AssemblyGraphConnectionCondition(const debruijn_graph::Graph &g, size_t max_connection_length,
- const ScaffoldingUniqueEdgeStorage& unique_edges);
- void AddInterestingEdge(debruijn_graph::EdgeId e);
- set <debruijn_graph::EdgeId> ConnectedWith(debruijn_graph::EdgeId e) const override;
- double GetWeight(debruijn_graph::EdgeId, debruijn_graph::EdgeId) const override;
- size_t GetLibIndex() const override;
-};
-}
-
-#endif //PROJECT_CONNECTION_CONDITION2015_HPP
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp
deleted file mode 100644
index 1e2af32..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-// Created by lab42 on 8/26/15.
-//
-
-#include "extension_chooser2015.hpp"
-
-namespace path_extend {
-using namespace std;
-
-std::pair<EdgeId, int> ExtensionChooser2015::FindLastUniqueInPath(const BidirectionalPath& path) const {
- for (int i = (int)path.Size() - 1; i >= 0; --i) {
- if (unique_edges_.IsUnique(path.At(i))) {
- return std::make_pair(path.At(i), i);
- }
- }
- return std::make_pair(EdgeId(0), -1);
-}
-
-ExtensionChooser::EdgeContainer ExtensionChooser2015::FindNextUniqueEdge(const EdgeId from) const {
- VERIFY(unique_edges_.IsUnique(from));
- EdgeContainer result;
- set<EdgeId> candidate_edges = paired_connection_condition_.ConnectedWith(from);
- vector<pair<double, pair<EdgeId, int >>> to_sort;
- for (EdgeId e : candidate_edges) {
- if (!unique_edges_.IsUnique(e)) {
- continue;
- }
- double sum = paired_connection_condition_.GetWeight(from, e);
- DEBUG("edge " << g_.int_id(e) << " weight " << sum);
- if (sum < absolute_weight_threshold_) {
- DEBUG("Edge " << g_.int_id(e) << " weight " << sum << " failed absolute weight threshold " << absolute_weight_threshold_);
- continue;
- }
- int gap = paired_connection_condition_.GetMedianGap(from, e);
-
- auto connected_with = graph_connection_condition_.ConnectedWith(from);
- if (connected_with.find(e) != connected_with.end()) {
- sum *= graph_connection_bonus_;
- }
- to_sort.push_back(make_pair(sum, make_pair(e, gap)));
- }
-//descending order, reverse iterators;
- sort(to_sort.rbegin(), to_sort.rend());
- for(size_t j = 0; j < to_sort.size(); j++) {
- if (j == 0 || to_sort[j].first* relative_weight_threshold_ > to_sort[j - 1].first) {
- result.push_back(EdgeWithDistance(to_sort[j].second.first, to_sort[j].second.second));
- DEBUG("Edge " << g_.int_id(to_sort[j].second.first) << " gap " << to_sort[j].second.second << " weight "<< to_sort[j].first << " passed absolute weight threshold " << absolute_weight_threshold_);
- } else {
- DEBUG ("Edge " << g_.int_id(to_sort[j].second.first) << " weight " << to_sort[j].first << " failed relative weight threshold " << relative_weight_threshold_);
- DEBUG("other removed");
- break;
- }
- }
- return result;
-}
-
-ExtensionChooser::EdgeContainer ExtensionChooser2015::Filter(const BidirectionalPath& path, const ExtensionChooser::EdgeContainer& /*edges*/) const {
-// set<EdgeId> candidates = FindCandidates(path);
- pair<EdgeId, int> last_unique = FindLastUniqueInPath(path);
- EdgeContainer result;
-
- if (last_unique.second < 0) {
-// No unique edge found
- return result;
- }
-
- result = FindNextUniqueEdge(last_unique.first);
-//Backward check. We connected edges iff they are best continuation to each other.
- if (result.size() == 1) {
- //We should reduce gap size with length of the edges that came after last unique.
- result[0].d_ -= int (path.LengthAt(last_unique.second) - g_.length(last_unique.first));
-
- DEBUG("For edge " << g_.int_id(last_unique.first) << " unique next edge "<< result[0].e_ <<" found, doing backwards check ");
- EdgeContainer backwards_check = FindNextUniqueEdge(g_.conjugate(result[0].e_));
- if ((backwards_check.size() != 1) || (g_.conjugate(backwards_check[0].e_) != last_unique.first)) {
- result.clear();
- }
- }
- return result;
-}
-
-}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp b/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
deleted file mode 100644
index f4ba49c..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/extension_chooser2015.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//
-// Created by lab42 on 8/26/15.
-//
-#pragma once
-
-#include "algorithms/path_extend/extension_chooser.hpp"
-#include "connection_condition2015.hpp"
-#include "algorithms/genome_consistance_checker.hpp"
-#include "dev_support/logger/logger.hpp"
-#include <map>
-#include <set>
-namespace path_extend {
-class ExtensionChooser2015: public ScaffoldingExtensionChooser {
-private:
- const ScaffoldingUniqueEdgeStorage& unique_edges_;
-// for possible connections e1 and e2 if weight(e1) > relative_weight_threshold_ * weight(e2) then e2 will be ignored
- double relative_weight_threshold_;
- PairedLibConnectionCondition paired_connection_condition_;
- AssemblyGraphConnectionCondition graph_connection_condition_;
-// weight < absolute_weight_threshold_ will be ignored
- size_t absolute_weight_threshold_;
-// multiplicator for the pairs which are connected in graph.
- double graph_connection_bonus_;
-
-protected:
-//If path contains no unique edges return -1
- pair<EdgeId, int> FindLastUniqueInPath(const BidirectionalPath& path) const;
-//Find all possible next unique edges confirmed with mate-pair information. (absolute/relative)_weight_threshold_ used for filtering
- EdgeContainer FindNextUniqueEdge(const EdgeId from) const;
- DECL_LOGGER("ExtensionChooser2015")
-public:
- ExtensionChooser2015(const Graph& g,
- shared_ptr<WeightCounter> wc,
- size_t lib_index,
- const ScaffoldingUniqueEdgeStorage& unique_edges,
- double cl_weight_threshold,
- double is_scatter_coeff,
- double relative_threshold):
- //TODO: constants are subject to reconsider
- ScaffoldingExtensionChooser(g, wc, cl_weight_threshold, is_scatter_coeff),
- unique_edges_(unique_edges),
- relative_weight_threshold_(relative_threshold),
- paired_connection_condition_(g, wc->get_libptr(), lib_index, 0),
- graph_connection_condition_(g, 2 * unique_edges_.GetMinLength(), unique_edges),
- //TODO to congif!
- absolute_weight_threshold_(2),
- graph_connection_bonus_(2) {
- INFO("ExtensionChooser2015 created");
- }
-/* @param edges are really not used and left for compatibility
- * @returns possible next edge if there is unique one, else returns empty container
- *
- */
-
- EdgeContainer Filter(const BidirectionalPath& path, const EdgeContainer& edges) const override;
-};
-
-
-}
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp
deleted file mode 100644
index 7e3312a..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-#include "scaffold_graph.hpp"
-
-
-namespace path_extend {
-namespace scaffold_graph {
-
-std::atomic<ScaffoldGraph::ScaffoldEdgeIdT> ScaffoldGraph::ScaffoldEdge::scaffold_edge_id_{0};
-
-void ScaffoldGraph::AddEdgeSimple(const ScaffoldGraph::ScaffoldEdge &e, size_t conjugate_id) {
- edges_.emplace(e.getId(), e);
- outgoing_edges_.emplace(e.getStart(), e.getId());
- incoming_edges_.emplace(e.getEnd(), e.getId());
- conjugate_[e.getId()] = conjugate_id;
-}
-
-void ScaffoldGraph::DeleteOutgoing(const ScaffoldGraph::ScaffoldEdge &e) {
- auto e_range = outgoing_edges_.equal_range(e.getStart());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- outgoing_edges_.erase(edge_id);
- }
- }
-}
-
-void ScaffoldGraph::DeleteIncoming(const ScaffoldGraph::ScaffoldEdge &e) {
- auto e_range = incoming_edges_.equal_range(e.getEnd());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- incoming_edges_.erase(edge_id);
- }
- }
-}
-
-void ScaffoldGraph::DeleteAllOutgoingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
- auto e_range = outgoing_edges_.equal_range(v);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- DeleteIncoming(edges_.at(edge_id->second));
- }
- outgoing_edges_.erase(v);
-}
-
-void ScaffoldGraph::DeleteEdgeFromStorage(const ScaffoldGraph::ScaffoldEdge &e) {
- VERIFY(!Exists(e));
-
- size_t conjugate_id = conjugate_[e.getId()];
- edges_.erase(e.getId());
- edges_.erase(conjugate_id);
- conjugate_.erase(e.getId());
- conjugate_.erase(conjugate_id);
-}
-
-void ScaffoldGraph::DeleteAllIncomingEdgesSimple(ScaffoldGraph::ScaffoldVertex v) {
- auto e_range = incoming_edges_.equal_range(v);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- DeleteOutgoing(edges_.at(edge_id->second));
- }
- incoming_edges_.erase(v);
-}
-
-bool ScaffoldGraph::Exists(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return vertices_.count(assembly_graph_edge) != 0;
-}
-
-bool ScaffoldGraph::Exists(const ScaffoldGraph::ScaffoldEdge &e) const {
- auto e_range = outgoing_edges_.equal_range(e.getStart());
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- if (edges_.at(edge_id->second) == e) {
- return true;
- }
- }
- return false;
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::conjugate(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return assembly_graph_.conjugate(assembly_graph_edge);
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::conjugate(const ScaffoldGraph::ScaffoldEdge &e) const {
- auto iter = conjugate_.find(e.getId());
- if (iter != conjugate_.end()) {
- return edges_.at(iter->second);
- }
- return ScaffoldEdge(conjugate(e.getEnd()), conjugate(e.getStart()), e.getColor(), e.getWeight());
-}
-
-bool ScaffoldGraph::AddVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
- if (!Exists(assembly_graph_edge)) {
- VERIFY(!Exists(conjugate(assembly_graph_edge)));
- vertices_.insert(assembly_graph_edge);
- vertices_.insert(conjugate(assembly_graph_edge));
- return true;
- }
- return false;
-}
-
-void ScaffoldGraph::AddVertices(const set<ScaffoldGraph::ScaffoldVertex> &vertices) {
- for (auto v : vertices) {
- AddVertex(v);
- }
-}
-
-bool ScaffoldGraph::AddEdge(ScaffoldGraph::ScaffoldVertex v1, ScaffoldGraph::ScaffoldVertex v2, size_t lib_id, double weight) {
- VERIFY(Exists(v1));
- VERIFY(Exists(v2));
-
- ScaffoldEdge e(v1, v2, lib_id, weight);
- if (Exists(e)) {
- VERIFY(Exists(conjugate(e)));
- return false;
- }
-
- auto conj = conjugate(e);
- AddEdgeSimple(e, conj.getId());
- AddEdgeSimple(conj, e.getId());
- return true;
-}
-
-void ScaffoldGraph::Print(ostream &os) const {
- for (auto v: vertices_) {
- os << "Vertex " << int_id(v) << " ~ " << int_id(conjugate(v))
- << ": len = " << assembly_graph_.length(v) << ", cov = " << assembly_graph_.coverage(v) << endl;
- }
- for (auto e_iter = edges_.begin(); e_iter != edges_.end(); ++e_iter) {
- os << "Edge " << e_iter->second.getId() << " ~ " << conjugate(e_iter->second).getId() <<
- ": " << int_id(e_iter->second.getStart()) << " -> " << int_id(e_iter->second.getEnd()) <<
- ", lib index = " << e_iter->second.getColor() << ", weight " << e_iter->second.getWeight() << endl;
- }
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- VERIFY(HasUniqueIncoming(assembly_graph_edge));
- return edges_.at(incoming_edges_.find(assembly_graph_edge)->second);
-}
-
-ScaffoldGraph::ScaffoldEdge ScaffoldGraph::UniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- VERIFY(HasUniqueOutgoing(assembly_graph_edge));
- return edges_.at(outgoing_edges_.find(assembly_graph_edge)->second);
-}
-
-bool ScaffoldGraph::HasUniqueIncoming(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return IncomingEdgeCount(assembly_graph_edge) == 1;
-}
-
-bool ScaffoldGraph::HasUniqueOutgoing(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return OutgoingEdgeCount(assembly_graph_edge) == 1;
-}
-
-size_t ScaffoldGraph::IncomingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return incoming_edges_.count(assembly_graph_edge);
-}
-
-size_t ScaffoldGraph::OutgoingEdgeCount(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- return outgoing_edges_.count(assembly_graph_edge);
-}
-
-vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::IncomingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- vector<ScaffoldEdge> result;
- auto e_range = incoming_edges_.equal_range(assembly_graph_edge);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- result.push_back(edges_.at(edge_id->second));
- }
- return result;
-}
-
-vector<ScaffoldGraph::ScaffoldEdge> ScaffoldGraph::OutgoingEdges(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- vector<ScaffoldEdge> result;
- auto e_range = outgoing_edges_.equal_range(assembly_graph_edge);
- for (auto edge_id = e_range.first; edge_id != e_range.second; ++edge_id) {
- result.push_back(edges_.at(edge_id->second));
- }
- return result;
-}
-
-const debruijn_graph::Graph &ScaffoldGraph::AssemblyGraph() const {
- return assembly_graph_;
-}
-
-size_t ScaffoldGraph::EdgeCount() const {
- return edges_.size();
-}
-
-size_t ScaffoldGraph::VertexCount() const {
- return vertices_.size();
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeEnd(ScaffoldEdge e) const {
- return e.getEnd();
-}
-
-ScaffoldGraph::ScaffoldVertex ScaffoldGraph::EdgeStart(ScaffoldEdge e) const {
- return e.getStart();
-}
-
-size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldEdge e) const {
- return e.getId();
-}
-
-size_t ScaffoldGraph::int_id(ScaffoldGraph::ScaffoldVertex v) const {
- return assembly_graph_.int_id(v);
-}
-
-ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::eend() const {
- return ConstScaffoldEdgeIterator(edges_.cend());
-}
-
-ScaffoldGraph::ConstScaffoldEdgeIterator ScaffoldGraph::ebegin() const {
- return ConstScaffoldEdgeIterator(edges_.cbegin());
-}
-
-ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vend() const {
- return vertices_.cend();
-}
-
-ScaffoldGraph::VertexStorage::const_iterator ScaffoldGraph::vbegin() const {
- return vertices_.cbegin();
-}
-
-adt::iterator_range<ScaffoldGraph::VertexStorage::const_iterator> ScaffoldGraph::vertices() const {
- return adt::make_range(vbegin(), vend());
-}
-
-adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> ScaffoldGraph::edges() const {
- return adt::make_range(ebegin(), eend());
-}
-
-bool ScaffoldGraph::IsVertexIsolated(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) const {
- bool
- result = incoming_edges_.count(assembly_graph_edge) == 0 && outgoing_edges_.count(assembly_graph_edge) == 0;
- VERIFY((incoming_edges_.count(conjugate(assembly_graph_edge)) == 0
- && incoming_edges_.count(assembly_graph_edge) == 0) == result);
- return result;
-}
-
-bool ScaffoldGraph::RemoveVertex(ScaffoldGraph::ScaffoldVertex assembly_graph_edge) {
- if (Exists(assembly_graph_edge)) {
- VERIFY(Exists(conjugate(assembly_graph_edge)));
-
- DeleteAllOutgoingEdgesSimple(assembly_graph_edge);
- DeleteAllIncomingEdgesSimple(assembly_graph_edge);
- DeleteAllOutgoingEdgesSimple(conjugate(assembly_graph_edge));
- DeleteAllIncomingEdgesSimple(conjugate(assembly_graph_edge));
-
- VERIFY(incoming_edges_.count(assembly_graph_edge) == 0);
- VERIFY(outgoing_edges_.count(assembly_graph_edge) == 0);
- VERIFY(incoming_edges_.count(conjugate(assembly_graph_edge)) == 0);
- VERIFY(outgoing_edges_.count(conjugate(assembly_graph_edge)) == 0);
-
- vertices_.erase(assembly_graph_edge);
- vertices_.erase(conjugate(assembly_graph_edge));
-
- return true;
- }
- return false;
-}
-
-bool ScaffoldGraph::RemoveEdge(const ScaffoldGraph::ScaffoldEdge &e) {
- if (Exists(e)) {
- VERIFY(Exists(conjugate(e)));
- DeleteOutgoing(e);
- DeleteIncoming(e);
- DeleteOutgoing(conjugate(e));
- DeleteIncoming(conjugate(e));
- DeleteEdgeFromStorage(e);
-
- return true;
- }
- return false;
-}
-
-bool ScaffoldGraph::AddEdge(const ScaffoldGraph::ScaffoldEdge &e) {
- return AddEdge(e.getStart(), e.getEnd(), e.getColor(), e.getWeight());
-}
-
-} //scaffold_graph
-} //path_extend
\ No newline at end of file
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp
deleted file mode 100644
index 5e51863..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-//
-// Created by andrey on 17.09.15.
-//
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-#include "algorithms/path_extend/paired_library.hpp"
-#include "connection_condition2015.hpp"
-
-#include "dev_support/standard_base.hpp"
-#include "utils/adt/iterator_range.hpp"
-
-namespace path_extend {
-namespace scaffold_graph {
-
-//do NOT add "using namespace debruijn_graph" in order not to confuse between EdgeId typdefs
-
-class ScaffoldGraph {
-
-public:
- //EdgeId in de Bruijn graph is vertex in scaffolding graph
- typedef debruijn_graph::EdgeId ScaffoldVertex;
-
- //Unique edge id
- typedef size_t ScaffoldEdgeIdT;
-
- //Scaffold edge indormation class
- struct ScaffoldEdge {
- private:
- //unique id
- ScaffoldEdgeIdT id_;
- //id counter
- static std::atomic<ScaffoldEdgeIdT> scaffold_edge_id_;
-
- ScaffoldVertex start_;
- ScaffoldVertex end_;
- //color = lib#
- size_t color_;
- //read pair weight or anything else
- double weight_;
-
- public:
-
- ScaffoldEdge(ScaffoldVertex start, ScaffoldVertex end, size_t lib_id = (size_t) -1, double weight = 0) :
- id_(scaffold_edge_id_++),
- start_(start), end_(end),
- color_(lib_id),
- weight_(weight) {
- }
-
- ScaffoldEdgeIdT getId() const {
- return id_;
- }
-
-
- size_t getColor() const {
- return color_;
- }
-
- double getWeight() const {
- return weight_;
- }
-
- const ScaffoldVertex getStart() const {
- return start_;
- }
-
- const ScaffoldVertex getEnd() const {
- return end_;
- }
-
- bool operator==(const ScaffoldEdge &e) const {
- return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
- }
-
- bool operator==(const ScaffoldEdge &e) {
- return color_ == e.color_ && weight_ == e.weight_ && start_ == e.start_ && end_ == e.end_;
- }
- };
-
- //typedef for possibility to use in templated graph visualizers
- typedef ScaffoldVertex VertexId;
- typedef ScaffoldEdge EdgeId;
-
- //All vertices are stored in set
- typedef std::set<ScaffoldVertex> VertexStorage;
- //Edges are stored in map: Id -> Edge Information
- typedef std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdge> EdgeStorage;
- //Adjacency list contains vertrx and edge id (instead of whole edge information)
- typedef std::unordered_multimap<ScaffoldVertex, ScaffoldEdgeIdT> AdjacencyStorage;
-
- struct ConstScaffoldEdgeIterator: public boost::iterator_facade<ConstScaffoldEdgeIterator,
- const ScaffoldEdge,
- boost::forward_traversal_tag> {
- private:
- EdgeStorage::const_iterator iter_;
-
- public:
- ConstScaffoldEdgeIterator(EdgeStorage::const_iterator iter) : iter_(iter) {
- }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- ++iter_;
- }
-
- bool equal(const ConstScaffoldEdgeIterator &other) const {
- return iter_ == other.iter_;
- }
-
- const ScaffoldEdge& dereference() const {
- return iter_->second;
- }
- };
-
-//TODO:: fix this. Seems that only ebegin and eend are broken.
-private:
- EdgeStorage edges_;
-
- VertexStorage vertices_;
-
- const debruijn_graph::Graph &assembly_graph_;
-
- //Map for storing conjugate scaffolding edges
- std::unordered_map<ScaffoldEdgeIdT, ScaffoldEdgeIdT> conjugate_;
-
- AdjacencyStorage outgoing_edges_;
-
- AdjacencyStorage incoming_edges_;
-
- //Add edge without any checks and conjugate
- void AddEdgeSimple(const ScaffoldEdge &e, size_t conjugate_id);
-
- //Delete outgoing edge from adjancecy list without checks
- //and removing conjugate and respective incoming edge
- void DeleteOutgoing(const ScaffoldEdge &e);
-
- //Delete incoming edge from adjancecy list without checks
- //and removing conjugate and respective outoging edge
- void DeleteIncoming(const ScaffoldEdge &e);
-
- //Delete all edge info from storage
- void DeleteEdgeFromStorage(const ScaffoldEdge &e);
-
- //Detelte all outgoing from v edges from adjacency lists
- void DeleteAllOutgoingEdgesSimple(ScaffoldVertex v);
-
- //Detelte all incoming from v edges from adjacency lists
- void DeleteAllIncomingEdgesSimple(ScaffoldVertex v);
-
-public:
- ScaffoldGraph(const debruijn_graph::Graph &g) : assembly_graph_(g) {
- }
-
- bool Exists(ScaffoldVertex assembly_graph_edge) const;
-
- bool Exists(const ScaffoldEdge &e) const;
-
- ScaffoldVertex conjugate(ScaffoldVertex assembly_graph_edge) const;
-
- //Return structure thay is equal to conjugate of e (not exactrly the same structure as in graph)
- ScaffoldEdge conjugate(const ScaffoldEdge &e) const;
-
- //Add isolated vertex to the graph if not exitsts
- bool AddVertex(ScaffoldVertex assembly_graph_edge);
-
- void AddVertices(const set<ScaffoldVertex> &vertices);
-
- //Add edge (and conjugate) if not exists
- //v1 and v2 must exist
- bool AddEdge(ScaffoldVertex v1, ScaffoldVertex v2, size_t lib_id, double weight);
-
- bool AddEdge(const ScaffoldEdge &e);
-
- //Rempve edge from edge container and all adjacency lists
- bool RemoveEdge(const ScaffoldEdge &e);
-
- //Remove vertex and all adjacent edges
- bool RemoveVertex(ScaffoldVertex assembly_graph_edge);
-
- bool IsVertexIsolated(ScaffoldVertex assembly_graph_edge) const;
-
- VertexStorage::const_iterator vbegin() const;
-
- VertexStorage::const_iterator vend() const;
-
- adt::iterator_range<VertexStorage::const_iterator> vertices() const;
-
- ConstScaffoldEdgeIterator ebegin() const;
-
- ConstScaffoldEdgeIterator eend() const;
-
- adt::iterator_range<ScaffoldGraph::ConstScaffoldEdgeIterator> edges() const;
-
- size_t int_id(ScaffoldVertex v) const;
-
- size_t int_id(ScaffoldEdge e) const;
-
- ScaffoldVertex EdgeStart(ScaffoldEdge e) const;
-
- ScaffoldVertex EdgeEnd(ScaffoldEdge e) const;
-
- size_t VertexCount() const;
-
- size_t EdgeCount() const;
-
- const debruijn_graph::Graph & AssemblyGraph() const;
-
- vector<ScaffoldEdge> OutgoingEdges(ScaffoldVertex assembly_graph_edge) const;
-
- vector<ScaffoldEdge> IncomingEdges(ScaffoldVertex assembly_graph_edge) const;
-
- size_t OutgoingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
-
- size_t IncomingEdgeCount(ScaffoldVertex assembly_graph_edge) const;
-
- bool HasUniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
-
- bool HasUniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
-
- ScaffoldEdge UniqueOutgoing(ScaffoldVertex assembly_graph_edge) const;
-
- ScaffoldEdge UniqueIncoming(ScaffoldVertex assembly_graph_edge) const;
-
- void Print(ostream &os) const;
-
-};
-
-} //scaffold_graph
-} //path_extend
-
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
deleted file mode 100644
index 61a813b..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//
-// Created by andrey on 04.12.15.
-//
-
-#include "scaffold_graph_constructor.hpp"
-
-namespace path_extend {
-namespace scaffold_graph {
-
-
-bool LengthEdgeCondition::IsSuitable(debruijn_graph::EdgeId e) const {
- return graph_.length(e) >= min_length_;
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromEdgeConditions(const EdgeCondition &edge_condition,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
- for (auto e = graph_->AssemblyGraph().ConstEdgeBegin(); !e.IsEnd(); ++e) {
- if (edge_condition.IsSuitable(*e)) {
- graph_->AddVertex(*e);
- }
- }
- ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromSet(const set<EdgeId> edge_set,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
- graph_->AddVertices(edge_set);
- ConstructFromConditions(connection_conditions, use_terminal_vertices_only);
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only) {
-//TODO :: awful. It depends on ordering of connected conditions.
- for (auto condition : connection_conditions) {
- if (condition->GetLibIndex() == (size_t) -1)
- ConstructFromSingleCondition(condition, true);
- else
- ConstructFromSingleCondition(condition, use_terminal_vertices_only);
- }
-}
-
-void BaseScaffoldGraphConstructor::ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
- bool use_terminal_vertices_only) {
- for (const auto& v : graph_->vertices()) {
- TRACE("Vertex " << graph_->int_id(v));
-
- if (use_terminal_vertices_only && graph_->OutgoingEdgeCount(v) > 0)
- continue;
-
- auto connected_with = condition->ConnectedWith(v);
- for (auto connected : connected_with) {
- TRACE("Connected with " << graph_->int_id(connected));
- if (graph_->Exists(connected)) {
- if (use_terminal_vertices_only && graph_->IncomingEdgeCount(connected) > 0)
- continue;
- graph_->AddEdge(v, connected, condition->GetLibIndex(), condition->GetWeight(v, connected));
- }
- }
- }
-}
-
-
-shared_ptr<ScaffoldGraph> SimpleScaffoldGraphConstructor::Construct() {
- ConstructFromSet(edge_set_, connection_conditions_);
- return graph_;
-}
-
-shared_ptr<ScaffoldGraph> DefaultScaffoldGraphConstructor::Construct() {
- ConstructFromSet(edge_set_, connection_conditions_);
- ConstructFromEdgeConditions(edge_condition_, connection_conditions_);
- return graph_;
-}
-
-} //scaffold_graph
-} //path_extend
\ No newline at end of file
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
deleted file mode 100644
index bbf45f4..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_constructor.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-//
-// Created by andrey on 04.12.15.
-//
-
-#pragma once
-
-#include "scaffold_graph.hpp"
-
-
-namespace path_extend {
-namespace scaffold_graph {
-
-//De Bruijn graph edge condition interface
-class EdgeCondition {
-public:
- virtual bool IsSuitable(debruijn_graph::EdgeId e) const = 0;
-
- virtual ~EdgeCondition() { }
-
-};
-
-//Edge length condition
-class LengthEdgeCondition: public EdgeCondition {
- const debruijn_graph::Graph &graph_;
-
- size_t min_length_;
-
-public:
- LengthEdgeCondition(const debruijn_graph::Graph &graph, size_t min_len) : graph_(graph), min_length_(min_len) {
- }
-
- bool IsSuitable(debruijn_graph::EdgeId e) const;
-};
-
-//Iterface
-class ScaffoldGraphConstructor {
-
-public:
- virtual shared_ptr<ScaffoldGraph> Construct() = 0;
-};
-
-//Basic scaffold graph constructor functions
-class BaseScaffoldGraphConstructor: public ScaffoldGraphConstructor {
-protected:
- shared_ptr<ScaffoldGraph> graph_;
-
- BaseScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph) {
- graph_ = make_shared<ScaffoldGraph>(assembly_graph);
- }
-
- void ConstructFromSingleCondition(const shared_ptr<ConnectionCondition> condition,
- bool use_terminal_vertices_only);
-
- void ConstructFromConditions(vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only = false);
-
- void ConstructFromSet(const set<EdgeId> edge_set,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only = false);
-
- void ConstructFromEdgeConditions(const EdgeCondition& edge_condition,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- bool use_terminal_vertices_only = false);
-};
-
-
-class SimpleScaffoldGraphConstructor: public BaseScaffoldGraphConstructor {
-protected:
- const set<EdgeId>& edge_set_;
- vector<shared_ptr<ConnectionCondition>>& connection_conditions_;
-
-public:
- SimpleScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph,
- const set<EdgeId>& edge_set,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions):
- BaseScaffoldGraphConstructor(assembly_graph),
- edge_set_(edge_set), connection_conditions_(connection_conditions) {}
-
- shared_ptr<ScaffoldGraph> Construct() override;
-};
-
-class DefaultScaffoldGraphConstructor: public SimpleScaffoldGraphConstructor {
-protected:
- const EdgeCondition& edge_condition_;
-
-public:
- DefaultScaffoldGraphConstructor(const debruijn_graph::Graph& assembly_graph,
- const set<EdgeId>& edge_set,
- vector<shared_ptr<ConnectionCondition>> &connection_conditions,
- const EdgeCondition& edge_condition):
- SimpleScaffoldGraphConstructor(assembly_graph, edge_set, connection_conditions),
- edge_condition_(edge_condition)
- {}
-
- shared_ptr<ScaffoldGraph> Construct() override;
-};
-
-
-} //scaffold_graph
-} //path_extend
-
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
deleted file mode 100644
index 8e5aec6..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-//
-// Created by andrey on 21.09.15.
-//
-
-#include "scaffold_graph_visualizer.hpp"
-
-namespace path_extend{ namespace scaffold_graph {
-
-const map<size_t, string> ScaffoldEdgeColorer::color_map =
- {{(size_t) -1, "black"},
- {0, "red"},
- {1, "blue"},
- {2, "green"},
- {3, "magenta"},
- {4, "orange"},
- {5, "cyan"}};
-
-const string ScaffoldEdgeColorer::default_color = "black";
-
-string ScaffoldGraphLabeler::label(EdgeId e) const {
- return "ID: " + ToString(e.getId()) +
- "\\n Weight: " + ToString(e.getWeight()) +
- "\\n Lib#: " + ToString(e.getColor());
-}
-
-string ScaffoldGraphLabeler::label(VertexId v) const {
- return "ID: " + ToString(graph_.int_id(v)) +
- "\\n Len: " + ToString(graph_.AssemblyGraph().length(v)) +
- "\\n Cov: " + ToString(graph_.AssemblyGraph().coverage(v));
-}
-
-void ScaffoldGraphVisualizer::Visualize(GraphPrinter<ScaffoldGraph> &printer) {
- printer.open();
- printer.AddVertices(graph_.vbegin(), graph_.vend());
- //for (auto e = graph_.ebegin(); e != graph_.eend(); ++e) {
- for (const auto& e : graph_.edges()) {
- printer.AddEdge(e);
- }
- printer.close();
-}
-
-void ScaffoldGraphVisualizer::Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer) {
- ScaffoldGraphLabeler labeler(graph_);
- EmptyGraphLinker<ScaffoldGraph> linker;
-
- if (paired_) {
- PairedGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
- Visualize(printer);
- } else {
- SingleGraphPrinter <ScaffoldGraph> printer(graph_, os, labeler, colorer, linker);
- Visualize(printer);
- }
-}
-
-string ScaffoldEdgeColorer::GetValue(ScaffoldGraph::EdgeId e) const {
- auto it = color_map.find(e.getColor());
- if (it != color_map.end()) {
- return it->second;
- }
- return default_color;
-}
-
-string ScaffoldVertexSetColorer::GetValue(ScaffoldGraph::VertexId v) const {
- if (vertex_set_.count(v) > 0)
- return "white";
- return "yellow";
-}
-} //scaffold_graph
-} //path_extend
-
-
-
diff --git a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp b/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
deleted file mode 100644
index 2ed651c..0000000
--- a/src/modules/algorithms/path_extend/scaffolder2015/scaffold_graph_visualizer.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//
-// Created by andrey on 21.09.15.
-//
-
-#ifndef PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
-#define PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
-
-#include "pipeline/graphio.hpp"
-#include "scaffold_graph.hpp"
-
-namespace path_extend { namespace scaffold_graph {
-
-using namespace omnigraph::visualization;
-
-
-class ScaffoldGraphLabeler : public GraphLabeler<ScaffoldGraph> {
-
-private:
- const ScaffoldGraph &graph_;
-
-public:
- ScaffoldGraphLabeler(const ScaffoldGraph &graph) : graph_(graph) {
- }
-
- string label(VertexId v) const;
-
- string label(EdgeId e) const;
-};
-
-
-class ScaffoldEdgeColorer : public ElementColorer<ScaffoldGraph::EdgeId> {
-private:
- static const map<size_t, string> color_map;
-
- static const string default_color;
-
-public:
- string GetValue(ScaffoldGraph::EdgeId e) const;
-};
-
-
-class ScaffoldVertexSetColorer : public ElementColorer<ScaffoldGraph::VertexId> {
- private:
- set<ScaffoldGraph::VertexId> vertex_set_;
-
- public:
- ScaffoldVertexSetColorer(const set<ScaffoldGraph::VertexId>& vertex_set): vertex_set_(vertex_set) {
- }
-
- string GetValue(ScaffoldGraph::VertexId v) const;
-};
-
-class ScaffoldGraphVisualizer {
-
- const ScaffoldGraph &graph_;
- const bool paired_;
-
-private:
- void Visualize(GraphPrinter<ScaffoldGraph> &printer);
-
-public:
- ScaffoldGraphVisualizer(const ScaffoldGraph &graph, bool paired = true) :
- graph_(graph), paired_(paired) {
- }
-
- void Visualize(ostream &os, CompositeGraphColorer<ScaffoldGraph>& colorer);
-};
-
-} //scaffold_graph
-} //path_extend
-
-
-#endif //PROJECT_SCAFFOLD_GRAPH_VISUALIZER_HPP
diff --git a/src/modules/algorithms/path_extend/split_graph_pair_info.hpp b/src/modules/algorithms/path_extend/split_graph_pair_info.hpp
deleted file mode 100644
index 8991d57..0000000
--- a/src/modules/algorithms/path_extend/split_graph_pair_info.hpp
+++ /dev/null
@@ -1,449 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * split_graph_pair_info.hpp
- *
- * Created on: May 14, 2013
- * Author: ira
- */
-
-#ifndef SPLIT_GRAPH_PAIR_INFO_HPP_
-#define SPLIT_GRAPH_PAIR_INFO_HPP_
-
-#include <paired_info/weights.hpp>
-#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
-#include "io/dataset_support/read_converter.hpp"
-#include "ideal_pair_info.hpp"
-
-using namespace debruijn_graph;
-
-namespace path_extend {
-
-inline double FindIntersection(vector<double>& pi1, vector<double>& pi2) {
- std::sort(pi1.begin(), pi1.end());
- std::sort(pi2.begin(), pi2.end());
- size_t iter1 = 0;
- size_t iter2 = 0;
- double threshold = 0.0;
- double percent1 = 0.0;
- double percent2 = 1.0;
- while (percent1 < percent2 and iter1 < pi1.size() and iter2 < pi2.size()) {
- threshold = pi1[iter1];
- while (iter2 < pi2.size() and pi2[iter2] <= threshold) {
- iter2++;
- }
- percent1 = (double) iter1 / (double) pi1.size();
- percent2 = 1.0 - (double) iter2 / (double) pi2.size();
- iter1 += 1;
- }
- return threshold;
-}
-
-class Basket {
- EdgeId edgeId_;
- size_t index_;
-
-public:
- Basket(EdgeId edgeId, size_t index)
- : edgeId_(edgeId), index_(index) { }
-
- Basket(const Basket& b)
- : edgeId_(b.edgeId_), index_(b.index_) {}
-
- const EdgeId edgeId() const {
- return edgeId_;
- }
-
- size_t index() const {
- return index_;
- }
-
- bool operator<(const Basket& rhs) const {
- if (edgeId() != rhs.edgeId()) {
- return edgeId() < rhs.edgeId();
- }
- return index() < rhs.index();
- }
-
- bool operator==(const Basket& rhs) const {
- return edgeId() == rhs.edgeId() && index() == rhs.index();
- }
-};
-
-struct PairInfo {
- double weight_;
- double distance_;
- size_t count_;
-
- PairInfo()
- : weight_(0.), distance_(0.), count_(0) {}
-
- PairInfo(double weight, double distance, size_t count = 0)
- : weight_(weight), distance_(distance), count_(count) {}
-
-};
-
-class EdgePairInfo {
- EdgeId edgeId_;
- size_t basket_size_;
- vector<map<Basket, PairInfo> > pair_info_;
-
-public:
- EdgePairInfo() {
- basket_size_ = 0;
- }
-
- EdgePairInfo(size_t length, EdgeId edgeId, size_t basket_size)
- : edgeId_(edgeId),
- basket_size_(basket_size) {
- size_t count_baskets = length / basket_size_ + 1;
- for (size_t index = 0; index < count_baskets; ++index) {
- pair_info_.push_back(map<Basket, PairInfo>());
- }
- }
-
- EdgePairInfo(const EdgePairInfo& pairInfo)
- : edgeId_(pairInfo.edgeId_),
- basket_size_(pairInfo.basket_size_) {
- for (size_t index = 0; index < pairInfo.pair_info_.size(); ++index) {
- pair_info_.push_back(pairInfo.pair_info_[index]);
- }
- }
-
- void AddPairInfo(size_t pos_begin1, size_t pos_end1, EdgeId edgeId2,
- size_t pos_begin2, size_t pos_end2, double weight,
- double edge_distance) {
- size_t begin_basket_index1 = GetBasketIndex(pos_begin1);
- size_t end_basket_index1 = GetBasketIndex(pos_end1);
- size_t begin_basket_index2 = GetBasketIndex(pos_begin2);
- size_t end_basket_index2 = GetBasketIndex(pos_end2);
- for (size_t index1 = begin_basket_index1; index1 <= end_basket_index1;
- ++index1) {
- for (size_t index2 = begin_basket_index2;
- index2 <= end_basket_index2; ++index2) {
- AddPairInfoToBasket(index1, edgeId2, index2, weight,
- edge_distance);
- }
- }
- }
-
- void AddPairInfo(const EdgePairInfo& edgePairInfo) {
- for (size_t index = 0; index < pair_info_.size(); ++index) {
- const map<Basket, PairInfo>& basketInfoToAdd = edgePairInfo
- .pair_info_[index];
- map<Basket, PairInfo>& oldBasketInfo = pair_info_[index];
- for (auto iter = basketInfoToAdd.begin();
- iter != basketInfoToAdd.end(); ++iter) {
- if (oldBasketInfo.find(iter->first) == oldBasketInfo.end()) {
- oldBasketInfo[iter->first] = iter->second;
- } else {
- PairInfo& pairInfo = oldBasketInfo[iter->first];
- oldBasketInfo[iter->first] = PairInfo(
- pairInfo.weight_ + iter->second.weight_,
- CountNewDistance(pairInfo, iter->second.distance_,
- iter->second.count_),
- iter->second.count_ + pairInfo.count_);
- }
- }
- }
- }
-
- map<Basket, PairInfo>& GetInfo(size_t index) {
- return pair_info_.at(index);
- }
-
- size_t size() {
- return pair_info_.size();
- }
-
-private:
- size_t GetBasketIndex(size_t pos) const {
- return pos / basket_size_;
- }
-
- void AddPairInfoToBasket(size_t index1, EdgeId edgeId2, size_t index2,
- double weight, double edge_distance) {
- Basket basket2(edgeId2, index2);
- if (pair_info_[index1].find(basket2) == pair_info_[index1].end()) {
- pair_info_[index1][basket2] = PairInfo(0.0, 0);
- }
- PairInfo oldPairInfo = pair_info_[index1][basket2];
- double basket_distance = GetBasketDistance(edge_distance, index1,
- index2);
- pair_info_[index1][basket2] = PairInfo(
- oldPairInfo.weight_ + weight,
- CountNewDistance(oldPairInfo, basket_distance),
- oldPairInfo.count_ + 1);
- }
-
- double CountNewDistance(PairInfo& oldPairInfo, double distance,
- size_t count = 1) {
- return (oldPairInfo.distance_ * (double) oldPairInfo.count_
- + distance * (double) count)
- / (double) (oldPairInfo.count_ + count);
- }
-
- double GetBasketDistance(double edge_distance, size_t index1,
- size_t index2) {
- return edge_distance - (double) index1 * (double) basket_size_
- + (double) index2 * (double) basket_size_;
- }
-};
-
-class BasketsPairInfoIndex {
- const conj_graph_pack& gp_;
- size_t basket_size_;
- map<EdgeId, EdgePairInfo> pair_info_;
-
-public:
- BasketsPairInfoIndex(const conj_graph_pack& gp, size_t basket_size)
- : gp_(gp),
- basket_size_(basket_size) {
- }
-
- void AddPairInfo(EdgeId edgeId1, size_t pos_begin1, size_t pos_end1,
- EdgeId edgeId2, size_t pos_begin2, size_t pos_end2,
- double weight, double edge_distance) {
- if (pair_info_.find(edgeId1) == pair_info_.end()) {
- EdgePairInfo edgePairInfo2(gp_.g.length(edgeId1), edgeId1,
- basket_size_);
- pair_info_.insert(make_pair(edgeId1, edgePairInfo2));
- }
- pair_info_[edgeId1].AddPairInfo(pos_begin1, pos_end1, edgeId2,
- pos_begin2, pos_end2, weight,
- edge_distance);
- }
-
- EdgePairInfo& GetEdgePairInfo(EdgeId edgeId) {
- return pair_info_[edgeId];
- }
-
- void AddAll(const BasketsPairInfoIndex& index) {
- for (auto it = index.pair_info_.begin(); it != index.pair_info_.end();
- ++it) {
- if (pair_info_.find(it->first) == pair_info_.end()) {
- pair_info_.insert(make_pair(it->first, it->second));
- } else {
- pair_info_[it->first].AddPairInfo(it->second);
- }
- }
- }
-
- void Clear() {
- pair_info_.clear();
- }
-
- size_t size() const {
- return pair_info_.size();
- }
-
-};
-
-class SplitGraphPairInfo : public SequenceMapperListener {
-
-public:
- //TODO: d_min = ? d_max = ? for ideal_pi_counter_
- SplitGraphPairInfo(conj_graph_pack& gp, size_t is,
- size_t is_var,
- size_t is_min, size_t is_max,
- size_t read_size, size_t /* k */, size_t basket_size,
- const std::map<int, size_t>& is_distribution)
- : gp_(gp),
- is_(is),
- is_var_(is_var),
- is_min_(is_min),
- is_max_(is_max),
- basket_size_(basket_size),
- basket_index_(gp, basket_size),
- threshold_(-1),
- ideal_pi_counter_(gp.g, (int)is_min_,
- (int)is_max_, read_size, is_distribution) {
-
- }
-
- void StartProcessLibrary(size_t threads_count) override {
- baskets_buffer_.clear();
- for (size_t i = 0; i < threads_count; ++i)
- baskets_buffer_.emplace_back(gp_, basket_size_);
- }
-
- void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) override {
- ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
- read1, read2, r.distance());
- }
-
- void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) override {
- ProcessPairedRead(baskets_buffer_[thread_index], r.first().size(), r.second().size(),
- read1, read2, r.distance());
- }
-
- void ProcessSingleRead(size_t, const io::SingleRead&, const MappingPath<EdgeId>&) override {
- //only paired reads are interesting
- }
-
- void ProcessSingleRead(size_t, const io::SingleReadSeq&, const MappingPath<EdgeId>&) override {
- //only paired reads are interesting
- }
- void MergeBuffer(size_t thread_index) override {
- basket_index_.AddAll(baskets_buffer_[thread_index]);
- baskets_buffer_[thread_index].Clear();
- }
-
- void StopProcessLibrary() override {
- for (size_t i = 0; i < baskets_buffer_.size(); ++i)
- MergeBuffer(i);
-
- FindThreshold();
-
- baskets_buffer_.clear();
- }
-
- double GetThreshold() const {
- return threshold_;
- }
-
-private:
- void FindThreshold() {
- size_t min_long_edge = basket_size_;
- const Graph& g = gp_.g;
- vector<double> good_pi;
- vector<double> bad_pi;
- double insert_size_min = (double) is_ - 2. * (double) is_var_;
- double insert_size_max = (double) is_ + 2. * (double) is_var_;
- for (auto e = g.ConstEdgeBegin(); !e.IsEnd(); ++e) {
- EdgeId edge = *e;
-
- if (g.length(edge) > min_long_edge) {
- if (g.int_id(edge) <= 0)
- continue;
-
- EdgePairInfo& edge_pi = basket_index_.GetEdgePairInfo(edge);
- if (edge_pi.size() == 0)
- continue;
- size_t count_backets = LastBasketIndex(edge, (int) insert_size_max,
- edge_pi);
- for (size_t index = 0; index <= count_backets; ++index) {
- map<Basket, PairInfo>& basket_info = edge_pi.GetInfo(index);
- set<size_t> pair_baskets = GetBaskets(index,
- (int) insert_size_min,
- (int) insert_size_max,
- edge_pi);
- for (auto iter = basket_info.begin(); iter != basket_info.end(); ++iter) {
- PairInfo& pi = iter->second;
- if (iter->first.edgeId() == edge &&
- pair_baskets.find(iter->first.index()) != pair_baskets.end()) {
- good_pi.push_back(GetNormalizedWeight(pi));
- } else {
- bad_pi.push_back(GetNormalizedWeight(pi));
- }
- }
- }
- }
- }
- DEBUG("good pi size " << good_pi.size() << " bad pi size " << bad_pi.size());
- threshold_ = FindIntersection(good_pi, bad_pi);
- INFO("Threshold for paired information " << threshold_);
- }
-
- size_t LastBasketIndex(EdgeId edgeId, int insert_size_max,
- EdgePairInfo& edge_pair_info) {
- return min((gp_.g.length(edgeId) - insert_size_max) / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- size_t FindBeginPairBasket(size_t index, int insert_size_min,
- EdgePairInfo& edge_pair_info) {
- return min(index + insert_size_min / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- size_t FindEndPairBasket(size_t index, int insert_size_max,
- EdgePairInfo& edge_pair_info) {
- return min(index + insert_size_max / basket_size_,
- edge_pair_info.size() - 1);
- }
-
- set<size_t> GetBaskets(size_t index, int insert_size_min,
- int insert_size_max, EdgePairInfo& edge_pair_info) {
- set<size_t> result;
- size_t begin = FindBeginPairBasket(index, insert_size_min,
- edge_pair_info);
- size_t end = FindEndPairBasket(index, insert_size_max, edge_pair_info);
- for (size_t pair_index = begin; pair_index <= end; ++pair_index) {
- result.insert(pair_index);
- }
- return result;
- }
-
- double GetNormalizedWeight(PairInfo& pi) {
- return pi.weight_
- / ideal_pi_counter_.IdealPairedInfo(basket_size_, basket_size_,
- (int) pi.distance_);
- }
-
- void InnerProcess(BasketsPairInfoIndex& basket_index,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2,
- size_t read_distance) {
- for (size_t i = 0; i < path1.size(); ++i) {
- pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
- for (size_t j = 0; j < path2.size(); ++j) {
- pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
- double weight = PairedReadCountWeight(mapping_edge_1.second,
- mapping_edge_2.second);
- size_t kmer_distance = read_distance
- + mapping_edge_2.second.initial_range.end_pos
- - mapping_edge_1.second.initial_range.start_pos;
- int edge_distance = (int) kmer_distance
- + (int) mapping_edge_1.second.mapped_range.start_pos
- - (int) mapping_edge_2.second.mapped_range.end_pos;
-
- basket_index.AddPairInfo(
- mapping_edge_1.first,
- mapping_edge_1.second.mapped_range.start_pos,
- mapping_edge_1.second.mapped_range.end_pos,
- mapping_edge_2.first,
- mapping_edge_2.second.mapped_range.start_pos,
- mapping_edge_2.second.mapped_range.end_pos, weight,
- (double) edge_distance);
- }
- }
- }
-
- void ProcessPairedRead(BasketsPairInfoIndex& basket_index,
- size_t r1_length,
- size_t r2_length,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2,
- size_t read_distance) {
- InnerProcess(basket_index, path1, path2, read_distance);
- InnerProcess(basket_index, ConjugateMapping(gp_.g, path2, r2_length),
- ConjugateMapping(gp_.g, path1, r1_length), read_distance);
- }
-
- const conj_graph_pack& gp_;
- size_t is_;
- size_t is_var_;
- size_t is_min_;
- size_t is_max_;
- size_t basket_size_;
- BasketsPairInfoIndex basket_index_;
- vector<BasketsPairInfoIndex> baskets_buffer_;
- double threshold_;
- IdealPairInfoCounter ideal_pi_counter_;
-};
-
-} /* path_extend */
-
-#endif /* SPLIT_GRAPH_PAIR_INFO_HPP_ */
diff --git a/src/modules/algorithms/path_extend/weight_counter.hpp b/src/modules/algorithms/path_extend/weight_counter.hpp
deleted file mode 100644
index a2d224b..0000000
--- a/src/modules/algorithms/path_extend/weight_counter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * weight_counter.hpp
- *
- * Created on: Feb 19, 2012
- * Author: andrey
- */
-
-#ifndef WEIGHT_COUNTER_HPP_
-#define WEIGHT_COUNTER_HPP_
-
-#include "assembly_graph/paths/bidirectional_path.hpp"
-#include "paired_library.hpp"
-#include <algorithm>
-#include <boost/math/special_functions/fpclassify.hpp>
-
-namespace path_extend {
-
-inline int median(const vector<int>& dist, const vector<double>& w, int min, int max) {
- VERIFY(dist.size() == w.size());
- double S = 0;
- for (size_t i = 0; i < w.size(); ++i) {
- if (dist[i] >= min && dist[i] <= max)
- S += w[i];
- }
- if (S == 0) {
- DEBUG("Empty histogram");
- return 0;
- }
-
- double sum = S;
- for (size_t i = 0; i < w.size(); ++i) {
- if (dist[i] >= min && dist[i] <= max) {
- sum -= w[i];
- if (sum <= S / 2) {
- return dist[i];
- }
- }
- }
- VERIFY(false);
- return -1;
-}
-
-struct EdgeWithPairedInfo {
- size_t e_;
- double pi_;
-
- EdgeWithPairedInfo(size_t e_, double pi) :
- e_(e_), pi_(pi) {
-
- }
-};
-
-struct EdgeWithDistance {
- EdgeId e_;
- int d_;
-
- EdgeWithDistance(EdgeId e, size_t d) :
- e_(e), d_((int) d) {
- }
-
- struct DistanceComparator {
- bool operator()(const EdgeWithDistance& e1, const EdgeWithDistance& e2) {
- if (e1.d_ == e2.d_)
- return e1.e_ < e2.e_;
- return e1.d_ > e2.d_;
- }
- };
-
- //static DistanceComparator comparator;
-};
-
-class IdealInfoProvider {
-public:
- virtual ~IdealInfoProvider() {}
-
- virtual std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const = 0;
-};
-
-class BasicIdealInfoProvider : public IdealInfoProvider {
- const shared_ptr<PairedInfoLibrary> lib_;
-public:
- BasicIdealInfoProvider(const shared_ptr<PairedInfoLibrary>& lib) : lib_(lib) {
- }
-
- std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
- std::vector<EdgeWithPairedInfo> covered;
- for (int i = (int) path.Size() - 1; i >= 0; --i) {
- double w = lib_->IdealPairedInfo(path[i], candidate,
- (int) path.LengthAt(i));
- //FIXME think if we need extremely low ideal weights
- if (math::gr(w, 0.)) {
- covered.push_back(EdgeWithPairedInfo(i, w));
- }
- }
- return covered;
- }
-};
-
-class WeightCounter {
-
-protected:
- const Graph& g_;
- const shared_ptr<PairedInfoLibrary> lib_;
- bool normalize_weight_;
- shared_ptr<IdealInfoProvider> ideal_provider_;
-
-public:
-
- WeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- g_(g), lib_(lib), normalize_weight_(normalize_weight), ideal_provider_(ideal_provider) {
- if (!ideal_provider_) {
- ideal_provider_ = make_shared<BasicIdealInfoProvider>(lib);
- }
- }
-
- virtual std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const = 0;
-
- virtual double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges = std::set<size_t>(), int gapLength = 0) const = 0;
-
- const PairedInfoLibrary& lib() const {
- return *lib_;
- }
-
- const shared_ptr<PairedInfoLibrary> get_libptr() const {
- return lib_;
- };
-
-private:
- DECL_LOGGER("WeightCounter");
-};
-
-class ReadCountWeightCounter: public WeightCounter {
-
- std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
- int add_gap = 0) const {
- std::vector<EdgeWithPairedInfo> answer;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideal_provider_->FindCoveredEdges(path, e)) {
- double w = lib_->CountPairedInfo(path[e_w_pi.e_], e,
- (int) path.LengthAt(e_w_pi.e_) + add_gap);
-
- if (normalize_weight_) {
- w /= e_w_pi.pi_;
- }
- answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, w));
- }
-
- return answer;
- }
-
-public:
-
- ReadCountWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- WeightCounter(g, lib, normalize_weight, ideal_provider) {
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap) const override {
- double weight = 0.0;
-
- for (const auto& e_w_pi : CountLib(path, e, gap)) {
- if (!excluded_edges.count(e_w_pi.e_)) {
- weight += e_w_pi.pi_;
- }
- }
-
- return weight;
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- std::set<size_t> answer;
- for (const auto& e_w_pi : CountLib(path, e, gap)) {
- if (math::gr(e_w_pi.pi_, 0.)) {
- answer.insert(e_w_pi.e_);
- }
- }
-
- return answer;
- }
-
-};
-
-class PathCoverWeightCounter: public WeightCounter {
- double single_threshold_;
-
- double TotalIdealNonExcluded(const std::vector<EdgeWithPairedInfo>& ideally_covered_edges,
- const std::set<size_t>& excluded_edges) const {
- double ideal_total = 0.0;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
- if (!excluded_edges.count(e_w_pi.e_))
- ideal_total += e_w_pi.pi_;
- }
-
- return ideal_total;
- }
-
- std::vector<EdgeWithPairedInfo> CountLib(const BidirectionalPath& path, EdgeId e,
- const std::vector<EdgeWithPairedInfo>& ideally_covered_edges, int add_gap = 0) const {
- std::vector<EdgeWithPairedInfo> answer;
-
- for (const EdgeWithPairedInfo& e_w_pi : ideally_covered_edges) {
- double ideal_weight = e_w_pi.pi_;
-
- double weight = lib_->CountPairedInfo(
- path[e_w_pi.e_], e,
- (int) path.LengthAt(e_w_pi.e_) + add_gap);
-
- if (normalize_weight_) {
- weight /= ideal_weight;
- }
-
- if (math::ge(weight, single_threshold_)) {
- answer.push_back(EdgeWithPairedInfo(e_w_pi.e_, ideal_weight));
- }
- }
-
- return answer;
- }
-
-public:
-
- PathCoverWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- bool normalize_weight = true,
- double single_threshold = -1.,
- shared_ptr<IdealInfoProvider> ideal_provider = nullptr) :
- WeightCounter(g, lib, normalize_weight, ideal_provider), single_threshold_(single_threshold) {
- if (math::ls(single_threshold_, 0.)) {
- single_threshold_ = lib_->GetSingleThreshold();
- }
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap) const override {
- double lib_weight = 0.;
- const auto ideal_coverage = ideal_provider_->FindCoveredEdges(path, e);
-
- for (const auto& e_w_pi : CountLib(path, e, ideal_coverage, gap)) {
- if (!excluded_edges.count(e_w_pi.e_)) {
- lib_weight += e_w_pi.pi_;
- }
- }
-
- double total_ideal_coverage = TotalIdealNonExcluded(ideal_coverage, excluded_edges);
- return math::eq(total_ideal_coverage, 0.) ? 0. : lib_weight / total_ideal_coverage;
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- std::set<size_t> answer;
- for (const auto& e_w_pi : CountLib(path, e, ideal_provider_->FindCoveredEdges(path, e), gap)) {
- if (math::gr(e_w_pi.pi_, 0.)) {
- answer.insert(e_w_pi.e_);
- }
- }
- return answer;
- }
-};
-
-class CoverageAwareIdealInfoProvider : public BasicIdealInfoProvider {
- static constexpr double MAGIC_COEFF = 2.;
- const Graph& g_;
- size_t read_length_;
- size_t estimation_edge_length_;
-
-public:
- //works for single lib only!!!
- double EstimatePathCoverage(const BidirectionalPath& path) const {
- double answer = -1.0;
- for (int i = (int) path.Size() - 1; i >= 0; --i) {
- EdgeId e = path.At(i);
- if (g_.length(e) > estimation_edge_length_) {
- if (answer < 0 || g_.coverage(e) < answer) {
- answer = g_.coverage(e);
- }
- }
- }
- return answer;
- }
-
- CoverageAwareIdealInfoProvider(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- size_t read_length, size_t estimation_edge_length) :
- BasicIdealInfoProvider(lib), g_(g), read_length_(read_length),
- estimation_edge_length_(estimation_edge_length) {
- VERIFY(read_length_ > g_.k());
- }
-
- std::vector<EdgeWithPairedInfo> FindCoveredEdges(const BidirectionalPath& path, EdgeId candidate) const override {
- VERIFY(read_length_ != -1ul);
- double estimated_coverage = EstimatePathCoverage(path);
- VERIFY(math::gr(estimated_coverage, 0.));
-
- double correction_coeff = estimated_coverage / ((double(read_length_) - double(g_.k())) * MAGIC_COEFF);
-
- std::vector<EdgeWithPairedInfo> answer = BasicIdealInfoProvider::FindCoveredEdges(path, candidate);
- for (auto& e_w_pi : answer) {
- e_w_pi.pi_ *= correction_coeff;
- }
- return answer;
- }
-};
-
-//FIXME optimize number of calls of EstimatePathCoverage(path)
-class MetagenomicWeightCounter: public WeightCounter {
- static const size_t LENGTH_BOUND = 500;
- shared_ptr<CoverageAwareIdealInfoProvider> cov_info_provider_;
- shared_ptr<WeightCounter> normalizing_wc_;
- shared_ptr<WeightCounter> raw_wc_;
-
-public:
-
- //negative raw_threshold leads to the halt if no sufficiently long edges are in the path
- MetagenomicWeightCounter(const Graph& g, const shared_ptr<PairedInfoLibrary>& lib,
- size_t read_length, double normalized_threshold, double raw_threshold,
- size_t estimation_edge_length = LENGTH_BOUND) :
- WeightCounter(g, lib) {
- cov_info_provider_ = make_shared<CoverageAwareIdealInfoProvider>(g, lib, read_length, estimation_edge_length);
- normalizing_wc_ = make_shared<PathCoverWeightCounter>(g, lib, true, normalized_threshold, cov_info_provider_);
- if (math::ge(raw_threshold, 0.)) {
- raw_wc_ = make_shared<PathCoverWeightCounter>(g, lib, false, raw_threshold);
- }
- }
-
- double CountWeight(const BidirectionalPath& path, EdgeId e,
- const std::set<size_t>& excluded_edges, int gap = 0) const override {
- if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
- return normalizing_wc_->CountWeight(path, e, excluded_edges, gap);
- } else if (raw_wc_) {
- return raw_wc_->CountWeight(path, e, excluded_edges, gap);
- } else {
- return 0.;
- }
- }
-
- std::set<size_t> PairInfoExist(const BidirectionalPath& path, EdgeId e,
- int gap = 0) const override {
- static std::set<size_t> empty;
- if (math::gr(cov_info_provider_->EstimatePathCoverage(path), 0.)) {
- return normalizing_wc_->PairInfoExist(path, e, gap);
- } else if (raw_wc_) {
- return raw_wc_->PairInfoExist(path, e, gap);
- } else {
- return empty;
- }
- }
-};
-
-class PathsWeightCounter {
-public:
- PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary> lib, size_t min_read_count);
- PathsWeightCounter(const PathsWeightCounter& w);
- map<size_t, double> FindPairInfoFromPath(
- const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2) const;
- double CountPairInfo(const BidirectionalPath& path1, size_t from1,
- size_t to1, const BidirectionalPath& path2,
- size_t from2, size_t to2, bool normalize = true) const;
- double CountPairInfo(const BidirectionalPath& path1, size_t from1,
- size_t to1, EdgeId edge, size_t gap) const;
- void SetCommonWeightFrom(size_t iedge, double weight);
- void ClearCommonWeight();
- void FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const;
- void FindJumpEdges(EdgeId e, set<EdgeId>& candidates, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const;
- const shared_ptr<PairedInfoLibrary> GetLib() const {
- return lib_;
- }
- bool HasPI(EdgeId e1, EdgeId e2, int dist) const;
- bool HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const;
- double PI(EdgeId e1, EdgeId e2, int dist) const;
- bool HasIdealPI(EdgeId e1, EdgeId e2, int dist) const;
- double IdealPI(EdgeId e1, EdgeId e2, int dist) const;
-
-private:
- void FindPairInfo(const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2,
- map<size_t, double>& pi, double& ideal_pi) const;
- void FindPairInfo(EdgeId e1, EdgeId e2, size_t dist, double& ideal_w,
- double& result_w) const;
-
- const Graph& g_;
- shared_ptr<PairedInfoLibrary> lib_;
- std::map<size_t, double> common_w_;
- size_t min_read_count_;
- DECL_LOGGER("WeightCounter");
-};
-
-inline PathsWeightCounter::PathsWeightCounter(const Graph& g, shared_ptr<PairedInfoLibrary>lib, size_t min_read_count):g_(g), lib_(lib), min_read_count_(min_read_count){
-
-}
-
-inline PathsWeightCounter::PathsWeightCounter(const PathsWeightCounter& w): g_(w.g_), lib_(w.lib_), min_read_count_(w.min_read_count_) {
-
-}
-
-inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1,
- const BidirectionalPath& path2,
- size_t from2, size_t to2, bool normalize) const {
- map<size_t, double> pi;
- double ideal_pi = 0.0;
- FindPairInfo(path1, from1, to1, path2, from2, to2,
- pi, ideal_pi);
- double result = 0.0;
- double all_common = 0.0;
- for (size_t i = from1; i < to1; ++i) {
- if (common_w_.find(i) != common_w_.end()) {
- all_common += common_w_.at(i);
- }
- result += pi[i];
- }
- DEBUG("ideal _pi " << ideal_pi << " common " << all_common << " result " << result);
- ideal_pi -= all_common;
- result -= all_common;
- double total_result = math::gr(ideal_pi, 0.0) ? result / ideal_pi : 0.0;
- total_result = math::gr(total_result, 0.0) ? total_result : 0.0;
- DEBUG("ideal _pi " << ideal_pi << " result " << result << " total_result " << total_result);
- return normalize ? total_result : result;
-}
-
-inline double PathsWeightCounter::CountPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1, EdgeId edge,
- size_t gap) const {
- double result = 0.0;
- for (size_t i1 = from1; i1 < to1; ++i1) {
- double ideal_w, w;
- FindPairInfo(path1.At(i1), edge, gap + path1.LengthAt(i1), ideal_w, w);
- result += w;
- }
- return result;
-}
-
-inline void PathsWeightCounter::FindPairInfo(const BidirectionalPath& path1,
- size_t from1, size_t to1,
- const BidirectionalPath& path2,
- size_t from2, size_t to2,
- map<size_t, double>& pi,
- double& ideal_pi) const {
- stringstream str;
- for (size_t i = 0; i < path2.Size(); ++i) {
- str << g_.int_id(path2.At(i)) << " ";
- }
- DEBUG("pair info for path " << str.str());
- for (size_t i1 = from1; i1 < to1; ++i1) {
- for (size_t i2 = from2; i2 < to2; ++i2) {
- size_t dist = path1.LengthAt(i1) + path2.Length()
- - path2.LengthAt(i2);
- double ideal_w = 0.0;
- double w = 0.0;
- FindPairInfo(path1.At(i1), path2.At(i2), dist, ideal_w, w);
- ideal_pi += ideal_w;
- if (pi.find(i1) == pi.end()) {
- pi[i1] = 0;
- }
- pi[i1] += w;
- }
- }
-}
-
-inline void PathsWeightCounter::FindPairInfo(EdgeId e1, EdgeId e2, size_t dist,
- double& ideal_w, double& result_w) const {
- ideal_w = lib_->IdealPairedInfo(e1, e2, (int) dist, true);
- result_w = 0.0;
- if (ideal_w == 0.0) {
- return;
- }
- if (HasPI(e1, e2, (int) dist)) {
- result_w = ideal_w;
- }
-}
-
-inline map<size_t, double> PathsWeightCounter::FindPairInfoFromPath(
- const BidirectionalPath& path1, size_t from1, size_t to1,
- const BidirectionalPath& path2, size_t from2, size_t to2) const {
- map<size_t, double> pi;
- double ideal_pi = 0;
- FindPairInfo(path1, from1, to1, path2, from2, to2, pi, ideal_pi);
- return pi;
-}
-
-inline void PathsWeightCounter::FindJumpCandidates(EdgeId e, int min_dist, int max_dist, size_t min_len, set<EdgeId>& result) const {
- result.clear();
- lib_->FindJumpEdges(e, result, min_dist, max_dist, min_len);
-}
-
-inline void PathsWeightCounter::FindJumpEdges(EdgeId e, set<EdgeId>& edges, int min_dist, int max_dist, vector<EdgeWithDistance>& result) const {
- result.clear();
-
- for (auto e2 = edges.begin(); e2 != edges.end(); ++e2) {
- vector<int> distances;
- vector<double> weights;
- lib_->CountDistances(e, *e2, distances, weights);
- int median_distance = median(distances, weights, min_dist, max_dist);
-
- if (HasPI(e, *e2, median_distance)) {
- result.push_back(EdgeWithDistance(*e2, median_distance));
- }
- }
-}
-
-inline void PathsWeightCounter::SetCommonWeightFrom(size_t iedge, double weight) {
- common_w_[iedge] = weight;
-}
-
-inline void PathsWeightCounter::ClearCommonWeight() {
- common_w_.clear();
-}
-
-inline double PathsWeightCounter::PI(EdgeId e1, EdgeId e2, int dist) const {
- double w = lib_->CountPairedInfo(e1, e2, dist, true);
- return w > (double) min_read_count_ ? w : 0.0;
-}
-
-inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->CountPairedInfo(e1, e2, dist, true) > (double) min_read_count_;
-}
-
-inline bool PathsWeightCounter::HasIdealPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->IdealPairedInfo(e1, e2, dist, true) > 0.0;
-}
-
-inline double PathsWeightCounter::IdealPI(EdgeId e1, EdgeId e2, int dist) const {
- return lib_->IdealPairedInfo(e1, e2, dist, true);
-}
-
-inline bool PathsWeightCounter::HasPI(EdgeId e1, EdgeId e2, size_t dist_min, size_t dist_max) const {
- return lib_->CountPairedInfo(e1, e2, (int) dist_min, (int) dist_max) > min_read_count_;
-}
-};
-
-#endif /* WEIGHT_COUNTER_HPP_ */
diff --git a/src/modules/algorithms/simplification/bulge_remover.hpp b/src/modules/algorithms/simplification/bulge_remover.hpp
deleted file mode 100644
index 1ab3de6..0000000
--- a/src/modules/algorithms/simplification/bulge_remover.hpp
+++ /dev/null
@@ -1,783 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bulge_remover.hpp
- *
- * Created on: Apr 13, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "assembly_graph/graph_support/parallel_processing.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "assembly_graph/graph_support/comparators.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "dev_support/standard_base.hpp"
-#include <cmath>
-#include <stack>
-#include "math/xmath.h"
-
-namespace omnigraph {
-
-template<class Graph>
-struct SimplePathCondition {
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
-
- SimplePathCondition(const Graph& g) :
- g_(g) {
-
- }
-
- bool operator()(EdgeId edge, const vector<EdgeId>& path) const {
- if (edge == g_.conjugate(edge))
- return false;
- for (size_t i = 0; i < path.size(); ++i)
- if (edge == path[i] || edge == g_.conjugate(path[i]))
- return false;
- for (size_t i = 0; i < path.size(); ++i) {
- if (path[i] == g_.conjugate(path[i])) {
- return false;
- }
- for (size_t j = i + 1; j < path.size(); ++j)
- if (path[i] == path[j] || path[i] == g_.conjugate(path[j]))
- return false;
- }
- return true;
- }
-};
-
-template<class Graph>
-bool TrivialCondition(typename Graph::EdgeId,
- const vector<typename Graph::EdgeId>& path) {
- for (size_t i = 0; i < path.size(); ++i)
- for (size_t j = i + 1; j < path.size(); ++j)
- if (path[i] == path[j])
- return false;
- return true;
-}
-
-template<class Graph>
-class MostCoveredSimpleAlternativePathChooser: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- EdgeId forbidden_edge_;
-
- double max_coverage_;
- vector<EdgeId> most_covered_path_;
-
-public:
-
- MostCoveredSimpleAlternativePathChooser(const Graph& g, EdgeId edge) :
- g_(g), forbidden_edge_(edge), max_coverage_(-1.0) {
-
- }
-
- void HandleReversedPath(const vector<EdgeId>& reversed_path) override {
- vector<EdgeId> path = this->ReversePath(reversed_path);
- double path_cov = AvgCoverage(g_, path);
- for (size_t i = 0; i < path.size(); i++) {
- if (path[i] == forbidden_edge_)
- return;
- }
- if (path_cov > max_coverage_ && SimplePathCondition<Graph>(g_)(forbidden_edge_, path)) {
- max_coverage_ = path_cov;
- most_covered_path_ = path;
- }
- }
-
- double max_coverage() {
- return max_coverage_;
- }
-
- const vector<EdgeId>& most_covered_path() {
- return most_covered_path_;
- }
-};
-
-inline size_t CountMaxDifference(size_t absolute_diff, size_t length, double relative_diff) {
- return std::max((size_t) std::floor(relative_diff * (double) length), absolute_diff);
-}
-
-template<class Graph>
-class BulgeGluer {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
- Graph& g_;
- BulgeCallbackF opt_callback_;
- std::function<void(EdgeId)> removal_handler_;
-
- void InnerProcessBulge(EdgeId edge, const vector<EdgeId>& path) {
-
- EnsureEndsPositionAligner aligner(CumulativeLength(g_, path),
- g_.length(edge));
- double prefix_length = 0.;
- vector<size_t> bulge_prefix_lengths;
-
- for (EdgeId e : path) {
- prefix_length += (double) g_.length(e);
- bulge_prefix_lengths.push_back(aligner.GetPosition((size_t) prefix_length));
- }
-
- EdgeId edge_to_split = edge;
- size_t prev_length = 0;
-
- TRACE("Process bulge " << path.size() << " edges");
-
- //fixme remove after checking results
- bool flag = false;
- VERIFY(bulge_prefix_lengths.back() == g_.length(edge));
-
- for (size_t i = 0; i < path.size(); ++i) {
- if (bulge_prefix_lengths[i] > prev_length) {
- if (bulge_prefix_lengths[i] - prev_length
- != g_.length(edge_to_split)) {
-
- TRACE("SplitEdge " << g_.str(edge_to_split));
- TRACE(
- "Start: " << g_.str(g_.EdgeStart(edge_to_split)));
- TRACE(
- "Start: " << g_.str(g_.EdgeEnd(edge_to_split)));
-
- pair<EdgeId, EdgeId> split_result = g_.SplitEdge(
- edge_to_split,
- bulge_prefix_lengths[i] - prev_length);
-
- edge_to_split = split_result.second;
-
- TRACE("GlueEdges " << g_.str(split_result.first));
- flag = true;
- g_.GlueEdges(split_result.first, path[i]);
-
- } else {
- TRACE("GlueEdges " << g_.str(edge_to_split));
- flag = true;
- g_.GlueEdges(edge_to_split, path[i]);
- }
- }
- prev_length = bulge_prefix_lengths[i];
- }
- VERIFY(flag);
- }
-
-public:
-
- BulgeGluer(Graph& g, BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0) :
- g_(g),
- opt_callback_(opt_callback),
- removal_handler_(removal_handler) {
-
- }
-
- void operator()(EdgeId edge, const vector<EdgeId>& path) {
- if (opt_callback_)
- opt_callback_(edge, path);
-
- if (removal_handler_)
- removal_handler_(edge);
-
- VertexId start = g_.EdgeStart(edge);
- VertexId end = g_.EdgeEnd(edge);
-
- TRACE("Projecting edge " << g_.str(edge));
- InnerProcessBulge(edge, path);
-
- TRACE("Compressing start vertex " << g_.str(start));
- g_.CompressVertex(start);
-
- TRACE("Compressing end vertex " << g_.str(end));
- g_.CompressVertex(end);
- }
-
-};
-
-template<class Graph>
-class AlternativesAnalyzer {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- double max_coverage_;
- size_t max_length_;
- double max_relative_coverage_;
- size_t max_delta_;
- double max_relative_delta_;
- size_t max_edge_cnt_;
-
- static vector<EdgeId> EmptyPath() {
- static vector<EdgeId> vec = {};
- return vec;
- }
-
- /**
- * Checks if alternative path is simple (doesn't contain conjugate edges, edge e or conjugate(e))
- * and its average coverage * max_relative_coverage_ is greater than g.coverage(e)
- */
- bool BulgeCondition(EdgeId e, const vector<EdgeId>& path,
- double path_coverage) const {
- return math::ge(path_coverage * max_relative_coverage_,
- g_.coverage(e)) && SimplePathCondition<Graph>(g_)(e, path);
- }
-
-public:
- AlternativesAnalyzer(const Graph& g, double max_coverage, size_t max_length,
- double max_relative_coverage, size_t max_delta,
- double max_relative_delta, size_t max_edge_cnt) :
- g_(g),
- max_coverage_(max_coverage),
- max_length_(max_length),
- max_relative_coverage_(max_relative_coverage),
- max_delta_(max_delta),
- max_relative_delta_(max_relative_delta),
- max_edge_cnt_(max_edge_cnt) {
- DEBUG("Created alternatives analyzer max_length=" << max_length
- << " max_coverage=" << max_coverage
- << " max_relative_coverage=" << max_relative_coverage
- << " max_delta=" << max_delta
- << " max_relative_delta=" << max_relative_delta);
- }
-
- vector<EdgeId> operator()(EdgeId e) const {
- if (g_.length(e) > max_length_ || math::gr(g_.coverage(e), max_coverage_)) {
- return EmptyPath();
- }
-
- size_t kplus_one_mer_coverage = (size_t) math::round((double) g_.length(e) * g_.coverage(e));
- TRACE("Processing edge " << g_.str(e) << " and coverage " << kplus_one_mer_coverage);
-
- size_t delta = CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
-
- MostCoveredSimpleAlternativePathChooser<Graph> path_chooser(g_, e);
-
- VertexId start = g_.EdgeStart(e);
- TRACE("Start " << g_.str(start));
- VertexId end = g_.EdgeEnd(e);
- TRACE("End " << g_.str(end));
-
- ProcessPaths(g_, (g_.length(e) > delta) ? g_.length(e) - delta : 0,
- g_.length(e) + delta, start, end, path_chooser, max_edge_cnt_);
-
- const vector<EdgeId>& path = path_chooser.most_covered_path();
- if (!path.empty()) {
- VERIFY(g_.EdgeStart(path[0]) == start);
- VERIFY(g_.EdgeEnd(path.back()) == end);
- }
-
- double path_coverage = path_chooser.max_coverage();
- if (math::gr(path_coverage, 0.)) {
- TRACE("Best path with coverage " << path_coverage << " is " << PrintPath(g_, path));
-
- if (BulgeCondition(e, path, path_coverage)) {
- TRACE("Satisfied condition");
- return path;
- } else {
- TRACE("Didn't satisfy condition");
- return EmptyPath();
- }
- } else {
- TRACE("Didn't find alternative");
- return EmptyPath();
- }
- }
-
- double max_coverage() const {
- return max_coverage_;
- }
-
- size_t max_length() const {
- return max_length_;
- }
-
-private:
- DECL_LOGGER("AlternativesAnalyzer");
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryBulgeCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddAlternativesPresenceCondition(g,
- pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-/**
- * This class removes simple bulges from given graph with the following algorithm: it iterates through all edges of
- * the graph and for each edge checks if this edge is likely to be a simple bulge
- * if edge is judged to be one it is removed.
- */
-//template<class Graph>
-//class OldBulgeRemover: public EdgeProcessingAlgorithm<Graph> {
-// typedef EdgeProcessingAlgorithm<Graph> base;
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-//
-//protected:
-//
-// /*virtual*/
-// bool ProcessEdge(EdgeId e) {
-// TRACE("Considering edge " << this->g().str(e)
-// << " of length " << this->g().length(e)
-// << " and avg coverage " << this->g().coverage(e));
-//
-// if (!HasAlternatives(this->g(), e)) {
-// TRACE("Not possible bulge edge");
-// return false;
-// }
-//
-// for (const auto& analyzer : alternatives_analyzers_) {
-// vector<EdgeId> alternative = analyzer(e);
-// if (!alternative.empty()) {
-// gluer_(e, alternative);
-// return true;
-// }
-// }
-// return false;
-// }
-//
-//public:
-//
-// typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-//
-//// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
-//// double max_relative_coverage, size_t max_delta,
-//// double max_relative_delta,
-//// size_t max_edge_cnt,
-//// BulgeCallbackF opt_callback = 0,
-//// std::function<void(EdgeId)> removal_handler = 0) :
-//// base(g, true),
-//// gluer_(g, opt_callback, removal_handler) {
-//// DEBUG("Launching br max_length=" << max_length
-//// << " max_coverage=" << max_coverage
-//// << " max_relative_coverage=" << max_relative_coverage
-//// << " max_delta=" << max_delta
-//// << " max_relative_delta=" << max_relative_delta
-//// << " max_number_edges=" << max_edge_cnt);
-//// alternatives_analyzers_.push_back(
-//// AlternativesAnalyzer<Graph>(g, max_coverage,
-//// max_length, max_relative_coverage,
-//// max_delta, max_relative_delta, max_edge_cnt));
-//// }
-//
-// OldBulgeRemover(Graph& g,
-// const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers,
-// BulgeCallbackF opt_callback = 0,
-// std::function<void(EdgeId)> removal_handler = 0) :
-// base(g, true),
-// alternatives_analyzers_(alternatives_analyzers),
-// gluer_(g, opt_callback, removal_handler) {
-// }
-//
-//private:
-// std::vector<AlternativesAnalyzer<Graph>> alternatives_analyzers_;
-// BulgeGluer<Graph> gluer_;
-//private:
-// DECL_LOGGER("BulgeRemover")
-//};
-
-template<class Graph>
-inline double AbsoluteMaxCoverage(const std::vector<AlternativesAnalyzer<Graph>>& alternatives_analyzers) {
- double ans = -1.;
- for (const auto& analyzer : alternatives_analyzers) {
- ans = std::max(ans, analyzer.max_coverage());
- }
- return ans;
-}
-
-//fixme maybe switch on parallel finder?
-template<class Graph, class InterestingElementFinder>
-class BulgeRemover: public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- InterestingElementFinder,
- CoverageComparator<Graph>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId,
- InterestingElementFinder, CoverageComparator<Graph>> base;
-
-protected:
-
- /*virtual*/
- bool Process(EdgeId e) {
- TRACE("Considering edge " << this->g().str(e)
- << " of length " << this->g().length(e)
- << " and avg coverage " << this->g().coverage(e));
-
- if (!HasAlternatives(this->g(), e)) {
- TRACE("Not possible bulge edge");
- return false;
- }
-
- vector<EdgeId> alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- gluer_(e, alternative);
- return true;
- }
- return false;
- }
-
-public:
-
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-
-// BulgeRemover(Graph& g, double max_coverage, size_t max_length,
-// double max_relative_coverage, size_t max_delta,
-// double max_relative_delta,
-// size_t max_edge_cnt,
-// BulgeCallbackF opt_callback = 0,
-// std::function<void(EdgeId)> removal_handler = 0) :
-// base(g, true),
-// gluer_(g, opt_callback, removal_handler) {
-// DEBUG("Launching br max_length=" << max_length
-// << " max_coverage=" << max_coverage
-// << " max_relative_coverage=" << max_relative_coverage
-// << " max_delta=" << max_delta
-// << " max_relative_delta=" << max_relative_delta
-// << " max_number_edges=" << max_edge_cnt);
-// alternatives_analyzers_.push_back(
-// AlternativesAnalyzer<Graph>(g, max_coverage,
-// max_length, max_relative_coverage,
-// max_delta, max_relative_delta, max_edge_cnt));
-// }
-
- BulgeRemover(Graph& g, const InterestingElementFinder& interesting_finder,
- const AlternativesAnalyzer<Graph>& alternatives_analyzer,
- BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0,
- bool track_changes = true) :
- base(g,
- interesting_finder,
- /*canonical_only*/true,
- CoverageComparator<Graph>(g),
- track_changes),
- alternatives_analyzer_(alternatives_analyzer),
- gluer_(g, opt_callback, removal_handler) {
- }
-
-private:
- AlternativesAnalyzer<Graph> alternatives_analyzer_;
- BulgeGluer<Graph> gluer_;
-private:
- DECL_LOGGER("BulgeRemover")
-};
-
-template<class Graph, class InterestingElementFinder>
-class ParallelBulgeRemover : public PersistentAlgorithmBase<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef SmartSetIterator<Graph, EdgeId, CoverageComparator<Graph>> SmartEdgeSet;
-
- size_t buff_size_;
- double buff_cov_diff_;
- double buff_cov_rel_diff_;
- AlternativesAnalyzer<Graph> alternatives_analyzer_;
- BulgeGluer<Graph> gluer_;
- InterestingElementFinder interesting_edge_finder_;
- //todo remove
- bool tracking_;
-
- size_t curr_iteration_;
-
- SmartEdgeSet it_;
-
- static vector<EdgeId> EmptyPath() {
- static vector<EdgeId> vec = {};
- return vec;
- }
-
- struct BulgeInfo : private boost::noncopyable {
- size_t id;
- EdgeId e;
- std::vector<EdgeId> alternative;
-
- BulgeInfo() :
- id(-1ul) {
- }
-
- BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId> alternative_) :
- id(id_), e(e_), alternative(std::move(alternative_)) {
-
- }
-
- BulgeInfo(BulgeInfo&& that) {
- *this = std::move(that);
- }
-
- BulgeInfo& operator= (BulgeInfo&& that) {
- id = that.id;
- e = that.e;
- alternative = std::move(that.alternative);
- return *this;
- }
-
-// BulgeInfo(size_t id_, EdgeId e_, std::vector<EdgeId>&& alternative_) :
-// id(id_), e(e_), alternative(std::move(alternative_)) {
-//
-// }
-//
- bool operator< (const BulgeInfo& that) const {
-// VERIFY_MSG(id != that.id, "Ooops " << id);
- return id < that.id;
- }
-
- std::string str(const Graph& g) const {
- std::stringstream ss;
- ss << "BulgeInfo " << id
- << " e: " << g.str(e)
- << " path: " << PrintPath(g, alternative);
- return ss.str();
- }
-
- };
-
- bool CheckInteracting(const BulgeInfo& info, const std::unordered_set<EdgeId>& involved_edges) const {
- if (involved_edges.count(info.e))
- return true;
- for (EdgeId e : info.alternative)
- if (involved_edges.count(e))
- return true;
- return false;
- }
-
- void AccountEdge(EdgeId e, std::unordered_set<EdgeId>& involved_edges) const {
- TRACE("Pushing edge " << this->g().str(e));
- involved_edges.insert(e);
- EdgeId conj = this->g().conjugate(e);
- TRACE("Pushing edge " << this->g().str(conj));
- involved_edges.insert(conj);
- }
-
- void AccountEdges(const BulgeInfo& info, std::unordered_set<EdgeId>& involved_edges) const {
- AccountEdge(info.e, involved_edges);
- for (EdgeId e : info.alternative) {
- AccountEdge(e, involved_edges);
- }
- }
-
- //false if time to stop
- bool FillEdgeBuffer(vector<EdgeId>& buffer, pred::TypedPredicate<EdgeId> proceed_condition) {
- VERIFY(buffer.empty());
- DEBUG("Filling edge buffer of size " << buff_size_);
- perf_counter perf;
- double low_cov = 0.;
- double cov_diff = 0.;
- while (!it_.IsEnd() && buffer.size() < buff_size_) {
- EdgeId e = *it_;
- TRACE("Current edge " << this->g().str(e));
- if (!proceed_condition(e)) {
- TRACE("Stop condition was reached.");
- //need to release last element of the iterator to make it replaceable by new elements
- it_.ReleaseCurrent();
- return false;
- }
-
- double cov = this->g().coverage(e);
- if (buffer.empty()) {
- low_cov = cov;
- cov_diff = max(buff_cov_diff_, buff_cov_rel_diff_ * low_cov);
- } else {
- if (math::gr(cov, low_cov + cov_diff)) {
- //need to release last element of the iterator to make it replaceable by new elements
- it_.ReleaseCurrent();
- return true;
- }
- }
- TRACE("Potential bulge edge");
- buffer.push_back(e);
- ++it_;
- }
-
- DEBUG("Filled in " << perf.time() << " seconds");
- if (buffer.size() == buff_size_) {
- TRACE("Buffer filled");
- return true;
- } else {
- TRACE("No more edges in iterator");
- return false;
- }
- }
-
- std::vector<std::vector<BulgeInfo>> FindBulges(const std::vector<EdgeId> edge_buffer) const {
- DEBUG("Looking for bulges (in parallel). Edge buffer size " << edge_buffer.size());
- perf_counter perf;
- std::vector<std::vector<BulgeInfo>> bulge_buffers(omp_get_max_threads());
- size_t n = edge_buffer.size();
- //order is in agreement with coverage
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < n; ++i) {
- EdgeId e = edge_buffer[i];
- auto alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- bulge_buffers[omp_get_thread_num()].push_back(BulgeInfo(i, e, std::move(alternative)));
- }
- }
- DEBUG("Bulges found in " << perf.time() << " seconds");
- return bulge_buffers;
- }
-
- std::vector<BulgeInfo> MergeBuffers(std::vector<std::vector<BulgeInfo>>&& buffers) const {
- DEBUG("Merging bulge buffers");
- perf_counter perf;
-
- std::vector<BulgeInfo> merged_bulges;
- for (auto& bulge_buffer : buffers) {
- std::copy(std::make_move_iterator(bulge_buffer.begin()),
- std::make_move_iterator(bulge_buffer.end()),
- std::back_inserter(merged_bulges));
- }
-
- DEBUG("Sorting");
- //order is in agreement with coverage
- std::sort(merged_bulges.begin(), merged_bulges.end());
- DEBUG("Total bulges " << merged_bulges.size());
- DEBUG("Buffers merged in " << perf.time() << " seconds");
- return merged_bulges;
- }
-
- SmartEdgeSet RetainIndependentBulges(std::vector<BulgeInfo>& bulges) const {
- DEBUG("Looking for independent bulges");
- size_t total_cnt = bulges.size();
- perf_counter perf;
-
- std::vector<BulgeInfo> filtered;
- filtered.reserve(bulges.size());
- //fixme switch to involved vertices to bring fully parallel glueing closer
- std::unordered_set<EdgeId> involved_edges;
- SmartEdgeSet interacting_edges(this->g(), false, CoverageComparator<Graph>(this->g()));
-
- for (BulgeInfo& info : bulges) {
- TRACE("Analyzing interactions of " << info.str(this->g()));
- if (CheckInteracting(info, involved_edges)) {
- TRACE("Interacting");
- interacting_edges.push(info.e);
- } else {
- TRACE("Independent");
- AccountEdges(info, involved_edges);
- filtered.push_back(std::move(info));
- }
- }
- bulges = std::move(filtered);
-
- DEBUG("Independent bulges identified in " << perf.time() << " seconds");
- DEBUG("Independent cnt " << bulges.size());
- DEBUG("Interacting cnt " << interacting_edges.size());
- VERIFY(bulges.size() + interacting_edges.size() == total_cnt);
-
- return interacting_edges;
- }
-
- bool ProcessBulges(const std::vector<BulgeInfo>& independent_bulges, SmartEdgeSet&& interacting_edges) {
- DEBUG("Processing bulges");
- perf_counter perf;
-
- bool triggered = false;
-
- for (const BulgeInfo& info : independent_bulges) {
- TRACE("Processing bulge " << info.str(this->g()));
- triggered = true;
- gluer_(info.e, info.alternative);
- }
-
- DEBUG("Independent bulges glued in " << perf.time() << " seconds");
- perf.reset();
-
- DEBUG("Processing remaining interacting bulges " << interacting_edges.size());
- //usual br strategy
- for (; !interacting_edges.IsEnd(); ++interacting_edges) {
- EdgeId e = *interacting_edges;
- TRACE("Processing edge " << this->g().str(e));
- std::vector<EdgeId> alternative = alternatives_analyzer_(e);
- if (!alternative.empty()) {
- gluer_(e, alternative);
- triggered = true;
- }
- }
- DEBUG("Interacting edges processed in " << perf.time() << " seconds");
- return triggered;
- }
-
-public:
-
- typedef std::function<void(EdgeId edge, const vector<EdgeId>& path)> BulgeCallbackF;
-
- ParallelBulgeRemover(Graph& g, const InterestingElementFinder& interesting_edge_finder,
- size_t buff_size, double buff_cov_diff,
- double buff_cov_rel_diff, const AlternativesAnalyzer<Graph>& alternatives_analyzer,
- BulgeCallbackF opt_callback = 0,
- std::function<void(EdgeId)> removal_handler = 0,
- bool track_changes = true) :
- PersistentAlgorithmBase<Graph>(g),
- buff_size_(buff_size),
- buff_cov_diff_(buff_cov_diff),
- buff_cov_rel_diff_(buff_cov_rel_diff),
- alternatives_analyzer_(alternatives_analyzer),
- gluer_(g, opt_callback, removal_handler),
- interesting_edge_finder_(interesting_edge_finder),
- tracking_(track_changes),
- curr_iteration_(0),
- it_(g, true, CoverageComparator<Graph>(g), true) {
- VERIFY(buff_size_ > 0);
- it_.Detach();
- }
-
- bool Run(bool force_primary_launch = false) override {
- bool primary_launch = force_primary_launch ? true : curr_iteration_ == 0;
- //todo remove if not needed;
- //potentially can vary coverage threshold in coordination with ec threshold
- auto proceed_condition = pred::AlwaysTrue<EdgeId>();
-
- if (!it_.IsAttached()) {
- it_.Attach();
- }
- if (primary_launch) {
- it_.clear();
- TRACE("Primary launch.");
- TRACE("Start search for interesting edges");
- interesting_edge_finder_.Run(it_);
- TRACE(it_.size() << " interesting edges to process");
- } else {
- VERIFY(tracking_);
- TRACE(it_.size() << " edges to process");
- }
-
- bool triggered = false;
- bool proceed = true;
- while (proceed) {
- std::vector<EdgeId> edge_buffer;
- edge_buffer.reserve(buff_size_);
- proceed = FillEdgeBuffer(edge_buffer, proceed_condition);
-
- std::vector<BulgeInfo> bulges = MergeBuffers(FindBulges(edge_buffer));
-
- auto interacting_edges = RetainIndependentBulges(bulges);
-
- bool inner_triggered = ProcessBulges(bulges, std::move(interacting_edges));
- proceed |= inner_triggered;
- triggered |= inner_triggered;
- }
-
- TRACE("Finished processing. Triggered = " << triggered);
- if (!tracking_)
- it_.Detach();
-
- curr_iteration_++;
-
- return triggered;
- }
-
-private:
- DECL_LOGGER("ParallelBulgeRemover")
-};
-
-}
diff --git a/src/modules/algorithms/simplification/cleaner.hpp b/src/modules/algorithms/simplification/cleaner.hpp
deleted file mode 100644
index 1787e56..0000000
--- a/src/modules/algorithms/simplification/cleaner.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#pragma once
-
-#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "assembly_graph/graph_support/parallel_processing.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-class Cleaner : public PersistentProcessingAlgorithm<Graph,
- typename Graph::VertexId,
- ParallelInterestingElementFinder < Graph, typename Graph::VertexId>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm <Graph,
- VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
- typedef IsolatedVertexCondition<Graph> ConditionT;
-
- Graph &g_;
- ConditionT isolated_condition_;
-
-public:
- Cleaner(Graph &g, size_t chunk_cnt = 1) :
- base(g,
- ParallelInterestingElementFinder<Graph, VertexId>(g,
- ConditionT(g), chunk_cnt),
- /*canonical only*/true),
- g_(g), isolated_condition_(g) {
- }
-
-protected:
-
- bool Process(VertexId v) {
- if (isolated_condition_.Check(v)) {
- g_.DeleteVertex(v);
- return true;
- } else {
- return false;
- }
- }
-};
-
-}
diff --git a/src/modules/algorithms/simplification/complex_bulge_remover.hpp b/src/modules/algorithms/simplification/complex_bulge_remover.hpp
deleted file mode 100644
index e3a531a..0000000
--- a/src/modules/algorithms/simplification/complex_bulge_remover.hpp
+++ /dev/null
@@ -1,1162 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <cmath>
-#include <stack>
-#include <queue>
-#include "utils/adt/concurrent_dsu.hpp"
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "math/xmath.h"
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "visualization/visualization.hpp"
-#include "dominated_set_finder.hpp"
-
-
-namespace omnigraph {
-
-namespace complex_br {
-
-template<class Graph>
-class LocalizedComponent: public GraphActionHandler<Graph> /*: public GraphComponent<Graph>*/{
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- VertexId start_vertex_;
- set<VertexId> end_vertices_;
- //usage of inclusive-inclusive range!!!
- map<VertexId, Range> vertex_depth_;
- multimap<size_t, VertexId> height_2_vertices_;
- size_t diff_threshold_;
-
- bool AllEdgeOut(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- bool AllEdgeIn(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (!contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- size_t Average(Range r) const {
- return r.start_pos;
- }
-
-public:
-
-// template <class It>
- LocalizedComponent(const Graph& g, //It begin, It end,
- VertexId start_vertex/*, const vector<VertexId>& end_vertices*/) :
- base(g, "br_component"), g_(g), start_vertex_(start_vertex) {
- end_vertices_.insert(start_vertex);
- vertex_depth_.insert(make_pair(start_vertex_, Range(0, 0)));
- height_2_vertices_.insert(make_pair(0, start_vertex));
- }
-
- const Graph& g() const {
- return g_;
- }
-
- bool IsEndVertex(VertexId v) const {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(g_.EdgeEnd(e)))
- return false;
- }
- return true;
- }
-
- void AddVertex(VertexId v, Range dist_range) {
-// VERIFY(CheckCloseNeighbour(v));
-// Range r = NeighbourDistanceRange(v);
- DEBUG("Adding vertex " << g_.str(v) << " to the component");
- vertex_depth_.insert(make_pair(v, dist_range));
- height_2_vertices_.insert(make_pair(Average(dist_range), v));
- DEBUG(
- "Range " << dist_range << " Average height " << Average(dist_range));
- for (EdgeId e : g_.IncomingEdges(v)) {
- end_vertices_.erase(g_.EdgeStart(e));
- }
- if (IsEndVertex(v)) {
- end_vertices_.insert(v);
- }
- }
-
- //todo what if path processor will fail inside
- size_t TotalPathCount() const {
- size_t answer = 0;
- for (VertexId end_v : end_vertices_) {
- PathStorageCallback<Graph> path_storage(g_);
- Range r = vertex_depth_.find(end_v)->second;
- ProcessPaths(g_, r.start_pos, r.end_pos, start_vertex_, end_v, path_storage);
- answer += path_storage.size();
- }
- return answer;
- }
-
- bool CheckCompleteness() const {
- for (VertexId v : key_set(vertex_depth_)) {
- if (v == start_vertex_)
- continue;
- if (!AllEdgeIn(v) && !AllEdgeOut(v))
- return false;
- }
- return true;
- }
-
- bool NeedsProjection() const {
- DEBUG("Checking if component needs projection");
- size_t tot_path_count = TotalPathCount();
- bool answer = tot_path_count > end_vertices_.size();
-// more robust to path processor failure this way VERIFY(tot_path_count >= end_vertices_.size());
- if (answer) {
- DEBUG("Needs projection");
- } else {
- DEBUG("Doesn't need projection");
- }
- return answer;
- }
-
- bool contains(VertexId v) const {
- return vertex_depth_.count(v) > 0;
- }
-
- bool contains(EdgeId e) const {
- return contains(g_.EdgeStart(e)) && contains(g_.EdgeEnd(e));
- }
-
- Range distance_range(VertexId v) const {
- VERIFY(contains(v));
- return vertex_depth_.find(v)->second;
- }
-
- size_t avg_distance(VertexId v) const {
- VERIFY(contains(v));
- return Average(vertex_depth_.find(v)->second);
- }
-
- set<size_t> avg_distances() const {
- set<size_t> distances;
- for (VertexId v : key_set(vertex_depth_)) {
- distances.insert(avg_distance(v));
- }
- return distances;
- }
-
- VertexId start_vertex() const {
- return start_vertex_;
- }
-
- const set<VertexId>& end_vertices() const {
- return end_vertices_;
- }
-
- bool CheckCloseNeighbour(VertexId v) const {
- DEBUG("Check if vertex " << g_.str(v) << " can be processed");
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (!contains(g_.EdgeStart(e))) {
- DEBUG(
- "Blocked by unprocessed or external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
- DEBUG("Check fail");
- return false;
- }
- }
- DEBUG("Check ok");
- return true;
- }
-
- GraphComponent<Graph> AsGraphComponent() const {
- set<VertexId> vertices = key_set(vertex_depth_);
- return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
- }
-
- bool ContainsConjugateVertices() const {
- set<VertexId> conjugate_vertices;
- for (VertexId v : key_set(vertex_depth_)) {
- if (conjugate_vertices.count(v) == 0) {
- conjugate_vertices.insert(g_.conjugate(v));
- } else {
- return true;
- }
- }
- return false;
- }
-
- virtual void HandleDelete(VertexId v) {
- VERIFY(end_vertices_.count(v) == 0);
- if (contains(v)) {
- DEBUG("Deleting vertex " << g_.str(v) << " from the component");
- size_t depth = avg_distance(v);
- vertex_depth_.erase(v);
- for (auto it = height_2_vertices_.lower_bound(depth);
- it != height_2_vertices_.upper_bound(depth); ++it) {
- if (it->second == v) {
- height_2_vertices_.erase(it);
- return;
- }
- }
- VERIFY(false);
- }
-
- }
-
- virtual void HandleDelete(EdgeId /*e*/) {
- //empty for now
- }
-
- virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
- VERIFY(false);
- }
-
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) {
- //empty for now
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1, EdgeId /*new_edge_2*/) {
- VERIFY(old_edge != g_.conjugate(old_edge));
- VertexId start = g_.EdgeStart(old_edge);
- VertexId end = g_.EdgeEnd(old_edge);
- if (contains(start)) {
- VERIFY(vertex_depth_.count(end) > 0);
- VERIFY(avg_distance(end) > avg_distance(start));
- VertexId new_vertex = g_.EdgeEnd(new_edge_1);
- Range new_vertex_depth(distance_range(start));
- new_vertex_depth.shift((int) g_.length(new_edge_1));
- //todo do better later (needs to be synched with splitting strategy)
-// + (vertex_depth_[end] - vertex_depth_[start])
-// * g_.length(new_edge_1) / g_.length(old_edge);
- DEBUG(
- "Inserting vertex " << g_.str(new_vertex) << " to component during split");
- vertex_depth_.insert(make_pair(new_vertex, new_vertex_depth));
- height_2_vertices_.insert(
- make_pair(Average(new_vertex_depth), new_vertex));
- }
- }
-
- const multimap<size_t, VertexId>& height_2_vertices() const {
- return height_2_vertices_;
- }
-
- const set<VertexId> vertices_on_height(size_t height) const {
- set<VertexId> answer;
- for (auto it = height_2_vertices_.lower_bound(height);
- it != height_2_vertices_.upper_bound(height); ++it) {
- answer.insert(it->second);
- }
- return answer;
- }
-
-private:
- DECL_LOGGER("LocalizedComponent")
- ;
-};
-
-template<class Graph>
-class SkeletonTree: public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
-
- const set<EdgeId>& edges() const {
- return edges_;
- }
-
- const set<VertexId>& vertices() const {
- return vertices_;
- }
-
- bool Contains(EdgeId e) const {
-// VertexId start = br_comp_.g().EdgeStart(e);
-// if (next_edges_.count(start) > 0) {
-// const vector<EdgeId> edges = next_edges_.find(start)->second;
-// return find(e, next_edges_.lower_bound(start), next_edges_.upper_bound(start)) != edges.end();
-// }
-// return false;
- return edges_.count(e) > 0;
- }
-
- bool Contains(VertexId v) const {
-// return next_edges_.count(v) > 0;
- return vertices_.count(v) > 0;
- }
-
- virtual void HandleDelete(VertexId v) {
- //verify v not in the tree
- VERIFY(!Contains(v));
- }
-
- virtual void HandleDelete(EdgeId e) {
- //verify e not in the tree
- DEBUG("Trying to delete " << br_comp_.g().str(e));
- VERIFY(!Contains(e));
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId /*new_edge*/) {
- //verify false
- for (EdgeId e : old_edges) {
- VERIFY(!Contains(e));
- }
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// verify edge2 in tree
-// put new_edge instead of edge2
- DEBUG("Glueing " << br_comp_.g().str(new_edge) << " " << br_comp_.g().str(edge1) << " " << br_comp_.g().str(edge2));
- if (Contains(edge2)) {
- DEBUG("Erasing from tree: " << br_comp_.g().str(edge2));
- DEBUG("Inserting to tree: " << br_comp_.g().str(new_edge));
- edges_.erase(edge2);
- edges_.insert(new_edge);
- }
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId new_edge_2) {
- VERIFY(old_edge != br_comp_.g().conjugate(old_edge));
- if (Contains(old_edge)) {
- edges_.erase(old_edge);
- vertices_.insert(br_comp_.g().EdgeEnd(new_edge_1));
- edges_.insert(new_edge_1);
- edges_.insert(new_edge_2);
- }
- }
-
- SkeletonTree(const LocalizedComponent<Graph>& br_comp,
- const set<EdgeId>& edges) :
- base(br_comp.g(), "br_tree"), br_comp_(br_comp), edges_(edges) {
- DEBUG("Tree edges " << br_comp.g().str(edges));
- for (EdgeId e : edges_) {
- vertices_.insert(br_comp_.g().EdgeStart(e));
- vertices_.insert(br_comp_.g().EdgeEnd(e));
- }
- }
-
-private:
- const LocalizedComponent<Graph>& br_comp_;
- set<EdgeId> edges_;
- set<VertexId> vertices_;
-
-private:
- DECL_LOGGER("SkeletonTree")
- ;
-};
-
-typedef size_t mask;
-typedef mask mixed_color_t;
-typedef unsigned primitive_color_t;
-
-template<class Graph>
-class ComponentColoring: public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
-
- size_t CountPrimitiveColors(mixed_color_t color) const {
- size_t cnt = 0;
- for (size_t shift = 0; shift < color_cnt_; ++shift) {
- mixed_color_t prim_color = 1 << shift;
- if ((prim_color & color) != 0) {
- cnt++;
- }
- }
- VERIFY(cnt > 0);
- return cnt;
- }
-
- primitive_color_t GetAnyPrimitiveColor(mixed_color_t color) const {
- for (size_t shift = 0; shift < color_cnt_; ++shift) {
- if ((1 << shift & color) != 0) {
- return primitive_color_t(shift);
- }
- }
- VERIFY(false);
- return 0;
- }
-
- bool IsSubset(mixed_color_t super_set, mixed_color_t sub_set) const {
- return (super_set | sub_set) == super_set;
- }
-
-private:
-
- const LocalizedComponent<Graph>& comp_;
- const size_t color_cnt_;
- map<VertexId, mixed_color_t> vertex_colors_;
-
- mixed_color_t CountVertexColor(VertexId v) const {
- mixed_color_t answer = mixed_color_t(0);
- for (EdgeId e : comp_.g().OutgoingEdges(v)) {
- answer |= color(e);
- }
- return answer;
- }
-
- void CountAndSetVertexColor(VertexId v) {
- vertex_colors_.insert(make_pair(v, CountVertexColor(v)));
- }
-
- void ColorComponent() {
- DEBUG("Coloring component");
- size_t cnt = 0;
- for (VertexId v : comp_.end_vertices()) {
- mixed_color_t color = 1 << cnt;
- DEBUG("Coloring exit " << comp_.g().str(v));
- vertex_colors_.insert(make_pair(v, color));
- cnt++;
- }
- for (auto it = comp_.height_2_vertices().rbegin();
- it != comp_.height_2_vertices().rend(); ++it) {
- if (vertex_colors_.count(it->second) == 0) {
- DEBUG("Coloring vertex " << comp_.g().str(it->second));
- CountAndSetVertexColor(it->second);
- }
- }
- DEBUG("Component colored");
- }
-
-public:
-
- ComponentColoring(const LocalizedComponent<Graph>& comp) :
- base(comp.g(), "br_comp_coloring"), comp_(comp), color_cnt_(
- comp_.end_vertices().size()) {
- VERIFY(comp.end_vertices().size() <= sizeof(size_t) * 8);
- ColorComponent();
- }
-
- mixed_color_t color(VertexId v) const {
- auto it = vertex_colors_.find(v);
- if (it == vertex_colors_.end()) {
- DEBUG("No color for vertex " << comp_.g().str(v));
- DEBUG(
- "Incoming edges " << comp_.g().str(comp_.g().IncomingEdges(v)));
- DEBUG(
- "Outgoing edges " << comp_.g().str(comp_.g().OutgoingEdges(v)));
- }
- VERIFY(it != vertex_colors_.end());
- return it->second;
- }
-
- mixed_color_t color(EdgeId e) const {
- return color(comp_.g().EdgeEnd(e));
- }
-
- virtual void HandleDelete(VertexId v) {
- vertex_colors_.erase(v);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& /*old_edges*/, EdgeId /*new_edge*/) {
- VERIFY(false);
- }
-
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId edge1, EdgeId edge2) {
- if (comp_.contains(edge1)) {
- VERIFY(comp_.contains(edge2));
- VERIFY(IsSubset(color(edge2), color(edge1)));
- }
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId /*new_edge_2*/) {
- VERIFY(old_edge != comp_.g().conjugate(old_edge));
- if (comp_.contains(old_edge)) {
- CountAndSetVertexColor(comp_.g().EdgeEnd(new_edge_1));
- }
- }
-
-private:
- DECL_LOGGER("ComponentColoring")
- ;
-};
-
-template<class Graph>
-class SkeletonTreeFinder {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef ConcurrentDSU color_partition_ds_t;
-
- const LocalizedComponent<Graph>& component_;
- const ComponentColoring<Graph>& coloring_;
-
- vector<size_t> level_heights_;
-
- int current_level_;
- color_partition_ds_t current_color_partition_;
-
- set<VertexId> good_vertices_;
- set<EdgeId> good_edges_;
- map<VertexId, vector<EdgeId>> next_edges_;
- map<VertexId, size_t> subtree_coverage_;
-
- bool ConsistentWithPartition(mixed_color_t color) const {
- return current_color_partition_.set_size(
- GetCorrespondingDisjointSet(color))
- == coloring_.CountPrimitiveColors(color);
- }
-
- bool IsGoodEdge(EdgeId e) const {
-// VertexId start = component_.g().EdgeStart(e);
- VertexId end = component_.g().EdgeEnd(e);
- //check if end is good
- if (good_vertices_.count(end) == 0)
- return false;
-
-// is subcase of next case
-// //check if end is from previous level
-// if (component_.avg_distance(end) == level_heights_[current_level_+1])
-// return true;
-
- //check if end color is consistent with partition
- //on level before the start
- return ConsistentWithPartition(coloring_.color(end));
- }
-
- vector<EdgeId> GoodOutgoingEdges(VertexId v) const {
- vector<EdgeId> answer;
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (IsGoodEdge(e)) {
- DEBUG("Edge " << component_.g().str(e) << " is classified as good");
- answer.push_back(e);
- } else {
- DEBUG("Edge " << component_.g().str(e) << " is classified as NOT good");
- }
- }
- return answer;
- }
-
- vector<EdgeId> GoodOutgoingEdges(const vector<VertexId>& vertices) const {
- vector<EdgeId> answer;
- for (VertexId v : vertices) {
- if (component_.end_vertices().count(v) == 0) {
- push_back_all(answer, GoodOutgoingEdges(v));
- }
- }
- return answer;
- }
-
- set<EdgeId> VectorAsSet(const vector<EdgeId>& edges) const {
- return set<EdgeId>(edges.begin(), edges.end());
- }
-
- template<class T>
- vector<T> SetAsVector(const set<T>& edges) const {
- return vector<T>(edges.begin(), edges.end());
- }
-
- primitive_color_t GetCorrespondingDisjointSet(mixed_color_t color) const {
- return (primitive_color_t) current_color_partition_.find_set(
- coloring_.GetAnyPrimitiveColor(color));
- }
-
- void UpdateColorPartitionWithVertex(VertexId v) {
- VERIFY(component_.g().OutgoingEdgeCount(v) > 0);
- primitive_color_t ds = GetCorrespondingDisjointSet(
- coloring_.color(*(component_.g().OutgoingEdges(v).begin())));
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- current_color_partition_.unite(ds,
- GetCorrespondingDisjointSet(coloring_.color(e)));
- }
- }
-
- bool IsGoodVertex(VertexId v) const {
- if (!ConsistentWithPartition(coloring_.color(v)))
- return false;
- mixed_color_t union_color_of_good_children = mixed_color_t(0);
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (good_edges_.count(e) > 0) {
- union_color_of_good_children |= coloring_.color(e);
- }
- }
- return coloring_.color(v) == union_color_of_good_children;
- }
-
- void Init() {
- current_level_ = (int) level_heights_.size() - 1;
- size_t end_cnt = 0;
- for (VertexId v : component_.end_vertices()) {
- good_vertices_.insert(v);
- subtree_coverage_[v] = 0;
- end_cnt++;
- }
- }
-
- size_t absolute_coverage(EdgeId e) {
- return (size_t) (component_.g().coverage(e) * (double) component_.g().length(e));
- }
-
- void UpdateNextEdgesAndCoverage(VertexId v) {
- map<mixed_color_t, size_t> best_subtrees_coverage;
- map<mixed_color_t, EdgeId> best_alternatives;
- for (EdgeId e : component_.g().OutgoingEdges(v)) {
- if (good_edges_.count(e) > 0) {
- VertexId end = component_.g().EdgeEnd(e);
- mixed_color_t color = coloring_.color(e);
- VERIFY(subtree_coverage_.count(end) > 0);
- if (subtree_coverage_[end] + absolute_coverage(e)
- >= best_subtrees_coverage[color]) {
- best_subtrees_coverage[color] = subtree_coverage_[end]
- + absolute_coverage(e);
- best_alternatives[color] = e;
- }
- }
- }
- size_t coverage = 0;
- for (size_t cov : value_set(best_subtrees_coverage)) {
- coverage += cov;
- }
- next_edges_[v] = SetAsVector<EdgeId>(value_set(best_alternatives));
- subtree_coverage_[v] = coverage;
- }
-
-public:
- SkeletonTreeFinder(const LocalizedComponent<Graph>& component,
- const ComponentColoring<Graph>& coloring) :
- component_(component),
- coloring_(coloring),
- level_heights_(SetAsVector<size_t>(component_.avg_distances())),
- current_level_((int) level_heights_.size() - 1),
- current_color_partition_(component_.end_vertices().size()) {
-
- Init();
- }
-
- const set<EdgeId> GetTreeEdges() const {
- set<EdgeId> answer;
- std::queue<VertexId> vertex_queue;
- vertex_queue.push(component_.start_vertex());
- while (!vertex_queue.empty()) {
- VertexId v = vertex_queue.front();
- vertex_queue.pop();
- if (next_edges_.count(v) == 0)
- continue;
- for (EdgeId e : next_edges_.find(v)->second) {
- answer.insert(e);
- vertex_queue.push(component_.g().EdgeEnd(e));
- }
- }
- return answer;
- }
-
- const map<VertexId, vector<EdgeId>>& GetTree() const {
- return next_edges_;
- }
-
- bool FindTree() {
- DEBUG("Looking for tree");
- while (current_level_ >= 0) {
- size_t height = level_heights_[current_level_];
- DEBUG("Processing level " << current_level_ << " on height " << height);
- set<VertexId> level_vertices = component_.vertices_on_height(
- height);
- VERIFY(!level_vertices.empty());
-
- //looking for good edges
- insert_all(good_edges_,
- GoodOutgoingEdges(
- vector<VertexId>(level_vertices.begin(),
- level_vertices.end())));
-
-
-
- //counting colors and color partitions
- for (VertexId v : level_vertices) {
- if (component_.end_vertices().count(v) == 0) {
- UpdateColorPartitionWithVertex(v);
- if (IsGoodVertex(v)) {
- DEBUG("Vertex " << component_.g().str(v) << " is classified as good");
- good_vertices_.insert(v);
- UpdateNextEdgesAndCoverage(v);
- } else {
- DEBUG("Vertex " << component_.g().str(v) << " is classified as NOT good");
- }
- }
- }
- current_level_--;
- }
- if (good_vertices_.count(component_.start_vertex()) > 0) {
- DEBUG("Looking for tree was successful");
- return true;
- } else {
- DEBUG("Looking for tree failed");
- return false;
- }
- }
-
-private:
- DECL_LOGGER("SkeletonTreeFinder")
- ;
-};
-
-template<class Graph>
-void PrintComponent(const LocalizedComponent<Graph>& component,
- const SkeletonTree<Graph>& tree, const string& file_name) {
- typedef typename Graph::EdgeId EdgeId;
- const set<EdgeId> tree_edges = tree.edges();
- shared_ptr<omnigraph::visualization::ElementColorer<typename Graph::EdgeId>> edge_colorer = make_shared<omnigraph::visualization::MapColorer<EdgeId>>(
- tree_edges.begin(), tree_edges.end(),"green", ""
- );
- visualization::WriteComponentSinksSources(component.AsGraphComponent(), file_name,
- omnigraph::visualization::DefaultColorer(component.g(), edge_colorer),
- *StrGraphLabelerInstance(component.g()));
-}
-
-template<class Graph>
-void PrintComponent(const LocalizedComponent<Graph>& component,
- const string& file_name) {
- visualization::WriteComponent(component.AsGraphComponent(), file_name,
- omnigraph::visualization::DefaultColorer(component.g()),
- *StrGraphLabelerInstance(component.g()));
-}
-
-
-
-template<class Graph>
-class ComponentProjector {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- Graph& g_;
- const LocalizedComponent<Graph>& component_;
- const ComponentColoring<Graph>& coloring_;
- const SkeletonTree<Graph>& tree_;
-
-// DEBUG("Result: edges " << g_.str(split_res.first) << " " << g_.str(split_res.second));
-// DEBUG("New vertex" << g_.str(inner_v) << " ");
-
- bool SplitComponent() {
- DEBUG("Splitting component");
- set<size_t> level_heights(component_.avg_distances());
- DEBUG("Level heights " << ToString<size_t>(level_heights));
-
- GraphComponent<Graph> gc = component_.AsGraphComponent();
-
- for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
- VertexId start_v = g_.EdgeStart(*it);
- VertexId end_v = g_.EdgeEnd(*it);
- size_t start_dist = component_.avg_distance(start_v);
- size_t end_dist = component_.avg_distance(end_v);
- DEBUG(
- "Processing edge " << g_.str(*it) << " avg_start " << start_dist << " avg_end " << end_dist);
- set<size_t> dist_to_split(level_heights.lower_bound(start_dist),
- level_heights.upper_bound(end_dist));
- DEBUG("Distances to split " << ToString<size_t>(dist_to_split));
-
- size_t offset = start_dist;
- EdgeId e = *it;
- for (auto split_it = dist_to_split.begin();
- split_it != dist_to_split.end(); ++split_it) {
- size_t curr = *split_it;
- if (curr == start_dist || curr == end_dist)
- continue;
- DEBUG("Splitting on " << curr);
- size_t pos = curr - offset;
- if(pos >= g_.length(e)) {
- return false;
- }
- DEBUG("Splitting edge " << g_.str(e) << " on position " << pos);
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, pos);
- //checks accordance
- VertexId inner_v = g_.EdgeEnd(split_res.first);
- VERIFY(component_.avg_distance(inner_v) == curr);
- e = split_res.second;
- offset = curr;
- }
- }
- DEBUG("Component split");
- return true;
- }
-
- EdgeId CorrespondingTreeEdge(EdgeId e) const {
- DEBUG("Getting height of vertex " << g_.str(g_.EdgeStart(e)));
- size_t start_height = component_.avg_distance(g_.EdgeStart(e));
- DEBUG("Done");
- mixed_color_t color = coloring_.color(e);
- DEBUG("Getting height of vertex " << g_.str(g_.EdgeEnd(e)));
- size_t end_height = component_.avg_distance(g_.EdgeEnd(e));
- DEBUG("Done");
- for (VertexId v : component_.vertices_on_height(start_height)) {
- if (component_.end_vertices().count(v) == 0) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VERIFY(
- component_.avg_distance(g_.EdgeEnd(e)) == end_height);
- if (tree_.Contains(e)
- && coloring_.IsSubset(coloring_.color(e), color)) {
- return e;
- }
- }
- }
- }
- VERIFY(false);
- return EdgeId(NULL);
- }
-
-public:
-
- bool ProjectComponent() {
- if(!SplitComponent()) {
- DEBUG("Component can't be split");
- return false;
- }
-
- DEBUG("Projecting split component");
- GraphComponent<Graph> gc = component_.AsGraphComponent();
-
- for (auto it = SmartSetIterator<Graph, EdgeId>(g_, gc.e_begin(),
- gc.e_end()); !it.IsEnd(); ++it) {
- DEBUG("Trying to project edge " << g_.str(*it));
- EdgeId target = CorrespondingTreeEdge(*it);
- DEBUG("Target found " << g_.str(target));
- if (target != *it) {
- DEBUG(
- "Glueing " << g_.str(*it) << " to target " << g_.str(target));
- g_.GlueEdges(*it, target);
- DEBUG("Glued");
- }
- DEBUG("Edge processed");
- }
- DEBUG("Component projected");
- return true;
- }
-
- ComponentProjector(Graph& g, const LocalizedComponent<Graph>& component,
- const ComponentColoring<Graph>& coloring,
- const SkeletonTree<Graph>& tree) :
- g_(g), component_(component), coloring_(coloring), tree_(tree) {
-
- }
-
-private:
- DECL_LOGGER("ComponentProjector")
- ;
-};
-
-template<class Graph>
-class LocalizedComponentFinder {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- static const size_t exit_bound = 32;
- static const size_t inf = -1ul;
-
- Graph& g_;
- size_t max_length_;
- size_t length_diff_threshold_;
-
- LocalizedComponent<Graph> comp_;
-
- map<VertexId, Range> dominated_;
- set<VertexId> interfering_;
-
- std::string ToString(EdgeId e) const {
- std::stringstream ss;
- ss << g_.str(e)
- << " start: "
- << g_.str(g_.EdgeStart(e))
- << " end: "
- << g_.str(g_.EdgeEnd(e));
- return ss.str();
- }
-
- bool CheckCompleteness() const {
- if (interfering_.size() == 0) {
- VERIFY(comp_.CheckCompleteness());
- return true;
- }
- return false;
- }
-
- //false if new interfering vertex is not dominated
- //can be slightly modified in new algorithm
- bool ProcessLocality(VertexId processing_v) {
- vector<VertexId> processed_neighb;
- vector<VertexId> unprocessed_neighb;
- for (EdgeId e : g_.OutgoingEdges(processing_v)) {
- VertexId v = g_.EdgeEnd(e);
- if (!comp_.contains(v)) {
- unprocessed_neighb.push_back(v);
- } else {
- processed_neighb.push_back(v);
- }
- }
- if (!processed_neighb.empty()) {
- for (VertexId v : unprocessed_neighb) {
- if (dominated_.count(v) > 0) {
- interfering_.insert(v);
- } else {
- return false;
- }
- }
- }
- return true;
- }
-
- bool AddVertexWithBackwardPaths(VertexId v) {
- DEBUG("Adding vertex with backward paths");
- std::queue<VertexId> q;
- q.push(v);
- while (!q.empty()) {
- VertexId next_v = q.front();
- q.pop();
- if (!ProcessLocality(next_v)) {
- return false;
- }
- if (!comp_.contains(next_v)) {
- VERIFY(dominated_.count(v) > 0);
- comp_.AddVertex(next_v, dominated_.find(next_v)->second);
- for (EdgeId e : g_.IncomingEdges(next_v)) {
- q.push(g_.EdgeStart(e));
- }
- }
- }
- return true;
- }
-
- boost::optional<VertexId> ClosestNeigbour() const {
- size_t min_dist = inf;
- boost::optional<VertexId> answer = boost::none;
- for (auto it = dominated_.begin(); it != dominated_.end(); ++it) {
- if (!comp_.contains(it->first) && it->second.start_pos < min_dist) {
- min_dist = it->second.start_pos;
- answer = boost::optional<VertexId>(it->first);
- }
- }
- return answer;
- }
-
- bool ProcessInterferingVertex(VertexId v) {
- interfering_.erase(v);
- return AddVertexWithBackwardPaths(v);
- }
-
- bool CheckPathLengths() const {
- VERIFY(CheckCompleteness());
- for (VertexId v : comp_.end_vertices()) {
- if (comp_.distance_range(v).size() > length_diff_threshold_)
- return false;
- }
- return true;
- }
-
- bool CheckPositiveHeightDiff() const {
- DEBUG("Checking for positive height diff of each edge");
- GraphComponent<Graph> gc = comp_.AsGraphComponent();
- for (auto it = gc.e_begin(); it != gc.e_end(); ++it) {
- size_t start_height = comp_.avg_distance(g_.EdgeStart(*it));
- size_t end_height = comp_.avg_distance(g_.EdgeEnd(*it));
- //VERIFY(end_height >= start_height);
- if (end_height <= start_height) {
- DEBUG("Check failed for edge " << g_.str(*it) << " start_height " << start_height << " end_height " << end_height);
- return false;
- }
- }
- return true;
- }
-
- bool CloseComponent() {
- while (!interfering_.empty()) {
- VertexId v = *interfering_.begin();
- DEBUG("Processing interfering vertex " << g_.str(v));
- if (!ProcessInterferingVertex(v)) {
- DEBUG("Vertex processing failed");
- return false;
- }
- }
- return true;
- }
-
-public:
- LocalizedComponentFinder(Graph& g, size_t max_length,
- size_t length_diff_threshold, VertexId start_v) :
- g_(g), max_length_(max_length), length_diff_threshold_(
- length_diff_threshold), comp_(g, start_v) {
- DEBUG(
- "Component finder from vertex " << g_.str(comp_.start_vertex()) << " created");
- DominatedSetFinder<Graph> dominated_set_finder(g_, start_v, max_length);
- dominated_set_finder.FillDominated();
- dominated_ = dominated_set_finder.dominated();
-// ProcessStartVertex();
- }
-
- bool ProceedFurther() {
- DEBUG("Processing further");
-
- DEBUG("Choosing closest vertex");
- do {
- optional<VertexId> next_v = ClosestNeigbour();
-
- if (next_v) {
- DEBUG(
- "Vertex " << g_.str(*next_v) << " was chosen as closest neighbour");
- interfering_.insert(*next_v);
- DEBUG("Trying to construct closure");
- if (!CloseComponent()) {
- DEBUG("Failed to close component");
- return false;
- } else {
- DEBUG("Component closed");
- }
- } else {
- DEBUG("No more vertices can be added");
- return false;
- }
- } while (!comp_.NeedsProjection());
-
- if (!CheckPathLengths()) {
- DEBUG("Path lengths check failed");
- return false;
- }
- if (!CheckPositiveHeightDiff()) {
- DEBUG("Check for positive height diff of each edge failed");
- return false;
- }
- if (comp_.ContainsConjugateVertices()) {
- DEBUG("Found component contains conjugate vertices");
- return false;
- }
- if (comp_.end_vertices().size() > exit_bound) {
- DEBUG("Too many exits:" << comp_.end_vertices().size());
- return false;
- }
- GraphComponent<Graph> gc = comp_.AsGraphComponent();
- DEBUG("Found component candidate. Vertices: " << g_.str(gc.vertices()));
- return true;
- }
-
- const LocalizedComponent<Graph>& component() {
- return comp_;
- }
-
-private:
- DECL_LOGGER("LocalizedComponentFinder")
- ;
-};
-
-template<class Graph>
-class ComplexBulgeRemover {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- Graph& g_;
- size_t max_length_;
- size_t length_diff_;
-
- string pics_folder_;
-
- bool ProcessComponent(LocalizedComponent<Graph>& component,
- size_t candidate_cnt) {
- DEBUG("Processing component");
- ComponentColoring<Graph> coloring(component);
- SkeletonTreeFinder<Graph> tree_finder(component, coloring);
- DEBUG("Looking for a tree");
- if (tree_finder.FindTree()) {
- DEBUG("Tree found");
-
- SkeletonTree<Graph> tree(component, tree_finder.GetTreeEdges());
-
- if (!pics_folder_.empty()) {
- PrintComponent(component, tree,
- pics_folder_ + "success/"
- + ToString(g_.int_id(component.start_vertex()))
- + "_" + ToString(candidate_cnt) + ".dot");
- }
-
- ComponentProjector<Graph> projector(g_, component, coloring, tree);
- if(!projector.ProjectComponent()) {
- DEBUG("Component can't be projected");
- return false;
- }
- DEBUG(
- "Successfully processed component candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
- return true;
- } else {
- DEBUG(
- "Failed to find skeleton tree for candidate " << candidate_cnt << " start_v " << g_.str(component.start_vertex()));
- if (!pics_folder_.empty()) {
- //todo check if we rewrite all of the previous pics!
- PrintComponent(component,
- pics_folder_ + "fail/"
- + ToString(g_.int_id(component.start_vertex())) //+ "_" + ToString(candidate_cnt)
- + ".dot");
- }
- return false;
- }
- }
-
-public:
- ComplexBulgeRemover(Graph& g, size_t max_length, size_t length_diff,
- const string& pics_folder = "") :
- g_(g), max_length_(max_length), length_diff_(length_diff), pics_folder_(
- pics_folder) {
- }
-
- bool Run() {
- size_t cnt = 0;
- DEBUG("Complex bulge remover started");
- if (!pics_folder_.empty()) {
-// remove_dir(pics_folder_);
- make_dir(pics_folder_);
- make_dir(pics_folder_ + "success/");
- make_dir(pics_folder_ + "fail/");
- }
- bool something_done_flag = false;
- for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
- DEBUG("Processing vertex " << g_.str(*it));
- size_t candidate_cnt = 0;
- vector<VertexId> vertices_to_post_process;
- { //important scope!!!
- LocalizedComponentFinder<Graph> comp_finder(g_, max_length_,
- length_diff_, *it);
- while (comp_finder.ProceedFurther()) {
- candidate_cnt++;
- DEBUG(
- "Found component candidate " << candidate_cnt << " start_v " << g_.str(*it));
- LocalizedComponent<Graph> component =
- comp_finder.component();
- if (ProcessComponent(component, candidate_cnt)) {
- something_done_flag = true;
- cnt++;
- GraphComponent<Graph> gc = component.AsGraphComponent();
- vertices_to_post_process.insert(
- vertices_to_post_process.end(), gc.v_begin(),
- gc.v_end());
- break;
- }
- }
- }
- for (VertexId v : vertices_to_post_process) {
- it.HandleAdd(v);
- g_.CompressVertex(v);
- }
- }
- DEBUG("Complex bulge remover finished");
- DEBUG("Bulges processed " << cnt);
- return something_done_flag;
- }
-
-private:
- DECL_LOGGER("ComplexBulgeRemover")
- ;
-};
-
-}
-
-}
diff --git a/src/modules/algorithms/simplification/complex_tip_clipper.hpp b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
deleted file mode 100644
index 984cfd5..0000000
--- a/src/modules/algorithms/simplification/complex_tip_clipper.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <limits>
-
-#include "visualization/visualization.hpp"
-#include "compressor.hpp"
-#include "dominated_set_finder.hpp"
-
-
-namespace omnigraph{
-
-
-template<class Graph>
-class ComplexTipClipper {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- Graph& g_;
- double relative_coverage_treshold_;
- size_t edge_length_treshold_;
- size_t max_path_length_;
- string pics_folder_;
- std::function<void(const set<EdgeId>&)> removal_handler_;
-
- bool CheckEdgeLenghts(const GraphComponent<Graph>& component) const {
- for(auto e : component.edges()) {
- if(g_.length(e) > edge_length_treshold_) {
- return false;
- }
- }
- return true;
- }
-
-
- bool CheckSize(const GraphComponent<Graph> & component) const {
- return (component.vertices().size() > 1);
- }
-
- void RemoveComplexTip(GraphComponent<Graph>& component) {
- ComponentRemover<Graph> remover(g_, removal_handler_);
- remover.DeleteComponent(component.edges().begin(), component.edges().end());
- }
-
-
- bool CheckPathLengths(const map<VertexId, Range>& ranges) const {
- for(auto r : ranges) {
- if(r.second.start_pos > max_path_length_) {
- return false;
- }
- }
- return true;
- }
-
- double GetTipCoverage(const GraphComponent<Graph> & component) const {
- double cov = numeric_limits<double>::max();
- for(auto edge : component.edges()) {
- cov = std::min(cov, g_.coverage(edge));
- }
- return cov;
- }
-
- double GetOutwardCoverage(const GraphComponent<Graph> & component) const {
- double cov = 0.0;
- for(auto v : component.vertices()) {
- for(auto edge : g_.OutgoingEdges(v)) {
- if(component.contains(edge)) {
- cov = max(cov, g_.coverage(edge));
- }
- }
-
- for(auto edge : g_.IncomingEdges(v)) {
- if(component.contains(edge)) {
- cov = max(cov, g_.coverage(edge));
- }
- }
- }
- return cov;
- }
-
- double GetRelativeTipCoverage(const GraphComponent<Graph> & component) const {
- return GetTipCoverage(component) / GetOutwardCoverage(component);
- }
-
-public:
- ComplexTipClipper(Graph& g, double relative_coverage, size_t max_edge_len, size_t max_path_len, const string& pics_folder = "", std::function<void(const set<EdgeId>&)> removal_handler = 0) :
- g_(g), relative_coverage_treshold_(math::ge(relative_coverage, 0.0) ? relative_coverage : std::numeric_limits<double>::max()), edge_length_treshold_(max_edge_len) ,max_path_length_(max_path_len), pics_folder_(pics_folder), removal_handler_(removal_handler)
- { }
-
- bool Run() {
- size_t cnt = 0;
- INFO("Complex tip clipper started");
- if (!pics_folder_.empty()) {
- make_dir(pics_folder_);
- }
-
- bool something_done_flag = false;
- for (auto it = g_.SmartVertexBegin(); !it.IsEnd(); ++it) {
- if(g_.IncomingEdgeCount(*it) != 0) {
- continue;
- }
- DEBUG("Processing vertex " << g_.str(*it));
-
- DominatedSetFinder<Graph> dom_finder(g_, *it, max_path_length_ * 2);
-
- if(!dom_finder.FillDominated()) {
- DEBUG("Tip contains too long paths");
- continue;
- }
-
- auto component = dom_finder.AsGraphComponent();
-
- if(!CheckEdgeLenghts(component)) {
- DEBUG("Tip contains too long edges");
- continue;
- }
-
- if(!CheckSize(component)) {
- DEBUG("Component doesn't meet size requirements");
- continue;
- }
- auto dominated = dom_finder.dominated();
- if(!CheckPathLengths(dominated)) {
- DEBUG("Tip contains too long paths");
- continue;
- }
-
- if(math::ge(GetRelativeTipCoverage(component), relative_coverage_treshold_)) {
- DEBUG("Tip is too high covered with respect to external edges");
- continue;
- }
-
- if (!pics_folder_.empty()) {
- visualization::WriteComponentSinksSources(component,
- pics_folder_
- + ToString(g_.int_id(*it)) //+ "_" + ToString(candidate_cnt)
- + ".dot");
- }
-
- something_done_flag = true;
- cnt++;
- RemoveComplexTip(component);
- }
- CompressAllVertices(g_);
- DEBUG("Complex tip clipper finished");
- DEBUG("Tips processed " << cnt);
- return something_done_flag;
- }
-private:
- DECL_LOGGER("ComplexTipClipper")
-};
-
-}
diff --git a/src/modules/algorithms/simplification/compressor.hpp b/src/modules/algorithms/simplification/compressor.hpp
deleted file mode 100644
index 27257f0..0000000
--- a/src/modules/algorithms/simplification/compressor.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-#pragma once
-#include "assembly_graph/graph_support/parallel_processing.hpp"
-#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
-namespace omnigraph {
-
-/**
-* Compressor compresses vertices with unique incoming and unique outgoing edge in linear time while
-* simple one-by-one compressing has square complexity.
-*/
-template<class Graph>
-class Compressor : public PersistentProcessingAlgorithm<Graph, typename Graph::VertexId,
- ParallelInterestingElementFinder<Graph, typename Graph::VertexId>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef PersistentProcessingAlgorithm <Graph,
- VertexId, ParallelInterestingElementFinder<Graph, VertexId>> base;
- typedef CompressCondition <Graph> ConditionT;
-
- Graph &graph_;
- ConditionT compress_condition_;
- bool safe_merging_;
-
- bool GoUniqueWayForward(EdgeId &e) {
- VertexId u = graph_.EdgeEnd(e);
- if (!graph_.CheckUniqueOutgoingEdge(u)
- || !graph_.CheckUniqueIncomingEdge(u)) {
- return false;
- }
- e = graph_.GetUniqueOutgoingEdge(u);
- return true;
- }
-
- bool GoUniqueWayBackward(EdgeId &e) {
- VertexId u = graph_.EdgeStart(e);
- if (!graph_.CheckUniqueOutgoingEdge(u)
- || !graph_.CheckUniqueIncomingEdge(u)) {
- return false;
- }
- e = graph_.GetUniqueIncomingEdge(u);
- return true;
- }
-
-//do not use without checks:)
- EdgeId CompressWithoutChecks(VertexId v) {
-
- EdgeId e = graph_.GetUniqueOutgoingEdge(v);
- EdgeId start_edge = e;
- while (GoUniqueWayBackward(e) && e != start_edge
- && !graph_.RelatedVertices(graph_.EdgeStart(e),
- graph_.EdgeEnd(e))) {
- }
- vector <EdgeId> mergeList;
- // e = graph_.conjugate(e);
- start_edge = e;
- do {
- mergeList.push_back(e);
- } while (GoUniqueWayForward(e) && e != start_edge
- && !graph_.RelatedVertices(graph_.EdgeStart(e),
- graph_.EdgeEnd(e)));
- EdgeId new_edge = graph_.MergePath(mergeList, safe_merging_);
- TRACE("Vertex compressed and is now part of edge "
- << graph_.str(new_edge));
- return new_edge;
-
- }
-
-// //todo use graph method!
-// bool CanCompressVertex(VertexId v) const {
-// if (!graph_.CheckUniqueOutgoingEdge(v)
-// || !graph_.CheckUniqueIncomingEdge(v)) {
-// TRACE(
-// "Vertex "
-// << graph_.str(v)
-// << " judged NOT compressible. Proceeding to the next vertex");
-// TRACE("Processing vertex " << graph_.str(v) << " finished");
-// return false;
-// }
-// return true;
-// }
-public:
- Compressor(Graph &graph, size_t chunk_cnt = 1, bool safe_merging = true) :
- base(graph,
- ParallelInterestingElementFinder<Graph, VertexId>(graph,
- ConditionT(graph), chunk_cnt),
- /*canonical only*/true),
- graph_(graph),
- compress_condition_(graph),
- safe_merging_(safe_merging) {
- }
-
- /**
- * Method compresses longest possible path, containing given vertex.
- * @param vertex to be compressed as part of a path
- * @return true if vertex can be compressed and false otherwise
- */
- bool CompressVertex(VertexId v) {
- TRACE("Processing vertex " << graph_.str(v) << " started");
- if (!compress_condition_.Check(v)) {
- return false;
- }
- TRACE("Vertex " << graph_.str(v) << " judged compressible");
- CompressWithoutChecks(v);
- return true;
- }
-
- EdgeId CompressVertexEdgeId(VertexId v) {
- TRACE("Processing vertex " << graph_.str(v) << " started");
- if (!compress_condition_.Check(v)) {
- return EdgeId(0);
- }
- TRACE("Vertex " << graph_.str(v) << " judged compressible");
- return CompressWithoutChecks(v);
- }
-
-// bool IsOfInterest(VertexId v) const {
-// return CanCompressVertex(v);
-// }
-
-protected:
- bool Process(VertexId v) override {
- if (compress_condition_.Check(v)) {
- CompressWithoutChecks(v);
- return true;
- } else {
- return false;
- }
- }
-
-private:
- DECL_LOGGER("Compressor")
-};
-
-/**
-* Method compresses all vertices which can be compressed.
-*/
-template<class Graph>
-bool CompressAllVertices(Graph &g, bool safe_merging = true, size_t chunk_cnt = 1) {
- Compressor<Graph> compressor(g, chunk_cnt, safe_merging);
- return compressor.Run();
-}
-}
diff --git a/src/modules/algorithms/simplification/dominated_set_finder.hpp b/src/modules/algorithms/simplification/dominated_set_finder.hpp
deleted file mode 100644
index 050777d..0000000
--- a/src/modules/algorithms/simplification/dominated_set_finder.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#pragma once
-
-namespace omnigraph {
-template<class Graph>
-class DominatedSetFinder {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- VertexId start_vertex_;
- size_t max_length_;
- size_t max_count_;
-
- size_t cnt_;
- std::map<VertexId, Range> dominated_;
-
- bool CheckCanBeProcessed(VertexId v) const {
- DEBUG( "Check if vertex " << g_.str(v) << " is dominated close neighbour");
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (dominated_.count(g_.EdgeStart(e)) == 0) {
- DEBUG( "Blocked by external vertex " << g_.int_id(g_.EdgeStart(e)) << " that starts edge " << g_.int_id(e));
- DEBUG("Check fail");
- return false;
- }
- }
- DEBUG("Check ok");
- return true;
- }
-
- void UpdateCanBeProcessed(VertexId v,
- std::queue<VertexId>& can_be_processed) const {
- DEBUG("Updating can be processed");
- for (EdgeId e : g_.OutgoingEdges(v)) {
- DEBUG("Considering edge " << ToString(e));
- VertexId neighbour_v = g_.EdgeEnd(e);
- if (CheckCanBeProcessed(neighbour_v)) {
- can_be_processed.push(neighbour_v);
- }
- }
- }
-
- Range NeighbourDistanceRange(VertexId v, bool dominated_only = true) const {
- DEBUG("Counting distance range for vertex " << g_.str(v));
- size_t min = numeric_limits<size_t>::max();
- size_t max = 0;
- VERIFY(g_.IncomingEdgeCount(v) > 0);
- VERIFY(!dominated_only || CheckCanBeProcessed(v));
- for (EdgeId e : g_.IncomingEdges(v)) {
- //in case of dominated_only == false
- if (dominated_.count(g_.EdgeStart(e)) == 0)
- continue;
- Range range = dominated_.find(g_.EdgeStart(e))->second;
- range.shift((int) g_.length(e));
- DEBUG("Edge " << g_.str(e) << " provide distance range " << range);
- if (range.start_pos < min)
- min = range.start_pos;
- if (range.end_pos > max)
- max = range.end_pos;
- }
- VERIFY((max > 0) && (min < numeric_limits<size_t>::max()) && (min <= max));
- Range answer(min, max);
- DEBUG("Range " << answer);
- return answer;
- }
-
- bool CheckNoEdgeToStart(VertexId v) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (g_.EdgeEnd(e) == start_vertex_) {
- return false;
- }
- }
- return true;
- }
-
-public:
- DominatedSetFinder(const Graph& g, VertexId v, size_t max_length = -1ul,
- size_t max_count = -1ul)
- : g_(g),
- start_vertex_(v),
- max_length_(max_length),
- max_count_(max_count),
- cnt_(0) {
-
- }
-
- //true if no thresholds exceeded
- bool FillDominated() {
- DEBUG("Adding starting vertex " << g_.str(start_vertex_) << " to dominated set");
- dominated_.insert(make_pair(start_vertex_, Range(0, 0)));
- cnt_++;
- std::queue<VertexId> can_be_processed;
- UpdateCanBeProcessed(start_vertex_, can_be_processed);
- while (!can_be_processed.empty()) {
- if (++cnt_ > max_count_) {
- return false;
- }
- VertexId v = can_be_processed.front();
- can_be_processed.pop();
- Range r = NeighbourDistanceRange(v);
- if (r.start_pos > max_length_) {
- return false;
- }
- //Currently dominated vertices cannot have edge to start vertex
- if (CheckNoEdgeToStart(v)) {
- DEBUG("Adding vertex " << g_.str(v) << " to dominated set");
- dominated_.insert(make_pair(v, r));
- UpdateCanBeProcessed(v, can_be_processed);
- }
- }
- return true;
- }
-
- const map<VertexId, Range>& dominated() const {
- return dominated_;
- }
-
- GraphComponent<Graph> AsGraphComponent() const {
- set<VertexId> vertices = key_set(dominated_);
- return GraphComponent<Graph>(g_, vertices.begin(), vertices.end());
- }
-
- //little meaning if FillDominated returned false
- const map<VertexId, Range> CountBorder() const {
- map<VertexId, Range> border;
- for (VertexId v : key_set(border)) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId e_end = g_.EdgeEnd(e);
- if (dominated_.count(e_end) == 0) {
- border[e_end] = NeighbourDistanceRange(e_end, false);
- }
- }
- }
- return border;
- }
-
-};
-}
diff --git a/src/modules/algorithms/simplification/ec_threshold_finder.hpp b/src/modules/algorithms/simplification/ec_threshold_finder.hpp
deleted file mode 100644
index 84d7af2..0000000
--- a/src/modules/algorithms/simplification/ec_threshold_finder.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef OMNI_TOOLS_HPP_
-#define OMNI_TOOLS_HPP_
-
-#include "dev_support/simple_tools.hpp"
-
-#include "dev_support/path_helper.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/graph_support/parallel_processing.hpp"
-#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-
-#ifdef USE_GLIBCXX_PARALLEL
-#include "parallel/algorithm"
-#endif
-
-namespace omnigraph {
-
-template<class Graph>
-class ErroneousConnectionThresholdFinder {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- size_t backet_width_;
-
- bool IsInteresting(EdgeId e) const {
- if (graph_.length(e) > graph_.k() + 1)
- return false;
-
- if (graph_.OutgoingEdgeCount(graph_.EdgeStart(e)) < 2 ||
- graph_.IncomingEdgeCount(graph_.EdgeEnd(e)) < 2)
- return false;
-
- std::vector<EdgeId> v1;
- push_back_all(v1, graph_.OutgoingEdges(graph_.EdgeStart(e)));
- std::vector<EdgeId> v2;
- push_back_all(v2, graph_.IncomingEdges(graph_.EdgeEnd(e)));
- bool eq = (v1.size() == 2 && v2.size() == 2) && ((v1[0] == v2[0] && v1[1] == v2[1]) || (v1[0] == v2[1] && v1[0] == v2[1]));
- return !eq;
- }
-
- double weight(size_t value, const map<size_t, size_t> &histogram,
- size_t backet_width) const {
- double result = 0;
- for (size_t i = 0; i < backet_width && value + i < histogram.size(); i++) {
- result += (double) (getValue(value + i, histogram) * std::min(i + 1, backet_width - i));
- }
- return result;
- }
-
- double Median(double thr = 500.0) const {
- vector<double> coverages;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (graph_.length(*it) > thr)
- coverages.push_back(graph_.coverage(*it));
- }
-
- auto middle_it = coverages.begin() + coverages.size() / 2;
-#ifdef USE_GLIBCXX_PARALLEL
- __gnu_parallel::nth_element(coverages.begin(), middle_it, coverages.end());
-#else
- std::nth_element(coverages.begin(), middle_it, coverages.end());
-#endif
- return coverages[coverages.size() / 2];
- }
-
- size_t getValue(size_t arg, const map<size_t, size_t> &ssmap) const {
- auto it = ssmap.find(arg);
- if (it == ssmap.end())
- return 0;
- else
- return it->second;
- }
-
-public:
- ErroneousConnectionThresholdFinder(const Graph &graph, size_t backet_width = 0) :
- graph_(graph), backet_width_(backet_width) {
- }
-
- double AvgCoverage() const {
- double cov = 0;
- double length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- cov += graph_.coverage(*it) * (double) graph_.length(*it);
- length += (double) graph_.length(*it);
- }
- return cov / length;
- }
-
- std::map<size_t, size_t> ConstructHistogram() const {
- std::map<size_t, size_t> result;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (IsInteresting(*it))
- result[(size_t)graph_.coverage(*it)]++;
- }
- return result;
- }
-
- double FindThreshold(const map<size_t, size_t> &histogram) const {
- size_t backet_width = backet_width_;
- if (backet_width == 0) {
- backet_width = (size_t)(0.3 * AvgCovereageCounter<Graph>(graph_).Count() + 5);
- }
- size_t size = 0;
- if (histogram.size() != 0)
- size = histogram.rbegin()->first + 1;
- INFO("Bucket size: " << backet_width);
- size_t cnt = 0;
- for (size_t i = 1; i + backet_width < size; i++) {
- if (weight(i, histogram, backet_width) > weight(i - 1, histogram, backet_width))
- cnt++;
-
- if (i > backet_width &&
- weight(i - backet_width, histogram, backet_width) >
- weight(i - backet_width - 1, histogram, backet_width)) {
- cnt--;
- }
- if (2 * cnt >= backet_width)
- return (double) i;
-
- }
- INFO("Proper threshold was not found. Threshold set to 0.1 of average coverage");
- return 0.1 * AvgCovereageCounter<Graph>(graph_).Count();
- }
-
- double FindThreshold() const {
- INFO("Finding threshold started");
- std::map<size_t, size_t> histogram = ConstructHistogram(/*weights*/);
- for (size_t i = 0; i < histogram.size(); i++) {
- TRACE(i << " " << histogram[i]);
- }
- double result = FindThreshold(histogram);
- INFO("Average edge coverage: " << AvgCoverage());
- INFO("Graph threshold: " << result);
- result = std::max(AvgCoverage(), result);
- INFO("Threshold finding finished. Threshold is set to " << result);
- return result;
- }
-private:
- DECL_LOGGER("ThresholdFinder");
-};
-
-}
-
-#endif /* OMNI_TOOLS_HPP_ */
diff --git a/src/modules/algorithms/simplification/erroneous_connection_remover.hpp b/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
deleted file mode 100644
index c755d19..0000000
--- a/src/modules/algorithms/simplification/erroneous_connection_remover.hpp
+++ /dev/null
@@ -1,690 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * erroneous_connection_remover.hpp
- *
- * Created on: May 31, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "dev_support/func.hpp"
-#include "math/xmath.h"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
-#include "assembly_graph/graph_core/coverage.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryECCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddAlternativesPresenceCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-
-template<class Graph>
-class RelativeCoverageECCondition: public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- const double rcec_ratio_;
-
- template<class ContainerType>
- double SumCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
- const Graph &g = this->g();
- double sum = 0;
- for (EdgeId e : edges) {
- //update if competitor edge is not loop
- if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
- sum += g.coverage(e);
- }
- return sum;
- }
-
- double AvgLocalityCoverage(EdgeId ec_edge) const {
- const Graph &g = this->g();
- VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
- auto in_start = g.IncomingEdges(start);
- auto out_start = g.OutgoingEdges(start);
- auto in_end = g.IncomingEdges(end);
- auto out_end = g.OutgoingEdges(end);
- double total_edges = double(g.IncomingEdgeCount(start) + g.OutgoingEdgeCount(start) +
- g.IncomingEdgeCount(end) + g.OutgoingEdgeCount(end) - 2);
- return (SumCompetitorCoverage(ec_edge, in_start) +
- SumCompetitorCoverage(ec_edge, out_start) +
- SumCompetitorCoverage(ec_edge, in_end) +
- SumCompetitorCoverage(ec_edge, out_end)) / total_edges;
- }
-
- template<class ContainerType>
- double MaxCompetitorCoverage(EdgeId ec_edge, const ContainerType& edges) const {
- const Graph &g = this->g();
- double result = 0;
- for (EdgeId e : edges) {
- //update if competitor edge is not loop
- if (e != ec_edge && g.EdgeStart(e) != g.EdgeEnd(e))
- result = std::max(result, g.coverage(e));
- }
- return result;
- }
-
- double MaxCompetitorCoverage(EdgeId ec_edge) const {
- const Graph &g = this->g();
- VertexId start = g.EdgeStart(ec_edge), end = g.EdgeEnd(ec_edge);
- auto in_start = g.IncomingEdges(start);
- auto out_start = g.OutgoingEdges(start);
- auto in_end = g.IncomingEdges(end);
- auto out_end = g.OutgoingEdges(end);
- return std::max(
- std::max(MaxCompetitorCoverage(ec_edge, in_start),
- MaxCompetitorCoverage(ec_edge, out_start)),
- std::max(MaxCompetitorCoverage(ec_edge, in_end),
- MaxCompetitorCoverage(ec_edge, out_end)));
- }
-
-public:
-
- RelativeCoverageECCondition(const Graph& g, double rcec_ratio) :
- base(g), rcec_ratio_(rcec_ratio) {
- }
-
- bool Check(EdgeId e) const override {
- //+1 is a trick to deal with edges of 0 coverage from iterative run
- double locality_coverage = AvgLocalityCoverage(e) + 1;
- return math::le(this->g().coverage(e), rcec_ratio_ * locality_coverage);
- }
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddRelativeCoverageECCondition(const Graph &g, double rcec_ratio,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(RelativeCoverageECCondition<Graph>(g, rcec_ratio), condition);
-}
-
-template<class Graph>
-inline bool IsSimpleBulge(const Graph &g, typename Graph::EdgeId e){
- size_t edge_count = g.GetEdgesBetween(g.EdgeStart(e), g.EdgeEnd(e)).size();
-
- return edge_count == g.OutgoingEdgeCount(g.EdgeStart(e)) &&
- edge_count == g.IncomingEdgeCount(g.EdgeEnd(e)) &&
- edge_count >= 2;
-}
-
-template<class Graph>
-class NotBulgeECCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
-public:
-
- NotBulgeECCondition(const Graph &g)
- : base(g) {
-
- }
-
- bool Check(EdgeId e) const {
- if (HasAlternatives(this->g(), e) && !IsSimpleBulge(this->g(), e)){
- DEBUG("edge id = " << this->g().int_id(e)
- << " between = " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e)).size()
- << " between ids: " << this->g().GetEdgesBetween(this->g().EdgeStart(e), this->g().EdgeEnd(e))
- << " outgoing s = " << this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
- << " incoming e = " << this->g().IncomingEdgeCount(this->g().EdgeEnd(e)));
- }
- return !IsSimpleBulge(this->g(), e);
- }
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddNotBulgeECCondition(const Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(NotBulgeECCondition<Graph>(g), condition);
-}
-
-template<class Graph>
-bool RemoveErroneousEdgesInCoverageOrder(Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
- double max_coverage,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
- AddAlternativesPresenceCondition(g, removal_condition),
- removal_handler);
-
- return erroneous_edge_remover.Run(CoverageComparator<Graph>(g),
- CoverageUpperBound<Graph>(g, max_coverage));
-}
-
-template<class Graph>
-bool RemoveErroneousEdgesInLengthOrder(Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> removal_condition,
- size_t max_length,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- omnigraph::EdgeRemovingAlgorithm<Graph> erroneous_edge_remover(g,
- AddAlternativesPresenceCondition(g, removal_condition),
- removal_handler);
-
- return erroneous_edge_remover.Run(LengthComparator<Graph>(g),
- LengthUpperBound<Graph>(g, max_length));
-}
-
-template<class Graph>
-class SelfConjugateCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- public:
-
- SelfConjugateCondition(const Graph& g)
- : base(g) {
- }
-
- bool Check(EdgeId e) const {
- return e == this->g().conjugate(e);
- }
-
- private:
- DECL_LOGGER("SelfConjugateCondition");
-};
-
-//coverage comparator
-//template<class Graph>
-//class RelativeCoverageCondition : public EdgeCondition<Graph> {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// typedef EdgeCondition<Graph> base;
-//
-// double min_coverage_gap_;
-//
-// bool StrongNeighbourCondition(EdgeId neighbour_edge,
-// EdgeId possible_ec) const {
-// return neighbour_edge == possible_ec
-// || math::gr(this->g().coverage(neighbour_edge),
-// this->g().coverage(possible_ec) * min_coverage_gap_);
-//// || this->g().length(neighbour_edge)
-//// >= neighbour_length_threshold_;
-// }
-//
-// bool CheckAdjacent(const vector<EdgeId>& edges, EdgeId possible_ec) const {
-// FOREACH (EdgeId e, edges) {
-// if (!StrongNeighbourCondition(e, possible_ec))
-// return false;
-// }
-// return true;
-// }
-//
-// public:
-//
-// RelativeCoverageCondition(const Graph& g, double min_coverage_gap)
-// : base(g),
-// min_coverage_gap_(min_coverage_gap) {
-//
-// }
-//
-// bool Check(EdgeId e) const {
-// const Graph& g = this->g();
-// return CheckAdjacent(g.IncidentEdges(g.EdgeStart(e)), e)
-// && CheckAdjacent(g.IncidentEdges(g.EdgeEnd(e)), e);
-// }
-//
-// private:
-// DECL_LOGGER("RelativeCoverageCondition")
-// ;
-//
-//};
-
-//todo refactor
-template<class Graph>
-class ThornCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- size_t uniqueness_length_;
- size_t dijkstra_depth_;
-
- bool Unique(const vector<EdgeId>& edges, bool forward) const {
- return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
- }
-
- bool CheckUnique(EdgeId e) const {
- TRACE("Checking conditions for edge start");
- return Unique(vector<EdgeId>(this->g().in_begin(this->g().EdgeStart(e)), this->g().in_end(this->g().EdgeStart(e))), false)
- || Unique(vector<EdgeId>(this->g().out_begin(this->g().EdgeEnd(e)), this->g().out_end(this->g().EdgeEnd(e))), true);
- }
-
- bool CheckThorn(EdgeId e) const {
- if (this->g().EdgeStart(e) == this->g().EdgeEnd(e))
- return false;
- if (this->g().RelatedVertices(this->g().EdgeStart(e),
- this->g().EdgeEnd(e))) {
- return true;
- }
- if (this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) != 2)
- return false;
- if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) != 1)
- return false;
- if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) != 1)
- return false;
- if (this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) != 2)
- return false;
-
- auto dij = DijkstraHelper<Graph>::CreateBoundedDijkstra(this->g(), dijkstra_depth_);
- dij.Run(this->g().EdgeStart(e));
- vector<VertexId> reached = dij.ReachedVertices();
- for (auto it = reached.begin(); it != reached.end(); ++it) {
- if (*it != this->g().EdgeEnd(e)
- && this->g().RelatedVertices(*it, this->g().EdgeEnd(e))) {
- return true;
- }
- }
- return false;
- }
-
- template<class EdgeContainer>
- bool CheckAlternativeCoverage(const EdgeContainer& edges, EdgeId base) const {
- for (EdgeId e: edges) {
- if (e != base && this->g().length(e) < 400
- && this->g().coverage(e) < 15 * this->g().coverage(base)) {
- return false;
- }
- }
- return true;
- }
-
- bool CheckCoverageAround(EdgeId e) const {
- return CheckAlternativeCoverage(
- this->g().IncidentEdges(this->g().EdgeStart(e)), e)
- && CheckAlternativeCoverage(
- this->g().IncidentEdges(this->g().EdgeEnd(e)), e);
- }
-
- bool CheckUniqueness(EdgeId e, bool /*forward*/) const {
- return this->g().length(e) >= uniqueness_length_;
- }
-
- public:
-
- ThornCondition(Graph& g, size_t uniqueness_length, size_t dijkstra_depth)
- : base(g),
- uniqueness_length_(uniqueness_length),
- dijkstra_depth_(dijkstra_depth) {
- }
-
- bool Check(EdgeId e) const {
- bool tmp = (CheckUnique(e) || CheckCoverageAround(e));
- if (tmp)
- tmp &= CheckThorn(e);
- return tmp;
- }
-
- private:
- DECL_LOGGER("ThornCondition")
- ;
-
-};
-
-
-template<class Graph>
-class MultiplicityCounter {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- size_t uniqueness_length_;
- size_t max_depth_;
-
- bool search(VertexId a, VertexId start, EdgeId e, size_t depth,
- std::set<VertexId> &was, pair<size_t, size_t> &result) const {
- if (depth > max_depth_)
- return false;
- if (was.count(a) == 1)
- return true;
- was.insert(a);
- if (graph_.OutgoingEdgeCount(a) == 0
- || graph_.IncomingEdgeCount(a) == 0)
- return false;
- for (auto I = graph_.out_begin(a), E = graph_.out_end(a); I != E; ++I) {
- if (*I == e) {
- if (a != start) {
- return false;
- }
- } else {
- if (graph_.length(*I) >= uniqueness_length_) {
- result.second++;
- } else {
- if (!search(graph_.EdgeEnd(*I), start, e,
- depth + 1 /*graph_.length(*it)*/, was, result))
- return false;
- }
- }
- }
- for (EdgeId in_e : graph_.IncomingEdges(a)) {
- if (in_e == e) {
- if (a != start) {
- return false;
- }
- } else {
- if (graph_.length(in_e) >= uniqueness_length_) {
- result.first++;
- } else {
- if (!search(graph_.EdgeStart(in_e), start, e,
- depth + 1 /*graph_.length(*it)*/, was, result))
- return false;
- }
- }
- }
- return true;
- }
-
-public:
- MultiplicityCounter(const Graph &graph, size_t uniqueness_length,
- size_t max_depth)
- : graph_(graph),
- uniqueness_length_(uniqueness_length),
- max_depth_(max_depth) {
- }
-
- size_t count(EdgeId e, VertexId start) const {
- std::pair<size_t, size_t> result;
- std::set<VertexId> was;
- bool valid = search(start, start, e, 0, was, result);
- if (!valid) {
- return (size_t) (-1);
- }
- if (graph_.EdgeStart(e) == start) {
- if (result.first < result.second) {
- return (size_t) (-1);
- }
- return result.first - result.second;
- } else {
- if (result.first > result.second) {
- return (size_t) (-1);
- }
- return -result.first + result.second;
- }
- }
-};
-
-template<class Graph>
-class MultiplicityCountingCondition : public UniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef UniquenessPlausabilityCondition<Graph> base;
-
- MultiplicityCounter<Graph> multiplicity_counter_;
- EdgePredicate plausiblity_condition_;
-
-public:
- bool CheckUniqueness(EdgeId e, bool forward) const {
- TRACE( "Checking " << this->g().int_id(e) << " for uniqueness in " << (forward ? "forward" : "backward") << " direction");
- VertexId start =
- forward ? this->g().EdgeEnd(e) : this->g().EdgeStart(e);
- bool result = multiplicity_counter_.count(e, start) <= 1;
- TRACE( "Edge " << this->g().int_id(e) << " is" << (result ? "" : " not") << " unique");
- return result;
- }
-
- bool CheckPlausibility(EdgeId e, bool) const {
- return plausiblity_condition_(e);
- }
-
- MultiplicityCountingCondition(const Graph& g, size_t uniqueness_length,
- EdgePredicate plausiblity_condition)
- :
- //todo why 8???
- base(g),
- multiplicity_counter_(g, uniqueness_length, 8),
- plausiblity_condition_(plausiblity_condition) {
-
- }
-
- private:
-
- DECL_LOGGER("MultiplicityCountingCondition")
- ;
-};
-
-
-template<class Graph>
-class ECLoopRemover : public EdgeProcessingAlgorithm<Graph> {
- typedef std::less<typename Graph::EdgeId> Comparator;
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- double ec_threshold_;
- double relative_threshold_;
- const AbstractFlankingCoverage<Graph> &flanking_coverage_;
- EdgeRemover<Graph> edge_remover_;
- size_t coverage_loops_removed = 0;
- size_t dead_loops_removed = 0;
- size_t not_dead_loops_removed = 0;
- size_t coverage_rc_loops_removed = 0;
- size_t dead_rc_loops_removed = 0;
- size_t not_dead_rc_loops_removed = 0;
-
- bool IsLoop(EdgeId e) {
- return this->g().EdgeStart(e) == this->g().EdgeEnd(e);
- }
-
- bool IsRCLoop(EdgeId e) {
- return this->g().EdgeStart(e) == this->g().conjugate(this->g().EdgeEnd(e));
- }
-
- bool IsAnyLoop(EdgeId e) {
- return IsRCLoop(e) || IsLoop(e);
- }
-
- void RemoveHiddenLoopEC(EdgeId e, bool break_on_end) {
- if (IsLoop(e))
- coverage_loops_removed++;
- else
- coverage_rc_loops_removed++;
- if (this->g().length(e) <= this->g().k())
- edge_remover_.DeleteEdge(e);
- else {
- if (break_on_end) {
- auto split_result = this->g().SplitEdge(e, this->g().length(e) - this->g().k());
- edge_remover_.DeleteEdge(split_result.second);
- } else {
- auto split_result = this->g().SplitEdge(e, this->g().k());
- edge_remover_.DeleteEdge(split_result.first);
- }
- }
-
- }
- void RemoveLoopWithNoCheck(EdgeId e) {
- if (IsLoop(e)) {
- if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 1 || this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) == 1)
- dead_loops_removed++;
- else
- not_dead_loops_removed++;
- } else {
- if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 2)
- dead_rc_loops_removed++;
- else
- not_dead_rc_loops_removed++;
-
- }
- edge_remover_.DeleteEdge(e);
- }
-
- bool FindHiddenLoopEC(EdgeId e) {
- if(flanking_coverage_.GetInCov(e) * relative_threshold_ < flanking_coverage_.GetOutCov(e) && flanking_coverage_.GetInCov(e) < ec_threshold_) {
- //start is bad, end is OK.
- RemoveHiddenLoopEC(e, false);
- return true;
- } else if(flanking_coverage_.GetOutCov(e) * relative_threshold_ < flanking_coverage_.GetInCov(e) && flanking_coverage_.GetOutCov(e) < ec_threshold_) {
- //end is bad, start is OK.
- RemoveHiddenLoopEC(e, true);
- return true;
- }
- RemoveLoopWithNoCheck(e);
- return false;
- }
-
- bool ProcessEdge(EdgeId e) {
- if (IsAnyLoop(e)) {
- DEBUG("Susp loop: " << this->g().int_id(e) << endl);
- bool res = FindHiddenLoopEC(e);
- if (res) {DEBUG ("was removed");} else {DEBUG("was not removed"); }
- return res;
- }
- return false;
- }
-
-
-public:
- ECLoopRemover(Graph &g, const AbstractFlankingCoverage<Graph> &flanking_coverage, double ec_threshold, double relative_threshold,
- HandlerF<Graph> removal_handler = 0): base(g),ec_threshold_(ec_threshold),
- relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
- edge_remover_(g, removal_handler){
- }
- void PrintLoopStats(){
- INFO("Loops: accurately removed/deadend removed/other: "<< coverage_loops_removed <<"/" << dead_loops_removed << "/" <<not_dead_loops_removed);
- INFO("RC loops: accurately removed/deadend removed/other: "<< coverage_rc_loops_removed <<"/" << dead_rc_loops_removed << "/" <<not_dead_rc_loops_removed);
- }
-private:
- DECL_LOGGER("ECLoopRemover");
-};
-
-
-template<class Graph>
-class HiddenECRemover: public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-private:
- size_t uniqueness_length_;
- double unreliability_threshold_;
- double ec_threshold_;
- double relative_threshold_;
- const AbstractFlankingCoverage<Graph> &flanking_coverage_;
- EdgeRemover<Graph> edge_remover_;
- MultiplicityCountingCondition<Graph> condition_;
-private:
- void RemoveHiddenEC(EdgeId edge) {
- if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k()))
- edge_remover_.DeleteEdge(edge);
- else {
- auto split_result = this->g().SplitEdge(edge, this->g().k());
- edge_remover_.DeleteEdge(split_result.first);
- }
- }
-
- void RemoveHiddenECWithNoCompression(EdgeId edge) {
- if (this->g().length(edge) <= this->g().k() || (edge == this->g().conjugate(edge) && this->g().length(edge) <= 2 * this->g().k())) {
- edge_remover_.DeleteEdgeWithNoCompression(edge);
- } else {
- auto split_result = this->g().SplitEdge(edge, this->g().k());
- edge_remover_.DeleteEdgeWithNoCompression(split_result.first);
- }
- }
-
- void DisconnectEdges(VertexId v) {
- while(!this->g().IsDeadEnd(v)) {
- RemoveHiddenECWithNoCompression(*(this->g().out_begin(v)));
- }
- }
-
- bool FindHiddenEC(VertexId v) {
- vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
- if(flanking_coverage_.GetInCov(edges[0]) > flanking_coverage_.GetInCov(edges[1])) {
- auto tmp = edges[0];
- edges[0] = edges[1];
- edges[1] = tmp;
- }
-// cout << flanking_coverage_.GetInCov(edges[0]) << " " << flanking_coverage_.GetInCov(edges[1]) << endl;
- if(flanking_coverage_.GetInCov(edges[1]) < unreliability_threshold_) {
- DisconnectEdges(v);
-// cout << "disconnected" << endl;
- return true;
- }
- if(flanking_coverage_.GetInCov(edges[0]) * relative_threshold_ < flanking_coverage_.GetInCov(edges[1]) && flanking_coverage_.GetInCov(edges[0]) < ec_threshold_) {
- RemoveHiddenEC(edges[0]);
-// cout << "success" << endl;
- return true;
- }
- return false;
- }
-
- bool CheckSuspicious(VertexId v) {
- if (this->g().IncomingEdgeCount(v) != 1 || this->g().OutgoingEdgeCount(v) != 2) {
- return false;
- }
- vector<EdgeId> edges(this->g().out_begin(v), this->g().out_end(v));
- return (edges.size() == 2 && this->g().conjugate(edges[0]) == edges[1] && condition_.CheckUniqueness(this->g().GetUniqueIncomingEdge(v), false)) || this->g().length(this->g().GetUniqueIncomingEdge(v)) >= uniqueness_length_;
- }
-
- bool ProcessEdge(EdgeId e) {
- VertexId v = this->g().EdgeEnd(e);
- if(CheckSuspicious(v)) {
-// cout << "client: " << this->g().int_id(v) << endl;
- return FindHiddenEC(v);
- }
- return false;
- }
-
-public:
- HiddenECRemover(Graph& g, size_t uniqueness_length,
- const AbstractFlankingCoverage<Graph> &flanking_coverage,
- double unreliability_threshold, double ec_threshold,
- double relative_threshold,
- std::function<void(EdgeId)> removal_handler = 0)
- : base(g), uniqueness_length_(uniqueness_length),
- unreliability_threshold_(unreliability_threshold * ec_threshold), ec_threshold_(ec_threshold),
- relative_threshold_(relative_threshold), flanking_coverage_(flanking_coverage),
- edge_remover_(g, removal_handler),
- condition_(g, uniqueness_length, pred::AlwaysTrue<EdgeId>()) {
- }
-
-private:
- DECL_LOGGER("HiddenECRemover");
-};
-
-template<class Graph>
-class SelfConjugateDisruptor: public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- EdgeRemover<Graph> edge_remover_;
-protected:
-
- bool ProcessEdge(EdgeId e) override {
- if (e == this->g().conjugate(e)) {
- TRACE("Disrupting self-conjugate edge " << this->g().str(e));
- EdgeId to_del = e;
- size_t len = this->g().length(e);
- if (len > 1) {
- to_del = this->g().SplitEdge(e, len / 2).second;
- }
- edge_remover_.DeleteEdge(to_del);
- return true;
- }
- return false;
- }
-
-public:
- SelfConjugateDisruptor(Graph& g,
- std::function<void(EdgeId)> removal_handler = 0)
- : base(g, true), edge_remover_(g, removal_handler) {
- }
-
-private:
- DECL_LOGGER("SelfConjugateDisruptor");
-};
-}
diff --git a/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp b/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp
deleted file mode 100644
index bea146c..0000000
--- a/src/modules/algorithms/simplification/parallel_simplification_algorithms.hpp
+++ /dev/null
@@ -1,820 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "cleaner.hpp"
-#include "bulge_remover.hpp"
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/graph_core/construction_helper.hpp"
-#include "assembly_graph/graph_support/marks_and_locks.hpp"
-#include "compressor.hpp"
-
-namespace debruijn {
-
-namespace simplification {
-
-// bool EnableParallel() {
-// if (simplif_cfg_.presimp.parallel) {
-// INFO("Trying to enable parallel presimplification.");
-// if (gp_.g.AllHandlersThreadSafe()) {
-// return true;
-// } else {
-// WARN("Not all handlers are threadsafe, switching to non-parallel presimplif");
-// //gp.g.PrintHandlersNames();
-// }
-// }
-// return false;
-// }
-
-template<class Graph>
-class ParallelTipClippingFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- size_t length_bound_;
- double coverage_bound_;
- HandlerF handler_f_;
-
- size_t LockingIncomingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.IncomingEdgeCount(v);
- }
-
- size_t LockingOutgoingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.OutgoingEdgeCount(v);
- }
-
- bool IsIncomingTip(EdgeId e) const {
- return g_.length(e) <= length_bound_ && math::le(g_.coverage(e), coverage_bound_)
- && LockingIncomingCount(g_.EdgeStart(e)) + LockingOutgoingCount(g_.EdgeStart(e)) == 1;
- }
-
- void RemoveEdge(EdgeId e) {
- //even full tip locking can't lead to deadlock
- VertexLockT lock1(g_.EdgeStart(e));
- VertexLockT lock2(g_.EdgeEnd(e));
- g_.DeleteEdge(e);
- }
-
-public:
-
- ParallelTipClippingFunctor(Graph& g, size_t length_bound, double coverage_bound, HandlerF handler_f = 0)
- : g_(g),
- length_bound_(length_bound),
- coverage_bound_(coverage_bound),
- handler_f_(handler_f) {
-
- }
-
- bool Process(VertexId v) {
- if (LockingOutgoingCount(v) == 0)
- return false;
-
- vector<EdgeId> tips;
- //don't need lock here after the previous check
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (IsIncomingTip(e)) {
- tips.push_back(e);
- }
- }
-
- //if all of edges are tips, leave the longest one
- if (!tips.empty() && tips.size() == g_.IncomingEdgeCount(v)) {
- sort(tips.begin(), tips.end(), omnigraph::LengthComparator<Graph>(g_));
- tips.pop_back();
- }
-
- for (EdgeId e : tips) {
- if (handler_f_) {
- handler_f_(e);
- }
- //don't need any synchronization here!
- RemoveEdge(e);
- }
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-};
-
-template<class Graph>
-class ParallelSimpleBRFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- size_t max_length_;
- double max_coverage_;
- double max_relative_coverage_;
- size_t max_delta_;
- double max_relative_delta_;
- std::function<void(EdgeId)> handler_f_;
-
- bool LengthDiffCheck(size_t l1, size_t l2, size_t delta) const {
- return l1 <= l2 + delta && l2 <= l1 + delta;
- }
-
- EdgeId Alternative(EdgeId e, const vector<EdgeId>& edges) const {
- size_t delta = omnigraph::CountMaxDifference(max_delta_, g_.length(e), max_relative_delta_);
- for (auto it = edges.rbegin(); it != edges.rend(); ++it) {
- EdgeId candidate = *it;
- if (g_.EdgeEnd(candidate) == g_.EdgeEnd(e) && candidate != e && candidate != g_.conjugate(e)
- && LengthDiffCheck(g_.length(candidate), g_.length(e), delta)) {
- return candidate;
- }
- }
- return EdgeId(0);
- }
-
- bool ProcessEdges(const vector<EdgeId>& edges) {
- for (EdgeId e : edges) {
- if (g_.length(e) <= max_length_ && math::le(g_.coverage(e), max_coverage_)) {
- EdgeId alt = Alternative(e, edges);
- if (alt != EdgeId(0) && math::ge(g_.coverage(alt) * max_relative_coverage_, g_.coverage(e))) {
- //todo is not work in multiple threads for now :)
- //Reasons: id distribution, kmer-mapping
- handler_f_(e);
- g_.GlueEdges(e, alt);
- return true;
- }
- }
- }
- return false;
- }
-
- vector<VertexId> MultiEdgeDestinations(VertexId v) const {
- vector<VertexId> answer;
- set<VertexId> destinations;
- for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId end = g_.EdgeEnd(e);
- if (destinations.count(end) > 0) {
- answer.push_back(end);
- }
- destinations.insert(end);
- }
- return answer;
- }
-
- VertexId SingleMultiEdgeDestination(VertexId v) const {
- vector<VertexId> dests = MultiEdgeDestinations(v);
- if (dests.size() == 1) {
- return dests.front();
- } else {
- return VertexId(0);
- }
- }
-
- void RemoveBulges(VertexId v) {
- bool flag = true;
- while (flag) {
- vector<EdgeId> edges(g_.out_begin(v), g_.out_end(v));
- if (edges.size() == 1)
- return;
- sort(edges.begin(), edges.end(), omnigraph::CoverageComparator<Graph>(g_));
- flag = ProcessEdges(edges);
- }
- }
-
- bool CheckVertex(VertexId v) const {
- VertexLockT lock(v);
- return MultiEdgeDestinations(v).size() == 1 && MultiEdgeDestinations(g_.conjugate(v)).size() == 0;
- }
-
- size_t MinId(VertexId v) const {
- return std::min(v.int_id(), g_.conjugate(v).int_id());
- }
-
- bool IsMinimal(VertexId v1, VertexId v2) const {
- return MinId(v1) < MinId(v2);
- }
-
-public:
-
- ParallelSimpleBRFunctor(Graph& g, size_t max_length, double max_coverage, double max_relative_coverage, size_t max_delta, double max_relative_delta,
- std::function<void(EdgeId)> handler_f = 0)
- : g_(g),
- max_length_(max_length),
- max_coverage_(max_coverage),
- max_relative_coverage_(max_relative_coverage),
- max_delta_(max_delta),
- max_relative_delta_(max_relative_delta),
- handler_f_(handler_f) {
-
- }
-
- bool operator()(VertexId v/*, need number of vertex for stable id distribution*/) {
- vector<VertexId> multi_dest;
-
- {
- VertexLockT lock(v);
- multi_dest = MultiEdgeDestinations(v);
- }
-
- if (multi_dest.size() == 1 && IsMinimal(v, multi_dest.front())) {
- VertexId dest = multi_dest.front();
- if (CheckVertex(v) && CheckVertex(g_.conjugate(dest))) {
- VertexLockT lock1(v);
- VertexLockT lock2(dest);
- RemoveBulges(v);
- }
- }
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-};
-
-template<class Graph>
-class CriticalEdgeMarker {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
-
- Graph& g_;
- size_t chunk_cnt_;
- omnigraph::GraphElementMarker<EdgeId> edge_marker_;
-
- void ProcessVertex(VertexId v) {
- if (g_.OutgoingEdgeCount(v) > 0) {
- auto max_cov_it =
- std::max_element(g_.out_begin(v), g_.out_end(v), omnigraph::CoverageComparator<Graph>(g_));
- DEBUG("Marking edge " << g_.str(*max_cov_it));
- edge_marker_.mark(*max_cov_it);
- }
- }
-
- template<class It>
- void ProcessVertices(It begin, It end) {
- for (auto it = begin; !(it == end); ++it) {
- ProcessVertex(*it);
- }
- }
-
-public:
-
- CriticalEdgeMarker(Graph& g, size_t chunk_cnt) : g_(g), chunk_cnt_(chunk_cnt) {
- }
-
- void PutMarks() {
- auto chunk_iterators = omnigraph::IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt_);
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- ProcessVertices(chunk_iterators[i], chunk_iterators[i + 1]);
- }
- }
-
- void ClearMarks() {
- auto chunk_iterators = omnigraph::IterationHelper<Graph, EdgeId>(g_).Chunks(chunk_cnt_);
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- for (auto it = chunk_iterators[i]; it != chunk_iterators[i + 1]; ++ it) {
- edge_marker_.unmark(*it);
- }
- }
- }
-private:
- DECL_LOGGER("CriticalEdgeMarker");
-};
-
-template<class Graph>
-class ParallelLowCoverageFunctor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- typename Graph::HelperT helper_;
- pred::TypedPredicate<EdgeId> ec_condition_;
- HandlerF handler_f_;
-
- omnigraph::GraphElementMarker<EdgeId> edge_marker_;
- vector<EdgeId> edges_to_remove_;
-
- void UnlinkEdgeFromStart(EdgeId e) {
- VertexId start = g_.EdgeStart(e);
- VertexLockT lock(start);
- helper_.DeleteLink(start, e);
- }
-
- void UnlinkEdge(EdgeId e) {
- UnlinkEdgeFromStart(e);
- if (g_.conjugate(e) != e)
- UnlinkEdgeFromStart(g_.conjugate(e));
- }
-
-public:
-
- //should be launched with conjugate copies filtered
- ParallelLowCoverageFunctor(Graph& g, size_t max_length, double max_coverage, HandlerF handler_f = 0)
- : g_(g),
- helper_(g_.GetConstructionHelper()),
- ec_condition_(pred::And(pred::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
- omnigraph::CoverageUpperBound<Graph>(g, max_coverage)),
- omnigraph::AlternativesPresenceCondition<Graph>(g))),
- handler_f_(handler_f) {}
-
- bool IsOfInterest(EdgeId e) const {
- return !edge_marker_.is_marked(e) && ec_condition_(e);
- }
-
- void PrepareForProcessing(size_t /*interesting_cnt*/) {
- }
-
- //no conjugate copies here!
- bool Process(EdgeId e, size_t /*idx*/) {
- if (handler_f_)
- handler_f_(e);
- DEBUG("Removing edge " << g_.str(e));
- g_.FireDeleteEdge(e);
- UnlinkEdge(e);
- helper_.DeleteUnlinkedEdge(e);
- return true;
- }
-
- bool ShouldFilterConjugate() const {
- return true;
- }
-// bool operator()(EdgeId e) {
-// if (ec_condition_->Check(e)) {
-// edges_to_remove_.push_back(e);
-// }
-// return false;
-// }
-//
-// void RemoveCollectedEdges() {
-// omnigraph::SmartSetIterator<Graph, EdgeId> to_delete(g_, edges_to_remove_.begin(), edges_to_remove_.end());
-// while (!to_delete.IsEnd()) {
-// EdgeId e = *to_delete;
-// handler_f_(e);
-// g_.DeleteEdge(e);
-// ++to_delete;
-// }
-// }
-private:
- DECL_LOGGER("ParallelLowCoverageFunctor");
-};
-
-template<class Graph>
-class ParallelCompressor {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::EdgeData EdgeData;
- typedef typename Graph::VertexId VertexId;
- typedef omnigraph::GraphElementLock<VertexId> VertexLockT;
-
- Graph& g_;
- typename Graph::HelperT helper_;
- restricted::IdSegmentStorage segment_storage_;
-
- bool IsBranching(VertexId v) const {
-// VertexLockT lock(v);
- return !g_.CheckUniqueOutgoingEdge(v) || !g_.CheckUniqueIncomingEdge(v);
- }
-
- size_t LockingIncomingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.IncomingEdgeCount(v);
- }
-
- size_t LockingOutgoingCount(VertexId v) const {
- VertexLockT lock(v);
- return g_.OutgoingEdgeCount(v);
- }
-
- vector<VertexId> LockingNextVertices(VertexId v) const {
- VertexLockT lock(v);
- vector<VertexId> answer;
- for (EdgeId e : g_.OutgoingEdges(v)) {
- answer.push_back(g_.EdgeEnd(e));
- }
- return answer;
- }
-
- vector<VertexId> FilterBranchingVertices(const vector<VertexId>& vertices) const {
- vector<VertexId> answer;
- for (VertexId v : vertices) {
- VertexLockT lock(v);
- if (!IsBranching(v)) {
- answer.push_back(v);
- }
- }
- return answer;
- }
-
- //correctly handles self-conjugate case
- bool IsMinimal(VertexId v1, VertexId v2) const {
- return !(g_.conjugate(v2) < v1);
- }
-
- //true if need to go further, false if stop on any reason!
- //to_compress is not empty only if compression needs to be done
- //don't need additional checks for v == init | conjugate(init), because init is branching!
- //fixme what about plasmids?! =)
- bool ProcessNextAndGo(VertexId& v, VertexId init, vector<VertexId>& to_compress) {
- VertexLockT lock(v);
- if (!CheckConsistent(v)) {
- to_compress.clear();
- return false;
- }
- if (IsBranching(v)) {
- if (!IsMinimal(init, v)) {
- to_compress.clear();
- }
- return false;
- } else {
- to_compress.push_back(v);
- v = g_.EdgeEnd(g_.GetUniqueOutgoingEdge(v));
- return true;
- }
- }
-
- void UnlinkEdge(VertexId v, EdgeId e) {
- VertexLockT lock(v);
- helper_.DeleteLink(v, e);
- }
-
- void UnlinkEdges(VertexId v) {
- VertexLockT lock(v);
- helper_.DeleteLink(v, g_.GetUniqueOutgoingEdge(v));
- helper_.DeleteLink(g_.conjugate(v), g_.GetUniqueOutgoingEdge(g_.conjugate(v)));
- }
-
- //fixme duplication with abstract conj graph
- //not locking!
- vector<EdgeId> EdgesToDelete(const vector<EdgeId> &path) const {
- set<EdgeId> edgesToDelete;
- edgesToDelete.insert(path[0]);
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- if (edgesToDelete.find(g_.conjugate(e)) == edgesToDelete.end())
- edgesToDelete.insert(e);
- }
- return vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
- }
-
- //not locking!
- //fixme duplication with abstract conj graph
- vector<VertexId> VerticesToDelete(const vector<EdgeId> &path) const {
- set<VertexId> verticesToDelete;
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- VertexId v = g_.EdgeStart(e);
- if (verticesToDelete.find(g_.conjugate(v)) == verticesToDelete.end())
- verticesToDelete.insert(v);
- }
- return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
- }
- //todo end duplication with abstract conj graph
-
- //not locking!
- vector<EdgeId> CollectEdges(const vector<VertexId>& to_compress) const {
- vector<EdgeId> answer;
- answer.push_back(g_.GetUniqueIncomingEdge(to_compress.front()));
- for (VertexId v : to_compress) {
- answer.push_back(g_.GetUniqueOutgoingEdge(v));
- }
- return answer;
- }
-
- void CallHandlers(const vector<EdgeId>& edges, EdgeId new_edge) const {
- g_.FireMerge(edges, new_edge);
- g_.FireDeletePath(EdgesToDelete(edges), VerticesToDelete(edges));
- g_.FireAddEdge(new_edge);
- }
-
- EdgeData MergedData(const vector<EdgeId>& edges) const {
- vector<const EdgeData*> to_merge;
- for (EdgeId e : edges) {
- to_merge.push_back(&(g_.data(e)));
- }
- return g_.master().MergeData(to_merge);
- }
-
- EdgeId SyncAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId new_edge = helper_.AddEdge(data, id_distributor);
- {
- VertexLockT lock(v1);
- helper_.LinkOutgoingEdge(v1, new_edge);
- }
- if (g_.conjugate(new_edge) != new_edge) {
- VertexLockT lock(v2);
- helper_.LinkIncomingEdge(v2, new_edge);
- }
- return new_edge;
- }
-
- void ProcessBranching(VertexId next, VertexId init, size_t idx) {
- vector<VertexId> to_compress;
- while (ProcessNextAndGo(next, init, to_compress)) {
- }
-
- if (!to_compress.empty()) {
- //here we are sure that we are the ones to process the path
- //so we can collect edges without any troubles (and actually without locks todo check!)
- vector<EdgeId> edges = CollectEdges(to_compress);
-
- restricted::ListIdDistributor<restricted::SegmentIterator> id_distributor = segment_storage_.GetSegmentIdDistributor(2 * idx, 2 * idx + 1);
-
- EdgeId new_edge = SyncAddEdge(g_.EdgeStart(edges.front()), g_.EdgeEnd(edges.back()), MergeSequences(g_, edges), id_distributor);
-
- CallHandlers(edges, new_edge);
-
- VertexId final = g_.EdgeEnd(edges.back());
- UnlinkEdge(init, edges.front());
- for (VertexId v : VerticesToDelete(edges/*to_compress*/)) {
- UnlinkEdges(v);
- }
-
- if (g_.conjugate(new_edge) != new_edge) {
- UnlinkEdge(g_.conjugate(final), g_.conjugate(edges.back()));
- }
-
- for (EdgeId e : EdgesToDelete(edges)) {
- helper_.DeleteUnlinkedEdge(e);
- }
- }
- }
-
- //vertex is not consistent if the path has already been compressed or under compression right now
- //not needed here, but could check if vertex is fully isolated
- bool CheckConsistent(VertexId v) const {
- //todo change to incoming edge count
- return g_.OutgoingEdgeCount(g_.conjugate(v)) > 0;
- }
-
- //long, but safe way to get left neighbour
- //heavily relies on the current graph structure!
- VertexId LockingGetInit(VertexId v) {
- VertexLockT lock(v);
- if (!CheckConsistent(v))
- return VertexId(0);
-
- //works even if this edge is already unlinked from the vertex =)
- VERIFY(g_.CheckUniqueIncomingEdge(v));
- return g_.EdgeStart(g_.GetUniqueIncomingEdge(v));
- }
-
-public:
-
- ParallelCompressor(Graph& g)
- : g_(g),
- helper_(g_.GetConstructionHelper()) {
-
- }
-
- //returns true iff v is the "leftmost" vertex to compress in the chain
- bool IsOfInterest(VertexId v) const {
- return !IsBranching(v) && IsBranching(g_.EdgeStart(g_.GetUniqueIncomingEdge(v)));
- }
-
- void PrepareForProcessing(size_t interesting_cnt) {
- segment_storage_ = g_.GetGraphIdDistributor().Reserve(interesting_cnt * 2);
- }
-
- bool Process(VertexId v, size_t idx) {
- VertexId init = LockingGetInit(v);
- if (init != VertexId(0))
- ProcessBranching(v, init, idx);
- return false;
- }
-
- bool ShouldFilterConjugate() const {
- return false;
- }
-
-};
-
-
-//todo add conjugate filtration
-template<class Graph, class ElementType>
-class AlgorithmRunner {
- const Graph& g_;
-
- template<class Algo, class It>
- bool ProcessBucket(Algo& algo, It begin, It end) {
- bool changed = false;
- for (auto it = begin; it != end; ++it) {
- changed |= algo.Process(*it);
- }
- return changed;
- }
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- AlgorithmRunner(Graph& g)
- : g_(g) {
-
- }
-
- template<class Algo, class ItVec>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
- DEBUG("Running from " << chunk_iterators.size() - 1 << "chunks");
- VERIFY(chunk_iterators.size() > 1);
- bool changed = false;
- #pragma omp parallel for schedule(guided) reduction(|:changed)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- changed |= ProcessBucket(algo, chunk_iterators[i], chunk_iterators[i + 1]);
- }
- DEBUG("Finished");
- return changed;
- }
-private:
- DECL_LOGGER("AlgorithmRunner")
- ;
-};
-
-template<class Graph, class ElementType>
-class TwoStepAlgorithmRunner {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
- const bool filter_conjugate_;
- std::vector<std::vector<ElementType>> elements_of_interest_;
-
- template<class Algo>
- bool ProcessBucket(Algo& algo, const std::vector<ElementType>& bucket, size_t idx_offset) const {
- bool changed = false;
- for (ElementType el : bucket) {
- changed |= algo.Process(el, idx_offset++);
- }
- return changed;
- }
-
- template<class Algo>
- bool Process(Algo& algo) const {
- std::vector<size_t> cumulative_bucket_sizes;
- cumulative_bucket_sizes.push_back(0);
- for (const auto& bucket : elements_of_interest_) {
- cumulative_bucket_sizes.push_back(cumulative_bucket_sizes.back() + bucket.size());
- }
- DEBUG("Preparing for processing");
- algo.PrepareForProcessing(cumulative_bucket_sizes.back());
- bool changed = false;
- DEBUG("Processing buckets");
- #pragma omp parallel for schedule(guided) reduction(|:changed)
- for (size_t i = 0; i < elements_of_interest_.size(); ++i) {
- changed |= ProcessBucket(algo, elements_of_interest_[i], cumulative_bucket_sizes[i]);
- }
- return changed;
- }
-
- template<class Algo>
- void CountElement(Algo& algo, ElementType el, size_t bucket) {
- if (filter_conjugate_ && g_.conjugate(el) < el)
- return;
- if (algo.IsOfInterest(el)) {
- TRACE("Element " << g_.str(el) << " is of interest");
- elements_of_interest_[bucket].push_back(el);
- } else {
- TRACE("Element " << g_.str(el) << " is not interesting");
- }
- }
-
- template<class Algo, class It>
- void CountAll(Algo& algo, It begin, It end, size_t bucket) {
- for (auto it = begin; !(it == end); ++it) {
- CountElement(algo, *it, bucket);
- }
- }
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- //conjugate elements are filtered based on ids
- //should be used only if both conjugate elements are simultaneously either interesting or not
- //fixme filter_conjugate is redundant
- TwoStepAlgorithmRunner(Graph& g, bool filter_conjugate)
- : g_(g),
- filter_conjugate_(filter_conjugate) {
-
- }
-
- template<class Algo, class ItVec>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators) {
- DEBUG("Started running from " << chunk_iterators.size() - 1 << " chunks");
- VERIFY(algo.ShouldFilterConjugate() == filter_conjugate_);
- VERIFY(chunk_iterators.size() > 1);
- elements_of_interest_.clear();
- elements_of_interest_.resize(chunk_iterators.size() - 1);
- DEBUG("Searching elements of interest");
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- CountAll(algo, chunk_iterators[i], chunk_iterators[i + 1], i);
- }
- DEBUG("Processing");
- return Process(algo);
- }
-
-// template<class Algo, class It>
-// void RunFromIterator(Algo& algo, It begin, It end) {
-// RunFromChunkIterators(algo, std::vector<It> { begin, end });
-// }
-private:
- DECL_LOGGER("TwoStepAlgorithmRunner")
- ;
-};
-
-template<class Graph, class ElementType>
-class SemiParallelAlgorithmRunner {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph& g_;
-
-public:
-
- const Graph& g() const {
- return g_;
- }
-
- SemiParallelAlgorithmRunner(Graph& g)
- : g_(g) {
-
- }
-
- template<class Algo, class ItVec, class Comparator = std::less<ElementType>>
- bool RunFromChunkIterators(Algo& algo, const ItVec& chunk_iterators,
- const Comparator& comp = Comparator()) {
- VERIFY(chunk_iterators.size() > 1);
- omnigraph::SmartSetIterator<Graph, ElementType, Comparator> it(g_, false, comp);
-
- FillInterestingFromChunkIterators(chunk_iterators, it,
- std::bind(&Algo::IsOfInterest, std::ref(algo), std::placeholders::_1));
-
- bool changed = false;
- for (; !it.IsEnd(); ++it) {
- changed |= algo.Process(*it);
- }
- return changed;
- }
-
-private:
- DECL_LOGGER("SemiParallelAlgorithmRunner")
- ;
-};
-
-//todo generalize to use for other algorithms if needed
-template<class Graph>
-class SemiParallelEdgeRemovingAlgorithm {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- Graph& g_;
- pred::TypedPredicate<EdgeId> condition_;
- omnigraph::EdgeRemover<Graph> edge_remover_;
-
-public:
- SemiParallelEdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> condition,
- std::function<void(EdgeId)> removal_handler = 0) :
- g_(g), condition_(condition), edge_remover_(g, removal_handler) {
- }
-
- bool IsOfInterest(EdgeId e) const {
- return condition_->Check(e);
- }
-
- bool Process(EdgeId e) {
- edge_remover_.DeleteEdge(e);
- return true;
- }
-};
-
-template<class Graph, class AlgoRunner, class Algo>
-bool RunVertexAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
- return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::VertexId>(g).Chunks(chunk_cnt));
-}
-
-template<class Graph, class AlgoRunner, class Algo>
-bool RunEdgeAlgorithm(Graph& g, AlgoRunner& runner, Algo& algo, size_t chunk_cnt) {
- return runner.RunFromChunkIterators(algo, omnigraph::IterationHelper<Graph, typename Graph::EdgeId>(g).Chunks(chunk_cnt));
-}
-
-}
-
-}
diff --git a/src/modules/algorithms/simplification/relative_coverage_remover.hpp b/src/modules/algorithms/simplification/relative_coverage_remover.hpp
deleted file mode 100644
index bc6da7e..0000000
--- a/src/modules/algorithms/simplification/relative_coverage_remover.hpp
+++ /dev/null
@@ -1,674 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "visualization/graph_colorer.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-
-namespace omnigraph {
-
-namespace simplification {
-
-template<class EdgeContainer>
-void SingleEdgeAdapter(
- const EdgeContainer& edges,
- std::function<void(typename EdgeContainer::value_type)> single_edge_handler_f) {
- for (auto e : edges) {
- single_edge_handler_f(e);
- }
-}
-
-namespace relative_coverage {
-
-template<class Graph>
-class Component {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- set<EdgeId> edges_;
- set<VertexId> inner_vertices_;
- set<VertexId> border_;
- set<VertexId> terminating_vertices_;
- //maybe use something more sophisticated in future
- size_t cumm_length_;
- bool contains_deadends_;
-
- //if edge start = edge end = v returns v
- VertexId OppositeEnd(EdgeId e, VertexId v) const {
- VERIFY(g_.EdgeStart(e) == v
- || g_.EdgeEnd(e) == v);
-// VERIFY(remover_.g.EdgeStart(e) != remover_.g.EdgeEnd(e));
- if (g_.EdgeStart(e) == v) {
- return g_.EdgeEnd(e);
- } else {
- return g_.EdgeStart(e);
- }
- }
-
- void RemoveFromBorder(VertexId v) {
- size_t cnt = border_.erase(v);
- VERIFY(cnt);
- }
-
-public:
-
- Component(const Graph& g, EdgeId e) : g_(g), cumm_length_(0), contains_deadends_(false) {
- edges_.insert(e);
- cumm_length_ += g_.length(e);
- border_.insert(g.EdgeStart(e));
- border_.insert(g.EdgeEnd(e));
- }
-
- void MakeInner(VertexId v) {
- VERIFY(border_.count(v) > 0);
- if (g_.IsDeadEnd(v) || g_.IsDeadStart(v)) {
- contains_deadends_ = true;
- }
- inner_vertices_.insert(v);
- for (EdgeId e : g_.IncidentEdges(v)) {
- //seems to correctly handle loops
- if (edges_.count(e) == 0) {
- edges_.insert(e);
- cumm_length_ += g_.length(e);
- VertexId other_end = OppositeEnd(e, v);
- if (inner_vertices_.count(other_end) == 0) {
- border_.insert(other_end);
- }
- }
- }
- RemoveFromBorder(v);
- }
-
- void TerminateOnVertex(VertexId v) {
- terminating_vertices_.insert(v);
- RemoveFromBorder(v);
- }
-
- VertexId NextBorderVertex() const {
- return *border_.begin();
- }
-
- bool IsBorderEmpty() const {
- return border_.empty();
- }
-
- const set<EdgeId>& edges() const {
- return edges_;
- }
-
- bool contains(EdgeId e) const {
- return edges_.count(e) > 0;
- }
-
- const set<VertexId>& terminating_vertices() const {
- return terminating_vertices_;
- }
-
- set<EdgeId> terminating_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.IncidentEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- //terminating edges, going into the component
- set<EdgeId> terminating_in_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.OutgoingEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- //terminating edges, going out of the component
- set<EdgeId> terminating_out_edges() const {
- set<EdgeId> answer;
- for (VertexId v : terminating_vertices()) {
- for (EdgeId e : g_.IncomingEdges(v)) {
- if (contains(e)) {
- answer.insert(e);
- }
- }
- }
- return answer;
- }
-
- const Graph& g() const {
- return g_;
- }
-
- size_t inner_vertex_cnt() const {
- return inner_vertices_.size();
- }
-
- size_t length() const {
- return cumm_length_;
- }
-
- bool contains_deadends() const {
- return contains_deadends_;
- }
-};
-
-template<class Graph>
-class RelativeCoverageHelper {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
-
- const Graph& g_;
- LocalCoverageFT local_coverage_f_;
- double min_coverage_gap_;
-
-public:
- RelativeCoverageHelper(const Graph& g, LocalCoverageFT local_coverage_f,
- double min_coverage_gap)
- : g_(g),
- local_coverage_f_(local_coverage_f),
- min_coverage_gap_(min_coverage_gap) {
- VERIFY(math::gr(min_coverage_gap, 1.));
- }
-
- double LocalCoverage(EdgeId e, VertexId v) const {
- DEBUG("Local coverage of edge " << g_.str(e) << " around vertex " << g_.str(v) << " was " << local_coverage_f_(e, v));
- return local_coverage_f_(e, v);
- }
-
- template<class EdgeContainer>
- double MaxLocalCoverage(const EdgeContainer& edges, VertexId v) const {
- double answer = 0.0;
- for (EdgeId e : edges) {
- answer = max(answer, LocalCoverage(e, v));
- }
- return answer;
- }
-
- template<class EdgeContainer>
- bool CheckAnyHighlyCovered(const EdgeContainer& edges, VertexId v,
- double base_coverage) const {
- return math::gr(MaxLocalCoverage(edges, v),
- base_coverage * min_coverage_gap_);
- }
-
- double RelativeCoverageToReport(VertexId v, double base_coverage) const {
- return std::min(MaxLocalCoverage(g_.OutgoingEdges(v), v),
- MaxLocalCoverage(g_.IncomingEdges(v), v))
- / base_coverage;
- }
-
-private:
- DECL_LOGGER("RelativeCoverageHelper");
-};
-
-template<class Graph>
-class LongestPathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Component<Graph>& component_;
- const Graph& g_;
- map<VertexId, int> max_distance_;
- vector<VertexId> vertex_stack_;
- bool cycle_detected_;
-
- //distance is changed!
- bool TryGetMaxDistance(VertexId v, int& distance) {
- if (max_distance_.count(v) > 0) {
- distance = max_distance_[v];
- return true;
- }
-
- //minus infinity for incoming tips
- distance = std::numeric_limits<int>::min();
- for (EdgeId e : g_.IncomingEdges(v)) {
- VertexId start = g_.EdgeStart(e);
- if (component_.contains(e)) {
- if (max_distance_.count(start) == 0) {
- if (std::find(vertex_stack_.begin(), vertex_stack_.end(), start) != vertex_stack_.end()) {
- cycle_detected_ = true;
- }
- vertex_stack_.push_back(start);
- return false;
- } else {
- distance = std::max(distance, max_distance_[start] + int(g_.length(e)));
- }
- }
- }
- //todo think...
- //currently whole length of zig-zag path
- //through several terminal vertices is counted
- if (component_.terminating_vertices().count(v) > 0) {
- distance = std::max(distance, 0);
- }
- return true;
- }
-
- void ProcessVertex(VertexId init_v) {
- vertex_stack_.push_back(init_v);
- while (!vertex_stack_.empty()) {
- if (cycle_detected_)
- return;
-
- VertexId v = vertex_stack_.back();
- int max_dist = 0;
- if (TryGetMaxDistance(v, max_dist)) {
- max_distance_[v] = max_dist;
- vertex_stack_.pop_back();
- }
- }
- }
-
-public:
- LongestPathFinder(const Component<Graph>& component)
- : component_(component), g_(component.g()), cycle_detected_(false) {
- }
-
- //-1u if component contains a cycle or no path between terminating vertices
- size_t Find() {
- int answer = 0;
- for (VertexId v : component_.terminating_vertices()) {
- ProcessVertex(v);
- if (cycle_detected_)
- return -1u;
- VERIFY(max_distance_.count(v) > 0);
- answer = std::max(answer, get(max_distance_, v));
- }
- VERIFY(answer >= 0);
- if (answer == 0)
- return -1u;
- return size_t(answer);
- }
-};
-
-template<class Graph>
-class ComponentChecker {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- size_t vertex_count_limit_;
- size_t length_bound_;
- size_t tip_allowing_length_bound_;
- size_t longest_connecting_path_bound_;
- double max_coverage_;
-
- bool CoverageCheck(const Component<Graph>& component) const {
- for (EdgeId e : component.edges()) {
- if (math::gr(g_.coverage(e), max_coverage_)) {
- TRACE("Too high coverage! Component contains highly covered edge " << g_.str(e)
- << " of coverage " << g_.coverage(e) << " while threshold was " << max_coverage_);
- return false;
- }
- }
- return true;
- }
-
-public:
- ComponentChecker(const Graph& g, size_t vertex_count_limit, size_t length_bound,
- size_t tip_allowing_length_bound,
- size_t longest_connecting_path_bound,
- double max_coverage)
- : g_(g), vertex_count_limit_(vertex_count_limit),
- length_bound_(length_bound),
- tip_allowing_length_bound_(tip_allowing_length_bound),
- longest_connecting_path_bound_(longest_connecting_path_bound),
- max_coverage_(max_coverage) {
- }
-
- bool SizeCheck(const Component<Graph>& component) const {
- if (component.inner_vertex_cnt() > vertex_count_limit_) {
- TRACE("Too many vertices : " << component.inner_vertex_cnt() << " ! More than " << vertex_count_limit_);
- return false;
- }
- return true;
- }
-
- bool FullCheck(const Component<Graph>& component) const {
- TRACE("Performing full check of the component");
- size_t longest_connecting_path = LongestPathFinder<Graph>(component).Find();
- if (longest_connecting_path != -1u) {
- if (longest_connecting_path >= longest_connecting_path_bound_) {
- TRACE("Length of longest path: " << longest_connecting_path << "; threshold: " << longest_connecting_path_bound_);
- return false;
- }
- } else {
- TRACE("Failed to find longest connecting path (check for cycles)");
- }
- if (!component.contains_deadends()
- && component.length() > length_bound_) {
- TRACE("Too long component of length " << component.length() << "! Longer than length bound " << length_bound_);
- return false;
- } else if (component.length() > tip_allowing_length_bound_) {
- TRACE("Too long component of length " << component.length() << "! Longer than tip allowing length bound " << tip_allowing_length_bound_);
- return false;
- }
-
- return SizeCheck(component) && CoverageCheck(component);
- }
-
-private:
- DECL_LOGGER("RelativelyLowCoveredComponentChecker");
-};
-
-//Removes last (k+1)-mer of graph edge
-template<class Graph>
-class EdgeDisconnector {
- typedef typename Graph::EdgeId EdgeId;
- Graph& g_;
- EdgeRemover<Graph> edge_remover_;
-
-public:
- EdgeDisconnector(Graph& g,
- HandlerF<Graph> removal_handler = nullptr):
- g_(g), edge_remover_(g, removal_handler) {
- }
-
- EdgeId operator()(EdgeId e) {
- VERIFY(g_.length(e) > 1);
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(e, 1);
- edge_remover_.DeleteEdge(split_res.first);
- return split_res.first;
- }
-};
-
-//todo make parallel
-template<class Graph>
-class RelativeCoverageDisconnector: public EdgeProcessingAlgorithm<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
- typedef EdgeProcessingAlgorithm<Graph> base;
-
- const RelativeCoverageHelper<Graph> rel_helper_;
- EdgeDisconnector<Graph> disconnector_;
- size_t cnt_;
-public:
- RelativeCoverageDisconnector(Graph& g,
- LocalCoverageFT local_coverage_f, double diff_mult) :
- base(g, false),
- rel_helper_(g, local_coverage_f, diff_mult),
- disconnector_(g),
- cnt_(0) {
- }
-
- ~RelativeCoverageDisconnector() {
- DEBUG("Disconnected edge cnt " << cnt_);
- }
-
-protected:
- bool ProcessEdge(EdgeId edge) {
- DEBUG("Processing edge " << this->g().int_id(edge));
- VertexId v = this->g().EdgeStart(edge);
- double coverage_edge_around_v = rel_helper_.LocalCoverage(edge, v);
- DEBUG("Local flanking coverage - " << coverage_edge_around_v);
- DEBUG("Max local coverage incoming - " << rel_helper_.MaxLocalCoverage(this->g().IncomingEdges(v), v));
- DEBUG("Max local coverage outgoing - " << rel_helper_.MaxLocalCoverage(this->g().OutgoingEdges(v), v));
- if (this->g().length(edge) > 1 &&
- rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v, coverage_edge_around_v) &&
- rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, coverage_edge_around_v)) {
- DEBUG("Disconnecting");
- disconnector_(edge);
- cnt_++;
- return true;
- } else {
- DEBUG("No need to disconnect");
- return false;
- }
- }
-
-private:
-
- DECL_LOGGER("RelativeCoverageDisconnector");
-};
-
-template<class Graph>
-class ComponentSearcher {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
- const RelativeCoverageHelper<Graph>& rel_helper_;
- const ComponentChecker<Graph>& checker_;
- Component<Graph> component_;
-
-public:
- ComponentSearcher(const Graph& g,
- const RelativeCoverageHelper<Graph>& rel_helper,
- const ComponentChecker<Graph>& checker,
- EdgeId first_edge)
- : g_(g), rel_helper_(rel_helper), checker_(checker),
- component_(g_, first_edge) {
- }
-
- bool FindComponent() {
- while (!component_.IsBorderEmpty()) {
- if (!checker_.SizeCheck(component_))
- return false;
-
- VertexId v = component_.NextBorderVertex();
-
- TRACE("Checking if vertex " << g_.str(v) << " is terminating.");
- //checking if there is a sufficient coverage gap
- if (!IsTerminateVertex(v)) {
- TRACE("Not terminating, adding neighbourhood");
- component_.MakeInner(v);
- if (component_.terminating_vertices().count(v) > 0) {
- TRACE("Terminating vertex classified as non-terminating");
- return false;
- }
- } else {
- TRACE("Terminating");
- component_.TerminateOnVertex(v);
- }
- }
-
- return checker_.FullCheck(component_);
- }
-
- const Component<Graph>& component() const {
- return component_;
- }
-
-private:
-
- bool IsTerminateVertex(VertexId v) const {
- double base_coverage = rel_helper_.MaxLocalCoverage(
- RetainEdgesFromComponent(g_.IncidentEdges(v)), v);
- return CheckAnyFilteredHighlyCovered(g_.OutgoingEdges(v),
- v, base_coverage)
- && CheckAnyFilteredHighlyCovered(
- g_.IncomingEdges(v), v, base_coverage);
- }
-
- template<class EdgeContainer>
- bool CheckAnyFilteredHighlyCovered(const EdgeContainer& edges,
- VertexId v,
- double base_coverage) const {
- return rel_helper_.CheckAnyHighlyCovered(
- FilterEdgesFromComponent(edges), v, base_coverage);
- }
-
- template<class EdgeContainer>
- vector<EdgeId> FilterEdgesFromComponent(
- const EdgeContainer& edges) const {
- vector<EdgeId> answer;
- for (EdgeId e : edges) {
- if (!component_.contains(e)) {
- answer.push_back(e);
- }
- }
- return answer;
- }
-
- template<class EdgeContainer>
- vector<EdgeId> RetainEdgesFromComponent(
- const EdgeContainer& edges) const {
- vector<EdgeId> answer;
- for (EdgeId e : edges) {
- if (component_.contains(e)) {
- answer.push_back(e);
- }
- }
- return answer;
- }
-
- DECL_LOGGER("RelativelyLowCoveredComponentSearcher")
- ;
-};
-
-//currently works with conjugate graphs only (due to the assumption in the outer cycle)
-template<class Graph>
-class RelativeCoverageComponentRemover : public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<double(EdgeId, VertexId)> LocalCoverageFT;
- typedef typename ComponentRemover<Graph>::HandlerF HandlerF;
- typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
-
- RelativeCoverageHelper<Graph> rel_helper_;
- size_t length_bound_;
- size_t tip_allowing_length_bound_;
- size_t longest_connecting_path_bound_;
- double max_coverage_;
- //bound on the number of inner vertices
- size_t vertex_count_limit_;
- std::string vis_dir_;
- ComponentRemover<Graph> component_remover_;
-
- size_t fail_cnt_;
- size_t succ_cnt_;
-
- void VisualizeNontrivialComponent(const set<typename Graph::EdgeId>& edges, bool success) {
- auto colorer = omnigraph::visualization::DefaultColorer(this->g());
- auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer);
- edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(this->g(), edges, "green"));
- // shared_ptr<visualization::GraphColorer<Graph>>
- auto resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer, edge_colorer);
-
- StrGraphLabeler<Graph> str_labeler(this->g());
- CoverageGraphLabeler<Graph> cov_labler(this->g());
- CompositeLabeler<Graph> labeler(str_labeler, cov_labler);
-
- if (edges.size() > 1) {
- set<typename Graph::VertexId> vertices;
- for (auto e : edges) {
- vertices.insert(this->g().EdgeStart(e));
- vertices.insert(this->g().EdgeEnd(e));
- }
-
-
- auto filename = success ? vis_dir_ + "/success/" + ToString(succ_cnt_++) : vis_dir_ + "/fail/" + ToString(fail_cnt_++);
- visualization::WriteComponent(
- ComponentCloser<Graph>(this->g(), 0).CloseComponent(GraphComponent<Graph>(this->g(), vertices.begin(), vertices.end())),
- filename + ".dot", colorer, labeler);
- }
- }
-
-public:
- RelativeCoverageComponentRemover(
- Graph& g, LocalCoverageFT local_coverage_f,
- double min_coverage_gap,
- size_t length_bound,
- size_t tip_allowing_length_bound,
- size_t longest_connecting_path_bound,
- double max_coverage = std::numeric_limits<double>::max(),
- HandlerF handler_function = 0, size_t vertex_count_limit = 10,
- std::string vis_dir = "")
- : base(g),
- rel_helper_(g, local_coverage_f, min_coverage_gap),
- length_bound_(length_bound),
- tip_allowing_length_bound_(tip_allowing_length_bound),
- longest_connecting_path_bound_(longest_connecting_path_bound),
- max_coverage_(max_coverage),
- vertex_count_limit_(vertex_count_limit),
- vis_dir_(vis_dir),
- component_remover_(g, handler_function),
- fail_cnt_(0),
- succ_cnt_(0) {
- VERIFY(math::gr(min_coverage_gap, 1.));
- VERIFY(tip_allowing_length_bound >= length_bound);
- TRACE("Coverage gap " << min_coverage_gap);
- if (!vis_dir_.empty()) {
- path::make_dirs(vis_dir_);
- path::make_dirs(vis_dir_ + "/success/");
- path::make_dirs(vis_dir_ + "/fail/");
- }
- }
-
-protected:
-
- bool ProcessEdge(EdgeId e) {
- TRACE("Processing edge " << this->g().str(e));
-
- //here we use that the graph is conjugate!
- VertexId v = this->g().EdgeStart(e);
- if (this->g().IsDeadEnd(v) && this->g().IsDeadStart(v)) {
- TRACE("Isolated");
- return false;
- }
- if (this->g().IsDeadEnd(v) || this->g().IsDeadStart(v)) {
- TRACE("Tip");
- return false;
- }
-
- double local_cov = rel_helper_.LocalCoverage(e, v);
-
- TRACE("Local coverage around start " << this->g().str(v) << " is " << local_cov);
-
- //since min_coverage_gap_ > 1, we don't need to think about e here
- TRACE("Checking presence of highly covered edges around start")
- if (rel_helper_.CheckAnyHighlyCovered(this->g().OutgoingEdges(v), v, local_cov)
- && rel_helper_.CheckAnyHighlyCovered(this->g().IncomingEdges(v), v,
- local_cov)) {
- TRACE("Looking for component");
- ComponentChecker<Graph> checker(this->g(), vertex_count_limit_, length_bound_,
- tip_allowing_length_bound_,
- longest_connecting_path_bound_, max_coverage_);
- //case of e being loop is handled implicitly!
- ComponentSearcher<Graph> component_searcher(
- this->g(), rel_helper_, checker, e);
- if (component_searcher.FindComponent()) {
- TRACE("Deleting component");
- const Component<Graph>& component = component_searcher.component();
- component_remover_.DeleteComponent(component.edges());
- return true;
- } else {
- TRACE("Failed to find component");
- if (!vis_dir_.empty()) {
- TRACE("Outputting image");
- VisualizeNontrivialComponent(component_searcher.component().edges(), false);
- }
- }
- } else {
- TRACE("No highly covered edges around");
- }
-
- return false;
- }
-
-private:
- DECL_LOGGER("RelativeCoverageComponentRemover");
-};
-
-}
-}
-
-}
diff --git a/src/modules/algorithms/simplification/tip_clipper.hpp b/src/modules/algorithms/simplification/tip_clipper.hpp
deleted file mode 100644
index a4b7db3..0000000
--- a/src/modules/algorithms/simplification/tip_clipper.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "math/xmath.h"
-#include "dev_support/func.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "data_structures/sequence/sequence.hpp"
-
-#include <set>
-
-namespace omnigraph {
-
-template<class Graph>
-class RelativeCoverageTipCondition: public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- const double max_relative_coverage_;
-
- template<class IteratorType>
- double MaxCompetitorCoverage(EdgeId tip, IteratorType begin, IteratorType end) const {
- const Graph &g = this->g();
- double result = 0;
- for (auto it = begin; it != end; ++it) {
- EdgeId e = *it;
- //update if competitor edge is not loop
- if (e != tip && g.EdgeStart(e) != g.EdgeEnd(e))
- result = std::max(result, g.coverage(*it));
- }
- return result;
- }
-
- double MaxCompetitorCoverage(EdgeId tip) const {
- const Graph &g = this->g();
- VertexId start = g.EdgeStart(tip), end = g.EdgeEnd(tip);
- auto out = g.OutgoingEdges(start);
- auto in = g.IncomingEdges(end);
- return std::max(
- MaxCompetitorCoverage(tip, out.begin(), out.end()),
- MaxCompetitorCoverage(tip, in.begin(), in.end()));
-// return std::max(
-// MaxCompetitorCoverage(tip, g.out_begin(start),
-// g.out_end(start)),
-// MaxCompetitorCoverage(tip, g.in_begin(end), g.in_end(end)));
- }
-
-public:
-
- RelativeCoverageTipCondition(const Graph& g, double max_relative_coverage) :
- base(g), max_relative_coverage_(max_relative_coverage) {
- }
-
- bool Check(EdgeId e) const override {
- //+1 is a trick to deal with edges of 0 coverage from iterative run
- double max_coverage = MaxCompetitorCoverage(e) + 1;
- return math::le(this->g().coverage(e),
- max_relative_coverage_ * max_coverage);
- }
-};
-
-template<class Graph>
-class TipCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- /**
- * This method checks if given vertex topologically looks like end of tip
- * @param v vertex to be checked
- * @return true if vertex judged to be tip and false otherwise.
- */
- bool IsTip(VertexId v) const {
- return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
- }
-
-public:
- TipCondition(const Graph& g) : base(g) {
- }
-
- /**
- * This method checks if given edge topologically looks like a tip.
- * @param edge edge vertex to be checked
- * @return true if edge judged to be tip and false otherwise.
- */
- bool Check(EdgeId e) const override {
- return (IsTip(this->g().EdgeEnd(e)) || IsTip(this->g().EdgeStart(e)))
- && (this->g().OutgoingEdgeCount(this->g().EdgeStart(e))
- + this->g().IncomingEdgeCount(this->g().EdgeEnd(e)) > 2);
- }
-
-};
-
-
-template<class Graph>
-class MismatchTipCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- size_t max_diff_;
-
- size_t Hamming(EdgeId edge1, EdgeId edge2) const {
- size_t cnt = 0;
- Sequence seq1 = this->g().EdgeNucls(edge1);
- Sequence seq2 = this->g().EdgeNucls(edge2);
- size_t len = std::min(seq1.size(), seq2.size());
- for(size_t i = this->g().k(); i < len; i++) {
- if(seq1[i] != seq2[i])
- cnt++;
- }
- return cnt;
- }
-
- bool InnerCheck(EdgeId e) const {
- size_t len = this->g().length(e);
- for (auto alt : this->g().OutgoingEdges(this->g().EdgeStart(e))) {
- if (e != alt && len < this->g().length(alt) && Hamming(e, alt) <= max_diff_) {
- return true;
- }
- }
- return false;
- }
-
-public:
- MismatchTipCondition(const Graph& g, size_t max_diff) :
- base(g), max_diff_(max_diff) {
- }
-
- bool Check(EdgeId e) const override {
- return InnerCheck(e) || InnerCheck(this->g().conjugate(e));
- }
-
-};
-
-template<class Graph>
-class ATCondition: public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
- const double max_AT_percentage_;
- const size_t max_tip_length_;
- const bool check_tip_ ;
-
-public:
-
- ATCondition(const Graph& g, double max_AT_percentage, size_t max_tip_length, bool check_tip) :
- base(g), max_AT_percentage_(max_AT_percentage), max_tip_length_(max_tip_length), check_tip_(check_tip) {
- DEBUG("check_tip: " << check_tip_);
- }
-
- bool Check(EdgeId e) const {
- //+1 is a trick to deal with edges of 0 coverage from iterative run
- size_t start = 0;
- //TODO: Do we need this check?
- if(this->g().length(e) > max_tip_length_)
- return false;
- size_t end = this->g().length(e) + this->g().k();
- if (check_tip_) {
- if (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e)) == 0)
- start = this->g().k();
- else if (this->g().IncomingEdgeCount(this->g().EdgeStart(e)) == 0)
- end = this->g().length(e);
- else return false;
- }
- std::array<size_t, 4> counts = std::array<size_t, 4>();
- const Sequence &s_edge = this->g().EdgeNucls(e);
-
- for (size_t position = start; position < end; position ++) {
- counts[s_edge[position]] ++;
- }
- size_t curm = *std::max_element(counts.begin(), counts.end());
- if (curm > max_AT_percentage_ * double(end - start)) {
- DEBUG("deleting edge" << s_edge.str());;
- DEBUG("curm: " << curm);
-
- DEBUG("start end cutoff" << start << " " << end << " " << max_AT_percentage_ * double(this->g().length(e)));
-
- return true;
- } else {
- return false;
- }
- }
-
-private:
- DECL_LOGGER("ATCondition")
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddTipCondition(const Graph& g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(TipCondition<Graph>(g), condition);
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>
-NecessaryTipCondition(const Graph& g, size_t max_length, double max_coverage) {
- return AddTipCondition(g, pred::And(LengthUpperBound<Graph>(g, max_length),
- CoverageUpperBound<Graph>(g, max_coverage)));
-}
-
-template<class Graph>
-class DeadEndCondition : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- /**
- * This method checks if given vertex topologically looks like end of tip
- * @param v vertex to be checked
- * @return true if vertex judged to be tip and false otherwise.
- */
- bool IsDeadEnd(VertexId v) const {
- return this->g().IncomingEdgeCount(v) * this->g().OutgoingEdgeCount(v) == 0;
- }
-
-public:
- DeadEndCondition(const Graph& g) : base(g) {
- }
-
- /**
- * This method checks if given edge topologically looks like a tip.
- * @param edge edge vertex to be checked
- * @return true if edge judged to be tip and false otherwise.
- */
- /*virtual*/
-
- //Careful - no alternative path check!
- bool Check(EdgeId e) const {
- return (IsDeadEnd(this->g().EdgeEnd(e)) || IsDeadEnd(this->g().EdgeStart(e)))
- && (this->g().OutgoingEdgeCount(this->g().EdgeEnd(e))
- + this->g().IncomingEdgeCount(this->g().EdgeStart(e)) >= 1);
- }
-
- private:
- DECL_LOGGER("DeadEndCondition");
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId>AddDeadEndCondition(const Graph& g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(DeadEndCondition<Graph>(g), condition);
-}
-
-//template<class Graph>
-//bool ClipTips(
-// Graph& g,
-// size_t max_length,
-// shared_ptr<Predicate<typename Graph::EdgeId>> condition
-// = make_shared<func::AlwaysTrue<typename Graph::EdgeId>>(),
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
-// AddTipCondition(g, condition),
-// removal_handler);
-//
-// return tc.Run(LengthComparator<Graph>(g),
-// make_shared<LengthUpperBound<Graph>>(g, max_length));
-//}
-
-} // namespace omnigraph
diff --git a/src/modules/assembly_graph/CMakeLists.txt b/src/modules/assembly_graph/CMakeLists.txt
deleted file mode 100644
index 41031ef..0000000
--- a/src/modules/assembly_graph/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(graph_support CXX)
-
-add_library(graph_support STATIC
- components/connected_component.cpp paths/bidirectional_path.cpp graph_support/scaff_supplementary.cpp graph_alignment/edge_index_refiller.cpp)
-target_link_libraries(graph_support hattrie)
diff --git a/src/modules/assembly_graph/components/connected_component.hpp b/src/modules/assembly_graph/components/connected_component.hpp
deleted file mode 100644
index abc396e..0000000
--- a/src/modules/assembly_graph/components/connected_component.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//
-// Created by lab42 on 8/24/15.
-//
-#pragma once
-#include <map>
-//#include "path_extend/bidirectional_path.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-
-namespace debruijn_graph{
-
-class ConnectedComponentCounter {
-public:
- mutable std::map<EdgeId, size_t> component_ids_;
- mutable std::map<size_t, size_t> component_edges_quantity_;
- mutable std::map<size_t, size_t> component_total_len_;
- const Graph &g_;
- ConnectedComponentCounter(const Graph &g):g_(g) {}
- void CalculateComponents() const;
-// size_t GetComponent(path_extend::BidirectionalPath * p) const;
- size_t GetComponent(EdgeId & e) const;
- bool IsFilled() const {
- return (component_ids_.size() != 0);
- }
-
-};
-}
diff --git a/src/modules/assembly_graph/components/graph_component.hpp b/src/modules/assembly_graph/components/graph_component.hpp
deleted file mode 100644
index e92831b..0000000
--- a/src/modules/assembly_graph/components/graph_component.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-
-namespace omnigraph {
-//todo make handler!!!
-template<class Graph>
-class GraphComponent {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename std::set<VertexId>::const_iterator vertex_iterator;
- typedef typename std::set<EdgeId>::const_iterator edge_iterator;
- const Graph& graph_;
- std::set<VertexId> vertices_;
- std::set<EdgeId> edges_;
- std::set<VertexId> sinks_;
- std::set<VertexId> sources_;
- std::string name_;
-
-
- template<class VertexIt>
- void FillVertices(VertexIt begin, VertexIt end) {
- for (auto it = begin; it != end; ++it) {
- vertices_.insert(*it);
- }
- }
-
- template<class VertexIt>
- void FillVertices(VertexIt begin, VertexIt end, bool add_conjugate) {
- for (auto it = begin; it != end; ++it) {
- vertices_.insert(*it);
- if (add_conjugate)
- vertices_.insert(graph_.conjugate(*it));
- }
- }
-
- void FillEdges() {
- for (auto v_it = vertices_.begin(); v_it != vertices_.end(); ++v_it) {
- TRACE("working with vertex " << graph_.str(*v_it));
- for (EdgeId e : graph_.OutgoingEdges(*v_it)) {
- VertexId edge_end = graph_.EdgeEnd(e);
- TRACE(graph_.coverage(e) << " " << graph_.length(e));
- if (vertices_.count(edge_end) > 0) {
- edges_.insert(e);
- TRACE("Edge added");
- }
- }
- }
- }
-
- template<class VertexIt>
- void Fill(VertexIt begin, VertexIt end) {
- FillVertices(begin, end);
- FillEdges();
- FindSinksAndSources();
- }
-
- template<class VertexIt>
- void Fill(VertexIt begin, VertexIt end, bool add_conjugate) {
- FillVertices(begin, end, add_conjugate);
- FillEdges();
- FindSinksAndSources();
- }
-
- void FindSinksAndSources() {
- for(auto v : vertices_) {
- for(auto e : graph_.IncomingEdges(v)) {
- if(!contains(e) && !(contains(graph_.EdgeStart(e)))) {
- sources_.insert(v);
- break;
- }
- }
-
- for(auto e : graph_.OutgoingEdges(v)) {
- if(!contains(e) && !(contains(graph_.EdgeEnd(e)))) {
- sinks_.insert(v);
- break;
- }
- }
- }
- }
-
-public:
- template<class VertexIt>
- GraphComponent(const Graph &g, VertexIt begin, VertexIt end, const string &name = "") :
- graph_(g), name_(name) {
- Fill(begin, end);
- }
-
- //todo refactor and get rid of hack
- template<class VertexIt>
- GraphComponent(const Graph &g, VertexIt begin, VertexIt end,
- bool add_conjugate, const string &name = "") : graph_(g), name_(name) {
- Fill(begin, end, add_conjugate);
- }
-
- //Full graph component
- GraphComponent(const Graph &g, bool fill = true, const string &name = "") : graph_(g), name_(name) {
- if(fill) {
- Fill(g.begin(), g.end());
- }
- }
-
- //may be used for conjugate closure
- GraphComponent(const GraphComponent& component, bool add_conjugate, const string &name = "") : graph_(component.graph_), name_(name)
-// vertices_(component.vertices_.begin(), component.vertices_.end()),
-// edges_(component.edges_.begin(), component.edges_.end())
- {
- Fill(component.v_begin(), component.v_end(), add_conjugate);
- }
-
- GraphComponent<Graph> &operator=(const GraphComponent<Graph> &that) {
- VERIFY(&this->graph_ == &that.graph_);
- this->vertices_ = that.vertices_;
- this->edges_ = that.edges_;
- this->name_ = that.name_;
- return *this;
- }
-
- const Graph& g() const {
- return graph_;
- }
-
- string name() const {
- return name_;
- }
-
- size_t v_size() const {
- return vertices_.size();
- }
-
- size_t e_size() const {
- return edges_.size();
- }
-
- bool contains(EdgeId e) const {
- return edges_.count(e) > 0;
- }
-
- bool contains(VertexId v) const {
- return vertices_.count(v) > 0;
- }
-
- edge_iterator e_begin() const {
- return edges_.begin();
- }
- edge_iterator e_end() const {
- return edges_.end();
- }
-
- const std::set<EdgeId>& edges() const {
- return edges_;
- }
-
- const std::set<VertexId>& vertices() const{
- return vertices_;
- }
-
- vertex_iterator v_begin() const {
- return vertices_.begin();
- }
- vertex_iterator v_end() const {
- return vertices_.end();
- }
-
- const std::set<VertexId>& sinks() const {
- return sinks_;
- }
-
- const std::set<VertexId>& sources() const {
- return sources_;
- }
-
- bool IsBorder(VertexId v) const {
- if(vertices_.count(v) == 0)
- return false;
- for (EdgeId e : graph_.IncidentEdges(v)) {
- if (vertices_.count(graph_.EdgeStart(e)) == 0
- || vertices_.count(graph_.EdgeEnd(e)) == 0) {
- return true;
- }
- }
- return false;
- }
-
-};
-
-}
-
-
-
diff --git a/src/modules/assembly_graph/components/splitters.hpp b/src/modules/assembly_graph/components/splitters.hpp
deleted file mode 100644
index 3bb8f41..0000000
--- a/src/modules/assembly_graph/components/splitters.hpp
+++ /dev/null
@@ -1,921 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dev_support/standard_base.hpp"
-#include "graph_component.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
-#include "component_filters.hpp"
-
-namespace omnigraph {
-
-
-template<typename Element>
-class JSIterator {
-public:
-
- virtual Element Next() = 0;
-
- virtual bool HasNext() = 0;
-
- virtual ~JSIterator() {
- }
-};
-
-template<class Graph>
-class GraphSplitter : public JSIterator<GraphComponent<Graph>>{
-private:
- const Graph& graph_;
-public:
- GraphSplitter(const Graph& graph)
- : graph_(graph) {
- }
-
- const Graph& graph() const {
- return graph_;
- }
-};
-
-template<class Graph>
-class PrecountedComponentSplitter : public GraphSplitter<Graph> {
- bool HasNext_;
- GraphComponent<Graph> component_;
-public:
-
- template<class It>
- PrecountedComponentSplitter(const Graph &graph, It begin, It end)
- : GraphSplitter<Graph>(graph), HasNext_(false),
- component_(graph, begin, end) {
- }
-
- template<class It>
- PrecountedComponentSplitter(GraphComponent<Graph> component)
- : GraphSplitter<Graph>(component.g()), HasNext_(false),
- component_(component) {
- }
-
- GraphComponent<Graph> Next() {
- HasNext_ = false;
- return component_;
- }
-
-// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge, size_t /*length*/) const {
-// return edges_.count(edge) != 0;
-// }
- bool HasNext() {
- return HasNext_;
- }
-};
-
-template<typename Element>
-class RelaxingIterator : public JSIterator<Element> {
-public:
- template<typename It>
- void Relax(It begin, It end) {
- Relax(vector<Element>(begin, end));
- }
-
-// virtual bool CheckProcessVertex(VertexId /*vertex*/, size_t distance) {
-// return distance <= bound_;
-// }
- virtual void Relax(const vector<Element> &v) = 0;
-
- virtual void Relax(Element) = 0;
-
- virtual ~RelaxingIterator() {
- }
-};
-
-template<class Collection>
-class CollectionIterator : public RelaxingIterator<typename Collection::value_type> {
-private:
- typedef typename Collection::value_type Element;
- typedef typename Collection::const_iterator Iter;
- shared_ptr<Collection> storage_;
- Iter current_;
- const Iter end_;
- set<Element> relaxed_;
-public:
- CollectionIterator(const Collection &collection)
- : current_(collection.begin()), end_(collection.end()) {
- }
-
-// virtual bool CheckPutVertex(VertexId vertex, EdgeId /*edge*/, size_t /*length*/) const {
-// return subgraph_.count(vertex) != 0;
-// }
- CollectionIterator(shared_ptr<Collection> collection)
- : storage_(collection), current_(collection->begin()), end_(collection->end()) {
- }
-
- CollectionIterator(Iter begin, Iter end)
- : current_(begin), end_(end) {
- }
-
- Element Next() {
- if(!HasNext()) { //This function actually changes value of current! It is not just to verify!
- //fixme use VERIFY_MSG instead
- VERIFY(HasNext());
- }
- Element next = *current_;
- ++current_;
- return next;
- }
-
-//public:
-// ErrorComponentSplitter(const Graph &graph, const set<EdgeId> &black_edges) :
-// base(graph), black_edges_(black_edges), iterator_(
-// graph.SmartEdgeBegin()) {
-// TRACE("ErrorComponentSplitter created and SmartIterator initialized");
-// }
-//
-// virtual ~ErrorComponentSplitter() {
-// }
-//
-// vector<VertexId> FindComponent(VertexId start_vertex) {
-// ComponentFinder<Graph> cf(this->graph(), black_edges_);
-// cf.run(start_vertex);
-// return cf.ReachedVertices();
-// }
-//
-// vector<VertexId> FindNeighbourhood(VertexId start, size_t bound) {
-// NeighbourhoodFinder<Graph> nf(this->graph(), black_edges_, bound);
-// nf.run(start);
-// return nf.ReachedVertices();
-// }
-//
-// size_t FindDiameter(const vector<VertexId> &component) {
-// set < VertexId > component_set(component.begin(), component.end());
-// size_t result = 0;
-// VertexId current = *(component.begin());
-// for (size_t i = 0; i < 4; i++) {
-// pair<VertexId, size_t> next = GetFarthest(current, component_set);
-// current = next.first;
-// result = next.second;
-// }
-// return result;
-// }
-//
-// pair<VertexId, size_t> GetFarthest(VertexId v,
-// const set<VertexId> &component) {
-// SubgraphDijkstra<Graph> sd(this->graph(), component);
-// sd.run(v);
-// pair<VertexId, size_t> result(v, 0);
-// auto bounds = sd.GetDistances();
-// for (auto it = bounds.first; it != bounds.second; ++it) {
-// if (it->second > result.second) {
-// result = *it;
-// }
-// }
-// return result;
-// }
-//
-// virtual vector<VertexId> NextComponent() {
-// TRACE("Construction of next component started");
-// if (Finished()) {
-// VERIFY(false);
-// return vector<VertexId>();
-// }
-// EdgeId next = *iterator_;
-// ++iterator_;
-// vector < VertexId > component = FindComponent(
-// this->graph().EdgeEnd(next));
-// TRACE("Error edges component constructed. It contains "
-// << component.size() << " vertices");
-// size_t component_size = FindDiameter(component);
-// TRACE("Diameter of component is " << component_size);
-// vector < VertexId > neighbourhood = FindNeighbourhood(
-// this->graph().EdgeEnd(next), (size_t) math::round(1.5 * (double) component_size));
-// TRACE("Error edges component neighborhood constructed. It contains "
-// << neighbourhood.size() << " vertices");
-// visited_.insert(component.begin(), component.end());
-// return neighbourhood;
-// }
-//
-// virtual bool Finished() {
-// while (!iterator_.IsEnd()) {
-// if (black_edges_.find(*iterator_) != black_edges_.end()
-// && visited_.find(this->graph().EdgeEnd(*iterator_))
-// == visited_.end()) {
-// return false;
-// }
-// ++iterator_;
-// }
-// return true;
-// }
- bool HasNext() {
- while(current_ != end_ && relaxed_.count(*current_) == 1) {
- ++current_;
- }
- return current_ != end_;
- }
-
- void Relax(Element e) {
- relaxed_.insert(e);
- }
-
-//template<class Graph>
-//class ShortEdgeComponentNeighbourhoodFinder: public UnorientedDijkstra<Graph> {
-//private:
-// typedef UnorientedDijkstra<Graph> base;
-//protected:
-// typedef typename base::VertexId VertexId;
-// typedef typename base::EdgeId EdgeId;
-// typedef typename base::DistanceType distance_t;
-//private:
-// distance_t bound_;
-//public:
-// ShortEdgeComponentNeighbourhoodFinder(const Graph &graph, distance_t bound) :
-// UnorientedDijkstra<Graph>(graph), bound_(bound) {
-// }
-//
-// virtual bool CheckProcessVertexVertexId (VertexId /*vertex*/, distance_t distance) {
-// return distance == 0;
-// }
-//
-// virtual distance_t GetLength(EdgeId edge) const {
-// if (this->graph().length(edge) <= bound_)
-// return 0;
-// else
-// return 1;
-// }
- void Relax(const vector<Element> &v) {
- for (auto it = v.begin(); it != v.end(); ++it)
- Relax(*it);
- }
-
- virtual ~CollectionIterator() {
- }
-};
-
-template<class Graph>
-class PathIterator : public RelaxingIterator<typename Graph::VertexId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- vector<VertexId> path_;
- size_t current_;
-
- static vector<VertexId> ExtractVertices(const Graph &graph, const vector<EdgeId> &path) {
- vector<VertexId> result;
- for(size_t i = 0; i < path.size(); i++) {
- if(i == 0 || path[i] != path[i - 1]) {
- result.push_back(graph.EdgeStart(path[i]));
- result.push_back(graph.EdgeEnd(path[i]));
- }
- }
- return result;
- }
-
-public:
- PathIterator(const Graph &graph, const vector<EdgeId> &path)
- : graph_(graph), path_(ExtractVertices(graph, path)), current_(0) {
- }
-
- VertexId Next() {
- if(!HasNext()) {
- VERIFY(HasNext());
- }
- VertexId next = path_[current_];
- Relax(next);
- return next;
- }
-
- bool HasNext() {
- return current_ < path_.size();
- }
-
- void Relax(const vector<VertexId> &v) {
- set<VertexId> toRelax(v.begin(), v.end());
- while(toRelax.count(path_[current_]) == 1)
- current_++;
- }
-
-//public:
-// CountingDijkstra(const Graph &graph, size_t max_size,
-// size_t edge_length_bound) :
-// base(graph), max_size_(max_size), edge_length_bound_(
-// edge_length_bound), current_(0) {
-// }
-//
-// virtual bool CheckPutVertex(VertexId /*vertex*/, EdgeId edge,
-// distance_t /*length*/) const {
-// if (current_ < max_size_) {
-// ++current_;
-// }
-// if (current_ < max_size_ && GetLength(edge) < inf) {
-// return true;
-// }
-// return false;
-// }
-//
-// virtual bool CheckProcessVertex(VertexId /*vertex*/, distance_t /*distance*/) {
-// return current_ < max_size_;
-// }
-//
-// virtual void init(VertexId /*start*/) {
-// current_ = 0;
-// }
-//
-// virtual size_t GetLength(EdgeId edge) const {
-// if (this->graph().length(edge) <= edge_length_bound_)
-// //todo change back
-//// return 1;
-// return this->graph().length(edge);
-// else
-// return inf;
-// }
- void Relax(VertexId e) {
- Relax(vector<VertexId>({e}));
- }
-};
-
-template<class Graph>
-class AbstractNeighbourhoodFinder {
-private:
- const Graph &graph_;
-public:
- AbstractNeighbourhoodFinder(const Graph &graph) : graph_(graph) {
- }
-
- const Graph &graph() const {
- return graph_;
- }
-
- virtual GraphComponent<Graph> Find(typename Graph::VertexId v) = 0;
-
- virtual vector<typename Graph::VertexId> InnerVertices(const GraphComponent<Graph> &component) = 0;
-
- virtual ~AbstractNeighbourhoodFinder() {
- }
-};
-
-template<class Graph, typename distance_t = size_t>
-class ComponentCloser {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &graph_;
- size_t edge_length_bound_;
-
-public:
- ComponentCloser(const Graph &graph, size_t edge_length_bound)
- : graph_(graph),
- edge_length_bound_(edge_length_bound) {
- }
-
- void CloseComponent(set<VertexId> &component) const {
- set<VertexId> additional_vertices;
- for (auto it = component.begin(); it != component.end(); ++it) {
- for (EdgeId e : graph_.OutgoingEdges(*it)) {
- if (graph_.length(e) >= edge_length_bound_) {
- additional_vertices.insert(graph_.EdgeEnd(e));
- }
- }
- for (EdgeId e : graph_.IncomingEdges(*it)) {
- if (graph_.length(e) >= edge_length_bound_) {
- additional_vertices.insert(graph_.EdgeStart(e));
- }
- }
- }
- component.insert(additional_vertices.begin(),
- additional_vertices.end());
- }
-
- GraphComponent<Graph> CloseComponent(const GraphComponent<Graph>& component) const {
- set<VertexId> vertices(component.v_begin(), component.v_end());
- CloseComponent(vertices);
- return GraphComponent<Graph>(graph_, vertices.begin(), vertices.end());
- }
-};
-
-//This method finds a neighbourhood of a set of vertices. Vertices that are connected by an edge of length more than 600 are not considered as adjacent.
-template<class Graph>
-class ReliableNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- set<VertexId> FindNeighbours(const set<VertexId> &s) {
- set<VertexId> result(s.begin(), s.end());
- for (VertexId v : result) {
- for (EdgeId e : this->graph().IncidentEdges(v)) {
- if(this->graph().length(e) <= edge_length_bound_) {
- result.insert(this->graph().EdgeEnd(e));
- result.insert(this->graph().EdgeStart(e));
- }
- }
- }
- return result;
- }
-
- set<VertexId> FindNeighbours(const set<VertexId> &s, size_t eps) {
- set<VertexId> result = s;
- for(size_t i = 0; i < eps; i++) {
- result = FindNeighbours(result);
- }
- return result;
- }
-
- set<VertexId> FindBorder(const GraphComponent<Graph> component) {
- set<VertexId> result;
- for(auto it = component.vertices().begin(); it != component.vertices().end(); ++it) {
- if(component.IsBorder(*it)) {
- result.insert(*it);
- }
- }
- return result;
- }
-
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
- static const size_t DEFAULT_MAX_SIZE = 100;
-
- const size_t edge_length_bound_;
- const size_t max_size_;
-
- ReliableNeighbourhoodFinder(const Graph &graph, size_t edge_length_bound =
- DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = DEFAULT_MAX_SIZE)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- edge_length_bound_(edge_length_bound),
- max_size_(max_size) {
- }
-
- GraphComponent<Graph> Find(typename Graph::VertexId v) {
- auto cd = DijkstraHelper<Graph>::CreateCountingDijkstra(this->graph(), max_size_,
- edge_length_bound_);
- cd.Run(v);
- vector<VertexId> result_vector = cd.ReachedVertices();
- set<VertexId> result(result_vector.begin(), result_vector.end());
- ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
- cc.CloseComponent(result);
- return GraphComponent<Graph>(this->graph(), result.begin(),
- result.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
- set<VertexId> border = FindNeighbours(FindBorder(component), 2);
- std::vector<VertexId> result;
- std::set_difference(component.vertices().begin(), component.vertices().end(), border.begin(), border.end(), std::inserter(result, result.end()));
- return vector<VertexId>(result.begin(), result.end());
- }
-};
-
-template<class Graph>
-class PathNeighbourhoodFinder : public AbstractNeighbourhoodFinder<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- VertexId OtherEnd(EdgeId e, VertexId v) const {
- if (this->graph().EdgeStart(e) == v)
- return this->graph().EdgeEnd(e);
- else
- return this->graph().EdgeStart(e);
- }
-
- bool Go(VertexId v, size_t curr_depth, set<VertexId>& grey, set<VertexId>& black) const {
- //allows single vertex to be visited many times with different depth values
- TRACE("Came to vertex " << this->graph().str(v) << " on depth " << curr_depth);
- if (curr_depth >= max_depth_) {
- TRACE("Too deep");
- return true;
- }
- if (grey.size() >= max_size_) {
- TRACE("Too many vertices");
- return false;
- }
-
- TRACE("Started processing of vertex " << this->graph().str(v));
- grey.insert(v);
-
- TRACE("Sorting incident edges");
- vector<EdgeId> incident_path;
- vector<EdgeId> incident_non_path;
- for (EdgeId e : this->graph().IncidentEdges(v)) {
- if (path_edges_.count(e) != 0) {
- /*condition not to go backward*/
- if (this->graph().EdgeStart(e) == v) {
- incident_path.push_back(e);
- }
- } else {
- incident_non_path.push_back(e);
- }
- }
-
- for (EdgeId e : incident_non_path) {
- if (this->graph().length(e) > edge_length_bound_) {
- TRACE("Edge " << this->graph().str(e) << " is too long");
- continue;
- }
- TRACE("Going along edge " << this->graph().str(e));
- if (!Go(OtherEnd(e, v), curr_depth + 1, grey, black))
- return false;
- }
-
- TRACE("End processing of vertex " << this->graph().str(v));
- black.insert(v);
-
- for (EdgeId e : incident_path) {
- if (grey.count(OtherEnd(e, v)) != 0)
- continue;
- TRACE("Going along next path edge " << this->graph().str(e));
- if (!Go(OtherEnd(e, v), 0, grey, black))
- return false;
- }
-
- return true;
- }
-
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 500;
- static const size_t DEFAULT_MAX_DEPTH = 2;
- static const size_t DEFAULT_MAX_SIZE = 20;
-
- set<EdgeId> path_edges_;
- const size_t edge_length_bound_;
- const size_t max_size_;
- const size_t max_depth_;
-
- set<VertexId> last_inner_;
-
- PathNeighbourhoodFinder(const Graph &graph, const vector<EdgeId>& path, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = DEFAULT_MAX_SIZE, size_t max_depth = DEFAULT_MAX_DEPTH)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- path_edges_(path.begin(), path.end()),
- edge_length_bound_(edge_length_bound),
- max_size_(max_size),
- max_depth_(max_depth) {
- }
-
-
- GraphComponent<Graph> Find(VertexId v) {
- TRACE("Starting from vertex " << this->graph().str(v));
- last_inner_.clear();
- set<VertexId> grey;
- set<VertexId> black;
- Go(v, 0, grey, black);
- last_inner_ = black;
- last_inner_.insert(v);
- ComponentCloser<Graph>(this->graph(), 0).CloseComponent(grey);
- return GraphComponent<Graph>(this->graph(), grey.begin(), grey.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &/*component*/) {
- return vector<VertexId>(last_inner_.begin(), last_inner_.end());
- }
-private:
- DECL_LOGGER("PathNeighbourhoodFinder");
-};
-
-//todo delete and think if we really need hierarchy
-template<class Graph>
-class ShortEdgeComponentFinder : public AbstractNeighbourhoodFinder<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-public:
- static const size_t DEFAULT_EDGE_LENGTH_BOUND = 100;
-
- const size_t edge_length_bound_;
-
- ShortEdgeComponentFinder(const Graph &graph, size_t edge_length_bound = DEFAULT_EDGE_LENGTH_BOUND)
- : AbstractNeighbourhoodFinder<Graph>(graph),
- edge_length_bound_(edge_length_bound) {
- }
-
- GraphComponent<Graph> Find(VertexId v) {
- auto cd = DijkstraHelper<Graph>::CreateShortEdgeDijkstra(this->graph(), edge_length_bound_);
- cd.Run(v);
- set<VertexId> result = cd.ProcessedVertices();
- return GraphComponent<Graph>(this->graph(), result.begin(),
- result.end());
- }
-
- vector<VertexId> InnerVertices(const GraphComponent<Graph> &component) {
- return vector<VertexId>(component.v_begin(), component.v_end());
- }
-};
-
-template<class Graph>
-class FilteringSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
-public:
- FilteringSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- }
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- }
-
- bool HasNext() {
- while (!next_ && inner_splitter_->HasNext()) {
- GraphComponent<Graph> ne = inner_splitter_->Next();
- if (checker_->Check(ne)) {
- next_ = ne;
- }
- }
- return next_;
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-//TODO split combined component into several.
-template<class Graph>
-class CollectingSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
- set<VertexId> filtered_;
-public:
- CollectingSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- } else {
- if(next_) {
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- } else {
- GraphComponent<Graph> result(this->graph(), filtered_.begin(), filtered_.end(), false, "filtered");
- filtered_.clear();
- return result;
- }
- }
- }
-
- bool HasNext() {
- while (!next_ && inner_splitter_->HasNext()) {
- GraphComponent<Graph> ne = inner_splitter_->Next();
- if (checker_->Check(ne)) {
- next_ = ne;
- } else {
- filtered_.insert(ne.v_begin(), ne.v_end());
- }
- }
- return next_ || !filtered_.empty();
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-template<class Graph>
-class CondensingSplitterWrapper : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- shared_ptr<GraphSplitter<Graph>> inner_splitter_;
- shared_ptr<GraphComponentFilter<Graph>> checker_;
- boost::optional<GraphComponent<Graph>> next_;
-
- string CutName(const string &name, size_t max_length) {
- VERIFY(max_length >= 7);
- size_t length = name.size();
- if (length <= max_length)
- return name;
- else {
- return name.substr(0, (max_length - 5) / 2) + "....." + name.substr(length - (max_length - 5) / 2, (max_length - 5) / 2);
- }
- }
-
- GraphComponent<Graph> ConstructComponent() {
- GraphComponent<Graph> next = inner_splitter_->Next();
- if (checker_->Check(next)) {
- return next;
- }
- set<VertexId> vertices(next.v_begin(), next.v_end());
- string name = next.name();
- for(size_t i = 0; i < 10 && inner_splitter_->HasNext(); i++) {
- next = inner_splitter_->Next();
- if (checker_->Check(next)) {
- next_ = next;
- break;
- } else {
- vertices.insert(next.v_begin(), next.v_end());
- if (next.name() != "") {
- name += ";";
- name += next.name();
- }
- }
- }
- return GraphComponent<Graph>(this->graph(), vertices.begin(), vertices.end(), CutName(name, 60));
- }
-
-public:
- CondensingSplitterWrapper(
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphComponentFilter<Graph>> checker)
- : GraphSplitter<Graph>(inner_splitter->graph()), inner_splitter_(inner_splitter),
- checker_(checker) {
- }
-
- GraphComponent<Graph> Next() {
- if (!HasNext()) {
- VERIFY(false);
- return omnigraph::GraphComponent<Graph>(this->graph());
- }
- if(next_) {
- GraphComponent<Graph> result = next_.get();
- next_ = boost::optional<GraphComponent<Graph>>();
- return result;
- } else {
- return ConstructComponent();
- }
- }
-
- bool HasNext() {
- if(next_)
- return true;
- if(!inner_splitter_->HasNext())
- return false;
- return true;
- }
-private:
- DECL_LOGGER("FilteringSplitterWrapper");
-};
-
-template<class Graph>
-class NeighbourhoodFindingSplitter : public GraphSplitter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator_;
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder_;
-
-public:
- NeighbourhoodFindingSplitter(
- const Graph& graph,
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator,
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> neighbourhood_finder)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(inner_iterator),
- neighbourhood_finder_(neighbourhood_finder) {
- }
-
- NeighbourhoodFindingSplitter(
- const Graph& graph,
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(inner_iterator),
- neighbourhood_finder_(
- make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
- }
-
- NeighbourhoodFindingSplitter(const Graph& graph)
- : GraphSplitter<Graph>(graph),
- inner_iterator_(
- make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end())),
- neighbourhood_finder_(make_shared<ReliableNeighbourhoodFinder<Graph>>(graph)) {
- }
-
- GraphComponent<Graph> Next() {
- VertexId next_vertex = inner_iterator_->Next();
- GraphComponent<Graph> result = neighbourhood_finder_->Find(next_vertex);
- vector<VertexId> to_relax = neighbourhood_finder_->InnerVertices(result);
- to_relax.push_back(next_vertex);
- inner_iterator_->Relax(to_relax);
- return result;
- }
-
- bool HasNext() {
- return inner_iterator_->HasNext();
- }
-};
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ReliableSplitter(const Graph &graph,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ConnectedSplitter(const Graph &graph,
- size_t edge_length_bound = 1000000,
- size_t max_size = 1000000) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound, max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> ReliableSplitterAlongPath(
- const Graph &graph, const vector<typename Graph::EdgeId>& path, size_t edge_length_bound = PathNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND,
- size_t max_size = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t max_depth = PathNeighbourhoodFinder<Graph>::DEFAULT_MAX_DEPTH) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
- PathIterator<Graph>>(graph, path);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<PathNeighbourhoodFinder<Graph>>(graph, path,
- edge_length_bound, max_size, max_depth);
-
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> LongEdgesExclusiveSplitter(
- const Graph &graph, size_t bound =
- ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<
- CollectionIterator<set<VertexId>>>(graph.begin(), graph.end());
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ShortEdgeComponentFinder<Graph>>(graph, bound);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph,
- inner_iterator, nf);
-}
-
-template<class Graph, typename Collection>
-shared_ptr<GraphSplitter<Graph>> StandardSplitter(
- const Graph &graph, const Collection &collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
- max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
-}
-
-template<class Graph, typename Collection>
-shared_ptr<GraphSplitter<Graph>> StandardSplitter(
- const Graph &graph, shared_ptr<Collection> collection, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- typedef typename Graph::VertexId VertexId;
- shared_ptr<RelaxingIterator<VertexId>> inner_iterator = make_shared<CollectionIterator<Collection>>(collection);
- shared_ptr<AbstractNeighbourhoodFinder<Graph>> nf = make_shared<
- ReliableNeighbourhoodFinder<Graph>>(graph, edge_length_bound,
- max_size);
- return make_shared<NeighbourhoodFindingSplitter<Graph>>(graph, inner_iterator, nf);
-}
-
-template<class Graph>
-shared_ptr<GraphSplitter<Graph>> WholeGraphSplitter(
- const Graph &graph, size_t max_size,
- size_t edge_length_bound) {
- return NeighbourhoodFindingSplitter<Graph>(graph, graph.vertices(), max_size, edge_length_bound);
-}
-
-template<class Graph>
-GraphComponent<Graph> VertexNeighborhood(
- const Graph &graph, typename Graph::VertexId vertex, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- vector<typename Graph::VertexId> vv = {vertex};
- shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
- return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
-}
-
-//TODO make a method that draws a picture that contains given set of edges for sure. ? mb refactor this into just drawing instead of splitting?
-template<class Graph>
-GraphComponent<Graph> EdgeNeighborhood(
- const Graph &graph, typename Graph::EdgeId edge, size_t max_size = ReliableNeighbourhoodFinder<Graph>::DEFAULT_MAX_SIZE,
- size_t edge_length_bound = ReliableNeighbourhoodFinder<Graph>::DEFAULT_EDGE_LENGTH_BOUND) {
- vector<typename Graph::VertexId> vv = {graph.EdgeStart(edge)};
- shared_ptr<vector<typename Graph::VertexId>> sh_vv = make_shared<vector<typename Graph::VertexId>>(vv);
- return StandardSplitter<Graph>(graph, sh_vv, max_size, edge_length_bound)->Next();
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index.hpp b/src/modules/assembly_graph/graph_alignment/edge_index.hpp
deleted file mode 100644
index 187ea94..0000000
--- a/src/modules/assembly_graph/graph_alignment/edge_index.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "assembly_graph/graph_core/graph.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-#include "data_structures/indices/edge_position_index.hpp"
-#include "edge_index_refiller.hpp"
-
-namespace debruijn_graph {
-
-/**
- * EdgeIndex is a structure to store info about location of certain k-mers in graph. It delegates all
- * container procedures to inner_index_ and all handling procedures to
- * renewer_ which is DataHashRenewer.
- */
-template<class Graph>
-class EdgeIndex: public omnigraph::GraphActionHandler<Graph> {
-
-public:
- typedef typename Graph::EdgeId EdgeId;
- using InnerIndex = KmerFreeEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, DefaultStoring>;
- typedef Graph GraphT;
- typedef typename InnerIndex::KMer KMer;
- typedef typename InnerIndex::KMerIdx KMerIdx;
- typedef typename InnerIndex::Value Value;
-
-private:
- InnerIndex inner_index_;
- EdgeInfoUpdater<InnerIndex, Graph> updater_;
- EdgeIndexRefiller refiller_;
- bool delete_index_;
-
-public:
- EdgeIndex(const Graph& g, const std::string &workdir)
- : omnigraph::GraphActionHandler<Graph>(g, "EdgeIndex"),
- inner_index_(g, workdir),
- updater_(g, inner_index_),
- delete_index_(true) {
- }
-
- virtual ~EdgeIndex() {
- TRACE("~EdgeIndex OK")
- }
-
- InnerIndex &inner_index() {
- return inner_index_;
- }
-
- size_t k() const {
- return inner_index_.k();
- }
-
- const InnerIndex &inner_index() const {
- VERIFY(this->IsAttached());
- return inner_index_;
- }
-
- void HandleAdd(EdgeId e) override {
- updater_.UpdateKmers(e);
- }
-
- void HandleDelete(EdgeId e) override {
- updater_.DeleteKmers(e);
- }
-
- bool contains(const KMer& kmer) const {
- VERIFY(this->IsAttached());
- return inner_index_.contains(inner_index_.ConstructKWH(kmer));
- }
-
- const pair<EdgeId, size_t> get(const KMer& kmer) const {
- VERIFY(this->IsAttached());
- auto kwh = inner_index_.ConstructKWH(kmer);
- if (!inner_index_.contains(kwh)) {
- return make_pair(EdgeId(0), -1u);
- } else {
- EdgeInfo<EdgeId> entry = inner_index_.get_value(kwh);
- return std::make_pair(entry.edge_id, (size_t)entry.offset);
- }
- }
-
- void Refill() {
- clear();
- refiller_.Refill(inner_index_, this->g());
- INFO("Index refilled");
- }
-
- void Update() {
- updater_.UpdateAll();
- }
-
- void clear() {
- inner_index_.clear();
- }
-
-};
-}
diff --git a/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp b/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp
deleted file mode 100644
index d008b5a..0000000
--- a/src/modules/assembly_graph/graph_alignment/edge_index_refiller.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2016 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "data_structures/indices/edge_index_builders.hpp"
-#include "data_structures/indices/edge_multi_index.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-
-#include "edge_index_refiller.hpp"
-
-namespace debruijn_graph {
-
-using EdgeIndex = KmerFreeEdgeIndex<ConjugateDeBruijnGraph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>>;
-
-template<>
-void EdgeIndexRefiller::Refill(EdgeIndex &index,
- const ConjugateDeBruijnGraph &g) {
- typedef typename EdgeIndexHelper<EdgeIndex>::GraphPositionFillingIndexBuilderT IndexBuilder;
- IndexBuilder().BuildIndexFromGraph(index, g);
-}
-
-using PacIndex = DeBruijnEdgeMultiIndex<ConjugateDeBruijnGraph::EdgeId>;
-
-template<>
-void EdgeIndexRefiller::Refill(PacIndex &index,
- const ConjugateDeBruijnGraph &g) {
- typedef typename debruijn_graph::EdgeIndexHelper<PacIndex>::GraphPositionFillingIndexBuilderT Builder;
- Builder().BuildIndexFromGraph(index, g);
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_map.hpp b/src/modules/assembly_graph/graph_alignment/kmer_map.hpp
deleted file mode 100644
index e2d0f12..0000000
--- a/src/modules/assembly_graph/graph_alignment/kmer_map.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2016 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __KMER_MAP_HPP__
-#define __KMER_MAP_HPP__
-
-#include "data_structures/sequence/runtime_k.hpp"
-
-#include <htrie/hat-trie.h>
-#include <boost/iterator/iterator_facade.hpp>
-
-namespace debruijn_graph {
-class KMerMap {
- typedef runtime_k::RtSeq Kmer;
- typedef runtime_k::RtSeq Seq;
- typedef typename Seq::DataType RawSeqData;
-
- value_t* internal_tryget(const Kmer &key) const {
- return hattrie_tryget(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
- }
-
- value_t* internal_get(const Kmer &key) const {
- return hattrie_get(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
- }
-
- int internal_erase(const Kmer &key) {
- return hattrie_del(mapping_, (const char *)key.data(), rawcnt_ * sizeof(RawSeqData));
- }
-
- class iterator : public boost::iterator_facade<iterator,
- const std::pair<Kmer, Seq>,
- std::forward_iterator_tag,
- const std::pair<Kmer, Seq>> {
- public:
- iterator(unsigned k, hattrie_iter_t *start = nullptr)
- : k_(k), iter_(start, [](hattrie_iter_t *p) { hattrie_iter_free(p); }) {}
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- hattrie_iter_next(iter_.get());
- }
-
- bool equal(const iterator &other) const {
- // Special case: NULL and finished are equal
- if (iter_.get() == nullptr || hattrie_iter_finished(iter_.get()))
- return other.iter_.get() == nullptr || hattrie_iter_finished(other.iter_.get());
-
- if (other.iter_.get() == nullptr)
- return false;
-
- return hattrie_iter_equal(iter_.get(), other.iter_.get());
- }
-
- const std::pair<Kmer, Seq> dereference() const {
- size_t len;
- Kmer k(k_, (const RawSeqData*)hattrie_iter_key(iter_.get(), &len));
- Seq s(k_, (const RawSeqData*)(*hattrie_iter_val(iter_.get())));
- return std::make_pair(k, s);
- }
-
- unsigned k_;
- std::shared_ptr<hattrie_iter_t> iter_;
- };
-
- public:
- KMerMap(unsigned k)
- : k_(k), mapping_(hattrie_create()) {
- rawcnt_ = (unsigned)Seq::GetDataSize(k_);
- }
-
- ~KMerMap() {
- clear();
- hattrie_free(mapping_);
- }
-
- void erase(const Kmer &key) {
- value_t *vp = internal_tryget(key);
- if (vp == nullptr)
- return;
-
- RawSeqData *value = reinterpret_cast<RawSeqData*>(*vp);
- delete[] value;
- int res = internal_erase(key);
- VERIFY_MSG(res == 0, "Failed to delete from kmer mapper");
- }
-
- void set(const Kmer &key, const Seq &value) {
- value_t *vp = internal_tryget(key);
- RawSeqData *rawvalue = nullptr;
- if (vp == nullptr) {
- vp = internal_get(key);
- rawvalue = new RawSeqData[rawcnt_];
- *vp = reinterpret_cast<uintptr_t>(rawvalue);
- } else {
- rawvalue = reinterpret_cast<RawSeqData*>(*vp);
- }
-
- memcpy(rawvalue, value.data(), rawcnt_ * sizeof(RawSeqData));
- }
-
- bool count(const Kmer &key) const {
- return internal_tryget(key) != nullptr;
- }
-
- const RawSeqData *find(const Kmer &key) const {
- value_t *vp = internal_tryget(key);
- if (vp == nullptr)
- return nullptr;
-
- return reinterpret_cast<const RawSeqData*>(*vp);
- }
-
- void clear() {
- // Delete all the values
- auto *iter = hattrie_iter_begin(mapping_, false);
- while (!hattrie_iter_finished(iter)) {
- RawSeqData *value = (RawSeqData*)(*hattrie_iter_val(iter));
- delete[] value;
- hattrie_iter_next(iter);
- }
- hattrie_iter_free(iter);
- // Delete the mapping and all the keys
- hattrie_clear(mapping_);
- }
-
- size_t size() const {
- return hattrie_size(mapping_);
- }
-
- iterator begin() const {
- return iterator(k_, hattrie_iter_begin(mapping_, false));
- }
-
- iterator end() const {
- return iterator(k_);
- }
-
- private:
- unsigned k_;
- unsigned rawcnt_;
- hattrie_t *mapping_;
-};
-
-}
-
-#endif // __KMER_MAP_HPP__
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp b/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
deleted file mode 100644
index 0f67d38..0000000
--- a/src/modules/assembly_graph/graph_alignment/kmer_mapper.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "utils/adt/kmer_vector.hpp"
-#include "edge_index.hpp"
-
-#include "kmer_map.hpp"
-
-#include <set>
-#include <cstdlib>
-
-namespace debruijn_graph {
-template<class Graph>
-class KmerMapper : public omnigraph::GraphActionHandler<Graph> {
- typedef omnigraph::GraphActionHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq Kmer;
- typedef runtime_k::RtSeq Seq;
- typedef typename Seq::DataType RawSeqData;
-
- unsigned k_;
- KMerMap mapping_;
- bool verification_on_;
- bool normalized_;
-
- bool CheckAllDifferent(const Sequence &old_s, const Sequence &new_s) const {
- std::set<Kmer> kmers;
- Kmer kmer = old_s.start<Kmer>(k_) >> 0;
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- kmer <<= old_s[i];
- kmers.insert(kmer);
- }
- kmer = new_s.start<Kmer>(k_) >> 0;
- for (size_t i = k_ - 1; i < new_s.size(); ++i) {
- kmer <<= new_s[i];
- kmers.insert(kmer);
- }
- return kmers.size() == old_s.size() - k_ + 1 + new_s.size() - k_ + 1;
- }
-
-public:
- KmerMapper(const Graph &g, bool verification_on = true) :
- base(g, "KmerMapper"), k_(unsigned(g.k() + 1)), mapping_(k_), verification_on_(verification_on), normalized_(false) {
- }
-
- virtual ~KmerMapper() {}
-
- unsigned get_k() const { return k_; }
-
- auto begin() const -> decltype(mapping_.begin()) {
- return mapping_.begin();
- }
-
- auto end() const -> decltype(mapping_.end()) {
- return mapping_.end();
- }
-
- void Normalize() {
- if (normalized_)
- return;
-
- KMerVector<Kmer> all(k_, size());
- for (auto it = begin(); it != end(); ++it)
- all.push_back(it->first);
-
- for (auto it = all.begin(); it != all.end(); ++it) {
- Seq val(k_, it.data());
- Normalize(val);
- }
- normalized_ = true;
- }
-
- void Revert(const Kmer &kmer) {
- Kmer old_value = Substitute(kmer);
- if (old_value != kmer) {
- mapping_.erase(kmer);
- mapping_.set(old_value, kmer);
- normalized_ = false;
- }
- }
-
- void Normalize(const Kmer &kmer) {
- mapping_.set(kmer, Substitute(kmer));
- }
-
- bool CheckCanRemap(const Sequence &old_s, const Sequence &new_s) const {
- if (!CheckAllDifferent(old_s, new_s))
- return false;
-
- size_t old_length = old_s.size() - k_ + 1;
- size_t new_length = new_s.size() - k_ + 1;
- UniformPositionAligner aligner(old_s.size() - k_ + 1,
- new_s.size() - k_ + 1);
- Kmer old_kmer = old_s.start<Kmer>(k_);
- old_kmer >>= 0;
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- old_kmer <<= old_s[i];
- size_t old_kmer_offset = i - k_ + 1;
- size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
- if (old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
- Kmer middle(k_ - 1, new_s, new_length / 2);
- if (typename Kmer::less2()(middle, !middle)) {
- new_kmer_offest = new_length - 1 - new_kmer_offest;
- }
- }
- Kmer new_kmer(k_, new_s, new_kmer_offest);
- if (mapping_.count(new_kmer)) {
- if (Substitute(new_kmer) != old_kmer) {
- return false;
- }
- }
- }
- return true;
- }
-
- void RemapKmers(const Sequence &old_s, const Sequence &new_s) {
- VERIFY(this->IsAttached());
- size_t old_length = old_s.size() - k_ + 1;
- size_t new_length = new_s.size() - k_ + 1;
- UniformPositionAligner aligner(old_s.size() - k_ + 1,
- new_s.size() - k_ + 1);
- Kmer old_kmer = old_s.start<Kmer>(k_);
-
- for (size_t i = k_ - 1; i < old_s.size(); ++i) {
- // Instead of shifting right
- if (i != k_ - 1) {
- old_kmer <<= old_s[i];
- }
-
- size_t old_kmer_offset = i - k_ + 1;
- size_t new_kmer_offest = aligner.GetPosition(old_kmer_offset);
- if (old_kmer_offset * 2 + 1 == old_length && new_length % 2 == 0) {
- Kmer middle(k_-1, new_s, new_length / 2);
- if (typename Kmer::less2()(middle, !middle)) {
- new_kmer_offest = new_length - 1 - new_kmer_offest;
- }
- }
- Kmer new_kmer(k_, new_s, new_kmer_offest);
- if (mapping_.count(new_kmer)) {
- if (verification_on_)
- VERIFY(Substitute(new_kmer) == old_kmer);
- mapping_.erase(new_kmer);
- }
- if (old_kmer != new_kmer) {
- mapping_.set(old_kmer, new_kmer);
- normalized_ = false;
- }
- }
- }
-
- void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) override {
- VERIFY(this->g().EdgeNucls(new_edge) == this->g().EdgeNucls(edge2));
- RemapKmers(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2));
- }
-
- Kmer Substitute(const Kmer &kmer) const {
- VERIFY(this->IsAttached());
- Kmer answer = kmer;
- const auto *rawval = mapping_.find(answer);
- while (rawval != nullptr) {
- Seq val(k_, rawval);
- if (verification_on_)
- VERIFY(answer != val);
-
- answer = val;
- rawval = mapping_.find(answer);
- }
- return answer;
- }
-
- bool CanSubstitute(const Kmer &kmer) const {
- const auto *rawval = mapping_.find(kmer);
- return rawval != nullptr;
- }
-
- void BinWrite(std::ostream &file) const {
- uint32_t sz = (uint32_t)size();
- file.write((const char *) &sz, sizeof(uint32_t));
-
- for (auto iter = begin(); iter != end(); ++iter) {
- Kmer::BinWrite(file, iter->first);
- Kmer::BinWrite(file, iter->second);
- }
- }
-
- void BinRead(std::istream &file) {
- clear();
-
- uint32_t size;
- file.read((char *) &size, sizeof(uint32_t));
- for (uint32_t i = 0; i < size; ++i) {
- Kmer key(k_);
- Seq value(k_);
- Kmer::BinRead(file, &key);
- Seq::BinRead(file, &value);
- mapping_.set(key, value);
- }
- normalized_ = false;
- }
-
- bool CompareTo(KmerMapper<Graph> const &m) {
- if (size() != m.size()) {
- INFO("Unequal sizes");
- return false;
- }
-
- for (auto iter = begin(); iter != end(); ++iter) {
- auto cmp = m.mapping_.find(iter.first());
- if (cmp == m.mapping_.end() || cmp.second() != iter.second()) {
- return false;
- }
- }
- return true;
- }
-
- void clear() {
- normalized_ = false;
- return mapping_.clear();
- }
-
- size_t size() const {
- return mapping_.size();
- }
-
- // "turn on = true" means turning of all verifies
- void SetUnsafeMode(bool turn_on) {
- verification_on_ = !turn_on;
- }
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp b/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp
deleted file mode 100644
index bb9ebe2..0000000
--- a/src/modules/assembly_graph/graph_alignment/kmer_mapper_logger.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * sequencem_mapping_logger.h
- *
- * Created on: Nov 27, 2012
- * Author: alex
- */
-
-#ifndef KMER_MAPPER_LOGGER_H_
-#define KMER_MAPPER_LOGGER_H_
-
-#include "data_structures/sequence/sequence.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-#include "dev_support/standard_base.hpp"
-
-namespace debruijn {
-
-template<class Graph>
-class KmerMapperLogger : public omnigraph::GraphActionHandler<Graph> {
-public:
- typedef pair<Sequence, Sequence> MappedSeq;
- typedef typename Graph::EdgeId EdgeId;
-
- KmerMapperLogger(Graph& graph) : GraphActionHandler<Graph>(graph, "KmerMapperLogger") {}
- virtual ~KmerMapperLogger() {}
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- log_.push_back(MappedSeq(this->g().EdgeNucls(edge1), this->g().EdgeNucls(edge2)));
- }
-
- const vector<MappedSeq>& log() const {
- return log_;
- }
-
- vector<MappedSeq> log_;
-};
-
-} /* namespace debruijn */
-#endif /* KMER_MAPPER_LOGGER_H_ */
diff --git a/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp b/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp
deleted file mode 100644
index 654bc21..0000000
--- a/src/modules/assembly_graph/graph_alignment/long_read_mapper.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * long_read_mapper.hpp
- *
- * Created on: Jun 17, 2013
- * Author: andrey
- */
-
-#ifndef LONG_READ_MAPPER_HPP_
-#define LONG_READ_MAPPER_HPP_
-
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-class AbstractLongReadMapper: public SequenceMapperListener {
-public:
- AbstractLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
- : gp_(gp), storage_(storage), path_finder_(gp_.g) {
- }
-
- void StartProcessLibrary(size_t threads_count) override {
- for (size_t i = 0; i < threads_count; ++i)
- buffer_storages_.emplace_back(gp_.g);
- }
-
- void StopProcessLibrary() override {
- for (size_t i = 0; i < buffer_storages_.size(); ++i) {
- MergeBuffer(i);
- }
- buffer_storages_.clear();
- }
-
- void MergeBuffer(size_t thread_index) override {
- DEBUG("Merge buffer " << thread_index << " with size " << buffer_storages_[thread_index].size());
- storage_.AddStorage(buffer_storages_[thread_index]);
- buffer_storages_[thread_index].Clear();
- DEBUG("Now size " << storage_.size());
- }
-
- void ProcessPairedRead(size_t ,
- const io::PairedReadSeq&,
- const MappingPath<EdgeId>& ,
- const MappingPath<EdgeId>&) override {
- //nothing to do
- }
-
- void ProcessPairedRead(size_t ,
- const io::PairedRead&,
- const MappingPath<EdgeId>& ,
- const MappingPath<EdgeId>&) override {
- //nothing to do
- }
-
- void ProcessSingleRead(size_t thread_index,
- const io::SingleRead&,
- const MappingPath<EdgeId>& read) override {
- ProcessSingleRead(thread_index, read);
- }
-
- void ProcessSingleRead(size_t thread_index,
- const io::SingleReadSeq&,
- const MappingPath<EdgeId>& read) override {
- ProcessSingleRead(thread_index, read);
- }
-
- PathStorage<conj_graph_pack::graph_t>& GetPaths() {
- return storage_;
- }
-
-private:
-
- virtual void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) = 0;
-
-protected:
- conj_graph_pack& gp_;
- PathStorage<conj_graph_pack::graph_t>& storage_;
- ReadPathFinder<conj_graph_pack::graph_t> path_finder_;
- std::vector<PathStorage<conj_graph_pack::graph_t> > buffer_storages_;
-
-};
-
-class SimpleLongReadMapper: public AbstractLongReadMapper {
-public:
- SimpleLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
- : AbstractLongReadMapper(gp, storage) {
- }
-
-private:
-
- void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) override {
- vector<EdgeId> path = path_finder_.FindReadPath(read);
- buffer_storages_[thread_index].AddPath(path, 1, false);
- }
-};
-
-class GappedLongReadMapper : public AbstractLongReadMapper {
-private:
- typedef MappingPathFixer<Graph> GraphMappingPathFixer;
- const GraphMappingPathFixer path_fixer_;
- const double MIN_MAPPED_RATIO = 0.3;
- const size_t MIN_MAPPED_LENGTH = 100;
-public:
- GappedLongReadMapper(conj_graph_pack& gp, PathStorage<conj_graph_pack::graph_t>& storage)
- : AbstractLongReadMapper(gp, storage), path_fixer_(gp.g) {
- }
-
-private:
-
- size_t CountMappedEdgeSize(EdgeId edge, const MappingPath<EdgeId>& mapping_path, size_t& mapping_index) const {
- while(mapping_path[mapping_index].first != edge) {
- mapping_index++;
- }
- size_t start_idx = mapping_index;
-
- while(mapping_path[mapping_index].first == edge) {
- mapping_index++;
- if(mapping_index >= mapping_path.size()) {
- break;
- }
- }
- size_t end_idx = mapping_index;
- size_t total_len = 0;
- for(size_t i = start_idx; i < end_idx; ++i) {
- total_len += mapping_path[i].second.initial_range.size();
- }
-
- return total_len;
- }
-
- vector<EdgeId> FilterBadMappings(const vector<EdgeId>& corrected_path, const MappingPath<EdgeId>& mapping_path) const {
- vector<EdgeId> new_corrected_path;
- size_t mapping_index = 0;
- for (auto edge : corrected_path) {
- size_t mapping_size = CountMappedEdgeSize(edge, mapping_path, mapping_index);
- size_t edge_len = gp_.g.length(edge);
- //VERIFY(edge_len >= mapping_size);
- if (mapping_size > MIN_MAPPED_LENGTH ||
- math::gr((double) mapping_size / (double) edge_len, MIN_MAPPED_RATIO)) {
- new_corrected_path.push_back(edge);
- }
- }
- return new_corrected_path;
- }
-
-
- void ProcessSingleRead(size_t thread_index, const MappingPath<EdgeId>& read) override {
- vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
- read.simple_path());
- corrected_path = FilterBadMappings(corrected_path, read);
- vector<vector<EdgeId>> paths = FindReadPathWithGaps(read, corrected_path);
- for(auto path : paths) {
- buffer_storages_[thread_index].AddPath(path, 1, false);
- }
- }
-
- vector<vector<EdgeId>> FindReadPathWithGaps(const MappingPath<EdgeId>& mapping_path, vector<EdgeId>& corrected_path) const {
- if (mapping_path.size() == 0) {
- TRACE("read unmapped");
- return vector<vector<EdgeId>>();
- }
- vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
- return SplitUnfixedPoints(fixed_path);
- }
-
- vector<vector<EdgeId>> SplitUnfixedPoints(vector<EdgeId>& path) const {
- vector<vector<EdgeId>> result;
- size_t prev_start = 0;
- for (size_t i = 1; i < path.size(); ++i) {
- if (gp_.g.EdgeEnd(path[i - 1]) != gp_.g.EdgeStart(path[i])) {
- result.push_back(vector<EdgeId>(path.begin() + prev_start, path.begin() + i));
- prev_start = i;
- }
- }
- result.push_back(vector<EdgeId>(path.begin() + prev_start, path.end()));
- return result;
- }
-};
-
-
-}/*longreads*/
-
-#endif /* LONG_READ_MAPPER_HPP_ */
diff --git a/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp b/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp
deleted file mode 100644
index 44bf89e..0000000
--- a/src/modules/assembly_graph/graph_alignment/long_read_storage.hpp
+++ /dev/null
@@ -1,376 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * long_edge_storage.hpp
- *
- * Created on: Feb 7, 2013
- * Author: lab42
- */
-
-#pragma once
-
-#include <algorithm>
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PathInfo {
-public:
- typedef typename Graph::EdgeId EdgeId;
- vector<EdgeId> path;
-
-private:
- mutable size_t w;
-
-public:
- vector<EdgeId> getPath() const {
- return path;
- }
-
- size_t getWeight() const {
- return w;
- }
-
- void increaseWeight(int addition = 1) const {
- w += addition;
- }
-
- bool operator<(const PathInfo<Graph> &other) const {
- return path < other.path;
- }
-
- PathInfo(const vector<EdgeId> &p, size_t weight = 0) :
- path(p), w(weight) {
- }
- PathInfo(const PathInfo<Graph> &other) {
- path = other.path;
- w = other.w;
- }
-
- string str(Graph &g_) {
- stringstream s;
- for(auto iter = path.begin(); iter != path.end(); iter ++ ){
- s << g_.int_id(*iter) << " ";
- }
- return s.str();
- }
-
-};
-
-template<class Graph>
-class PathStorage {
- friend class PathInfo<Graph> ;
- typedef typename Graph::EdgeId EdgeId;
- typedef map<EdgeId, set<PathInfo<Graph> > > InnerIndex;
-private:
- Graph &g_;
- InnerIndex inner_index_;
- const size_t kLongEdgeForStats = 500;
-
- void HiddenAddPath(const vector<EdgeId> &p, int w){
- if (p.size() == 0 ) return;
- for (typename set<PathInfo<Graph> >::iterator iter = inner_index_[p[0]].begin(); iter != inner_index_[p[0]].end(); ++iter) {
-
- if (iter->path == p) {
- iter->increaseWeight(w);
- return;
- }
- }
- inner_index_[p[0]].insert(PathInfo<Graph>(p, w));
- size_++;
- }
-
-public:
-
- PathStorage(Graph &g)
- : g_(g),
- inner_index_(),
- size_(0) {
- }
- PathStorage(const PathStorage & p)
- : g_(p.g_),
- inner_index_(),
- size_(0) {
- for (auto iter = p.inner_index_.begin(); iter != p.inner_index_.end();
- iter++) {
- for (auto j_iter = iter->second.begin();
- j_iter != iter->second.end(); j_iter++) {
- this->AddPath(j_iter->path, (int) j_iter->getWeight());
- }
- }
- }
- void ReplaceEdges(map<EdgeId, EdgeId> &old_to_new){
- map<int, EdgeId> tmp_map;
-// for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter ){
-// tmp_map[g_.int_id(*iter)] = *iter;
-// }
- InnerIndex new_index;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end(); iter++) {
- auto tmp = iter->second;
- EdgeId new_first;
- if (old_to_new.find(iter->first) == old_to_new.end())
- new_first = iter->first;
- else {
- DEBUG("new first edge: "<< g_.int_id(old_to_new[iter->first]) << " with " << tmp.size() << " edges ");
- new_first = old_to_new[iter->first];
- }
- set<PathInfo<Graph> > new_tmp;
- for (auto j_iter = tmp.begin(); j_iter != tmp.end(); j_iter++) {
- PathInfo<Graph> pi = *(j_iter);
- for (size_t k = 0; k < pi.path.size(); k++)
- if (old_to_new.find(pi.path[k]) != old_to_new.end()) {
-// INFO(g_.int_id(old_to_new[pi.path[k]]));
- pi.path[k] = old_to_new[pi.path[k]];
- }
- DEBUG(pi.str(g_));
- new_tmp.insert(pi);
-
- }
- if (new_first != iter->first) {
- TRACE("and mmew_tmp.size: "<< new_tmp.size());
- }
- if (new_index.find(new_first) == new_index.end()) {
- new_index[new_first] = new_tmp;
- } else {
- for (auto j_iter = new_tmp.begin(); j_iter != new_tmp.end(); j_iter++) {
- new_index[new_first].insert(*j_iter);
- }
- }
-
- }
-
- inner_index_ = new_index;
- }
-
- void AddPath(const vector<EdgeId> &p, int w, bool add_rc = false) {
- HiddenAddPath(p, w);
- if (add_rc) {
- vector<EdgeId> rc_p(p.size());
- for (size_t i = 0; i < p.size(); i++)
- rc_p[i] = g_.conjugate(p[p.size() - 1 - i]);
- HiddenAddPath(rc_p, w);
- }
- }
- void DumpToFile(const string filename) const{
- map <EdgeId, EdgeId> auxilary;
- DumpToFile(filename, auxilary);
- }
- void DumpToFile(const string filename, map<EdgeId, EdgeId> &replacement, size_t stats_weight_cutoff = 1, bool need_log = false) const {
- ofstream filestr(filename);
- set<EdgeId> continued_edges;
-
- for(auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter){
- filestr<< iter->second.size() << endl;
- int non1 = 0;
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << " Weight: " << j_iter->getWeight();
- if (j_iter->getWeight() > stats_weight_cutoff)
- non1++;
-
- filestr << " length: " << j_iter->path.size() << " ";
- for (auto p_iter = j_iter->path.begin(); p_iter != j_iter->path.end(); ++p_iter) {
- if (p_iter != j_iter->path.end() - 1 && j_iter->getWeight() > stats_weight_cutoff) {
- continued_edges.insert(*p_iter);
- }
-
- filestr << g_.int_id(*p_iter) << "(" << g_.length(*p_iter) << ") ";
- }
- filestr << endl;
- }
- filestr << endl;
- }
-
- int noncontinued = 0;
- int long_gapped = 0;
- int continued = 0;
- if (need_log) {
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (g_.length(*iter) > kLongEdgeForStats) {
- if (!g_.IsDeadEnd(g_.EdgeEnd(*iter))) {
- if (continued_edges.find(*iter) == continued_edges.end()) {
- if ((replacement.find(*iter) != replacement.end() &&
- continued_edges.find(replacement[*iter]) != continued_edges.end())) {
- TRACE("found in teplacement, edges " << g_.int_id(*iter) << " " <<
- g_.int_id(replacement[*iter]) << " skipping ");
- continue;
- }
- TRACE("noncontinued end left " << g_.int_id(*iter));
- noncontinued++;
- } else
- continued++;
- } else {
- TRACE("dead end left " << g_.int_id(*iter));
- long_gapped++;
- }
- }
- }
- INFO("After PacBio (long reads) aligning, for edges longer than " << kLongEdgeForStats << ":");
- INFO("No continuation found for " << noncontinued + long_gapped << " edges of " <<
- noncontinued + continued + long_gapped);
- }
- }
-
- vector<PathInfo<Graph> > GetAllPaths() const {
- vector<PathInfo<Graph> > res;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end();
- ++iter) {
- for (auto j_iter = iter->second.begin();
- j_iter != iter->second.end(); ++j_iter) {
-
- res.push_back(*j_iter);
- }
- }
- return res;
- }
-
-
- vector<PathInfo<Graph> > GetAllPathsNoConjugate() {
- vector<PathInfo<Graph> > res;
-
- std::set< PathInfo<Graph> > added;
- for (auto iter = inner_index_.begin(); iter != inner_index_.end(); ++iter) {
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- if (added.count(*j_iter) > 0) {
- continue;
- }
-
- added.insert(*j_iter);
- vector<EdgeId> rc_p(j_iter->path.size()) ;
- for (size_t i = 0; i < j_iter->path.size(); i++) {
- rc_p[i] = g_.conjugate(j_iter->path[j_iter->path.size() - 1 - i]);
- }
- added.insert(PathInfo<Graph>(rc_p, j_iter->getWeight()));
-
- res.push_back(*j_iter);
- }
- }
- return res;
- }
-
-
- void LoadFromFile(const string s, bool force_exists = true) {
- FILE* file = fopen(s.c_str(), "r");
- if (force_exists) {
- VERIFY(file != NULL);
- } else if (file == NULL) {
- INFO("Long reads not found, skipping");
- return;
- }
- fclose(file);
-
- INFO("Loading long reads alignment...");
- ifstream filestr(s);
- INFO("loading from " << s);
- map<size_t, EdgeId> tmp_map;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- tmp_map[g_.int_id(*iter)] = *iter;
- }
- int fl;
-
- file = fopen((s).c_str(), "r");
- char ss[14];
- while (!feof(file)) {
- int n;
-
- fl = fscanf(file, "%d\n", &n);
- if (fl != 1)
- break;
- TRACE(n);
- for (int i = 0; i < n; i++) {
-
- int w = -1, l = -1;
- fl = fscanf(file, "Weight: %d length: %d", &w, &l);
- TRACE(w << " " << l);
- VERIFY(fl == 2);
- vector<EdgeId> p;
- for (int j = 0; j < l; j++) {
- size_t e;
- int x;
- fl = fscanf(file, "%zu(%d)", &e, &x);
- VERIFY(fl == 2);
- VERIFY(tmp_map.find(e) != tmp_map.end());
- p.push_back(tmp_map[e]);
- }
- fl = fscanf(file, "%[^\n]\n", ss);
- TRACE(ss[0]);
- AddPath(p, w);
- }
- }
- fclose(file);
- INFO("Loading finished.");
- }
-
- void AddStorage(PathStorage<Graph> & to_add) {
-
- for(auto iter = to_add.inner_index_.begin(); iter != to_add.inner_index_.end(); iter++) {
- for(auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter ++) {
- this->AddPath(j_iter->path, (int) j_iter->getWeight());
- }
- }
- }
-
- void Clear() {
- inner_index_.clear();
- size_ = 0;
- }
-
- size_t size() {
- return size_;
- }
-
-// typename InnerIndex::iterator begin() const {
-// return inner_index.begin();
-// }
-//
-// typename InnerIndex::iterator end() const {
-// return inner_index.end();
-// }
-// typename InnerIndex::iterator operator*(){
-// return this->first;
-// }
-private:
- size_t size_;
-};
-
-template<class Graph>
-class LongReadContainer {
- Graph& g_;
- vector<PathStorage<Graph>> data_;
-
-public:
-
- LongReadContainer(Graph& g, size_t count = 0): g_(g) {
- for (size_t i = 0; i < count; ++i) {
- data_.emplace_back(g_);
- }
- }
-
- PathStorage<Graph>& operator[](size_t index) {
- return data_[index];
- }
-
- const PathStorage<Graph>& operator[](size_t index) const {
- return data_[index];
- }
-
- size_t size() const {
- return data_.size();
- }
-
- void Clear() {
- for (auto& storage : data_) {
- storage.Clear();
- }
- }
-
-};
-
-
-}
-
-
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
deleted file mode 100644
index 0a1c55a..0000000
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pac_index.hpp
+++ /dev/null
@@ -1,824 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/indices/edge_multi_index.hpp"
-#include "assembly_graph/graph_alignment/edge_index_refiller.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-// FIXME: Layering violation, get rid of this
-#include "pipeline/config_struct.hpp"
-#include "pacbio_read_structures.hpp"
-#include "pipeline/config_struct.hpp"
-
-#include <algorithm>
-
-namespace pacbio {
-enum {
- UNDEF_COLOR = -1,
- DELETED_COLOR = - 2
-};
-
-template<class Graph>
-class PacBioMappingIndex {
-public:
- typedef map<typename Graph::EdgeId, vector<MappingInstance> > MappingDescription;
- typedef pair<typename Graph::EdgeId, vector<MappingInstance> > ClusterDescription;
- typedef set<KmerCluster<Graph> > ClustersSet;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> Index;
- typedef typename Index::KeyWithHash KeyWithHash;
-
-private:
- DECL_LOGGER("PacIndex")
-
- const Graph &g_;
- size_t pacbio_k;
- size_t debruijn_k;
- const static int short_edge_cutoff = 0;
- const static size_t min_cluster_size = 8;
- const static int max_similarity_distance = 500;
-
-//Debug stasts
- int good_follow = 0;
- int half_bad_follow = 0;
- int bad_follow = 0;
-
- set<Sequence> banned_kmers;
- debruijn_graph::DeBruijnEdgeMultiIndex<typename Graph::EdgeId> tmp_index;
- map<pair<VertexId, VertexId>, vector<size_t> > distance_cashed;
- size_t read_count;
- bool ignore_map_to_middle;
- debruijn_graph::config::debruijn_config::pacbio_processor pb_config_;
-public:
- MappingDescription Locate(const Sequence &s) const;
-
- PacBioMappingIndex(const Graph &g, size_t k, size_t debruijn_k_, bool ignore_map_to_middle, string out_dir, debruijn_graph::config::debruijn_config::pacbio_processor pb_config )
- : g_(g),
- pacbio_k(k),
- debruijn_k(debruijn_k_),
- tmp_index((unsigned) pacbio_k, out_dir), ignore_map_to_middle(ignore_map_to_middle), pb_config_(pb_config) {
- DEBUG("PB Mapping Index construction started");
- debruijn_graph::EdgeIndexRefiller().Refill(tmp_index, g_);
- INFO("Index constructed");
- FillBannedKmers();
- read_count = 0;
- }
- ~PacBioMappingIndex(){
- DEBUG("good/ugly/bad counts:" << good_follow << " "<<half_bad_follow << " " << bad_follow);
- }
-
- void FillBannedKmers() {
- for (int i = 0; i < 4; i++) {
- auto base = nucl((unsigned char) i);
- for (int j = 0; j < 4; j++) {
- auto other = nucl((unsigned char) j);
- for (size_t other_pos = 0; other_pos < pacbio_k; other_pos++) {
- string s = "";
- for (size_t k = 0; k < pacbio_k; k++) {
- if (k != other_pos)
- s += base;
- else
- s += other;
- }
- banned_kmers.insert(Sequence(s));
- }
- }
- }
- }
-
- bool similar(const MappingInstance &a, const MappingInstance &b,
- int shift = 0) const {
- if (b.read_position + shift < a.read_position) {
- return similar(b, a, -shift);
- } else if (b.read_position == a.read_position) {
- return (abs(int(b.edge_position) + shift - int(a.edge_position)) < 2);
- } else {
- return ((b.edge_position + shift - a.edge_position >= (b.read_position - a.read_position) * pb_config_.compression_cutoff) &&
- ((b.edge_position + shift - a.edge_position) * pb_config_.compression_cutoff <= (b.read_position - a.read_position)));
- }
- }
-
- void dfs_cluster(vector<int> &used, vector<MappingInstance> &to_add,
- const int cur_ind,
- const typename MappingDescription::iterator iter) const {
- size_t len = iter->second.size();
- for (size_t k = 0; k < len; k++) {
- if (!used[k] && similar(iter->second[cur_ind], iter->second[k])) {
- to_add.push_back(iter->second[k]);
- used[k] = 1;
- dfs_cluster(used, to_add, (int) k, iter);
- }
- }
- }
-
- void dfs_cluster_norec(vector<int> &used, vector<MappingInstance> &to_add,
- const size_t cur_ind,
- const typename MappingDescription::iterator iter, vector<vector<size_t> > &similarity_list) const {
- std::deque<size_t> stack;
- stack.push_back(cur_ind);
- used[cur_ind] = 1;
- while (stack.size() > 0) {
- size_t k = stack.back();
- stack.pop_back();
- to_add.push_back(iter->second[k]);
-
- for (size_t i = 0; i < similarity_list[k].size(); i++) {
- if (!used[similarity_list[k][i]]) {
- stack.push_back(similarity_list[k][i]);
- used[similarity_list[k][i]] = 1;
- }
- }
- }
- }
-
- ClustersSet GetOrderClusters(const Sequence &s) const {
- MappingDescription descr = Locate(s);
- ClustersSet res;
- TRACE(read_count << " read_count");
-
- DEBUG(descr.size() <<" clusters");
- for (auto iter = descr.begin(); iter != descr.end(); ++iter) {
- size_t edge_id = g_.int_id(iter->first);
- DEBUG(edge_id);
- sort(iter->second.begin(), iter->second.end(), ReadPositionComparator());
- set<vector<MappingInstance> > edge_cluster_set;
- size_t len = iter->second.size();
- vector<vector<size_t> > similarity_list(len);
- int cnt = 0;
- for (size_t i = 0; i < len; i++){
- for (size_t j = i + 1; j < len; j++){
- if (iter->second[i].read_position + max_similarity_distance < iter->second[j].read_position) {
- break;
- }
- if (similar(iter->second[i], iter->second[j])) {
- similarity_list[i].push_back(j);
- cnt ++;
- if (cnt % 10000 == 0) {
- DEBUG(cnt);
- }
- }
- }
- }
-
- DEBUG(len <<" kmers in cluster");
- vector<int> used(len);
- for (size_t i = 0; i < len; i++) {
- if (!used[i]) {
- vector<size_t> new_cluster(len);
- vector<size_t> prev(len);
- for(size_t j = i; j < len; j++) {
- if (!used[j]) {
- if (new_cluster[j] == 0) new_cluster[j] = 1, prev[j] = size_t(-1);
- for(size_t k = 0; k < similarity_list[j].size(); k++) {
- size_t next_ind = similarity_list[j][k];
- if (!used[next_ind]) {
- if (new_cluster[next_ind] < new_cluster[j] + 1){
- new_cluster[next_ind] = new_cluster[j] + 1;
- prev[next_ind] = j;
- }
- }
- }
- }
- }
- size_t maxx = 0;
- size_t maxj = i;
- for(size_t j = i; j < len; j++) {
- if (new_cluster[j] > maxx) maxj = j, maxx = new_cluster[j];
- }
- vector<MappingInstance> to_add;
- size_t real_maxj = maxj, first_j = maxj;
- while (maxj != size_t(-1)) {
- to_add.push_back(iter->second[maxj]);
- first_j = maxj;
- maxj = prev[maxj];
- }
- for (auto j = first_j; j < real_maxj; j++)
- used[j] = 1;
- reverse(to_add.begin(), to_add.end());
- TRACE("adding cluster "" edge "<< edge_id << " len " <<to_add.size() )
- res.insert(KmerCluster<Graph>(iter->first, to_add));
- }
- }
- }
- FilterClusters(res);
- return res;
- }
- //filter clusters that are too small or fully located on a vertex or dominated by some other cluster.
- void FilterClusters(ClustersSet &clusters) const {
- for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
- size_t edge_id = g_.int_id(i_iter->edgeId);
-
- int len = (int) g_.length(i_iter->edgeId);
- auto sorted_by_edge = i_iter->sorted_positions;
- sort(sorted_by_edge.begin(), sorted_by_edge.end());
- double good = 0;
- DEBUG("filtering cluster of size " << sorted_by_edge.size());
- DEBUG(edge_id <<" : edgeId");
- for (auto iter = sorted_by_edge.begin();
- iter < sorted_by_edge.end(); iter++) {
- if (iter->IsUnique())
- good++;
- //good += 1.0 / (iter->quality * iter->quality);
- }
- DEBUG("good " << good);
-
- if (good < min_cluster_size || (len < short_edge_cutoff)) {
- if (len < short_edge_cutoff) {
- DEBUG("Life is too long, and edge is too short!");
- }
- auto tmp_iter = i_iter;
- tmp_iter++;
- clusters.erase(i_iter);
- i_iter = tmp_iter;
- } else {
- if (sorted_by_edge[0].edge_position >= len
- || sorted_by_edge[i_iter->size - 1].edge_position
- <= int(debruijn_k) - int(pacbio_k)) {
- DEBUG("All anchors in vertex");
- auto tmp_iter = i_iter;
- tmp_iter++;
- clusters.erase(i_iter);
- i_iter = tmp_iter;
- } else {
- i_iter++;
- }
- }
- }
- for (auto i_iter = clusters.begin(); i_iter != clusters.end();) {
- size_t edge_id = g_.int_id(i_iter->edgeId);
- auto sorted_by_edge = i_iter->sorted_positions;
-
- DEBUG("filtering with cluster edge, stage 2 "<< edge_id << " len " << sorted_by_edge.size() << " clusters still alive: "<< clusters.size());
- for (auto j_iter = clusters.begin(); j_iter != clusters.end();) {
- if (i_iter != j_iter) {
- if (dominates(*i_iter, *j_iter)) {
- TRACE("cluster is dominated");
- auto tmp_iter = j_iter;
- tmp_iter++;
- TRACE("cluster on edge " << g_.int_id(j_iter->edgeId));
- TRACE("erased - dominated");
- clusters.erase(j_iter);
- j_iter = tmp_iter;
- } else {
- j_iter++;
- }
- } else {
- j_iter++;
- }
- }
- DEBUG("cluster size "<< i_iter->sorted_positions.size() << "survived filtering");
- i_iter++;
- }
- }
-
- // is "non strictly dominates" required?
- inline bool dominates(const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b) const {
- size_t a_size = a.size;
- size_t b_size = b.size;
- if ((double) a_size < (double) b_size * pb_config_.domination_cutoff
- || a.sorted_positions[a.first_trustable_index].read_position
- > b.sorted_positions[b.first_trustable_index].read_position
- || a.sorted_positions[a.last_trustable_index].read_position
- < b.sorted_positions[b.last_trustable_index].read_position) {
- return false;
- } else {
- return true;
- }
- }
-
- vector<EdgeId> FillGapsInCluster(vector<pair<size_t, typename ClustersSet::iterator> > &cur_cluster,
- const Sequence &s) {
- vector<EdgeId> cur_sorted;
- EdgeId prev_edge = EdgeId(0);
-
- for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
- ++iter) {
- EdgeId cur_edge = iter->second->edgeId;
- if (prev_edge != EdgeId(0)) {
-//Need to find sequence of edges between clusters
- VertexId start_v = g_.EdgeEnd(prev_edge);
- VertexId end_v = g_.EdgeStart(cur_edge);
- auto prev_iter = iter - 1;
- MappingInstance cur_first_index =
- iter->second->sorted_positions[iter->second
- ->first_trustable_index];
- MappingInstance prev_last_index = prev_iter->second
- ->sorted_positions[prev_iter->second
- ->last_trustable_index];
-
- if (start_v != end_v ||
- (start_v == end_v &&
- (double) (cur_first_index.read_position - prev_last_index.read_position) >
- (double) (cur_first_index.edge_position + (int) g_.length(prev_edge) - prev_last_index.edge_position) * 1.3)) {
- DEBUG(" traversing tangled hregion between "<< g_.int_id(prev_edge)<< " " << g_.int_id(cur_edge));
- DEBUG(" first pair" << cur_first_index.str() << " edge_len" << g_.length(cur_edge));
- DEBUG(" last pair" << prev_last_index.str() << " edge_len" << g_.length(prev_edge));
- string s_add = "";
- string e_add = "";
- int seq_end = cur_first_index.read_position;
- int seq_start = prev_last_index.read_position;
- string tmp = g_.EdgeNucls(prev_edge).str();
- s_add = tmp.substr(prev_last_index.edge_position,
- g_.length(prev_edge) - prev_last_index.edge_position);
- tmp = g_.EdgeNucls(cur_edge).str();
- e_add = tmp.substr(0, cur_first_index.edge_position);
- pair<int, int> limits = GetPathLimits(*(prev_iter->second),
- *(iter->second),
- (int) s_add.length(),
- (int) e_add.length());
- if (limits.first == -1)
- return vector<EdgeId>(0);
-
- vector<EdgeId> intermediate_path = BestScoredPath(s, start_v, end_v, limits.first, limits.second, seq_start, seq_end, s_add, e_add);
- if (intermediate_path.size() == 0) {
- DEBUG("Tangled region between edgees "<< g_.int_id(prev_edge) << " " << g_.int_id(cur_edge) << " is not closed, additions from edges: " << int(g_.length(prev_edge)) - int(prev_last_index.edge_position) <<" " << int(cur_first_index.edge_position) - int(debruijn_k - pacbio_k ) << " and seq "<< - seq_start + seq_end);
- if (pb_config_.additional_debug_info) {
- DEBUG(" escpected gap length: " << -int(g_.length(prev_edge)) + int(prev_last_index.edge_position) - int(cur_first_index.edge_position) + int(debruijn_k - pacbio_k ) - seq_start + seq_end);
- omnigraph::PathStorageCallback<Graph> callback(g_);
- ProcessPaths(g_, 0, 4000,
- start_v, end_v,
- callback);
- vector<vector<EdgeId> > paths = callback.paths();
- stringstream s_buf;
- for (auto p_iter = paths.begin();
- p_iter != paths.end(); p_iter++) {
- size_t tlen = 0;
- for (auto path_iter = p_iter->begin();
- path_iter != p_iter->end();
- path_iter++) {
- tlen += g_.length(*path_iter);
- }
- s_buf << tlen << " ";
- }
- DEBUG(s_buf.str());
- }
- return intermediate_path;
- }
- for (auto j_iter = intermediate_path.begin(); j_iter != intermediate_path.end(); j_iter++) {
- cur_sorted.push_back(*j_iter);
- }
- }
- }
- cur_sorted.push_back(cur_edge);
- prev_edge = cur_edge;
- }
- return cur_sorted;
- }
-
- bool TopologyGap(EdgeId first, EdgeId second, bool oriented) const {
- bool res = (g_.IsDeadStart(g_.EdgeStart(first)) && g_.IsDeadEnd(g_.EdgeEnd(second)));
- if (!oriented)
- res |= g_.IsDeadEnd(g_.EdgeEnd(first)) && g_.IsDeadStart(g_.EdgeStart(second));
- return res;
- }
-
- vector<int> GetWeightedColors(ClustersSet &mapping_descr, Sequence &s) {
- int len = (int) mapping_descr.size();
- DEBUG("getting colors, table size "<< len);
- vector<vector<int> > cons_table(len);
-
- vector<int> colors(len);
- vector<int> cluster_size(len);
- vector<int> max_size(len);
- vector<int> prev(len);
-
- for (int i = 0; i < len; i++) {
- cons_table[i].resize(len);
- cons_table[i][i] = 0;
- prev[i] = -1;
- }
- int i = 0;
-
- for (int i = 0; i < len; i++) {
-//-1 not initialized, -2 - removed as trash
- colors[i] = UNDEF_COLOR;
- }
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- cluster_size[i] = i_iter->size;
- }
- i = 0;
- if (len > 1) {
- TRACE(len << "clusters");
- }
-
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- int j = i;
- for (auto j_iter = i_iter;
- j_iter != mapping_descr.end(); ++j_iter, ++j) {
- if (i_iter == j_iter)
- continue;
- cons_table[i][j] = IsConsistent(s, *i_iter, *j_iter);
- }
- }
- i = 0;
- int cur_color = 0;
-
- while (true) {
- for (i = 0; i < len; i++) {
- max_size[i] = 0;
- prev[i] = -1;
- }
- i = 0;
- for (auto i_iter = mapping_descr.begin(); i_iter != mapping_descr.end();
- ++i_iter, ++i) {
- if (colors[i] != UNDEF_COLOR) continue;
- max_size[i] = cluster_size[i];
- for (int j = 0; j < i; j ++) {
- if (colors[j] != -1) continue;
- if (cons_table[j][i] && max_size[i] < cluster_size[i] + max_size[j]) {
- max_size[i] = max_size[j] + cluster_size[i];
- prev[i] = j;
- }
- }
- }
- int maxx = 0;
- int maxi = -1;
- for (int j = 0; j < len; j++) {
- if (max_size[j] > maxx) {
- maxx = max_size[j];
- maxi = j;
- }
- }
- if (maxi == -1) {
- break;
- }
- colors[maxi] = cur_color;
- int real_maxi = maxi, min_i = maxi;
-
- while (prev[maxi] != -1) {
- min_i = maxi;
- maxi = prev[maxi];
- colors[maxi] = cur_color;
- }
- while (real_maxi >= min_i) {
- if (colors[real_maxi] == UNDEF_COLOR) {
- colors[real_maxi] = DELETED_COLOR;
- }
- real_maxi --;
- }
- cur_color ++;
-
- }
- return colors;
- }
-
-
-
-
- OneReadMapping<Graph> GetReadAlignment(Sequence &s) {
- ClustersSet mapping_descr = GetOrderClusters(s);
- DEBUG("clusters got");
- int len = (int) mapping_descr.size();
- vector<size_t> real_length;
-
- vector<int> colors = GetWeightedColors(mapping_descr, s);
- vector<vector<EdgeId> > sortedEdges;
- vector<typename ClustersSet::iterator> start_clusters, end_clusters;
- vector<GapDescription<Graph> > illumina_gaps;
- vector<int> used(len);
- size_t used_seed_count = 0;
- auto iter = mapping_descr.begin();
- for (int i = 0; i < len; i++, iter ++) {
- used[i] = 0;
- DEBUG(colors[i] <<" " << iter->str(g_));
- }
- for (int i = 0; i < len; i++) {
- if (!used[i]) {
- DEBUG("starting new subread");
- size_t cur_seed_count = 0;
- vector<pair<size_t, typename ClustersSet::iterator> > cur_cluster;
- used[i] = 1;
- int j = 0;
- int cur_color = colors[i];
- if (cur_color == DELETED_COLOR)
- continue;
- for (auto i_iter = mapping_descr.begin();
- i_iter != mapping_descr.end(); ++i_iter, ++j) {
- if (colors[j] == cur_color) {
- cur_cluster.push_back(
- make_pair(
- i_iter->average_read_position,
- i_iter));
- used[j] = 1;
- cur_seed_count += i_iter->sorted_positions.size();
- }
- }
- sort(cur_cluster.begin(), cur_cluster.end(),
- pair_iterator_less<typename ClustersSet::iterator>());
- VERIFY(cur_cluster.size() > 0);
- //if (cur_seed_count > used_seed_count)
- used_seed_count += cur_seed_count;
- auto cur_cluster_start = cur_cluster.begin();
- for (auto iter = cur_cluster.begin(); iter != cur_cluster.end();
- ++iter) {
- auto next_iter = iter + 1;
- if (next_iter == cur_cluster.end()
- || !IsConsistent(s, *(iter->second),
- *(next_iter->second))) {
- if (next_iter != cur_cluster.end()) {
- DEBUG("clusters splitted:");
- DEBUG("on "<< iter->second->str(g_));
- DEBUG("and " << next_iter->second->str(g_));
- }
- vector<pair<size_t, typename ClustersSet::iterator> > splitted_cluster(
- cur_cluster_start, next_iter);
- vector<EdgeId> cur_sorted = FillGapsInCluster(
- splitted_cluster, s);
- if (cur_sorted.size() > 0) {
- start_clusters.push_back(cur_cluster_start->second);
- end_clusters.push_back(iter->second);
- sortedEdges.push_back(cur_sorted);
- }
- cur_cluster_start = next_iter;
- } else {
- DEBUG("connected consequtive clusters:");
- DEBUG("on "<< iter->second->str(g_));
- DEBUG("and " << next_iter->second->str(g_));
-
- }
-
- }
- }
- }
- DEBUG("adding gaps between subreads");
- int alignments = int(sortedEdges.size());
- for (int i = 0; i < alignments; i++) {
- for (int j = 0; j < alignments; j++) {
- EdgeId before_gap = sortedEdges[j][sortedEdges[j].size() - 1];
- EdgeId after_gap = sortedEdges[i][0];
-//do not add "gap" for rc-jumping
- if (before_gap != after_gap
- && before_gap != g_.conjugate(after_gap)) {
- if (i != j && TopologyGap(before_gap, after_gap, true)) {
- if (start_clusters[j]->CanFollow(*end_clusters[i])) {
- illumina_gaps.push_back(
- GapDescription<Graph>(*end_clusters[i],
- *start_clusters[j], s,
- (int) pacbio_k));
- }
-
- }
- }
- }
- }
- return OneReadMapping<Graph>(sortedEdges, illumina_gaps, real_length, used_seed_count);
- }
-
- std::pair<int, int> GetPathLimits(const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b,
- int s_add_len, int e_add_len) {
- int start_pos = a.sorted_positions[a.last_trustable_index].read_position;
- int end_pos = b.sorted_positions[b.first_trustable_index].read_position;
- int seq_len = -start_pos + end_pos;
- //int new_seq_len =
-//TODO::something more reasonable
- int path_min_len = max(int(floor((seq_len - int(debruijn_k)) * pb_config_.path_limit_pressing)), 0);
- int path_max_len = (int) ((double) (seq_len + (int) debruijn_k) * pb_config_.path_limit_stretching);
- if (seq_len < 0) {
- DEBUG("suspicious negative seq_len " << start_pos << " " << end_pos << " " << path_min_len << " " << path_max_len);
- return std::make_pair(-1, -1);
- }
- path_min_len = max(path_min_len - int(s_add_len + e_add_len), 0);
- path_max_len = max(path_max_len - int(s_add_len + e_add_len), 0);
- return std::make_pair(path_min_len, path_max_len);
- }
-
-//0 - No, 1 - Yes
- int IsConsistent(Sequence &s, const KmerCluster<Graph> &a,
- const KmerCluster<Graph> &b) {
- EdgeId a_edge = a.edgeId;
- EdgeId b_edge = b.edgeId;
- size_t a_id = g_.int_id(a_edge);
- size_t b_id = g_.int_id(b_edge);
- DEBUG("clusters on " << a_id << " and " << b_id );
- if (abs(a.sorted_positions[a.last_trustable_index].read_position - b.sorted_positions[b.first_trustable_index].read_position) > 5000) {
- DEBUG("...to far5000");
- return 0;
- }
- VertexId start_v = g_.EdgeEnd(a_edge);
- size_t addition = g_.length(a_edge);
- VertexId end_v = g_.EdgeStart(b_edge);
- pair<VertexId, VertexId> vertex_pair = make_pair(start_v, end_v);
- vector<size_t> result;
- DEBUG("seq dist:" << s.size()/3);
- if (distance_cashed.find(vertex_pair) == distance_cashed.end()) {
- omnigraph::DistancesLengthsCallback<Graph> callback(g_);
- ProcessPaths(g_, 0, s.size() / 3, start_v,
- end_v, callback);
- result = callback.distances();
- distance_cashed[vertex_pair] = result;
- } else {
- DEBUG("taking from cashed");
- }
- DEBUG("addition: " << addition << " found " << result.size() << " lengths:" );
- for (size_t i = 0; i < result.size(); i++) {
- DEBUG(result[i]);
- }
- result = distance_cashed[vertex_pair];
- //TODO: Serious optimization possible
- for (size_t i = 0; i < result.size(); i++) {
- for (auto a_iter = a.sorted_positions.begin();
- a_iter != a.sorted_positions.end(); ++a_iter) {
- if (a_iter - a.sorted_positions.begin() > 500 && a.sorted_positions.end() - a_iter >500) continue;
- int cnt = 0;
- for (auto b_iter = b.sorted_positions.begin();
- b_iter != b.sorted_positions.end() && cnt <500; ++b_iter, cnt ++) {
- if (similar(*a_iter, *b_iter,
- (int) (result[i] + addition))) {
- return 1;
- }
- }
- cnt = 0;
- if (b.sorted_positions.size() > 500) {
- for (auto b_iter = b.sorted_positions.end() - 1;
- b_iter != b.sorted_positions.begin() && cnt < 500; --b_iter, cnt ++) {
- if (similar(*a_iter, *b_iter,
- (int) (result[i] + addition))) {
- return 1;
- }
- }
- }
- }
- }
- return 0;
-
- }
-
- string PathToString(const vector<EdgeId>& path) const {
- string res = "";
- for (auto iter = path.begin(); iter != path.end(); iter++) {
- size_t len = g_.length(*iter);
- string tmp = g_.EdgeNucls(*iter).First(len).str();
- res = res + tmp;
- }
- return res;
- }
-
- vector<EdgeId> BestScoredPath(const Sequence &s, VertexId start_v, VertexId end_v,
- int path_min_length, int path_max_length,
- int start_pos, int end_pos, string &s_add,
- string &e_add) {
- DEBUG(" Traversing tangled region. Start and end vertices resp: " << g_.int_id(start_v) <<" " << g_.int_id(end_v));
- omnigraph::PathStorageCallback<Graph> callback(g_);
- ProcessPaths(g_,
- path_min_length, path_max_length,
- start_v, end_v,
- callback);
- vector<vector<EdgeId> > paths = callback.paths();
- DEBUG("taking subseq" << start_pos <<" "<< end_pos <<" " << s.size());
- int s_len = int(s.size());
- string seq_string = s.Subseq(start_pos, min(end_pos + 1, s_len)).str();
- size_t best_path_ind = paths.size();
- size_t best_score = 1000000000;
- DEBUG("need to find best scored path between "<<paths.size()<<" , seq_len " << seq_string.length());
- if (paths.size() == 0)
- return vector<EdgeId>(0);
- for (size_t i = 0; i < paths.size(); i++) {
- string cur_string = s_add + PathToString(paths[i]) + e_add;
- if (paths.size() > 1 && paths.size() < 10) {
- TRACE("candidate path number "<< i << " , len " << cur_string.length());
- TRACE("graph candidate: " << cur_string);
- TRACE("in pacbio read: " << seq_string);
- for (auto j_iter = paths[i].begin(); j_iter != paths[i].end();
- ++j_iter) {
- DEBUG(g_.int_id(*j_iter));
- }
- }
- size_t cur_score = StringDistance(cur_string, seq_string);
- if (paths.size() > 1 && paths.size() < 10) {
- DEBUG("score: "<< cur_score);
- }
- if (cur_score < best_score) {
- best_score = cur_score;
- best_path_ind = i;
- }
- }
- if (best_score == 1000000000)
- return vector<EdgeId>(0);
- if (paths.size() > 1 && paths.size() < 10) {
- DEBUG("best score found! Path " <<best_path_ind <<" score "<< best_score);
- }
- return paths[best_path_ind];
- }
-
- // Short read alignment
- omnigraph::MappingPath<EdgeId> GetShortReadAlignment(const Sequence &s) const {
- ClustersSet mapping_descr = GetOrderClusters(s);
- map<EdgeId, KmerCluster<Graph> > largest_clusters;
-
- //Selecting the biggest cluster for each edge
- for (auto iter = mapping_descr.begin(); iter != mapping_descr.end(); ++iter) {
-
- auto first_cluster = iter->sorted_positions[iter->first_trustable_index];
- auto last_cluster = iter->sorted_positions[iter->last_trustable_index];
- int read_range = last_cluster.read_position - first_cluster.read_position;
- int edge_range = last_cluster.edge_position - first_cluster.edge_position;
- int cluster_szie = iter->last_trustable_index - iter->first_trustable_index;
- if (cluster_szie > 2 * read_range || edge_range < 0 || 2 * edge_range < read_range || edge_range > 2 * read_range) {
- //skipping cluster
- continue;
- }
-
- auto edge_cluster = largest_clusters.find(iter->edgeId);
- if (edge_cluster != largest_clusters.end()) {
- if (edge_cluster->second.last_trustable_index - edge_cluster->second.first_trustable_index
- < iter->last_trustable_index - iter->first_trustable_index) {
-
- edge_cluster->second = *iter;
- }
- } else {
- largest_clusters.insert(make_pair(iter->edgeId, *iter));
- }
- }
-
- omnigraph::MappingPath<EdgeId> result;
- for (auto iter = largest_clusters.begin(); iter != largest_clusters.end(); ++iter) {
- auto first_cluster = iter->second.sorted_positions[iter->second.first_trustable_index];
- auto last_cluster = iter->second.sorted_positions[iter->second.last_trustable_index];
- omnigraph::MappingRange range(Range(first_cluster.read_position, last_cluster.read_position),
- Range(first_cluster.edge_position, last_cluster.edge_position));
- result.join({iter->second.edgeId, range});
- }
-
- return result;
- }
-
- std::pair<EdgeId, size_t> GetUniqueKmerPos(const runtime_k::RtSeq& kmer) const {
- KeyWithHash kwh = tmp_index.ConstructKWH(kmer);
-
- if (tmp_index.valid(kwh.key())) {
- auto keys = tmp_index.get(kwh);
- if (keys.size() == 1) {
- return make_pair(keys[0].edge_id, keys[0].offset);
- }
- }
- return std::make_pair(EdgeId(0), -1u);
- }
-
-
-};
-
-template<class Graph>
-typename PacBioMappingIndex<Graph>::MappingDescription PacBioMappingIndex<Graph>::Locate(const Sequence &s) const {
- MappingDescription res;
- //WARNING: removed read_count from here to make const methods
- int local_read_count = 0;
- ++local_read_count;
- if (s.size() < pacbio_k)
- return res;
-
- //runtime_k::RtSeq kmer = s.start<runtime_k::RtSeq>(pacbio_k);
- KeyWithHash kwh = tmp_index.ConstructKWH(s.start<runtime_k::RtSeq>(pacbio_k));
-
- for (size_t j = pacbio_k; j < s.size(); ++j) {
- kwh = kwh << s[j];
- if (!tmp_index.valid(kwh.key())) {
-// INFO("not valid kmer");
- continue;
- }
- auto keys = tmp_index.get(kwh);
- TRACE("Valid key, size: "<< keys.size());
-
- for (auto iter = keys.begin(); iter != keys.end(); ++iter) {
-
- int quality = (int) keys.size();
- TRACE("and quality:" << quality);
- if (banned_kmers.find(Sequence(kwh.key())) != banned_kmers.end())
- continue;
- int offset = (int)iter->offset;
- int s_stretched = int ((double)s.size() * 1.2 + 50);
- int edge_len = int(g_.length(iter->edge_id));
- //No alignment in vertex, and further than s+eps bp from edge ends;
- bool correct_alignment = offset > int(debruijn_k - pacbio_k) && offset < edge_len;
- if (ignore_map_to_middle) {
- correct_alignment &= (offset < int(debruijn_k - pacbio_k) + s_stretched || offset > edge_len - s_stretched);
- }
- if (correct_alignment) {
- res[iter->edge_id].push_back(MappingInstance((int) iter->offset, (int) (j - pacbio_k + 1), quality));
- }
- }
- }
-
- for (auto iter = res.begin(); iter != res.end(); ++iter) {
- sort(iter->second.begin(), iter->second.end());
- DEBUG("read count "<< local_read_count);
- DEBUG("edge: " << g_.int_id(iter->first) << "size: " << iter->second.size());
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); j_iter++) {
- DEBUG(j_iter->str());
- }
- }
-
- return res;
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
deleted file mode 100644
index 2d3a0f0..0000000
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "pacbio_read_structures.hpp"
-
-#include "ConsensusCore/Poa/PoaConfig.hpp"
-#include "ConsensusCore/Poa/PoaConsensus.hpp"
-
-#include <algorithm>
-#include <fstream>
-
-namespace pacbio {
-template<class Graph>
-class PacbioGapCloser;
-
-template<class Graph>
-class GapStorage {
- friend class PacbioGapCloser<Graph> ;
- typedef typename Graph::EdgeId EdgeId;
-private:
- DECL_LOGGER("PacbioGaps")
- ;
- Graph &g_;
- map<EdgeId, vector<GapDescription<Graph> > > inner_index;
- void HiddenAddGap(const GapDescription<Graph> &p) {
- inner_index[p.start].push_back(p);
- }
- vector<EdgeId> index;
- set<pair<EdgeId, EdgeId> > nonempty_pairs;
- set<pair<EdgeId, EdgeId> > transitively_ignored_pairs;
- set<pair<EdgeId, EdgeId> > symmetrically_ignored_pairs;
-
-public:
- size_t min_gap_quantity;
- size_t long_seq_limit_;
- GapStorage(Graph &g, size_t min_gap_quantity, size_t long_seq_limit)
- : g_(g),
- inner_index(), min_gap_quantity(min_gap_quantity), long_seq_limit_(long_seq_limit){
- }
-
- size_t FillIndex() {
- index.resize(0);
- set<EdgeId> tmp;
- for (auto iter = inner_index.begin(); iter != inner_index.end(); iter++) {
- index.push_back(iter->first);
- }
- return index.size();
- }
-
- EdgeId operator[](size_t i) {
- return index.at(i);
- }
-
- size_t size() const {
- return index.size();
- }
-
- bool IsTransitivelyIgnored(pair<EdgeId, EdgeId> p) {
- return (transitively_ignored_pairs.find(p) != transitively_ignored_pairs.end());
- }
- bool IsSymmetricallyIgnored(pair<EdgeId, EdgeId> p) {
- return (symmetrically_ignored_pairs.find(p) != symmetrically_ignored_pairs.end());
- }
-
- bool IsIgnored(pair<EdgeId, EdgeId> p) {
- return (IsTransitivelyIgnored(p) || IsSymmetricallyIgnored(p));
- }
- void AddGap(const GapDescription<Graph> &p, bool add_rc = false) {
- HiddenAddGap(p);
- if (add_rc) {
- TRACE("Adding conjugate");
- HiddenAddGap(p.conjugate(g_, (int) g_.k() ));
- }
- }
-
- void AddStorage(const GapStorage<Graph> & to_add) {
- const auto& idx = to_add.inner_index;
- for (auto iter = idx.begin(); iter != idx.end(); ++iter)
- inner_index[iter->first].insert(inner_index[iter->first].end(), iter->second.begin(), iter->second.end());
- }
-
- void PostProcess() {
- FillIndex();
-
- for (auto j_iter = index.begin(); j_iter != index.end(); j_iter++) {
- EdgeId e = *j_iter;
- auto cl_start = inner_index[e].begin();
- auto iter = inner_index[e].begin();
- vector<GapDescription<Graph> > padded_gaps;
- while (iter != inner_index[e].end()) {
- auto next_iter = ++iter;
- if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
- size_t len = next_iter - cl_start;
- if (len >= min_gap_quantity) {
- nonempty_pairs.insert(make_pair(cl_start->start, cl_start->end));
- }
- cl_start = next_iter;
- }
- }
- }
-
- set<pair<EdgeId, EdgeId> > used_rc_pairs;
- for (auto iter = nonempty_pairs.begin(); iter != nonempty_pairs.end(); ++iter) {
- if (used_rc_pairs.find(*iter) != used_rc_pairs.end()) {
- DEBUG("skipping pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
- symmetrically_ignored_pairs.insert(make_pair(iter->first, iter->second));
- } else {
- DEBUG("Using pair" << g_.int_id(iter->first) << "," << g_.int_id(iter->second));
- }
-
- for (size_t i = 0; i < index.size(); i++) {
- if (nonempty_pairs.find(make_pair(iter->first, index[i])) != nonempty_pairs.end()
- && nonempty_pairs.find(make_pair(index[i], iter->second)) != nonempty_pairs.end()) {
- DEBUG("pair " << g_.int_id(iter->first) << "," << g_.int_id(iter->second) << " is ignored because of edge between " << g_.int_id(index[i]));
- transitively_ignored_pairs.insert(make_pair(iter->first, iter->second));
- }
- }
- used_rc_pairs.insert(make_pair(g_.conjugate(iter->second), g_.conjugate(iter->first)));
- }
- }
-
- void DumpToFile(const string filename) {
- ofstream filestr(filename);
- for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
- DEBUG( g_.int_id(iter->first)<< " " <<iter->second.size());
- filestr << g_.int_id(iter->first) << " " << iter->second.size() << endl;
- sort(iter->second.begin(), iter->second.end());
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << j_iter->str(g_);
- }
- filestr << endl;
- }
- }
-
- void LoadFromFile(const string s) {
- FILE* file = fopen((s).c_str(), "r");
- int res;
- char ss[5000];
- map<int, EdgeId> tmp_map;
- for (auto iter = g_.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- tmp_map[g_.int_id(*iter)] = *iter;
- }
- while (!feof(file)) {
- int first_id, second_id, first_ind, second_ind;
- int size;
- res = fscanf(file, "%d %d\n", &first_id, &size);
- VERIFY(res == 2);
- for (int i = 0; i < size; i++) {
- res = fscanf(file, "%d %d\n", &first_id, &first_ind);
- VERIFY(res == 2);
- res = fscanf(file, "%d %d\n", &second_id, &second_ind);
- VERIFY(res == 2);
- res = fscanf(file, "%s\n", ss);
- VERIFY(res == 1);
- GapDescription<Graph> gap(tmp_map[first_id], tmp_map[second_id], Sequence(ss), first_ind, second_ind);
- this->AddGap(gap);
- }
- }
- }
-
- void PadGapStrings(EdgeId e) {
- sort(inner_index[e].begin(), inner_index[e].end());
- auto cl_start = inner_index[e].begin();
- auto iter = inner_index[e].begin();
- vector<GapDescription<Graph> > padded_gaps;
- while (iter != inner_index[e].end()) {
- auto next_iter = ++iter;
- if (next_iter == inner_index[e].end() || next_iter->end != cl_start->end) {
- int start_min = 1000000000;
- int end_max = 0;
- size_t long_seqs = 0;
- size_t short_seqs = 0;
- bool exclude_long_seqs = false;
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500) {
- DEBUG("ignoring alingment to the middle of edge");
- continue;
- }
- if (j_iter->gap_seq.size() > long_seq_limit_)
- long_seqs++;
- else
- short_seqs++;
-
- if (j_iter->edge_gap_start_position < start_min)
- start_min = j_iter->edge_gap_start_position;
- if (j_iter->edge_gap_end_position > end_max)
- end_max = j_iter->edge_gap_end_position;
- }
-
- if (short_seqs >= min_gap_quantity && short_seqs > long_seqs)
- exclude_long_seqs = true;
-
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- if (g_.length(j_iter->start) - j_iter->edge_gap_start_position > 500 || j_iter->edge_gap_end_position > 500)
- continue;
-
- if (exclude_long_seqs && j_iter->gap_seq.size() > long_seq_limit_)
- continue;
-
- string s = g_.EdgeNucls(j_iter->start).Subseq(start_min, j_iter->edge_gap_start_position).str();
- s += j_iter->gap_seq.str();
- s += g_.EdgeNucls(j_iter->end).Subseq(j_iter->edge_gap_end_position, end_max).str();
- padded_gaps.push_back(GapDescription<Graph>(j_iter->start, j_iter->end, Sequence(s), start_min, end_max));
- }
- cl_start = next_iter;
- }
- }
- inner_index[e] = padded_gaps;
- }
-
- void PadGapStrings() {
- for (auto iter = inner_index.begin(); iter != inner_index.end(); ++iter) {
- DEBUG("Padding gaps for first edge " << g_.int_id(iter->first));
- PadGapStrings(iter->first);
- }
- PostProcess();
- }
-};
-
-template<class Graph>
-class PacbioGapCloser {
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq Kmer;
- typedef vector<map<Kmer, int> > KmerStorage;
-private:
- DECL_LOGGER("PacbioGaps")
- ;
- Graph &g_;
- //first edge, second edge, weight, seq
- map<EdgeId, map<EdgeId, pair<size_t, string> > > new_edges_;
- int closed_gaps;
- int not_unique_gaps;
- int chained_gaps;
- bool consensus_gap_closing;
- size_t max_contigs_gap_length_;
-public:
- void CloseGapsInGraph(map<EdgeId, EdgeId> &replacement) {
- for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
- if (iter->second.size() != 1) {
- DEBUG("non-unique gap!!");
- not_unique_gaps ++;
- continue;
- }
- EdgeId first = iter->first;
- EdgeId second = (iter->second.begin()->first);
- if (replacement.find(first) != replacement.end() || replacement.find(second) != replacement.end()) {
- DEBUG("sorry, gap chains are not supported yet");
- chained_gaps++;
- continue;
- }
-
- EdgeId first_conj = g_.conjugate(first);
- EdgeId second_conj = g_.conjugate(second);
- size_t first_id = g_.int_id(first);
- size_t second_id = g_.int_id(second);
- size_t first_id_conj = g_.int_id(g_.conjugate(first));
- size_t second_id_conj = g_.int_id(g_.conjugate(second));
- DEBUG("closing gaps between "<< first_id << " " << second_id);
- size_t len_f = g_.length(first);
- size_t len_s = g_.length(second);
- size_t len_sum = iter->second.begin()->second.second.length();
- double cov = (double)g_.length(first) * g_.coverage(first) + (double)g_.length(second) * g_.coverage(second);
-
- DEBUG("coverage was " << g_.coverage(first) << " " << g_.coverage(second));
-
- EdgeId newEdge = g_.AddEdge(g_.EdgeStart(first), g_.EdgeEnd(second), Sequence(iter->second.begin()->second.second));
- if (cov > UINT_MAX * 0.75 ) cov = UINT_MAX*0.75;
- cov /= (double) g_.length(newEdge);
- TRACE(g_.int_id(newEdge));
- int len_split = int(((double) len_f * (double) len_sum) / ((double)len_s + (double)len_f));
- if (len_split == 0) {
- DEBUG(" zero split length, length are:" << len_f <<" " << len_sum <<" " << len_s);
- len_split = 1;
- }
- g_.DeleteEdge(first);
- g_.DeleteEdge(second);
- g_.coverage_index().SetAvgCoverage(newEdge, cov);
- g_.coverage_index().SetAvgCoverage(g_.conjugate(newEdge), cov);
- size_t next_id = g_.int_id(newEdge);
- DEBUG("and new coverage is " << g_.coverage(newEdge));
- closed_gaps ++;
- size_t next_id_conj = g_.int_id(g_.conjugate(newEdge));
- TRACE(first_id << " " << second_id << " " << next_id << " " << first_id_conj << " " << second_id_conj << " " << next_id_conj << " ");
- replacement[first] = newEdge;
- replacement[second] = newEdge;
- replacement[first_conj] = g_.conjugate(newEdge);
- replacement[second_conj] = g_.conjugate(newEdge);
- }
- INFO("Closed " << closed_gaps << " gaps");
- INFO("Total " << not_unique_gaps << " were not closed due to more than one possible pairing");
- INFO("Total " << chained_gaps << " were skipped because of gap chains");
- //TODO: chains of gaps!
- }
-private:
-
- void ConstructConsensus(EdgeId e, GapStorage<Graph> &storage, map<EdgeId, map<EdgeId, pair<size_t, string> > > & new_edges) {
- auto cl_start = storage.inner_index[e].begin();
- auto iter = storage.inner_index[e].begin();
- size_t cur_len = 0;
- while (iter != storage.inner_index[e].end()) {
- auto next_iter = ++iter;
- cur_len++;
- if (next_iter == storage.inner_index[e].end() || next_iter->end != cl_start->end) {
- if (cur_len >= storage.min_gap_quantity && !storage.IsIgnored(make_pair(cl_start->start, cl_start->end))) {
- vector<string> gap_variants;
-
- for (auto j_iter = cl_start; j_iter != next_iter; j_iter++) {
- string s = j_iter->gap_seq.str();
- transform(s.begin(), s.end(), s.begin(), ::toupper);
- gap_variants.push_back(s);
- }
- if (consensus_gap_closing || (gap_variants.size() > 0 && gap_variants[0].length() < max_contigs_gap_length_)) {
- map <EdgeId, pair<size_t, string>> tmp;
- string tmp_string;
- string s = g_.EdgeNucls(cl_start->start).Subseq(0, cl_start->edge_gap_start_position).str();
- if (consensus_gap_closing) {
- const ConsensusCore::PoaConsensus *pc = ConsensusCore::PoaConsensus::FindConsensus(
- gap_variants,
- ConsensusCore::PoaConfig::GLOBAL_ALIGNMENT);
- tmp_string = pc->Sequence();
- } else {
- tmp_string = gap_variants[0];
- if (gap_variants.size() > 1) {
-
- stringstream ss;
- for (size_t i = 0; i < gap_variants.size(); i++)
- ss << gap_variants[i].length() << " ";
- INFO(gap_variants.size() << " gap closing variant for contigs, lengths: " << ss.str());
- }
- }
-
- DEBUG("consenus for " << g_.int_id(cl_start->start) << " and " << g_.int_id(cl_start->end) <<
- "found: ");
- DEBUG(tmp_string);
- s += tmp_string;
- s += g_.EdgeNucls(cl_start->end).Subseq(cl_start->edge_gap_end_position,
- g_.length(cl_start->end) + g_.k()).str();
- tmp.insert(make_pair(cl_start->end, make_pair(cur_len, s)));
- new_edges[cl_start->start] = tmp;
- } else {
- INFO ("Skipping gap of size " << gap_variants[0].length() << " multiplicity " << gap_variants.size());
- }
- }
- cl_start = next_iter;
- cur_len = 0;
- }
- }
- }
-
-public:
- PacbioGapCloser(Graph &g, bool consensus_gap, size_t max_contigs_gap_length )
- : g_(g), consensus_gap_closing(consensus_gap), max_contigs_gap_length_(max_contigs_gap_length) {
- closed_gaps = 0;
- not_unique_gaps = 0;
- chained_gaps = 0;
- }
-
- void ConstructConsensus(size_t nthreads, GapStorage<Graph> &storage) {
- vector<map<EdgeId, map<EdgeId, pair<size_t, string> > > > new_edges_by_thread;
- new_edges_by_thread.resize(nthreads);
- size_t storage_size = storage.size();
-# pragma omp parallel for shared(storage, new_edges_by_thread) num_threads(nthreads)
- for (size_t i = 0; i < storage_size; i++) {
- EdgeId e = storage[i];
- size_t thread_num = omp_get_thread_num();
- DEBUG("constructing consenus for first edge " << g_.int_id(e) << " in thread " <<thread_num);
- ConstructConsensus(e, storage, new_edges_by_thread[thread_num]);
- }
- for (size_t i = 0; i < nthreads; i++) {
- for (auto iter = new_edges_by_thread[i].begin(); iter != new_edges_by_thread[i].end(); ++iter) {
- new_edges_.insert(*iter);
- }
- }
- }
- void DumpToFile(const string filename) {
- ofstream filestr(filename);
- for (auto iter = new_edges_.begin(); iter != new_edges_.end(); ++iter) {
- if (iter->second.size() > 1) {
- DEBUG("nontrivial gap closing for edge" <<g_.int_id(iter->first));
- }
- for (auto j_iter = iter->second.begin(); j_iter != iter->second.end(); ++j_iter) {
- filestr << ">" << g_.int_id(iter->first) << "_" << iter->second.size() << "_" << g_.int_id(j_iter->first) << "_" << j_iter->second.first << endl;
- filestr << j_iter->second.second << endl;
- }
- }
- }
-
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp b/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
deleted file mode 100644
index c2ce186..0000000
--- a/src/modules/assembly_graph/graph_alignment/pacbio/pacbio_read_structures.hpp
+++ /dev/null
@@ -1,320 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/indices/perfect_hash_map.hpp"
-#include <algorithm>
-#include <map>
-#include <set>
-
-namespace pacbio {
-template<class T>
-struct pair_iterator_less {
- bool operator ()(pair<size_t, T> const& a, pair<size_t, T> const& b) const {
- return (a.first < b.first);
- }
-};
-
-struct MappingInstance {
- int edge_position;
- int read_position;
- //Now quality is the same with multiplicity, so best quality is 1,
- int quality;
- MappingInstance(int edge_position, int read_position, int quality) :
- edge_position(edge_position), read_position(read_position), quality(quality) {
- }
-
- inline bool IsUnique() const {
- return (quality == 1);
- }
-
- string str() {
- stringstream s;
- s << "E: " << edge_position << " R: " << read_position << " Q: " << quality;
- return s.str();
- }
-
-//Less by EDGE position
- bool operator <(MappingInstance const& b) const {
- if (edge_position < b.edge_position || (edge_position == b.edge_position && read_position < b.read_position))
- return true;
- else
- return false;
- }
-private:
- DECL_LOGGER("MappingInstance")
- ;
-};
-
-//Less by READ position
-struct ReadPositionComparator {
- bool operator ()(MappingInstance const& a, MappingInstance const& b) const {
- return (a.read_position < b.read_position || (a.read_position == b.read_position && a.edge_position < b.edge_position));
- }
-};
-
-template<class Graph>
-struct KmerCluster {
- typedef typename Graph::EdgeId EdgeId;
- int last_trustable_index;
- int first_trustable_index;
- size_t average_read_position;
- size_t average_edge_position;
- EdgeId edgeId;
- vector<MappingInstance> sorted_positions;
- int size;
-
- KmerCluster(EdgeId e, const vector<MappingInstance>& v) {
- last_trustable_index = 0;
- first_trustable_index = 0;
- average_read_position = 0;
- edgeId = e;
- size = (int) v.size();
- sorted_positions = v;
- FillTrustableIndeces();
- }
-
- bool operator <(const KmerCluster & b) const {
- return (average_read_position < b.average_read_position ||(average_read_position == b.average_read_position && edgeId < b.edgeId) ||
- (average_read_position == b.average_read_position && edgeId == b.edgeId && sorted_positions < b.sorted_positions));
- }
-
- bool CanFollow(const KmerCluster &b) const {
- return (b.sorted_positions[b.last_trustable_index].read_position < sorted_positions[first_trustable_index].read_position);
- }
-
- void FillTrustableIndeces() {
- //ignore non-unique kmers for distance determination
- int first_unique_ind = 0;
- while (first_unique_ind != size - 1 && !(sorted_positions[first_unique_ind].IsUnique())) {
- first_unique_ind += 1;
- }
- int last_unique_ind = size - 1;
- while (last_unique_ind != 0 && !(sorted_positions[last_unique_ind].IsUnique())) {
- last_unique_ind -= 1;
- }
- last_trustable_index = last_unique_ind;
- first_trustable_index = first_unique_ind;
- double tmp_read_position = 0, tmp_edge_position = 0;;
- vector<int> diffs;
- for (auto mp : sorted_positions) {
- tmp_read_position += mp.read_position;
- tmp_edge_position += mp.edge_position;
- diffs.push_back(mp.read_position - mp.edge_position);
- }
- sort(diffs.begin(), diffs.end());
- int median_diff = diffs[size/2];
-
- tmp_read_position /= size;
- tmp_edge_position /= size;
- average_read_position = (size_t)trunc(tmp_read_position);
- average_edge_position = (size_t)trunc(tmp_edge_position);
-
- if (size > 10) {
- int max_debug_size = 10;
- vector<int> distances(max_debug_size);
- for (int df: diffs) {
- int ind = abs(df - median_diff)/ 50;
- if (ind > max_debug_size - 1) ind = max_debug_size - 1;
- distances [ind] ++;
- }
- if (size > 100 || distances[0] * 5 < size * 4) {
- stringstream s;
-
- for (int d: distances) {
- s << d << " ";
- }
-// INFO(s.str());
-
- }
- }
- }
-
- string str(const Graph &g) const{
- stringstream s;
- s << "Edge: " << g.int_id(edgeId) << " on edge: " << sorted_positions[first_trustable_index].edge_position<< " - " << sorted_positions[last_trustable_index].edge_position<< ";on read: " << sorted_positions[first_trustable_index].read_position<< " - " << sorted_positions[last_trustable_index].read_position<< ";size "<< size;
- return s.str();
- }
-private:
- DECL_LOGGER("KmerCluster")
- ;
-};
-
-template<class Graph>
-struct GapDescription {
- typedef typename Graph::EdgeId EdgeId;
- typename Graph::EdgeId start, end;
- Sequence gap_seq;
- int edge_gap_start_position, edge_gap_end_position;
-
-
- GapDescription(EdgeId start_e, EdgeId end_e, const Sequence &gap, int gap_start, int gap_end) :
- start(start_e), end(end_e), gap_seq(gap.str()), edge_gap_start_position(gap_start), edge_gap_end_position(gap_end) {
- }
-
- GapDescription(const KmerCluster<Graph> &a, const KmerCluster<Graph> & b, Sequence read, int pacbio_k) {
- edge_gap_start_position = a.sorted_positions[a.last_trustable_index].edge_position;
- edge_gap_end_position = b.sorted_positions[b.first_trustable_index].edge_position + pacbio_k - 1;
- start = a.edgeId;
- end = b.edgeId;
- DEBUG(read.str());
- gap_seq = read.Subseq(a.sorted_positions[a.last_trustable_index].read_position, b.sorted_positions[b.first_trustable_index].read_position + pacbio_k - 1);
- DEBUG(gap_seq.str());
- DEBUG("gap added");
- }
-
- GapDescription<Graph> conjugate(Graph &g_, int shift) const {
- GapDescription<Graph> res(
- g_.conjugate(end), g_.conjugate(start), (!gap_seq),
- (int) g_.length(end) + shift - edge_gap_end_position,
- (int) g_.length(start) + shift - edge_gap_start_position);
- DEBUG("conjugate created" << res.str(g_));
- return res;
- }
-
- string str(Graph &g_) const {
- stringstream s;
- s << g_.int_id(start) << " " << edge_gap_start_position <<endl << g_.int_id(end) << " " << edge_gap_end_position << endl << gap_seq.str()<< endl;
- return s.str();
- }
-
- bool operator <(const GapDescription & b) const {
- return (start < b.start || (start == b.start && end < b.end) ||
- (start == b.start && end == b.end && edge_gap_start_position < b.edge_gap_start_position));
- }
-
-private:
- DECL_LOGGER("PacIndex")
- ;
-};
-
-template<class Graph>
-struct OneReadMapping {
- typedef typename Graph::EdgeId EdgeId;
- vector<vector<EdgeId> > main_storage;
- vector<GapDescription<Graph> > gaps;
- vector<size_t> real_length;
-//Total used seeds. sum over all subreads;
- size_t seed_num;
- OneReadMapping(vector<vector<EdgeId> > &paths_description, vector<GapDescription<Graph> > &gaps_description, vector<size_t> real_length, size_t seed_num) :
- main_storage(paths_description), gaps(gaps_description), real_length(real_length), seed_num(seed_num) {
- }
-
-};
-
-
-struct StatsCounter{
-
- map<size_t,size_t> path_len_in_edges;
- vector<size_t> subreads_length;
- size_t total_len ;
- size_t reads_with_conjugate;
- size_t subreads_count;
- map<size_t, size_t> seeds_percentage;
- StatsCounter() {
- total_len = 0;
- reads_with_conjugate = 0;
- }
-
- void AddStorage(StatsCounter &other) {
- total_len += other.total_len;
- reads_with_conjugate += other.reads_with_conjugate;
- for (auto iter = other.subreads_length.begin(); iter != other.subreads_length.end(); ++iter) {
- subreads_length.push_back(*iter);
- }
-
- for (auto iter = other.path_len_in_edges.begin(); iter != other.path_len_in_edges.end(); ++iter){
- auto j_iter = iter;
- if (( j_iter = path_len_in_edges.find(iter->first)) == other.path_len_in_edges.end()){
- path_len_in_edges.insert(make_pair(iter->first, iter->second));
- } else {
- path_len_in_edges[j_iter->first] += iter->second;
- }
- }
- for (auto iter = other.seeds_percentage.begin(); iter != other.seeds_percentage.end(); ++iter){
- auto j_iter = iter;
- if (( j_iter = seeds_percentage.find(iter->first)) == other.seeds_percentage.end()){
- seeds_percentage.insert(make_pair(iter->first, iter->second));
- } else {
- seeds_percentage[j_iter->first] += iter->second;
- }
- }
- }
-
- void report(){
- size_t total = 0;
- for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
- total += iter->second;
- }
- size_t cur = 0;
- size_t percentage = 0;
- for (auto iter = seeds_percentage.begin(); iter != seeds_percentage.end(); ++iter){
- cur += iter->second;
- percentage = iter->first;
- if (cur * 2 > total) break;
- }
- INFO("Median fraction of present seeds in maximal alignmnent among reads aligned to the graph: " << double(percentage) * 0.001);
- }
-private:
- DECL_LOGGER("StatsCounter");
-
-};
-
-inline int StringDistance(string &a, string &b) {
- int a_len = (int) a.length();
- int b_len = (int) b.length();
- int d = min(a_len / 3, b_len / 3);
- d = max(d, 10);
- DEBUG(a_len << " " << b_len << " " << d);
- vector<vector<int> > table(a_len);
- //int d =
- for (int i = 0; i < a_len; i++) {
- table[i].resize(b_len);
- int low = max(max(0, i - d - 1), i + b_len - a_len - d - 1);
- int high = min(min(b_len, i + d + 1), i + a_len - b_len + d + 1);
- TRACE(low << " " <<high);
- for (int j = low; j < high; j++)
- table[i][j] = 1000000;
- }
- table[a_len - 1][b_len - 1] = 1000000;
- table[0][0] = 0;
-//free deletions on begin
-// for(int j = 0; j < b_len; j++)
-// table[0][j] = 0;
-
- for (int i = 0; i < a_len; i++) {
- int low = max(max(0, i - d), i + b_len - a_len - d);
- int high = min(min(b_len, i + d), i + a_len - b_len + d);
-
- TRACE(low << " " <<high);
- for (int j = low; j < high; j++) {
-
- if (i > 0)
- table[i][j] = min(table[i][j], table[i - 1][j] + 1);
- if (j > 0)
- table[i][j] = min(table[i][j], table[i][j - 1] + 1);
- if (i > 0 && j > 0) {
- int add = 1;
- if (a[i] == b[j])
- add = 0;
- table[i][j] = min(table[i][j], table[i - 1][j - 1] + add);
- }
- }
- }
- //return table[a_len - 1][b_len - 1];
-//free deletions on end
- int res = table[a_len - 1][b_len - 1];
- DEBUG(res);
-// for(int j = 0; j < b_len; j++){
-// res = min(table[a_len - 1][j], res);
-// }
- return res;
-}
-
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
deleted file mode 100644
index 1334ced..0000000
--- a/src/modules/assembly_graph/graph_alignment/sequence_mapper.hpp
+++ /dev/null
@@ -1,387 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-
-#include "data_structures/sequence/runtime_k.hpp"
-#include "edge_index.hpp"
-#include "kmer_mapper.hpp"
-
-#include <cstdlib>
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-
-namespace debruijn_graph {
-using omnigraph::MappingPath;
-using omnigraph::Path;
-using omnigraph::MappingRange;
-using omnigraph::Range;
-
-template<class Graph>
-MappingPath<typename Graph::EdgeId> ConjugateMapping(const Graph& g,
- const MappingPath<typename Graph::EdgeId>& mp,
- size_t sequence_length) {
- MappingPath<typename Graph::EdgeId> answer;
- for (size_t i = mp.size(); i > 0; --i) {
- auto p = mp[i-1];
- auto e = p.first;
- MappingRange mr = p.second;
- answer.push_back(g.conjugate(e),
- MappingRange(mr.initial_range.Invert(sequence_length - g.k()),
- mr.mapped_range.Invert(g.length(e))));
- }
- return answer;
-}
-
-template<class Graph>
-class SequenceMapper {
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq Kmer;
-
-protected:
- const Graph& g_;
-
-public:
- SequenceMapper(const Graph& g): g_(g) {
-
- }
-
- virtual ~SequenceMapper() {
-
- }
-
- virtual MappingPath<EdgeId> MapSequence(const Sequence &sequence) const = 0;
-
-
- MappingPath<EdgeId> MapRead(const io::SingleRead &read) const {
-// VERIFY(read.IsValid());
- DEBUG(read.name() << " is mapping");
- string s = read.GetSequenceString();
- size_t l = 0, r = 0;
- MappingPath<EdgeId> result;
- for(size_t i = 0; i < s.size(); i++) {
- if (read.GetSequenceString()[i] == 'N') {
- if (r > l) {
- result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
- }
- r = i + 1;
- l = i + 1;
- } else {
- r++;
- }
- }
- if (r > l) {
- result.join(MapSequence(Sequence(s.substr(l, r - l))), int(l));
- }
- DEBUG(read.name() << " is mapped");
- DEBUG("Number of edges is " << result.size());
-
- return result;
- }
-
- virtual size_t KmerSize() const = 0;
-};
-
-template<class Graph>
-class MappingPathFixer {
-public:
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- MappingPathFixer(const Graph& graph)
- : g_(graph) {
- }
-
- bool CheckContiguous(const vector<typename Graph::EdgeId>& path) const {
- for (size_t i = 1; i < path.size(); ++i) {
- if (g_.EdgeEnd(path[i - 1]) != g_.EdgeStart(path[i]))
- return false;
- }
- return true;
- }
-
- Path<EdgeId> TryFixPath(const Path<EdgeId>& path, size_t length_bound = 70) const {
- return Path<EdgeId>(TryFixPath(path.sequence(), length_bound), path.start_pos(), path.end_pos());
- }
-
- vector<EdgeId> TryFixPath(const vector<EdgeId>& edges, size_t length_bound = 70) const {
- vector<EdgeId> answer;
- if (edges.empty()) {
- // WARN("Mapping path was empty");
- return vector<EdgeId>();
- }
- answer.push_back(edges[0]);
- for (size_t i = 1; i < edges.size(); ++i) {
- if (g_.EdgeEnd(edges[i - 1]) != g_.EdgeStart(edges[i])) {
- vector<EdgeId> closure = TryCloseGap(g_.EdgeEnd(edges[i - 1]),
- g_.EdgeStart(edges[i]),
- length_bound);
- answer.insert(answer.end(), closure.begin(), closure.end());
- }
- answer.push_back(edges[i]);
- }
- return answer;
- }
-
- vector<EdgeId> DeleteSameEdges(const vector<EdgeId>& path) const {
- vector<EdgeId> result;
- if (path.empty()) {
- return result;
- }
- result.push_back(path[0]);
- for (size_t i = 1; i < path.size(); ++i) {
- if (path[i] != result[result.size() - 1]) {
- result.push_back(path[i]);
- }
- }
- return result;
- }
-
-private:
- vector<EdgeId> TryCloseGap(VertexId v1, VertexId v2, size_t length_bound) const {
- if (v1 == v2)
- return vector<EdgeId>();
- TRACE("Trying to close gap between v1=" << g_.int_id(v1) << " and v2=" << g_.int_id(v2));
- omnigraph::PathStorageCallback<Graph> path_store(g_);
-
- TRACE("Path storage callback created");
- //todo reduce value after investigation
- omnigraph::ProcessPaths(g_, 0, length_bound, v1, v2, path_store);
-
- TRACE("Paths processed");
- if (path_store.size() == 0) {
- TRACE("Failed to find closing path");
- // TRACE("Failed to close gap between v1=" << graph_.int_id(v1)
- // << " (conjugate "
- // << graph_.int_id(g_.conjugate(v1))
- // << ") and v2=" << g_.int_id(v2)
- // << " (conjugate "
- // << g_.int_id(g_.conjugate(v2)) << ")");
- // return boost::none;
- return vector<EdgeId>();
- } else if (path_store.size() == 1) {
- TRACE("Unique closing path found");
- } else {
- TRACE("Several closing paths found, first chosen");
- }
- TRACE("Taking answer ");
- vector<EdgeId> answer = path_store.paths().front();
- TRACE("Gap closed");
- TRACE( "Cumulative closure length is " << CumulativeLength(g_, answer));
- return answer;
- }
- const Graph& g_;
-};
-
-template<class Graph>
-class ReadPathFinder {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- typedef MappingPathFixer<Graph> GraphMappingPathFixer;
- const GraphMappingPathFixer path_fixer_;
-public:
- ReadPathFinder (const Graph& g) :
- g_(g), path_fixer_(g)
- { }
-
- vector<EdgeId> FindReadPath(const MappingPath<EdgeId>& mapping_path) const {
- if (!IsMappingPathValid(mapping_path)) {
- TRACE("read unmapped");
- return vector<EdgeId>();
- }
- vector<EdgeId> corrected_path = path_fixer_.DeleteSameEdges(
- mapping_path.simple_path());
- PrintPathInfo(corrected_path);
- if(corrected_path.size() != mapping_path.simple_path().size()) {
- DEBUG("Some edges were deleted");
- }
- vector<EdgeId> fixed_path = path_fixer_.TryFixPath(corrected_path);
- if (!path_fixer_.CheckContiguous(fixed_path)) {
- TRACE("read unmapped");
- std::stringstream debug_stream;
- for (size_t i = 0; i < fixed_path.size(); ++i) {
- debug_stream << g_.int_id(fixed_path[i]) << " ";
- }
- TRACE(debug_stream.str());
- return vector<EdgeId>();
- } else {
- DEBUG("Path fix works");
- }
- return fixed_path;
- }
-
-
-private:
-
- bool IsTip(VertexId v) const {
- return g_.IncomingEdgeCount(v) + g_.OutgoingEdgeCount(v) == 1;
- }
-
- bool IsMappingPathValid(const MappingPath<EdgeId>& path) const {
- return path.size() != 0;
- }
-
- void PrintPathInfo(vector<EdgeId>& corrected_path) const {
- for(size_t i = 0; i < corrected_path.size(); ++i) {
- DEBUG(i + 1 << "-th edge is " << corrected_path[i].int_id());
- }
- }
-};
-
-template<class Graph, class Index>
-class NewExtendedSequenceMapper: public SequenceMapper<Graph> {
- using SequenceMapper<Graph>::g_;
-
- public:
- typedef std::vector<MappingRange> RangeMappings;
-
- private:
- const Index& index_;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef typename Index::KMer Kmer;
- typedef KmerMapper<Graph> KmerSubs;
- const KmerSubs& kmer_mapper_;
- size_t k_;
- bool optimization_on_;
-
- bool FindKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
- RangeMappings& range_mappings) const {
- std::pair<EdgeId, size_t> position = index_.get(kmer);
- if (position.second == -1u)
- return false;
-
- if (passed.empty() || passed.back() != position.first ||
- kmer_pos != range_mappings.back().initial_range.end_pos ||
- position.second + 1 < range_mappings.back().mapped_range.end_pos) {
- passed.push_back(position.first);
-
- range_mappings.push_back(MappingRange(Range(kmer_pos, kmer_pos + 1),
- Range(position.second, position.second + 1)));
- } else {
- range_mappings.back().initial_range.end_pos = kmer_pos + 1;
- range_mappings.back().mapped_range.end_pos = position.second + 1;
- }
-
- return true;
- }
-
- bool TryThread(const Kmer& kmer, size_t kmer_pos, std::vector<EdgeId> &passed,
- RangeMappings& range_mappings) const {
- EdgeId last_edge = passed.back();
- size_t end_pos = range_mappings.back().mapped_range.end_pos;
- if (end_pos < g_.length(last_edge)) {
- if (g_.EdgeNucls(last_edge)[end_pos + k_ - 1] == kmer[k_ - 1]) {
- range_mappings.back().initial_range.end_pos++;
- range_mappings.back().mapped_range.end_pos++;
- return true;
- }
- } else {
- VertexId v = g_.EdgeEnd(last_edge);
-
- if(!optimization_on_)
- if(g_.OutgoingEdgeCount(v) > 1)
- return false;
-
- for (auto I = g_.out_begin(v), E = g_.out_end(v); I != E; ++I) {
- EdgeId edge = *I;
- if (g_.EdgeNucls(edge)[k_ - 1] == kmer[k_ - 1]) {
- passed.push_back(edge);
- range_mappings.push_back(
- MappingRange(Range(kmer_pos, kmer_pos + 1),
- Range(0, 1)));
- return true;
- }
- }
- }
- return false;
- }
-
- bool ProcessKmer(const Kmer &kmer, size_t kmer_pos, std::vector<EdgeId> &passed_edges,
- RangeMappings& range_mapping, bool try_thread) const {
- if (try_thread) {
- if (!TryThread(kmer, kmer_pos, passed_edges, range_mapping)) {
- FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
- return false;
- }
-
- return true;
- }
-
- if (kmer_mapper_.CanSubstitute(kmer)) {
- FindKmer(kmer_mapper_.Substitute(kmer), kmer_pos, passed_edges, range_mapping);
- return false;
- }
-
- return FindKmer(kmer, kmer_pos, passed_edges, range_mapping);
- }
-
- public:
- NewExtendedSequenceMapper(const Graph& g,
- const Index& index,
- const KmerSubs& kmer_mapper,
- bool optimization_on = true) :
- SequenceMapper<Graph>(g), index_(index), kmer_mapper_(kmer_mapper), k_(g.k()+1),
- optimization_on_(optimization_on) { }
-
- ~NewExtendedSequenceMapper() {
- // TRACE("In destructor of sequence mapper");
- // TRACE(mapped_ << " sequences were mapped");
- // TRACE(unmapped_ << " sequences couldn't be mapped");
- }
-
- MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
- std::vector<EdgeId> passed_edges;
- RangeMappings range_mapping;
-
- if (sequence.size() < k_) {
- return MappingPath<EdgeId>();
- }
-
- Kmer kmer = sequence.start<Kmer>(k_);
- //kmer >>= 0;
- bool try_thread = false;
- try_thread = ProcessKmer(kmer, 0, passed_edges,
- range_mapping, try_thread);
- for (size_t i = k_; i < sequence.size(); ++i) {
- kmer <<= sequence[i];
- try_thread = ProcessKmer(kmer, i - k_ + 1, passed_edges,
- range_mapping, try_thread);
- }
-
- // if (passed_edges.empty()) {
- //// TRACE("Sequence " << sequence << "couldn't be mapped");
- // unmapped_++;
- // //todo maybe check path consistency?
- // } else {
- // mapped_++;
- // }
-
- return MappingPath<EdgeId>(passed_edges, range_mapping);
- }
-
- size_t KmerSize() const {
- return k_;
- }
-
- DECL_LOGGER("NewExtendedSequenceMapper");
-};
-
-
-template<class gp_t>
-std::shared_ptr<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> > MapperInstance(const gp_t& gp) {
- return std::make_shared<NewExtendedSequenceMapper<typename gp_t::graph_t, typename gp_t::index_t> >(gp.g, gp.index, gp.kmer_mapper);
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp b/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
deleted file mode 100644
index d5af6f9..0000000
--- a/src/modules/assembly_graph/graph_alignment/sequence_mapper_notifier.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_MAPPER_NOTIFIER_HPP_
-#define SEQUENCE_MAPPER_NOTIFIER_HPP_
-
-#include "dev_support/memory_limit.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "short_read_mapper.hpp"
-#include "io/reads/paired_read.hpp"
-#include "pipeline/graph_pack.hpp"
-
-#include <vector>
-#include <cstdlib>
-
-namespace debruijn_graph {
-//todo think if we still need all this
-class SequenceMapperListener {
-public:
- virtual void StartProcessLibrary(size_t threads_count) = 0;
- virtual void StopProcessLibrary() = 0;
-
- //TODO: think about read ierarchy
- virtual void ProcessPairedRead(size_t thread_index, const io::PairedRead& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
- virtual void ProcessPairedRead(size_t thread_index, const io::PairedReadSeq& pr, const MappingPath<EdgeId>& read1, const MappingPath<EdgeId>& read2) = 0;
- virtual void ProcessSingleRead(size_t thread_index, const io::SingleRead& r, const MappingPath<EdgeId>& read) = 0;
- virtual void ProcessSingleRead(size_t thread_index, const io::SingleReadSeq& r, const MappingPath<EdgeId>& read) = 0;
-
- virtual void MergeBuffer(size_t thread_index) = 0;
- virtual ~SequenceMapperListener() {}
-};
-
-class SequenceMapperNotifier {
- static const size_t BUFFER_SIZE = 200000;
-public:
- typedef SequenceMapper<conj_graph_pack::graph_t> SequenceMapperT;
-
- SequenceMapperNotifier(const conj_graph_pack& gp)
- : gp_(gp) { }
-
- void Subscribe(size_t lib_index, SequenceMapperListener* listener) {
- while ((int)lib_index >= (int)listeners_.size() - 1) {
- std::vector<SequenceMapperListener*> vect;
- listeners_.push_back(vect);
- }
- listeners_[lib_index].push_back(listener);
- }
-
- template<class ReadType>
- void ProcessLibrary(io::ReadStreamList<ReadType>& streams,
- size_t lib_index, const SequenceMapperT& mapper, size_t threads_count = 0) {
- if (threads_count == 0)
- threads_count = streams.size();
-
- streams.reset();
- NotifyStartProcessLibrary(lib_index, threads_count);
-
- size_t counter = 0, n = 15;
- size_t fmem = get_free_memory();
-
- #pragma omp parallel for num_threads(threads_count) shared(counter)
- for (size_t ithread = 0; ithread < threads_count; ++ithread) {
- size_t size = 0;
- ReadType r;
- auto& stream = streams[ithread];
- stream.reset();
- while (!stream.eof()) {
- if (size == BUFFER_SIZE ||
- // Stop filling buffer if the amount of available is smaller
- // than half of free memory.
- (10 * get_free_memory() / 4 < fmem && size > 10000)) {
- #pragma omp critical
- {
- counter += size;
- if (counter >> n) {
- INFO("Processed " << counter << " reads");
- n += 1;
- }
- size = 0;
- NotifyMergeBuffer(lib_index, ithread);
- }
- }
- stream >> r;
- ++size;
- NotifyProcessRead(r, mapper, lib_index, ithread);
- }
-# pragma omp atomic
- counter += size;
- }
- INFO("Total " << counter << " reads processed");
- NotifyStopProcessLibrary(lib_index);
- }
-
-private:
- template<class ReadType>
- void NotifyProcessRead(const ReadType& r, const SequenceMapperT& mapper, size_t ilib, size_t ithread) const;
-
- void NotifyStartProcessLibrary(size_t ilib, size_t thread_count) const {
- for (const auto& listener : listeners_[ilib])
- listener->StartProcessLibrary(thread_count);
- }
-
- void NotifyStopProcessLibrary(size_t ilib) const {
- for (const auto& listener : listeners_[ilib])
- listener->StopProcessLibrary();
- }
-
- void NotifyMergeBuffer(size_t ilib, size_t ithread) const {
- for (const auto& listener : listeners_[ilib])
- listener->MergeBuffer(ithread);
- }
- const conj_graph_pack& gp_;
-
- std::vector<std::vector<SequenceMapperListener*> > listeners_; //first vector's size = count libs
-};
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedReadSeq& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
-
- const Sequence& read1 = r.first().sequence();
- const Sequence& read2 = r.second().sequence();
- MappingPath<EdgeId> path1 = mapper.MapSequence(read1);
- MappingPath<EdgeId> path2 = mapper.MapSequence(read2);
- for (const auto& listener : listeners_[ilib]) {
- TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
- listener->ProcessPairedRead(ithread, r, path1, path2);
- listener->ProcessSingleRead(ithread, r.first(), path1);
- listener->ProcessSingleRead(ithread, r.second(), path2);
- }
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::PairedRead& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- MappingPath<EdgeId> path1 = mapper.MapRead(r.first());
- MappingPath<EdgeId> path2 = mapper.MapRead(r.second());
- for (const auto& listener : listeners_[ilib]) {
- TRACE("Dist: " << r.second().size() << " - " << r.insert_size() << " = " << r.second().size() - r.insert_size());
- listener->ProcessPairedRead(ithread, r, path1, path2);
- listener->ProcessSingleRead(ithread, r.first(), path1);
- listener->ProcessSingleRead(ithread, r.second(), path2);
- }
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleReadSeq& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- const Sequence& read = r.sequence();
- MappingPath<EdgeId> path = mapper.MapSequence(read);
- for (const auto& listener : listeners_[ilib])
- listener->ProcessSingleRead(ithread, r, path);
-}
-
-template<>
-inline void SequenceMapperNotifier::NotifyProcessRead(const io::SingleRead& r,
- const SequenceMapperT& mapper,
- size_t ilib,
- size_t ithread) const {
- MappingPath<EdgeId> path = mapper.MapRead(r);
- for (const auto& listener : listeners_[ilib])
- listener->ProcessSingleRead(ithread, r, path);
-}
-
-} /*debruijn_graph*/
-
-
-#endif /* SEQUENCE_MAPPER_NOTIFIER_HPP_ */
diff --git a/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp b/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
deleted file mode 100644
index b17559a..0000000
--- a/src/modules/assembly_graph/graph_alignment/short_read_mapper.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * short_read_mapper.hpp
- *
- * Created on: Dec 4, 2013
- * Author: andrey
- */
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "assembly_graph/graph_alignment/pacbio/pac_index.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class SensitiveReadMapper: public SequenceMapper<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- using SequenceMapper<Graph>::g_;
-private:
-
- size_t small_k_;
-
- static map<size_t, pacbio::PacBioMappingIndex<Graph>* > indices_;
- static size_t active_mappers_;
-
- pacbio::PacBioMappingIndex<Graph>* index_;
-
-public:
-
- SensitiveReadMapper(const Graph& g, size_t k, size_t graph_k) :
- SequenceMapper<Graph>(g), small_k_(k)
- {
- if (indices_.find(small_k_) == indices_.end()) {
- indices_.insert(make_pair(small_k_,
- new pacbio::PacBioMappingIndex<Graph>(g, small_k_, graph_k, false, cfg::get().output_dir, cfg::get().pb)));
- }
- index_ = indices_[small_k_];
- ++active_mappers_;
- }
-
- MappingPath<EdgeId> MapSequence(const Sequence &sequence) const {
- return index_->GetShortReadAlignment(sequence);
- }
-
- size_t KmerSize() const {
- return small_k_;
- }
-
- ~SensitiveReadMapper() {
- --active_mappers_;
- }
-
- static void EraseIndices() {
- if (active_mappers_ > 0) {
- WARN("There are still active mappers");
- }
- for (auto iter = indices_.begin(); iter != indices_.end(); ++iter) {
- delete iter->second;
- }
- indices_.clear();
- }
-
-};
-
-template<class Graph>
-map<size_t, pacbio::PacBioMappingIndex<Graph>* > SensitiveReadMapper<Graph>::indices_;
-
-template<class Graph>
-size_t SensitiveReadMapper<Graph>::active_mappers_ = 0;
-
-
-template<class graph_pack, class SequencingLib>
-std::shared_ptr<SequenceMapper<typename graph_pack::graph_t>> ChooseProperMapper(const graph_pack& gp, const SequencingLib& library) {
- if (library.type() == io::LibraryType::MatePairs) {
- INFO("Mapping mate-pair library, selecting sensitive read mapper with k=" << cfg::get().sensitive_map.k);
- return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, cfg::get().sensitive_map.k, gp.k_value);
- }
-
- size_t read_length = library.data().read_length;
- if (read_length < gp.k_value && library.type() == io::LibraryType::PairedEnd) {
- INFO("Read length = " << read_length << ", selecting short read mapper");
- return std::make_shared<SensitiveReadMapper<typename graph_pack::graph_t>>(gp.g, read_length/ 3, gp.k_value);
- }
-
- INFO("Selecting usual mapper");
- return MapperInstance(gp);
-}
-
-}
-
diff --git a/src/modules/assembly_graph/graph_core/action_handlers.hpp b/src/modules/assembly_graph/graph_core/action_handlers.hpp
deleted file mode 100644
index 55d015d..0000000
--- a/src/modules/assembly_graph/graph_core/action_handlers.hpp
+++ /dev/null
@@ -1,347 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __OMNI_ACTION_HANDLERS_HPP__
-#define __OMNI_ACTION_HANDLERS_HPP__
-
-#include "dev_support/verify.hpp"
-#include "dev_support/logger/logger.hpp"
-
-#include <boost/noncopyable.hpp>
-#include <string>
-#include <vector>
-
-namespace omnigraph {
-
-using std::vector;
-
-/**
-* ActionHandler is base listening class for graph events. All structures and information storages
-* which are meant to synchronize with graph should use this structure. In order to make handler listen
-* to graph events one should add it to graph listeners.
-* Normally structure itself extends ActionHandler and overrides several handling methods. In
-* constructor it adds itself to graph handler list and removes itself form this list in destructor.
-* All events are divided into two levels: low level events and high level events.
-* Low level events are addition/deletion of vertices/edges. These events should be triggered only after
-* high level events when all data was already transferred and graph structure is consistent.
-* High level events should be used to keep external data synchronized with graph and keep internal data
-* consistent. Now high level events are merge, glue and split. This list can be extended in near future.
-*/
-template<typename VertexId, typename EdgeId>
-class ActionHandler : private boost::noncopyable {
- const std::string handler_name_;
-private:
- bool attached_;
-public:
- /**
- * Create action handler with given name. With this name one can find out what tipe of handler is it.
- */
- ActionHandler(const std::string &name)
- : handler_name_(name), attached_(true) {
- }
-
- virtual ~ActionHandler() {
- TRACE("~ActionHandler " << handler_name_);
- }
-
- /**
- * Method returns name of this handler
- */
- const std::string &name() const {
- return handler_name_;
- }
-
- /**
- * Low level event which is triggered when vertex is added to graph.
- * @param v new vertex
- */
- virtual void HandleAdd(VertexId /*v*/) { }
-
- /**
- * Low level event which is triggered when edge is added to graph.
- * @param e new edge
- */
- virtual void HandleAdd(EdgeId /*e*/) { }
-
- /**
- * Low level event which is triggered when vertex is deleted from graph.
- * @param v vertex to delete
- */
- virtual void HandleDelete(VertexId /*v*/) { }
-
- /**
- * Low level event which is triggered when edge is deleted from graph.
- * @param e edge to delete
- */
- virtual void HandleDelete(EdgeId /*e*/) { }
-
- /**
- * High level event which is triggered when merge operation is performed on graph, which is when
- * path of edges with all inner vertices having exactly one incoming and one outgoing edge is
- * replaced with a single edge. Since this is high level operation event of creation of new edge
- * and events of deletion of old edges should not have been triggered yet when this event was triggered.
- * @param old_edges path of edges to be replaced with single edge
- * @param new_edge new edge that was added to be a replacement of path
- */
- virtual void HandleMerge(const vector<EdgeId> & /*old_edges*/, EdgeId /*new_edge*/) { }
-
- /**
- * High level event which is triggered when glue operation is performed on graph, which is when
- * edge is completely replaced with other edge. This operation is widely used in bulge removal
- * when alternative path is glued to main path. Since this is high level operation event of deletion
- * of old edge should not have been triggered yet when this event was triggered.
- * @param new_edge edge glue result
- * @param edge1 edge to be glued to edge2
- * @param edge2 edge edge1 should be glued with
- */
- virtual void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) { }
-
- /**
- * High level event which is triggered when split operation is performed on graph, which is when
- * edge is split into several shorter edges. Split operation is reverse to merge operation.
- * Since this is high level operation event of deletion of old edge and events of creation of new edges
- * should not have been triggered yet when this event was triggered.
- * @param old_edge edge to be split
- * @param new_edges edges which are results of split
- */
- virtual void HandleSplit(EdgeId /*old_edge*/, EdgeId /*new_edge_1*/,
- EdgeId /*new_edge_2*/) { }
-
- /**
- * Every thread safe descendant should override this method for correct concurrent graph processing.
- */
- virtual bool IsThreadSafe() const {
- return false;
- }
-
- bool IsAttached() const {
- return attached_;
- }
-
- void Attach() {
- VERIFY(!attached_);
- attached_ = true;
- }
-
- void Detach() {
- VERIFY(attached_);
- attached_ = false;
- }
-};
-
-template<class Graph>
-class GraphActionHandler : public ActionHandler<typename Graph::VertexId,
- typename Graph::EdgeId> {
- typedef ActionHandler<typename Graph::VertexId, typename Graph::EdgeId> base;
-
- const Graph &g_;
-
-protected:
- const Graph &g() const {
- return g_;
- }
-
-public:
- GraphActionHandler(const Graph &g, const std::string &name)
- : base(name),
- g_(g) {
- TRACE("Adding new action handler: " << this->name());
- g_.AddActionHandler(this);
- }
-
- GraphActionHandler(const GraphActionHandler<Graph> &other)
- : base(other.name()),
- g_(other.g_) {
- TRACE("Adding new action handler: " << this->name());
- g_.AddActionHandler(this);
- }
-
- virtual ~GraphActionHandler() {
- TRACE("Removing action handler: " << this->name());
- if (this->IsAttached())
- this->Detach();
- g_.RemoveActionHandler(this);
- }
-};
-
-/**
-* In order to support various types of graphs and make handler structure more flexible HandlerApplier
-* structure was introduced. If certain implementation of graph requires special handler triggering scheme
-* one can store certain extension of HandlerApplier in graph and trigger HandlerApplier methods instead
-* of GraphHandler methods.
-* HandlerApplier contains one method for each of graph events which define the exact way this event
-* should be triggered.
-*/
-template<typename VertexId, typename EdgeId>
-class HandlerApplier {
- typedef ActionHandler<VertexId, EdgeId> Handler;
-public:
-
- virtual void
- ApplyAdd(Handler &handler, VertexId v) const = 0;
-
- virtual void
- ApplyAdd(Handler &handler, EdgeId e) const = 0;
-
- virtual void
- ApplyDelete(Handler &handler, VertexId v) const = 0;
-
- virtual void
- ApplyDelete(Handler &handler, EdgeId e) const = 0;
-
- virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const = 0;
-
- virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const = 0;
-
- virtual void ApplySplit(Handler &handler, EdgeId old_edge,
- EdgeId new_edge_1, EdgeId new_edge2) const = 0;
-
- virtual ~HandlerApplier() {
- }
-};
-
-/**
-* SimpleHandlerApplier is simple implementation of handler applier with no special filtering.
-*/
-template<class Graph>
-class SimpleHandlerApplier : public HandlerApplier<typename Graph::VertexId,
- typename Graph::EdgeId> {
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
- virtual void ApplyAdd(Handler &handler, VertexId v) const {
- handler.HandleAdd(v);
- }
-
- virtual void ApplyAdd(Handler &handler, EdgeId e) const {
- handler.HandleAdd(e);
- }
-
- virtual void ApplyDelete(Handler &handler, VertexId v) const {
- handler.HandleDelete(v);
- }
-
- virtual void ApplyDelete(Handler &handler, EdgeId e) const {
- handler.HandleDelete(e);
- }
-
- virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const {
- handler.HandleMerge(old_edges, new_edge);
- }
-
- virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const {
- handler.HandleGlue(new_edge, edge1, edge2);
- }
-
- virtual void ApplySplit(Handler &handler, EdgeId old_edge, EdgeId new_edge1,
- EdgeId new_edge2) const {
- handler.HandleSplit(old_edge, new_edge1, new_edge2);
- }
-
-};
-
-/**
-* PairedHandlerApplier is implementation of HandlerApplier for graph with synchronization of actions
-* performed with vertices/edges and its reverse-complement analogues. Thus while corresponding
-* method was called only once event should be triggered twice: for the parameters with which method
-* was called and for reverse-complement parameters. Also certain assertions were added for bad cases.
-*/
-template<class Graph>
-class PairedHandlerApplier : public HandlerApplier<typename Graph::VertexId,
- typename Graph::EdgeId> {
-private:
- Graph &graph_;
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
- PairedHandlerApplier(Graph &graph)
- : graph_(graph) {
- }
-
- virtual void ApplyAdd(Handler &handler, VertexId v) const {
- VertexId rcv = graph_.conjugate(v);
- handler.HandleAdd(v);
- if (v != rcv) {
- handler.HandleAdd(rcv);
- }
- }
-
- virtual void ApplyAdd(Handler &handler, EdgeId e) const {
- EdgeId rce = graph_.conjugate(e);
- handler.HandleAdd(e);
- if (e != rce) {
- handler.HandleAdd(rce);
- }
- }
-
- virtual void ApplyDelete(Handler &handler, VertexId v) const {
- VertexId rcv = graph_.conjugate(v);
- handler.HandleDelete(v);
- if (v != rcv) {
- handler.HandleDelete(rcv);
- }
- }
-
- virtual void ApplyDelete(Handler &handler, EdgeId e) const {
- EdgeId rce = graph_.conjugate(e);
- handler.HandleDelete(e);
- if (e != rce) {
- handler.HandleDelete(rce);
- }
- }
-
- virtual void ApplyMerge(Handler &handler, vector<EdgeId> old_edges,
- EdgeId new_edge) const {
- EdgeId rce = graph_.conjugate(new_edge);
- handler.HandleMerge(old_edges, new_edge);
- if (new_edge != rce) {
- vector<EdgeId> rc_old_edges;
- for (int i = (int) old_edges.size() - 1; i >= 0; i--) {
- rc_old_edges.push_back(graph_.conjugate(old_edges[i]));
- }
- handler.HandleMerge(rc_old_edges, rce);
- }
- }
-
- virtual void ApplyGlue(Handler &handler, EdgeId new_edge, EdgeId edge1,
- EdgeId edge2) const {
- EdgeId rc_edge1 = graph_.conjugate(edge1);
- EdgeId rc_edge2 = graph_.conjugate(edge2);
- VERIFY(edge1 != edge2);
- VERIFY(edge2 != rc_edge2);
- handler.HandleGlue(new_edge, edge1, edge2);
- if (edge1 != rc_edge1) {
- handler.HandleGlue(graph_.conjugate(new_edge), rc_edge1, rc_edge2);
- }
- }
-
- virtual void ApplySplit(Handler &handler, EdgeId old_edge,
- EdgeId new_edge_1, EdgeId new_edge2) const {
- EdgeId rce = graph_.conjugate(old_edge);
- //VERIFY(old_edge != rce);
- handler.HandleSplit(old_edge, new_edge_1, new_edge2);
- if (old_edge != rce) {
- handler.HandleSplit(rce, graph_.conjugate(new_edge2),
- graph_.conjugate(new_edge_1));
- }
- }
-
-private:
- DECL_LOGGER("PairedHandlerApplier")
-};
-
-};
-
-#endif
diff --git a/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp b/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp
deleted file mode 100644
index 52701ac..0000000
--- a/src/modules/assembly_graph/graph_core/basic_graph_stats.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-namespace omnigraph {
-
-template<class Graph>
-class AvgCovereageCounter {
-private:
- const Graph &graph_;
- const size_t min_length_;
-public:
- AvgCovereageCounter(const Graph &graph, size_t min_length = 0) :
- graph_(graph), min_length_(min_length) {
- }
-
- double Count() const {
- double cov = 0;
- size_t length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- if (graph_.length(*it) >= min_length_) {
- cov += graph_.coverage(*it) * (double) graph_.length(*it);
- length += graph_.length(*it);
- }
- }
- if (length == 0)
- return 0.;
- return cov / (double) length;
- }
-};
-
-template<class Graph>
-size_t CumulativeLength(const Graph& g,
- const std::vector<typename Graph::EdgeId>& path) {
- size_t s = 0;
- for (auto it = path.begin(); it != path.end(); ++it)
- s += g.length(*it);
-
- return s;
-}
-
-template<class Graph>
-double AvgCoverage(const Graph& g,
- const std::vector<typename Graph::EdgeId>& path) {
- double unnormalized_coverage = 0;
- size_t path_length = 0;
- for (auto edge : path) {
- size_t length = g.length(edge);
- path_length += length;
- unnormalized_coverage += g.coverage(edge) * (double) length;
- }
- return unnormalized_coverage / (double) path_length;
-}
-}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_core/construction_helper.hpp b/src/modules/assembly_graph/graph_core/construction_helper.hpp
deleted file mode 100644
index f9c5514..0000000
--- a/src/modules/assembly_graph/graph_core/construction_helper.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-//#include "graph_core.hpp"
-#include "observable_graph.hpp"
-
-namespace omnigraph {
-
-template<class DataMaster>
-class ConstructionHelper {
- //typedef GraphCore<DataMaster> Graph;
- typedef ObservableGraph<DataMaster> Graph;
- typedef typename Graph::DataMasterT DataMasterT;
- typedef typename Graph::VertexData VertexData;
- typedef typename Graph::EdgeData EdgeData;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::VertexIt VertexIt;
- typedef typename Graph::edge_const_iterator edge_const_iterator;
-
- Graph &graph_;
-
-public:
-
- ConstructionHelper(Graph &graph) : graph_(graph) {
- }
-
- Graph &graph() {
- return graph_;
- }
-
- EdgeId AddEdge(const EdgeData &data) {
- return AddEdge(data, graph_.GetGraphIdDistributor());
- }
-
- EdgeId AddEdge(const EdgeData &data, restricted::IdDistributor &id_distributor) {
- return graph_.AddEdge(data, id_distributor);
- }
-
- void LinkIncomingEdge(VertexId v, EdgeId e) {
- VERIFY(graph_.EdgeEnd(e) == VertexId(0));
- graph_.conjugate(v)->AddOutgoingEdge(graph_.conjugate(e));
- e->SetEndVertex(v);
- }
-
- void LinkOutgoingEdge(VertexId v, EdgeId e) {
- VERIFY(graph_.EdgeEnd(graph_.conjugate(e)) == VertexId(0));
- v->AddOutgoingEdge(e);
- graph_.conjugate(e)->SetEndVertex(graph_.conjugate(v));
- }
-
- void DeleteLink(VertexId v, EdgeId e) {
- v->RemoveOutgoingEdge(e);
- }
-
- void DeleteUnlinkedEdge(EdgeId e) {
- EdgeId rc = graph_.conjugate(e);
- if (e != rc) {
- delete rc.get();
- }
- delete e.get();
- }
-
- VertexId CreateVertex(const VertexData &data) {
- return CreateVertex(data, graph_.GetGraphIdDistributor());
- }
-
- VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
- return graph_.CreateVertex(data, id_distributor);
- }
-
- template<class Iter>
- void AddVerticesToGraph(Iter begin, Iter end) {
- for(; begin != end; ++begin) {
- graph_.AddVertexToGraph(*begin);
- }
- }
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_core/coverage.hpp b/src/modules/assembly_graph/graph_core/coverage.hpp
deleted file mode 100644
index 4f243eb..0000000
--- a/src/modules/assembly_graph/graph_core/coverage.hpp
+++ /dev/null
@@ -1,343 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * coverage.hpp
- *
- * Created on: Jun 21, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include <iostream>
-#include <vector>
-#include <algorithm>
-#include "math/xmath.h"
-#include "action_handlers.hpp"
-namespace omnigraph {
-
-using std::vector;
-//todo save/load absolute coverage
-template<class Graph>
-class CoverageIndex : public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- //typedef unordered_map<EdgeId, int> map_type;
-
- Graph& g_;
-// map_type storage_;
-
-// size_t KPlusOneMerCoverage(EdgeId edge) const {
-// return (size_t) math::round(coverage(edge) * (double) this->g().length(edge));
-// }
-
-// template<class ReadThreader>
-// Path<EdgeId> ProcessSequence(const ReadThreader& threader,
-// const Sequence& sequence) const {
-// return threader.MapSequence(sequence);
-// }
-
-// void AddPathsToGraph(const Path<EdgeId>& path) {
-//
-// if (path.sequence().size() == 0)
-// return;
-//
-// const vector<EdgeId>& edges_list = path.sequence();
-//
-// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
-// IncCoverage(*it, this->g().length(*it));
-// }
-// IncCoverage(edges_list[0], -int(path.start_pos()));
-// EdgeId last = edges_list[edges_list.size() - 1];
-// IncCoverage(last, int(path.end_pos()) - int(this->g().length(last)));
-// }
-
-// void IncCoverageInMap(EdgeId edge, int toAdd, map_type& map) {
-// //VERIFY(toAdd >= 0);
-// map[edge] += toAdd;
-// VERIFY(map[edge] >= 0);
-// }
-//
-// void AddPathsToMap(const Path<EdgeId>& path, map_type& map) {
-//
-// if (path.sequence().size() == 0)
-// return;
-//
-// const vector<EdgeId>& edges_list = path.sequence();
-//
-// for (auto it = edges_list.cbegin(); it != edges_list.cend(); ++it) {
-// IncCoverageInMap(*it, this->g().length(*it), map);
-// }
-// IncCoverageInMap(edges_list[0], -int(path.start_pos()), map);
-// EdgeId last = edges_list[edges_list.size() - 1];
-// IncCoverageInMap(last,
-// int(path.end_pos()) - int(this->g().length(last)),
-// map);
-// }
-
- public:
- CoverageIndex(Graph &g)
- : GraphActionHandler<Graph>(g, "CoverageIndex"), g_(g) {
- }
-
- virtual ~CoverageIndex() {
- }
-
- /**
- * In NON averaged units
- */
- void SetRawCoverage(EdgeId e, unsigned cov) {
- g_.data(e).set_raw_coverage(cov);
- }
-
- void IncRawCoverage(EdgeId e, unsigned count) {
- g_.data(e).inc_raw_coverage((int)count);
- }
-
- void SetAvgCoverage(EdgeId e, double cov) {
- g_.data(e).set_raw_coverage((int) math::round(cov * (double) this->g().length(e)));
- }
-
- /**
- * Returns average coverage of the edge
- */
- double coverage(EdgeId edge) const {
- return (double) RawCoverage(edge) / (double) this->g().length(edge);
- }
-
- unsigned RawCoverage(EdgeId edge) const {
- return g_.data(edge).raw_coverage();
- }
-// /**
-// * Returns average coverage of the edge
-// */
-// double operator[](EdgeId e) const {
-// return coverage(e);
-// }
-
-// /**
-// * Method increases coverage value
-// */
-// void IncCoverage(EdgeId edge, int to_add) {
-// edge->IncCoverage(to_add);
-// VERIFY(edge->GetRawCoverage() >= 0);
-// }
-//
-// /**
-// * Method increases coverage value by 1
-// */
-// void IncCoverage(EdgeId edge) {
-// IncCoverage(edge, 1);
-// }
-
-// template<class ReadThreader, class Read>
-// void Fill(io::IReader<Read>& stream, const ReadThreader& threader) {
-//
-// INFO("Processing reads (takes a while)");
-// size_t counter = 0;
-// stream.reset();
-//
-// while (!stream.eof()) {
-// Read r;
-// stream >> r;
-// Path<EdgeId> path = ProcessSequence(threader, r.sequence());
-// AddPathsToGraph(path);
-//
-// VERBOSE_POWER(++counter, " reads processed");
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-// }
-//
-// template<class ReadThreader, class Read>
-// void FillParallel(io::ReadStreamVector<io::IReader<Read> >& streams,
-// const ReadThreader& threader, size_t buffer_size) {
-//
-// INFO("Processing reads (takes a while)");
-// perf_counter pc;
-// size_t counter = 0;
-//
-// size_t nthreads = streams.size();
-// size_t buf_size = buffer_size
-// / (nthreads * (sizeof(Path<EdgeId> ) + 32));
-//
-//#pragma omp parallel num_threads(nthreads)
-// {
-//#pragma omp for reduction(+ : counter)
-// for (size_t i = 0; i < nthreads; ++i) {
-//
-// Read r;
-// io::IReader<Read>& stream = streams[i];
-// stream.reset();
-// std::vector<Path<EdgeId> > buffer(buf_size);
-//
-// size_t j = 0;
-// while (!stream.eof()) {
-// stream >> r;
-// ++counter;
-// buffer[j++] = ProcessSequence(threader, r.sequence());
-//
-// if (j == buf_size) {
-// j = 0;
-//
-//#pragma omp critical
-// {
-// for (size_t l = 0; l < buf_size; ++l) {
-// AddPathsToGraph(buffer[l]);
-// }
-// }
-// }
-// }
-//
-//#pragma omp critical
-// {
-// for (size_t l = 0; l < j; ++l) {
-// AddPathsToGraph(buffer[l]);
-// }
-// }
-// }
-//
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-//
-// INFO("Elapsed time: " << pc.time_ms());
-// }
-//
-// template<class ReadThreader, class Read>
-// void FillFastParallel(
-// io::ReadStreamVector<io::IReader<Read> >& streams,
-// const ReadThreader& threader) {
-//
-// INFO("Processing reads (takes a while)");
-// perf_counter pc;
-// size_t counter = 0;
-//
-// size_t nthreads = streams.size();
-////
-// std::vector<map_type*> maps(nthreads);
-//// maps[0] = &storage_;
-//
-// for (size_t i = 0; i < nthreads; ++i) {
-// maps[i] = new map_type();
-// }
-//
-//#pragma omp parallel num_threads(nthreads)
-// {
-//#pragma omp for reduction(+ : counter)
-// for (size_t i = 0; i < nthreads; ++i) {
-//
-// Read r;
-// io::IReader<Read>& stream = streams[i];
-// stream.reset();
-// Path<EdgeId> path;
-//
-// while (!stream.eof()) {
-// stream >> r;
-// ++counter;
-// path = ProcessSequence(threader, r.sequence());
-//
-// AddPathsToMap(path, *maps[i]);
-// }
-// }
-// }
-//
-// INFO("Merging maps");
-// for (size_t i = 0; i < nthreads; ++i) {
-// for (auto it = maps[i]->begin(); it != maps[i]->end(); ++it) {
-// it->first->IncCoverage(it->second);
-// }
-// delete maps[i];
-// }
-//
-// INFO("DeBruijn graph coverage counted, reads used: " << counter);
-//
-// INFO("Elapsed time: " << pc.time_ms());
-// }
-
-// template<class Index>
-// void FillFromIndex(Index& index) {
-// for (auto I = index.value_cbegin(), E = index.value_cend();
-// I != E; ++I) {
-// const auto& edge_info = *I;
-// VERIFY(edge_info.offset != -1u);
-// VERIFY(edge_info.edge_id.get() != NULL);
-// IncRawCoverage(edge_info.edge_id, edge_info.count);
-// }
-//
-// DEBUG("Coverage counted");
-// }
-
- virtual void HandleDelete(EdgeId edge) {
- SetRawCoverage(edge, 0);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
- unsigned coverage = 0;
- for (auto it = old_edges.begin(); it != old_edges.end(); ++it) {
- coverage += RawCoverage(*it);
- }
- SetRawCoverage(new_edge, coverage);
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1, EdgeId new_edge2) {
-// size_t length1 = this->g().length(newEdge1);
-// size_t length = this->g().length(oldEdge);
-// size_t coverage = KPlusOneMerCoverage(oldEdge);
-// size_t coverage1 = coverage * length1 / length;
-// if (coverage1 == 0)
-// coverage1 = 1;
-// size_t coverage2 = coverage - coverage1;
-// if (coverage2 == 0)
-// coverage2 = 1;
-// SetCoverage(newEdge1, coverage1);
-// SetCoverage(newEdge2, coverage2);
- double avg_cov = coverage(old_edge);
- if (old_edge == g_.conjugate(old_edge)) {
- int raw1 = std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1)));
- SetRawCoverage(new_edge1, raw1);
- SetRawCoverage(g_.conjugate(new_edge1), raw1);
- SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
- } else {
- SetRawCoverage(new_edge1, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge1))));
- SetRawCoverage(new_edge2, std::max(1, (int) math::round(avg_cov * (double) this->g().length(new_edge2))));
- }
- }
-
- void Save(EdgeId e, std::ostream& out) const {
- out << fmt::format("{:.6f}", coverage(e));
- }
-
- void Load(EdgeId e, std::istream& in) {
- double cov;
- in >> cov;
- SetAvgCoverage(e, cov);
- }
-
- /*
- * Is thread safe if different threads process different edges.
- */
- bool IsThreadSafe() const {
- return true;
- }
-};
-
-//todo discuss with Anton
-template<class Graph>
-class AbstractFlankingCoverage {
-public:
- virtual double GetInCov(typename Graph::EdgeId edge) const = 0;
- virtual double GetOutCov(typename Graph::EdgeId edge) const = 0;
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_core/debruijn_data.hpp b/src/modules/assembly_graph/graph_core/debruijn_data.hpp
deleted file mode 100644
index c775165..0000000
--- a/src/modules/assembly_graph/graph_core/debruijn_data.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include <cstring>
-#include "dev_support/verify.hpp"
-#include "dev_support/logger/logger.hpp"
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "dev_support/standard_base.hpp"
-
-namespace debruijn_graph {
-class DeBruijnMaster;
-
-class DeBruijnVertexData {
- friend class DeBruinMaster;
-public:
- DeBruijnVertexData() {
-
- }
-};
-
-class CoverageData {
- private:
- unsigned coverage_;
-
- public:
- CoverageData()
- : coverage_(0) {
- }
-
- void inc_coverage(int value) {
- VERIFY(value >= 0 || coverage_ > unsigned(-value));
- coverage_ += value;
- }
-
- void set_coverage(unsigned coverage) {
- coverage_ = coverage;
- }
-
- //not length normalized
- unsigned coverage() const {
- return coverage_;
- }
-};
-
-class DeBruijnEdgeData {
- friend class DeBruinMaster;
- CoverageData coverage_;
- CoverageData flanking_cov_;
- Sequence nucls_;
-public:
-
- DeBruijnEdgeData(const Sequence &nucls) :
- nucls_(nucls) {
- }
-
- const Sequence& nucls() const {
- return nucls_;
- }
-
- void inc_raw_coverage(int value) {
- coverage_.inc_coverage(value);
- }
-
- void set_raw_coverage(unsigned coverage) {
- coverage_.set_coverage(coverage);
- }
-
- unsigned raw_coverage() const {
- return coverage_.coverage();
- }
-
- void inc_flanking_coverage(int value) {
- flanking_cov_.inc_coverage(value);
- }
-
- void set_flanking_coverage(unsigned flanking_coverage) {
- flanking_cov_.set_coverage(flanking_coverage);
- }
-
- //not length normalized
- unsigned flanking_coverage() const {
- return flanking_cov_.coverage();
- }
-
- size_t size() const {
- return nucls_.size();
- }
-};
-
-class DeBruijnDataMaster {
-private:
- const size_t k_;
-
-public:
- typedef DeBruijnVertexData VertexData;
- typedef DeBruijnEdgeData EdgeData;
-
- DeBruijnDataMaster(size_t k) :
- k_(k) {
- }
-
- const EdgeData MergeData(const std::vector<const EdgeData*>& to_merge, bool safe_merging = true) const;
-
- std::pair<VertexData, std::pair<EdgeData, EdgeData>> SplitData(const EdgeData& edge, size_t position, bool is_self_conj = false) const;
-
- EdgeData GlueData(const EdgeData&, const EdgeData& data2) const;
-
- bool isSelfConjugate(const EdgeData &data) const {
- return data.nucls() == !(data.nucls());
- }
-
- EdgeData conjugate(const EdgeData &data) const {
- return EdgeData(!(data.nucls()));
- }
-
- VertexData conjugate(const VertexData & /*data*/) const {
- return VertexData();
- }
-
- size_t length(const EdgeData& data) const {
- return data.nucls().size() - k_;
- }
-
- size_t length(const VertexData& ) const {
- return k_;
- }
-
- size_t k() const {
- return k_;
- }
-
-};
-
-//typedef DeBruijnVertexData VertexData;
-//typedef DeBruijnEdgeData EdgeData;
-//typedef DeBruijnDataMaster DataMaster;
-
-inline const DeBruijnEdgeData DeBruijnDataMaster::MergeData(const std::vector<const DeBruijnEdgeData*>& to_merge, bool safe_merging) const {
- std::vector<Sequence> ss;
- ss.reserve(to_merge.size());
- for (auto it = to_merge.begin(); it != to_merge.end(); ++it) {
- ss.push_back((*it)->nucls());
- }
- return EdgeData(MergeOverlappingSequences(ss, k_, safe_merging));
-}
-
-inline std::pair<DeBruijnVertexData, std::pair<DeBruijnEdgeData, DeBruijnEdgeData>> DeBruijnDataMaster::SplitData(const EdgeData& edge,
- size_t position,
- bool is_self_conj) const {
- const Sequence& nucls = edge.nucls();
- size_t end = nucls.size();
- if (is_self_conj) {
- VERIFY(position < end);
- end -= position;
- }
- return std::make_pair(VertexData(), std::make_pair(EdgeData(edge.nucls().Subseq(0, position + k_)), EdgeData(nucls.Subseq(position, end))));
-}
-
-inline DeBruijnEdgeData DeBruijnDataMaster::GlueData(const DeBruijnEdgeData&, const DeBruijnEdgeData& data2) const {
- return data2;
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_core/graph_core.hpp b/src/modules/assembly_graph/graph_core/graph_core.hpp
deleted file mode 100644
index d45efb4..0000000
--- a/src/modules/assembly_graph/graph_core/graph_core.hpp
+++ /dev/null
@@ -1,620 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include "dev_support/verify.hpp"
-#include "dev_support/logger/logger.hpp"
-#include "order_and_law.hpp"
-#include <boost/iterator/iterator_facade.hpp>
-#include "dev_support/simple_tools.hpp"
-
-namespace omnigraph {
-
-using std::vector;
-template<class DataMaster>
-class GraphCore;
-
-template<class DataMaster>
-class ConstructionHelper;
-
-template<class T>
-class PairedElementManipulationHelper;
-
-template<class DataMaster>
-class PairedVertex;
-
-template<class DataMaster>
-class PairedEdge;
-
-template<class DataMaster>
-class PairedEdge {
- private:
- typedef typename DataMaster::EdgeData EdgeData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- friend class GraphCore<DataMaster>;
- friend class ConstructionHelper<DataMaster>;
- friend class PairedElementManipulationHelper<EdgeId>;
- //todo unfriend
- friend class PairedVertex<DataMaster>;
- VertexId end_;
- EdgeData data_;
- EdgeId conjugate_;
-
- PairedEdge(VertexId end, const EdgeData &data)
- : end_(end),
- data_(data) {
- }
-
- EdgeData &data() {
- return data_;
- }
-
- void set_data(const EdgeData &data) {
- data_ = data;
- }
-
- VertexId end() const {
- return end_;
- }
-
- VertexId start() const {
- return conjugate_->end()->conjugate();
- }
-
- void set_conjugate(EdgeId conjugate) {
- conjugate_ = conjugate;
- }
-
- void SetEndVertex(VertexId end) {
- end_ = end;
- }
-
-public:
- EdgeId conjugate() const {
- return conjugate_;
- }
-
- size_t length(size_t k) const {
- return data_.size() - k;
- }
-};
-
-template<class DataMaster>
-class PairedVertex {
-private:
- typedef typename DataMaster::VertexData VertexData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- typedef typename std::vector<EdgeId>::const_iterator edge_raw_iterator;
-
- class conjugate_iterator : public boost::iterator_facade<conjugate_iterator,
- EdgeId, boost::forward_traversal_tag, EdgeId> {
- public:
- explicit conjugate_iterator(edge_raw_iterator it,
- bool conjugate = false)
- : it_(it),
- conjugate_(conjugate) {
- }
-
- //todo do we need it?
- conjugate_iterator()
- : conjugate_(false) {
- }
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() {
- it_++;
- }
-
- bool equal(const conjugate_iterator &other) const {
- return other.it_ == it_ && other.conjugate_ == conjugate_;
- }
-
- EdgeId dereference() const {
- return (conjugate_ ? (*it_)->conjugate() : *it_);
- }
-
- edge_raw_iterator it_;
- bool conjugate_;
- };
-
-public:
- typedef conjugate_iterator edge_const_iterator;
-
-private:
- friend class GraphCore<DataMaster>;
- friend class ConstructionHelper<DataMaster>;
- friend class PairedEdge<DataMaster>;
- friend class PairedElementManipulationHelper<VertexId>;
- friend class conjugate_iterator;
-
- std::vector<EdgeId> outgoing_edges_;
-
- VertexId conjugate_;
-
- VertexData data_;
-
- bool IsMinimal() const {
- return conjugate_->conjugate_ <= conjugate_;
- }
-
- VertexId conjugate() const {
- return conjugate_;
- }
-
- void set_conjugate(VertexId conjugate) {
- conjugate_ = conjugate;
- }
-
- size_t OutgoingEdgeCount() const {
- return outgoing_edges_.size();
- }
-
- edge_const_iterator out_begin() const {
- return edge_const_iterator(outgoing_edges_.cbegin(), false);
- }
-
- edge_const_iterator out_end() const {
- return edge_const_iterator(outgoing_edges_.cend(), false);
- }
-
- size_t IncomingEdgeCount() const {
- return conjugate_->OutgoingEdgeCount();
- }
-
- size_t IncomingEdgesCount() const {
- return conjugate_->OutgoingEdgeCount();
- }
-
- edge_const_iterator in_begin() const {
- return edge_const_iterator(conjugate_->outgoing_edges_.cbegin(), true);
- }
-
- edge_const_iterator in_end() const {
- return edge_const_iterator(conjugate_->outgoing_edges_.cend(), true);
- }
-
- PairedVertex(VertexData data)
- : data_(data) {
- }
-
- VertexData &data() {
- return data_;
- }
-
- void set_data(VertexData data) {
- data_ = data;
- }
-
- const std::vector<EdgeId> OutgoingEdgesTo(VertexId v) const {
- vector<EdgeId> result;
- for (auto it = outgoing_edges_.begin(); it != outgoing_edges_.end(); ++it) {
- if ((*it)->end() == v) {
- result.push_back(*it);
- }
- }
- return result;
- }
-
- void AddOutgoingEdge(EdgeId e) {
- outgoing_edges_.insert(std::upper_bound(outgoing_edges_.begin(), outgoing_edges_.end(), e), e);
- //outgoing_edges_.push_back(e);
- }
-
- bool RemoveOutgoingEdge(const EdgeId e) {
- auto it = std::find(outgoing_edges_.begin(), outgoing_edges_.end(), e);
- if (it == outgoing_edges_.end())
- return false;
-
- outgoing_edges_.erase(it);
- return true;
- }
-
- ~PairedVertex() {
- VERIFY(outgoing_edges_.size() == 0);
- }
-};
-
-template<class DataMaster>
-class GraphCore: private boost::noncopyable {
-public:
- typedef DataMaster DataMasterT;
- typedef typename DataMasterT::VertexData VertexData;
- typedef typename DataMasterT::EdgeData EdgeData;
- typedef restricted::pure_pointer<PairedEdge<DataMaster>> EdgeId;
- typedef restricted::pure_pointer<PairedVertex<DataMaster>> VertexId;
- typedef typename std::set<VertexId>::const_iterator VertexIt;
- typedef typename PairedVertex<DataMaster>::edge_const_iterator edge_const_iterator;
-
-private:
- restricted::LocalIdDistributor id_distributor_;
- DataMaster master_;
- std::set<VertexId> vertices_;
-
- friend class ConstructionHelper<DataMaster>;
-public:
- VertexIt begin() const {
- return vertices_.begin();
- }
-
- VertexIt end() const {
- return vertices_.end();
- }
-
- const std::set<VertexId>& vertices() const {
- return vertices_;
- }
-
- size_t size() const {
- return vertices_.size();
- }
-
- edge_const_iterator out_begin(VertexId v) const {
- return v->out_begin();
- }
-
- edge_const_iterator out_end(VertexId v) const {
- return v->out_end();
- }
-
- edge_const_iterator in_begin(VertexId v) const {
- return v->in_begin();
- }
-
- edge_const_iterator in_end(VertexId v) const {
- return v->in_end();
- }
-
-private:
- void DeleteVertexFromGraph(VertexId vertex) {
- this->vertices_.erase(vertex);
- this->vertices_.erase(conjugate(vertex));
- }
-
- void DestroyVertex(VertexId vertex) {
- VertexId conjugate = vertex->conjugate();
- delete vertex.get();
- delete conjugate.get();
- }
-
- bool AdditionalCompressCondition(VertexId v) const {
- return !(EdgeEnd(GetUniqueOutgoingEdge(v)) == conjugate(v) && EdgeStart(GetUniqueIncomingEdge(v)) == conjugate(v));
- }
-
-protected:
-
- VertexId CreateVertex(const VertexData& data1, const VertexData& data2, restricted::IdDistributor& id_distributor) {
- VertexId vertex1(new PairedVertex<DataMaster>(data1), id_distributor);
- VertexId vertex2(new PairedVertex<DataMaster>(data2), id_distributor);
- vertex1->set_conjugate(vertex2);
- vertex2->set_conjugate(vertex1);
- return vertex1;
- }
-
- VertexId CreateVertex(const VertexData &data, restricted::IdDistributor &id_distributor) {
- return CreateVertex(data, master_.conjugate(data), id_distributor);
- }
-
- VertexId CreateVertex(const VertexData &data) {
- return CreateVertex(data, id_distributor_);
- }
-
- void AddVertexToGraph(VertexId vertex) {
- vertices_.insert(vertex);
- vertices_.insert(conjugate(vertex));
- }
-
- VertexId HiddenAddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
- VertexId vertex = CreateVertex(data, id_distributor);
- AddVertexToGraph(vertex);
- return vertex;
- }
-
- VertexId HiddenAddVertex(const VertexData& data) {
- return HiddenAddVertex(data, id_distributor_);
- }
-
- void HiddenDeleteVertex(VertexId vertex) {
- DeleteVertexFromGraph(vertex);
- DestroyVertex(vertex);
- }
-
- /////////////////////////low-level ops (move to helper?!)
-
- ////what with this method?
- EdgeId AddSingleEdge(VertexId v1, VertexId v2, const EdgeData &data,
- restricted::IdDistributor &idDistributor) {
- EdgeId newEdge(new PairedEdge<DataMaster>(v2, data), idDistributor);
- if (v1 != VertexId(0))
- v1->AddOutgoingEdge(newEdge);
- return newEdge;
- }
-
- EdgeId HiddenAddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId result = AddSingleEdge(VertexId(0), VertexId(0), data, id_distributor);
- if (this->master().isSelfConjugate(data)) {
- result->set_conjugate(result);
- return result;
- }
- EdgeId rcEdge = AddSingleEdge(VertexId(0), VertexId(0), this->master().conjugate(data), id_distributor);
- result->set_conjugate(rcEdge);
- rcEdge->set_conjugate(result);
- return result;
- }
-
- EdgeId HiddenAddEdge(const EdgeData &data) {
- return HiddenAddEdge(data, id_distributor_);
- }
-
- EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- // todo was suppressed for concurrent execution reasons (see concurrent_graph_component.hpp)
- // VERIFY(this->vertices_.find(v1) != this->vertices_.end() && this->vertices_.find(v2) != this->vertices_.end());
- EdgeId result = AddSingleEdge(v1, v2, data, id_distributor);
- if (this->master().isSelfConjugate(data) && (v1 == conjugate(v2))) {
- // todo why was it removed???
- // Because of some split issues: when self-conjugate edge is split armageddon happends
- // VERIFY(v1 == conjugate(v2));
- // VERIFY(v1 == conjugate(v2));
- result->set_conjugate(result);
- return result;
- }
- EdgeId rcEdge = AddSingleEdge(v2->conjugate(), v1->conjugate(), this->master().conjugate(data), id_distributor);
- result->set_conjugate(rcEdge);
- rcEdge->set_conjugate(result);
- return result;
- }
-
- EdgeId HiddenAddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
- return HiddenAddEdge(v1, v2, data, id_distributor_);
- }
-
- void HiddenDeleteEdge(EdgeId edge) {
- DEBUG("Hidden delete edge " << edge.int_id());
- EdgeId rcEdge = conjugate(edge);
- VertexId rcStart = conjugate(edge->end());
- VertexId start = conjugate(rcEdge->end());
- start->RemoveOutgoingEdge(edge);
- rcStart->RemoveOutgoingEdge(rcEdge);
- if (edge != rcEdge) {
- delete rcEdge.get();
- }
- delete edge.get();
- }
-
- void HiddenDeletePath(const std::vector<EdgeId>& edgesToDelete, const std::vector<VertexId>& verticesToDelete) {
- for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
- HiddenDeleteEdge(*it);
- for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
- HiddenDeleteVertex(*it);
- }
-
-public:
-
- GraphCore(const DataMaster& master) : master_(master) {
- }
-
- virtual ~GraphCore() {
- VERIFY(size() == 0);
- }
-
- class IteratorContainer {
- public:
- typedef edge_const_iterator const_iterator;
- private:
- const_iterator begin_;
- const_iterator end_;
- public:
- IteratorContainer(const_iterator begin, const_iterator end) :
- begin_(begin), end_(end) {
-
- }
-
- const_iterator begin() const {
- return begin_;
- }
-
- const_iterator end() const {
- return end_;
- }
- };
-
- restricted::LocalIdDistributor &GetGraphIdDistributor() {
- return id_distributor_;
- }
-
- const restricted::LocalIdDistributor &GetGraphIdDistributor() const {
- return id_distributor_;
- }
-
- size_t int_id(EdgeId edge) const {
- return edge.int_id();
- }
-
- size_t int_id(VertexId vertex) const {
- return vertex.int_id();
- }
-
- const DataMaster& master() const {
- return master_;
- }
-
- const EdgeData& data(EdgeId edge) const {
- return edge->data();
- }
-
- const VertexData& data(VertexId v) const {
- return v->data();
- }
-
- EdgeData& data(EdgeId edge) {
- return edge->data();
- }
-
- VertexData& data(VertexId v) {
- return v->data();
- }
-
- size_t OutgoingEdgeCount(VertexId v) const {
- return v->OutgoingEdgeCount();
- }
-
- IteratorContainer OutgoingEdges(VertexId v) const {
- //INFO("Outgoing");
- return IteratorContainer(out_begin(v), out_end(v));
- }
-
- size_t IncomingEdgeCount(VertexId v) const {
- return v->IncomingEdgeCount();
- }
-
- IteratorContainer IncomingEdges(VertexId v) const {
- return IteratorContainer(in_begin(v), in_end(v));
- }
-
- std::vector<EdgeId> GetEdgesBetween(VertexId v, VertexId u) const {
- return v->OutgoingEdgesTo(u);
- }
-
- bool RelatedVertices(VertexId v1, VertexId v2) const {
- return v1 == v2 || v1 == conjugate(v2);
- }
-
- ////////////////////////edge information
- VertexId EdgeStart(EdgeId edge) const {
- return edge->start();
- }
-
- VertexId EdgeEnd(EdgeId edge) const {
- //INFO("Edge end");
- return edge->end();
- }
-
- VertexId conjugate(VertexId v) const {
- return v->conjugate();
- }
-
- EdgeId conjugate(EdgeId edge) const {
- return edge->conjugate();
- }
-
- size_t length(const EdgeId edge) const {
- return master_.length(data(edge));
- }
-
- size_t length(const VertexId v) const {
- return master_.length(data(v));
- }
-
- //////////////////////shortcut methods
-
- std::vector<EdgeId> IncidentEdges(VertexId v) const {
- vector<EdgeId> answer;
- push_back_all(answer, IncomingEdges(v));
- push_back_all(answer, OutgoingEdges(v));
- return answer;
- }
-
- EdgeId GetUniqueOutgoingEdge(VertexId v) const {
- VERIFY(CheckUniqueOutgoingEdge(v));
- return *out_begin(v);
- }
-
- bool CheckUniqueIncomingEdge(VertexId v) const {
- return IncomingEdgeCount(v) == 1;
- }
-
- EdgeId GetUniqueIncomingEdge(VertexId v) const {
- VERIFY(CheckUniqueIncomingEdge(v));
- return *in_begin(v);
- }
-
- bool CheckUniqueOutgoingEdge(VertexId v) const {
- return OutgoingEdgeCount(v) == 1;
- }
-
- bool IsDeadEnd(VertexId v) const {
- return OutgoingEdgeCount(v) == 0;
- }
-
- bool IsDeadStart(VertexId v) const {
- return IncomingEdgeCount(v) == 0;
- }
-
- bool CanCompressVertex(VertexId v) const {
- // TRACE("Compress vertex check: ");
- // TRACE("Outgoing check: " << (OutgoingEdgeCount(v) == 1));
- // TRACE("Outgoing check: " << (CheckUniqueOutgoingEdge(v)));
- // TRACE("Incoming check: " << (IncomingEdgeCount(v) == 1));
- // TRACE("Incoming check: " << (CheckUniqueIncomingEdge(v) == 1));
- // if((OutgoingEdgeCount(v) == 1) && (IncomingEdgeCount(v) == 1)) {
- // TRACE("Loop check: " << (GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v)));
- // TRACE("Additional check: " << AdditionalCompressCondition(v));
- // }
- return OutgoingEdgeCount(v) == 1 && IncomingEdgeCount(v) == 1 &&
- GetUniqueOutgoingEdge(v) != GetUniqueIncomingEdge(v) &&
- AdditionalCompressCondition(v);
- }
-
- //////////////////////printing
- std::string str(const EdgeId e) const {
-// return master_.str(data(edge));
- std::stringstream ss;
- ss << int_id(e) << " (" << length(e) << ")";
- return ss.str();
- }
-
- std::string str(const VertexId v) const {
-// return master_.str(data(v));
- return ToString(int_id(v));
- }
-
- std::string detailed_str(const VertexId v) const {
- std::stringstream ss;
- ss << str(v) << ";";
- ss << "Incoming edges" << str(IncomingEdges(v)) << "; ";
- ss << "Outgoing edges" << str(OutgoingEdges(v)) << ";";
- return ss.str();
- }
-
- std::string detailed_str(const std::vector<EdgeId>& path) const {
- std::stringstream ss;
- ss << "Path: ";
- ss << "Vertex " << detailed_str(EdgeStart(path[0])) << " | ";
- for (auto it = path.begin(); it != path.end(); ++it) {
- EdgeId e = *it;
- ss << "Edge " << str(e) << " | ";
- ss << "Vertex " << detailed_str(EdgeEnd(e)) << " | ";
- }
- return ss.str();
- }
-
- template<class Container>
- std::string str(const Container& container) const {
- return str(container.begin(), container.end());
- }
-
- template<class It>
- std::string str(It begin, It end) const {
- std::stringstream ss;
- std::string delim = "";
- for (auto it = begin; it != end; ++it) {
- ss << delim << str(*it);
- delim = ", ";
- }
- return ss.str();
- }
-
-private:
- DECL_LOGGER("GraphCore");
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_core/graph_iterators.hpp b/src/modules/assembly_graph/graph_core/graph_iterators.hpp
deleted file mode 100644
index 9879885..0000000
--- a/src/modules/assembly_graph/graph_core/graph_iterators.hpp
+++ /dev/null
@@ -1,408 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "utils/adt/queue_iterator.hpp"
-#include "math/pred.hpp"
-#include "action_handlers.hpp"
-#include "dev_support/simple_tools.hpp"
-#include <boost/iterator/iterator_facade.hpp>
-
-namespace omnigraph {
-
-/**
- * SmartIterator is able to iterate through collection content of which can be changed in process of
- * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
- * way graph is changed. Also one can define order of iteration by specifying Comparator.
- */
-template<class Graph, typename ElementId, typename Comparator = std::less<ElementId>>
-class SmartIterator : public GraphActionHandler<Graph> {
- typedef GraphActionHandler<Graph> base;
- DynamicQueueIterator<ElementId, Comparator> inner_it_;
- bool add_new_;
- bool canonical_only_;
- //todo think of checking it in HandleAdd
- pred::TypedPredicate<ElementId> add_condition_;
-
-protected:
-
- void push(const ElementId& el) {
- if ((!canonical_only_ || el <= this->g().conjugate(el)) &&
- add_condition_(el)) {
- inner_it_.push(el);
- }
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- for (auto it = begin; it != end; ++it) {
- push(*it);
- }
- }
-
- void erase(const ElementId& el) {
- if (!canonical_only_ || el <= this->g().conjugate(el)) {
- inner_it_.erase(el);
- }
- }
-
- void clear() {
- inner_it_.clear();
- }
-
- SmartIterator(const Graph &g, const std::string &name, bool add_new,
- const Comparator& comparator, bool canonical_only,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : base(g, name),
- inner_it_(comparator),
- add_new_(add_new),
- canonical_only_(canonical_only),
- add_condition_(add_condition) {
- }
-
-public:
-
- bool canonical_only() const {
- return canonical_only_;
- }
-
- bool IsEnd() const {
- return inner_it_.IsEnd();
- }
-
- size_t size() const {
- return inner_it_.size();
- }
-
- ElementId operator*() {
- return *inner_it_;
- }
-
- void operator++() {
- ++inner_it_;
- }
-
- void HandleAdd(ElementId v) override {
- if (add_new_)
- push(v);
- }
-
- void HandleDelete(ElementId v) override {
- erase(v);
- }
-
- //use carefully!
- void ReleaseCurrent() {
- inner_it_.ReleaseCurrent();
- }
-
-};
-
-/**
- * SmartIterator is abstract class which acts both as QueueIterator and GraphActionHandler. As QueueIterator
- * SmartIterator is able to iterate through collection content of which can be changed in process of
- * iteration. And as GraphActionHandler SmartIterator can change collection contents with respect to the
- * way graph is changed. Also one can define order of iteration by specifying Comparator.
- */
-template<class Graph, typename ElementId,
- typename Comparator = std::less<ElementId>>
-class SmartSetIterator : public SmartIterator<Graph, ElementId, Comparator> {
- typedef SmartIterator<Graph, ElementId, Comparator> base;
-
-public:
- SmartSetIterator(const Graph &g,
- bool add_new = false,
- const Comparator& comparator = Comparator(),
- bool canonical_only = false,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : base(g, "SmartSet " + ToString(this), add_new, comparator, canonical_only, add_condition) {
- }
-
- template<class Iterator>
- SmartSetIterator(const Graph &g, Iterator begin, Iterator end,
- bool add_new = false,
- const Comparator& comparator = Comparator(),
- bool canonical_only = false,
- pred::TypedPredicate<ElementId> add_condition = pred::AlwaysTrue<ElementId>())
- : SmartSetIterator(g, add_new, comparator, canonical_only, add_condition) {
- insert(begin, end);
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- base::insert(begin, end);
- }
-
- void push(const ElementId& el) {
- base::push(el);
- }
-
- void clear() {
- base::clear();
- }
-};
-
-/**
- * SmartVertexIterator iterates through vertices of graph. It listens to AddVertex/DeleteVertex graph events
- * and correspondingly edits the set of vertices to iterate through. Note: high level event handlers are
- * triggered before low level event handlers like H>andleAdd/HandleDelete. Thus if Comparator uses certain
- * structure which is also updated with handlers make sure that all information is updated in high level
- * event handlers.
- */
-template<class Graph, typename Comparator = std::less<typename Graph::VertexId> >
-class SmartVertexIterator : public SmartIterator<Graph,
- typename Graph::VertexId, Comparator> {
- public:
- typedef typename Graph::VertexId VertexId;
-
- static size_t get_id() {
- static size_t id = 0;
- return id++;
- }
-
- public:
- SmartVertexIterator(const Graph &g, const Comparator& comparator =
- Comparator(), bool canonical_only = false)
- : SmartIterator<Graph, VertexId, Comparator>(
- g, "SmartVertexIterator " + ToString(get_id()), true,
- comparator, canonical_only) {
- this->insert(g.begin(), g.end());
- }
-
-};
-
-//todo return verifies when they can be switched off
-template<class Graph>
-class GraphEdgeIterator : public boost::iterator_facade<GraphEdgeIterator<Graph>
- , typename Graph::EdgeId, boost::forward_traversal_tag
- , typename Graph::EdgeId> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexIt const_vertex_iterator;
- typedef typename Graph::edge_const_iterator const_edge_iterator;
-
- const Graph& g_;
- const_vertex_iterator v_it_;
- const_edge_iterator e_it_;
- bool canonical_only_;
-
-public:
-
- GraphEdgeIterator(const Graph& g, const_vertex_iterator v_it, bool canonical_only = false)
- : g_(g),
- v_it_(v_it),
- canonical_only_(canonical_only) {
- if (v_it_ != g_.end()) {
- e_it_ = g_.out_begin(*v_it_);
- Skip();
- }
- }
-
-private:
-
- bool Canonical(EdgeId e) const {
- return e <= g_.conjugate(e);
- }
-
- friend class boost::iterator_core_access;
-
- void Skip() {
- //VERIFY(v_it_ != g_.end());
- while (true) {
- if (e_it_ == g_.out_end(*v_it_)) {
- v_it_++;
- if (v_it_ == g_.end())
- return;
- e_it_ = g_.out_begin(*v_it_);
- } else {
- if (!canonical_only_ || Canonical(*e_it_))
- return;
- else
- e_it_++;
- }
- }
- }
-
- void increment() {
- if (v_it_ == g_.end())
- return;
- e_it_++;
- Skip();
- }
-
- bool equal(const GraphEdgeIterator &other) const {
- if (other.v_it_ != v_it_)
- return false;
- if (v_it_ != g_.end() && other.e_it_ != e_it_)
- return false;
- if (other.canonical_only_ != canonical_only_)
- return false;
- return true;
- }
-
- EdgeId dereference() const {
- //VERIFY(v_it_ != g_.end());
- return *e_it_;
- }
-
-};
-
-template<class Graph>
-class ConstEdgeIterator {
- typedef typename Graph::EdgeId EdgeId;
- GraphEdgeIterator<Graph> begin_, end_;
-
- public:
- ConstEdgeIterator(const Graph &g, bool canonical_only = false)
- : begin_(g, g.begin(), canonical_only), end_(g, g.end(), canonical_only) {
- }
-
- bool IsEnd() const {
- return begin_ == end_;
- }
-
- EdgeId operator*() const {
- return *begin_;
- }
-
- const ConstEdgeIterator& operator++() {
- begin_++;
- return *this;
- }
-};
-
-/**
- * SmartEdgeIterator iterates through edges of graph. It listens to AddEdge/DeleteEdge graph events
- * and correspondingly edits the set of edges to iterate through. Note: high level event handlers are
- * triggered before low level event handlers like HandleAdd/HandleDelete. Thus if Comparator uses certain
- * structure which is also updated with handlers make sure that all information is updated in high level
- * event handlers.
- */
-template<class Graph, typename Comparator = std::less<typename Graph::EdgeId> >
-class SmartEdgeIterator : public SmartIterator<Graph, typename Graph::EdgeId, Comparator> {
- typedef GraphEdgeIterator<Graph> EdgeIt;
- public:
- typedef typename Graph::EdgeId EdgeId;
-
- static size_t get_id() {
- static size_t id = 0;
- return id++;
- }
-
- public:
- SmartEdgeIterator(const Graph &g, Comparator comparator = Comparator(),
- bool canonical_only = false)
- : SmartIterator<Graph, EdgeId, Comparator>(
- g, "SmartEdgeIterator " + ToString(get_id()), true,
- comparator, canonical_only) {
- this->insert(EdgeIt(g, g.begin()), EdgeIt(g, g.end()));
-
-// for (auto it = graph.begin(); it != graph.end(); ++it) {
-// //todo: this solution doesn't work with parallel simplification
-// this->insert(graph.out_begin(*it), graph.out_end(*it));
-// //this does
-// //auto out = graph.OutgoingEdges(*it);
-// //this->base::insert(out.begin(), out.end());
-// }
- }
-};
-
-//todo move out
-template<class Graph, class ElementId>
-class IterationHelper {
-};
-
-template<class Graph>
-class IterationHelper<Graph, typename Graph::VertexId> {
- const Graph& g_;
-public:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::VertexIt const_vertex_iterator;
-
- IterationHelper(const Graph& g)
- : g_(g) {
- }
-
- const_vertex_iterator begin() const {
- return g_.begin();
- }
-
- const_vertex_iterator end() const {
- return g_.end();
- }
-
- std::vector<const_vertex_iterator> Chunks(size_t chunk_cnt) const {
- VERIFY(chunk_cnt > 0);
- if (chunk_cnt == 1) {
- return {begin(), end()};
- }
-
- //trying to split vertices into equal chunks, leftovers put into first chunk
- vector<const_vertex_iterator> answer;
- size_t vertex_cnt = g_.size();
- size_t chunk_size = vertex_cnt / chunk_cnt;
- auto it = g_.begin();
- answer.push_back(it);
- for (size_t i = 0; i + chunk_cnt * chunk_size < vertex_cnt; ++i) {
- it++;
- }
- if (chunk_size > 0) {
- size_t i = 0;
- do {
- ++it;
- if (++i % chunk_size == 0)
- answer.push_back(it);
- } while (it != g_.end());
-
- VERIFY(i == chunk_cnt * chunk_size);
- } else {
- VERIFY(it == g_.end());
- answer.push_back(it);
- }
- VERIFY(answer.back() == g_.end());
- return answer;
- }
-
-};
-
-//todo move out
-template<class Graph>
-class IterationHelper<Graph, typename Graph::EdgeId> {
- typedef typename Graph::VertexId VertexId;
-
- const Graph& g_;
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef GraphEdgeIterator<Graph> const_edge_iterator;
-
- IterationHelper(const Graph& g)
- : g_(g) {
- }
-
- const_edge_iterator begin() const {
- return const_edge_iterator(g_, g_.begin());
- }
-
- const_edge_iterator end() const {
- return const_edge_iterator(g_, g_.end());
- }
-
- std::vector<omnigraph::GraphEdgeIterator<Graph>> Chunks(size_t chunk_cnt) const {
- if (chunk_cnt == 1) {
- return {begin(), end()};
- }
-
- vector<omnigraph::GraphEdgeIterator<Graph>> answer;
-
- for (auto v_it : IterationHelper<Graph, VertexId>(g_).Chunks(chunk_cnt)) {
- answer.push_back(omnigraph::GraphEdgeIterator<Graph>(g_, v_it));
- }
- return answer;
- }
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_core/observable_graph.hpp b/src/modules/assembly_graph/graph_core/observable_graph.hpp
deleted file mode 100644
index 0286cc5..0000000
--- a/src/modules/assembly_graph/graph_core/observable_graph.hpp
+++ /dev/null
@@ -1,499 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <vector>
-#include <set>
-#include <cstring>
-#include "dev_support/logger/logger.hpp"
-#include "graph_core.hpp"
-#include "graph_iterators.hpp"
-
-namespace omnigraph {
-
-using std::vector;
-using std::set;
-template<class DataMaster>
-class ObservableGraph: public GraphCore<DataMaster> {
-public:
- typedef GraphCore<DataMaster> base;
- typedef typename base::DataMasterT DataMasterT;
- typedef typename base::VertexData VertexData;
- typedef typename base::EdgeData EdgeData;
- typedef typename base::EdgeId EdgeId;
- typedef typename base::VertexId VertexId;
- typedef typename base::VertexIt VertexIt;
- typedef typename base::edge_const_iterator edge_const_iterator;
-
- typedef HandlerApplier<VertexId, EdgeId> Applier;
- typedef SmartVertexIterator<ObservableGraph> SmartVertexIt;
- typedef SmartEdgeIterator<ObservableGraph> SmartEdgeIt;
- typedef ConstEdgeIterator<ObservableGraph> ConstEdgeIt;
- typedef ActionHandler<VertexId, EdgeId> Handler;
-
-private:
- //todo switch to smart iterators
- mutable std::vector<Handler*> action_handler_list_;
- const HandlerApplier<VertexId, EdgeId> *applier_;
-
-public:
-//todo move to graph core
- typedef ConstructionHelper<DataMaster> HelperT;
-
- HelperT GetConstructionHelper() {
-// TODO: fix everything and restore this check
-// VERIFY(this->VerifyAllDetached());
- return HelperT(*this);
- }
-
- const Applier& GetHandlerApplier() const {
- return *applier_;
- }
-
- void AddActionHandler(Handler* action_handler) const;
-
- bool RemoveActionHandler(const Handler* action_handler) const;
-
- bool AllHandlersThreadSafe() const;
-
- // TODO: for debug. remove.
- void PrintHandlersNames() const;
-
- //todo make Fire* protected once again with helper friend class
- void FireAddVertex(VertexId v) const;
-
- void FireAddEdge(EdgeId e) const;
-
- void FireDeleteVertex(VertexId v) const;
-
- void FireDeleteEdge(EdgeId e) const;
-
- void FireMerge(std::vector<EdgeId> old_edges, EdgeId new_edge) const;
-
- void FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const;
-
- void FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const;
-
- bool VerifyAllDetached();
-
- //smart iterators
- template<typename Comparator>
- SmartVertexIterator<ObservableGraph, Comparator> SmartVertexBegin(
- const Comparator& comparator, bool canonical_only = false) const {
- return SmartVertexIterator<ObservableGraph, Comparator>(*this,
- comparator, canonical_only);
- }
-
- SmartVertexIterator<ObservableGraph> SmartVertexBegin(bool canonical_only = false) const {
- return SmartVertexIterator<ObservableGraph>(*this, std::less<VertexId>(), canonical_only);
- }
-
- template<typename Comparator>
- SmartEdgeIterator<ObservableGraph, Comparator> SmartEdgeBegin(
- const Comparator& comparator, bool canonical_only = false) const {
- return SmartEdgeIterator<ObservableGraph, Comparator>(*this, comparator, canonical_only);
- }
-
- SmartEdgeIterator<ObservableGraph> SmartEdgeBegin(bool canonical_only = false) const {
- return SmartEdgeIterator<ObservableGraph>(*this, std::less<EdgeId>(), canonical_only);
- }
-
- ConstEdgeIterator<ObservableGraph> ConstEdgeBegin(bool canonical_only = false) const {
- return ConstEdgeIterator<ObservableGraph>(*this, canonical_only);
- }
-
- void FireDeletePath(const std::vector<EdgeId>& edges_to_delete, const std::vector<VertexId>& vertices_to_delete) const;
-
- ObservableGraph(const DataMaster& master) :
- base(master), applier_(new PairedHandlerApplier<ObservableGraph>(*this)) {
- }
-
- virtual ~ObservableGraph();
-
- /////////////////////////graph operations
- //adding/removing vertices and edges
- VertexId AddVertex(const VertexData& data) {
- return AddVertex(data, GetGraphIdDistributor());
- }
-
- VertexId AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor);
-
- void DeleteVertex(VertexId v);
-
- void ForceDeleteVertex(VertexId v);
-
- using base::GetGraphIdDistributor;
- using base::conjugate;
-
- EdgeId AddEdge(const EdgeData &data) {
- return AddEdge(data, GetGraphIdDistributor());
- }
-
- EdgeId AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor);
-
- EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData &data) {
- return AddEdge(v1, v2, data, GetGraphIdDistributor());
- }
-
- EdgeId AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor);
-
- void DeleteEdge(EdgeId e);
-
- void DeleteAllOutgoing(VertexId v);
-
- void DeleteAllIncoming(VertexId v);
-
- void CompressVertex(VertexId v);
-
- EdgeId UnsafeCompressVertex(VertexId v);
-
- std::vector<EdgeId> EdgesToDelete(const std::vector<EdgeId>& path) const;
-
- std::vector<VertexId> VerticesToDelete(const std::vector<EdgeId>& path) const;
-
- std::vector<EdgeId> CorrectMergePath(const std::vector<EdgeId>& path) const;
-
- EdgeId MergePath(const std::vector<EdgeId>& path, bool safe_merging = true);
-
- std::pair<EdgeId, EdgeId> SplitEdge(EdgeId edge, size_t position);
-
- EdgeId GlueEdges(EdgeId edge1, EdgeId edge2);
-
-private:
- DECL_LOGGER("ObservableGraph")
-};
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::VertexId ObservableGraph<DataMaster>::AddVertex(const VertexData& data, restricted::IdDistributor& id_distributor) {
- VertexId v = base::HiddenAddVertex(data, id_distributor);
- FireAddVertex(v);
- return v;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteVertex(VertexId v) {
- VERIFY(base::IsDeadEnd(v) && base::IsDeadStart(v));
- VERIFY(v != VertexId(NULL));
- FireDeleteVertex(v);
- base::HiddenDeleteVertex(v);
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::ForceDeleteVertex(VertexId v) {
- DeleteAllOutgoing(v);
- DeleteAllIncoming(v);
- DeleteVertex(v);
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(VertexId v1, VertexId v2, const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId e = base::HiddenAddEdge(v1, v2, data, id_distributor);
- FireAddEdge(e);
- return e;
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::AddEdge(const EdgeData& data, restricted::IdDistributor& id_distributor) {
- EdgeId e = base::HiddenAddEdge(data, id_distributor);
- FireAddEdge(e);
- return e;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteEdge(EdgeId e) {
- FireDeleteEdge(e);
- base::HiddenDeleteEdge(e);
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteAllOutgoing(VertexId v) {
- while (base::OutgoingEdgeCount(v) > 0) {
- EdgeId edge = *base::out_begin(v);
- DeleteEdge(edge);
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::DeleteAllIncoming(VertexId v) {
- while (base::IncomingEdgeCount(v) > 0) {
- EdgeId edge = *base::in_begin(v);
- DeleteEdge(edge);
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::CompressVertex(VertexId v) {
- //VERIFY(CanCompressVertex(v));
- if (base::CanCompressVertex(v)) {
- UnsafeCompressVertex(v);
- } else {
- TRACE("Vertex " << base::str(v) << " can't be compressed");
- }
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::UnsafeCompressVertex(VertexId v) {
- VERIFY(base::CanCompressVertex(v));
- std::vector<EdgeId> edges_to_merge;
- edges_to_merge.push_back(base::GetUniqueIncomingEdge(v));
- edges_to_merge.push_back(base::GetUniqueOutgoingEdge(v));
- return MergePath(edges_to_merge);
-}
-
-template<class DataMaster>
-std::vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::EdgesToDelete(const std::vector<EdgeId>& path) const {
- std::set<EdgeId> edgesToDelete;
- edgesToDelete.insert(path[0]);
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- if (edgesToDelete.find(base::conjugate(e)) == edgesToDelete.end())
- edgesToDelete.insert(e);
- }
- return std::vector<EdgeId>(edgesToDelete.begin(), edgesToDelete.end());
-}
-
-template<class DataMaster>
-vector<typename ObservableGraph<DataMaster>::VertexId> ObservableGraph<DataMaster>::VerticesToDelete(const vector<EdgeId>& path) const {
- std::set<VertexId> verticesToDelete;
- for (size_t i = 0; i + 1 < path.size(); i++) {
- EdgeId e = path[i + 1];
- VertexId v = base::EdgeStart(e);
- if (verticesToDelete.find(base::conjugate(v)) == verticesToDelete.end())
- verticesToDelete.insert(v);
- }
- return vector<VertexId>(verticesToDelete.begin(), verticesToDelete.end());
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::AddActionHandler(Handler* action_handler) const {
-#pragma omp critical(action_handler_list_modification)
- {
- TRACE("Action handler " << action_handler->name() << " added");
- if (find(action_handler_list_.begin(), action_handler_list_.end(), action_handler) != action_handler_list_.end()) {
- VERIFY_MSG(false, "Action handler " << action_handler->name() << " has already been added");
- } else {
- action_handler_list_.push_back(action_handler);
- }
- }
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::RemoveActionHandler(const Handler* action_handler) const {
- bool result = false;
-#pragma omp critical(action_handler_list_modification)
- {
- auto it = std::find(action_handler_list_.begin(), action_handler_list_.end(), action_handler);
- if (it != action_handler_list_.end()) {
- action_handler_list_.erase(it);
- TRACE("Action handler " << action_handler->name() << " removed");
- result = true;
- } else {
- TRACE("Action handler " << action_handler->name() << " wasn't found among graph action handlers");
- }
- }
- return result;
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::AllHandlersThreadSafe() const {
- for (Handler* handler : action_handler_list_) {
- if (handler->IsAttached() && !handler->IsThreadSafe()) {
- return false;
- }
- }
- return true;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::PrintHandlersNames() const {
- for (Handler* handler : action_handler_list_) {
- std::cout << handler->name() << " attached=" << handler->IsAttached() << std::endl;
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireAddVertex(VertexId v) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- TRACE("FireAddVertex to handler " << handler_ptr->name());
- applier_->ApplyAdd(*handler_ptr, v);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireAddEdge(EdgeId e) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- TRACE("FireAddEdge to handler " << handler_ptr->name());
- applier_->ApplyAdd(*handler_ptr, e);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeleteVertex(VertexId v) const {
- for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
- if ((*it)->IsAttached()) {
- applier_->ApplyDelete(**it, v);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeleteEdge(EdgeId e) const {
- for (auto it = action_handler_list_.rbegin(); it != action_handler_list_.rend(); ++it) {
- if ((*it)->IsAttached()) {
- applier_->ApplyDelete(**it, e);
- }
- };
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireMerge(vector<EdgeId> old_edges, EdgeId new_edge) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplyMerge(*handler_ptr, old_edges, new_edge);
- }
- }
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplyGlue(*handler_ptr, new_edge, edge1, edge2);
- }
- };
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireSplit(EdgeId edge, EdgeId new_edge1, EdgeId new_edge2) const {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- applier_->ApplySplit(*handler_ptr, edge, new_edge1, new_edge2);
- }
- }
-}
-
-template<class DataMaster>
-bool ObservableGraph<DataMaster>::VerifyAllDetached() {
- for (Handler* handler_ptr : action_handler_list_) {
- if (handler_ptr->IsAttached()) {
- return false;
- }
- }
- return true;
-}
-
-template<class DataMaster>
-void ObservableGraph<DataMaster>::FireDeletePath(const vector<EdgeId>& edgesToDelete, const vector<VertexId>& verticesToDelete) const {
- for (auto it = edgesToDelete.begin(); it != edgesToDelete.end(); ++it)
- FireDeleteEdge(*it);
- for (auto it = verticesToDelete.begin(); it != verticesToDelete.end(); ++it)
- FireDeleteVertex(*it);
-}
-
-template<class DataMaster>
-ObservableGraph<DataMaster>::~ObservableGraph<DataMaster>() {
- while (base::size() > 0) {
- ForceDeleteVertex(*base::begin());
- }
-}
-
-template<class DataMaster>
-vector<typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::CorrectMergePath(const vector<EdgeId>& path) const {
- for (size_t i = 0; i < path.size(); i++) {
- if (path[i] == base::conjugate(path[i])) {
- vector<EdgeId> result;
- if (i < path.size() - 1 - i) {
- for (size_t j = 0; j < path.size(); j++)
- result.push_back(base::conjugate(path[path.size() - 1 - j]));
- i = path.size() - 1 - i;
- } else {
- result = path;
- }
- size_t size = 2 * i + 1;
- for (size_t j = result.size(); j < size; j++) {
- result.push_back(base::conjugate(result[size - 1 - j]));
- }
- return result;
- }
- }
- return path;
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::MergePath(const vector<EdgeId>& path, bool safe_merging) {
- VERIFY(!path.empty());
- for (size_t i = 0; i < path.size(); i++)
- for (size_t j = i + 1; j < path.size(); j++) {
- VERIFY(path[i] != path[j]);
- }
- if (path.size() == 1) {
- TRACE(
- "Path of single edge " << base::str(*(path.begin())) << ". Nothing to merge.");
- };
- // cerr << "Merging " << PrintDetailedPath(pObservableGraph<DataMaster><VertexIdT, EdgeIdT, VertexIt>ath) << endl;
- // cerr << "Conjugate " << PrintConjugatePath(path) << endl;
- vector<EdgeId> corrected_path = CorrectMergePath(path);
- VertexId v1 = base::EdgeStart(corrected_path[0]);
- VertexId v2 = base::EdgeEnd(corrected_path[corrected_path.size() - 1]);
- vector<const EdgeData*> to_merge;
- for (auto it = corrected_path.begin(); it != corrected_path.end(); ++it) {
- to_merge.push_back(&(base::data(*it)));
- }
- EdgeId new_edge = base::HiddenAddEdge(v1, v2, base::master().MergeData(to_merge, safe_merging));
- FireMerge(corrected_path, new_edge);
- vector<EdgeId> edges_to_delete = EdgesToDelete(corrected_path);
- vector<VertexId> vertices_to_delete = VerticesToDelete(corrected_path);
- FireDeletePath(edges_to_delete, vertices_to_delete);
- FireAddEdge(new_edge);
- base::HiddenDeletePath(edges_to_delete, vertices_to_delete);
- return new_edge;
-}
-
-template<class DataMaster>
-std::pair<typename ObservableGraph<DataMaster>::EdgeId, typename ObservableGraph<DataMaster>::EdgeId> ObservableGraph<DataMaster>::SplitEdge(EdgeId edge, size_t position) {
- bool sc_flag = (edge == conjugate(edge));
- VERIFY_MSG(position > 0 && position < (sc_flag ? base::length(edge) / 2 + 1 : base::length(edge)),
- "Edge length is " << base::length(edge) << " but split pos was " << position);
- std::pair<VertexData, std::pair<EdgeData, EdgeData> > newData = base::master().SplitData(base::data(edge), position, sc_flag);
- VertexId splitVertex = base::HiddenAddVertex(newData.first);
- EdgeId new_edge1 = base::HiddenAddEdge(base::EdgeStart(edge), splitVertex, newData.second.first);
- EdgeId new_edge2 = base::HiddenAddEdge(splitVertex, sc_flag ? conjugate(splitVertex) : base::EdgeEnd(edge), newData.second.second);
- VERIFY(!sc_flag || new_edge2 == conjugate(new_edge2))
- FireSplit(edge, new_edge1, new_edge2);
- FireDeleteEdge(edge);
- FireAddVertex(splitVertex);
- FireAddEdge(new_edge1);
- FireAddEdge(new_edge2);
- base::HiddenDeleteEdge(edge);
- return make_pair(new_edge1, new_edge2);
-}
-
-template<class DataMaster>
-typename ObservableGraph<DataMaster>::EdgeId ObservableGraph<DataMaster>::GlueEdges(EdgeId edge1, EdgeId edge2) {
- EdgeId new_edge = base::HiddenAddEdge(base::EdgeStart(edge2), base::EdgeEnd(edge2), base::master().GlueData(base::data(edge1), base::data(edge2)));
- FireGlue(new_edge, edge1, edge2);
- FireDeleteEdge(edge1);
- FireDeleteEdge(edge2);
- FireAddEdge(new_edge);
- VertexId start = base::EdgeStart(edge1);
- VertexId end = base::EdgeEnd(edge1);
- base::HiddenDeleteEdge(edge1);
- base::HiddenDeleteEdge(edge2);
- if (base::IsDeadStart(start) && base::IsDeadEnd(start)) {
- DeleteVertex(start);
- }
- if (base::IsDeadStart(end) && base::IsDeadEnd(end)) {
- DeleteVertex(end);
- }
- return new_edge;
-}
-}
diff --git a/src/modules/assembly_graph/graph_core/order_and_law.hpp b/src/modules/assembly_graph/graph_core/order_and_law.hpp
deleted file mode 100644
index 20ad96d..0000000
--- a/src/modules/assembly_graph/graph_core/order_and_law.hpp
+++ /dev/null
@@ -1,644 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <boost/utility.hpp>
-
-#include <ostream>
-#include <unordered_set>
-#include <unordered_map>
-#include "dev_support/stacktrace.hpp"
-#include <algorithm>
-#include <map>
-#include "dev_support/openmp_wrapper.h"
-#include "folly/PackedSyncPtr.h"
-
-
-namespace restricted {
-
-//todo discuss with Anton
-static const uint16_t MAX_THREAD_CNT = 128;
-
-class IdDistributor {
-public:
- virtual size_t GetId() = 0;
-
- virtual ~IdDistributor() {
- }
-};
-
-template<class Iter>
-class ListIdDistributor : public IdDistributor {
- friend class IdSegmentStorage;
-
-private:
- Iter left_;
- Iter right_;
- size_t shift_;
- size_t max_;
-
- ListIdDistributor(Iter left, Iter right, size_t shift = 0, size_t max = size_t(-1)) : left_(left),
- right_(right),
- shift_(shift), max_(max) {
- }
-
-public:
- bool valid() {
- return left_ < right_;
- }
-
- size_t GetId() {
- size_t result = *(left_);
- VERIFY(result < max_);
- ++left_;
- return shift_ + result;
- }
-};
-
-class SegmentIterator {
-private:
- size_t value_;
-public:
- SegmentIterator(size_t value) : value_(value) {
- }
-
- size_t operator*() const {
- return value_;
- }
-
- void operator++() {
- value_++;
- }
-
- void operator++(int) {
- ++value_;
- }
-
- bool operator==(const SegmentIterator &that) const {
- return value_ == that.value_;
- }
-
- bool operator!=(const SegmentIterator &that) const {
- return value_ != that.value_;
- }
-};
-
-class IdSegmentStorage {
- friend class LocalIdDistributor;
-
-public:
- ListIdDistributor<SegmentIterator> GetSegmentIdDistributor(size_t left, size_t right) {
- VERIFY(left < right);
- VERIFY(right <= size_);
- return ListIdDistributor<SegmentIterator>(SegmentIterator(left), SegmentIterator(right), min_value_, size_);
- }
-
- template<class Iter>
- ListIdDistributor<Iter> GetSegmentIdDistributor(Iter left, Iter right) {
- VERIFY(left < right);
- return ListIdDistributor<Iter>(left, right, min_value_, size_);
- }
-
- IdSegmentStorage() : min_value_(0), size_(0) { }
-
-private:
- IdSegmentStorage(size_t min_value, size_t size) : min_value_(min_value), size_(size) { }
-
- size_t min_value_;
- size_t size_;
-};
-
-// Id distributor for pure_pointer. Singleton.
-class LocalIdDistributor : public IdDistributor, boost::noncopyable {
- friend class PeriodicIdDistributor;
-
- static const size_t INITIAL_MAX_INT_ID = 2;
-public:
- size_t GetId() {
- return max_int_id_++;
- }
-
- IdSegmentStorage Reserve(size_t size) {
- max_int_id_ += size;
- return IdSegmentStorage(max_int_id_ - size, size);
- }
-
- IdSegmentStorage ReserveUpTo(size_t max) {
- VERIFY(max_int_id_ == INITIAL_MAX_INT_ID);
- max_int_id_ = max;
- return IdSegmentStorage(0, max);
- }
-
-// static GlobalIdDistributor &GetInstance() {
-// static GlobalIdDistributor instance(INITIAL_MAX_INT_ID);
-// return instance;
-// }
-
- size_t GetMax() const {
- return max_int_id_;
- }
-
- LocalIdDistributor(size_t min_id_value = INITIAL_MAX_INT_ID) : max_int_id_(min_id_value) { }
-
-private:
- size_t max_int_id_;
-};
-
-/* id distributor used for concurrent algorithms.
-* each thread use their own PeriodicIdDistributor with period equals to
-* the quantity of threads. After thread's job is done Synchronize call are required
-* to increase id in GlobalIdDistributor.
-*/
-class PeriodicIdDistributor : public IdDistributor {
-
-public:
- PeriodicIdDistributor(LocalIdDistributor &id_distributor, size_t first_id, size_t period)
- : id_distributor_(id_distributor), cur_id_(first_id), period_(period) {
- }
-
- virtual size_t GetId() {
- size_t id = cur_id_;
- cur_id_ += period_;
-
- return id;
- }
-
- void Synchronize() const {
- size_t &global_max_id = id_distributor_.max_int_id_;
- global_max_id = std::max(cur_id_, global_max_id);
- }
-
-private:
- LocalIdDistributor &id_distributor_;
- size_t cur_id_;
- size_t period_;
-};
-
-template<class PurePtrT>
-class PurePtrLock;
-
-template<class PurePtrT>
-class PurePtrMarker;
-
-//todo maybe make it extend folly::PackedSyncPtr<T>?
-template<class T>
-struct pure_pointer {
- typedef T type;
- typedef T *pointer_type;
-
- explicit pure_pointer()
- : int_id_(0) {
- ptr_.init(pointer_type(0), MAX_THREAD_CNT);
- }
-
- explicit pure_pointer(T *ptr)
- : int_id_(size_t(ptr)) {
- ptr_.init(ptr, MAX_THREAD_CNT);
- VERIFY(int_id_ < 2);
- }
-
- explicit pure_pointer(T *ptr, IdDistributor &idDistributor)
- : int_id_(generate_id(ptr, idDistributor)) {
- ptr_.init(ptr, MAX_THREAD_CNT);
- }
-
-// lock_pointer_type& get_lockable() {
-// return ptr_;
-// }
-
- T *get() const {
- return ptr_.get();
- }
-
- T &operator*() const {
- return *ptr_;
- }
-
- T *operator->() const {
- return ptr_.get();
- }
-
- bool operator==(const pure_pointer &rhs) const {
- if (int_id_ == rhs.int_id_) {
- VERIFY(ptr_.get() == rhs.ptr_.get());
- return true;
- }
- return false;
- }
-
- bool operator!=(const pure_pointer &rhs) const {
- return !operator==(rhs);
- }
-
- bool operator<(const pure_pointer &rhs) const {
- return this->int_id_ < rhs.int_id_;
- }
-
- bool operator<=(const pure_pointer &rhs) const {
- return *this < rhs || *this == rhs;
- }
-
- size_t hash() const {
- return this->int_id_;
- }
-
- size_t int_id() const {
- return int_id_;
- }
-
-private:
- friend class PurePtrLock<pure_pointer<T>>;
-
- friend class PurePtrMarker<pure_pointer<T>>;
-
- typedef folly::PackedSyncPtr<T> lock_pointer_type;
-
- static size_t generate_id(T *ptr, IdDistributor &idDistributor) {
- if (ptr == 0 || ptr == (T *) 1 || ptr == (T *) (-1)) {
- return size_t(ptr);
- }
-
- return idDistributor.GetId();
- }
-
- lock_pointer_type ptr_;
-
- size_t int_id_;
-};
-
-template<class LockT>
-class ReEnteringLock {
- LockT &lock_;
- bool reentered_;
-
- uint16_t locking_thread() const {
- //don't need barrier here (as folly documentation says)
- return lock_.extra();
- }
-
- uint16_t current_thread() const {
- return uint16_t(omp_get_thread_num());
- }
-
- void Lock() {
- lock_.lock();
- lock_.setExtra(current_thread());
- }
-
- void Unlock() {
- lock_.setExtra(MAX_THREAD_CNT);
- lock_.unlock();
- }
-
-public:
- ReEnteringLock(LockT &lock) :
- lock_(lock),
- reentered_(false) {
- if (locking_thread() == current_thread()) {
- reentered_ = true;
- } else {
- Lock();
- }
- }
-
- ~ReEnteringLock() {
- if (!reentered_) {
- Unlock();
- }
- }
-};
-
-/**
-* Lock that uses a pure ptr as a target.
-* Be careful NOT to pass a COPY of pure ptr you want to use as locked object!
-*/
-template<class PurePtrT>
-class PurePtrLock {
- ReEnteringLock<typename PurePtrT::lock_pointer_type> inner_lock_;
-
-public:
- PurePtrLock(PurePtrT &pure_ptr) :
- inner_lock_(pure_ptr.ptr_) {
- }
-
-};
-
-/**
-* Way to "mark" pure pointer without using additional memory.
-* Marking/unmarking operations are atomic
-* Be careful NOT to pass a COPY of pure ptr you want to mark!
-* Do not use with PurePtrLocks, they use the same space for storing data...
-*/
-template<class PurePtrT>
-class PurePtrMarker {
- typedef typename PurePtrT::lock_pointer_type LockWithData;
-
- void ChangeMark(PurePtrT &pure_ptr, uint16_t new_mark) const {
- LockWithData &lock_with_data = pure_ptr.ptr_;
- lock_with_data.lock();
- lock_with_data.setExtra(new_mark);
- lock_with_data.unlock();
- }
-
-public:
-
- void mark(PurePtrT &pure_ptr) const {
- ChangeMark(pure_ptr, 0);
- }
-
- void unmark(PurePtrT &pure_ptr) const {
- ChangeMark(pure_ptr, MAX_THREAD_CNT);
- }
-
- bool is_marked(const PurePtrT &pure_ptr) const {
- uint16_t curr_mark = pure_ptr.ptr_.extra();
- VERIFY(curr_mark == 0 || curr_mark == MAX_THREAD_CNT);
- return curr_mark == 0;
- }
-
-};
-
-//template<class T>
-//struct Comparator
-//{
-// typedef pure_pointer<T> pointer_type_t;
-//
-// bool operator()(pointer_type_t const& a, pointer_type_t const& b) const {
-// return a.get() < b.get();
-// }
-//};
-
-template<class T>
-struct Hash {
- typedef pure_pointer<T> pointer_type_t;
- std::hash<T *> inner_hash_;
-
- size_t operator()(pointer_type_t const &a) const {
- return inner_hash_(a.get());
- }
-};
-
-template<class It>
-struct iterator_wrapper {
- typedef typename It::value_type value_type;
- typedef typename It::difference_type difference_type;
- typedef typename It::reference reference;
- typedef typename It::pointer pointer;
-
- explicit iterator_wrapper(It it) : it_(it) { }
-
- reference operator*() const { return it_.operator*(); }
-
- pointer operator->() const { return it_.operator->(); }
-
- bool operator==(const iterator_wrapper &rhs) const { return it_ == rhs.it_; }
-
- bool operator!=(const iterator_wrapper &rhs) const { return it_ != rhs.it_; }
-
-private:
- It it_;
-};
-
-template<class T>
-struct set {
- typedef Hash<typename T::type> hash_t;
- typedef std::unordered_set<T, hash_t> base_set_t;
- typedef typename base_set_t::value_type value_type;
-
- typedef iterator_wrapper<typename base_set_t::iterator> iterator;
- typedef iterator_wrapper<typename base_set_t::const_iterator> const_iterator;
-
-public:
- set() : base_set_(10, hash_t()) {
- }
-
- template<class It>
- set(It begin, It end) : base_set_(begin, end, 10, hash_t()) {
- }
-
- const_iterator begin() const { return const_iterator(base_set_.begin()); }
-
- const_iterator end() const { return const_iterator(base_set_.end()); }
-
- iterator begin() { return iterator(base_set_.begin()); }
-
- iterator end() { return iterator(base_set_.end()); }
-
- const_iterator find(const T &key) const { return const_iterator(base_set_.find(key)); }
-
- iterator find(const T &key) { return iterator(base_set_.find(key)); }
-
- size_t count(T const &item) const { return base_set_.count(item); }
-
- std::pair<iterator, bool> insert(value_type const &item) {
- const std::pair<iterator, bool> &ret = base_set_.insert(item);
- return make_pair(iterator(ret.first), ret.second);
- }
-
- template<class It>
- void insert(It first, It last) { base_set_.insert(first, last); }
-
- size_t erase(const T &x) { return base_set_.erase(x); }
-
- void clear() { base_set_.clear(); }
-
- size_t size() const { return base_set_.size(); }
-
- bool operator==(const set &rhs) const {
- if (this->size() != rhs.size())
- return false;
-
- for (auto i = base_set_.begin(), j = rhs.base_set_.begin();
- i != base_set_.end() && j != rhs.base_set_.end();
- ++i, ++j) {
- if (*i != *j)
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const set &rhs) const {
- return !(*this == rhs);
- }
-
- template<class Comparator>
- void Copy(std::set<T, Comparator> &container) const {
- container.insert(base_set_.begin(), base_set_.end());
- }
-
-private:
- base_set_t base_set_;
-};
-
-
-template<class Key, class Value>
-struct map {
- typedef Hash<typename Key::type> hash_t;
- typedef std::unordered_map<Key, Value, hash_t> base_map_t;
- typedef typename base_map_t::value_type value_type;
-
- typedef iterator_wrapper<typename base_map_t::iterator> iterator;
- typedef iterator_wrapper<typename base_map_t::const_iterator> const_iterator;
-
-public:
- map()
- : base_map_(10, hash_t()) {
- }
-
- template<class It>
- map(It begin, It end)
- : base_map_(begin, end, 10, hash_t()) {
- }
-
- const_iterator begin() const { return const_iterator(base_map_.begin()); }
-
- const_iterator end() const { return const_iterator(base_map_.end()); }
-
- iterator begin() { return iterator(base_map_.begin()); }
-
- iterator end() { return iterator(base_map_.end()); }
-
- const_iterator find(const Key &key) const {
- return const_iterator(base_map_.find(key));
- }
-
- iterator find(const Key &key) { return iterator(base_map_.find(key)); }
-
- size_t count(Key const &item) const { return base_map_.count(item); }
-
- Value &operator[](Key const &x) { return base_map_[x]; }
-
- std::pair<iterator, bool> insert(value_type const &value) {
- std::pair<iterator, bool> ret = base_map_.insert(value);
- return make_pair(iterator(ret.first), ret.second);
- }
-
- template<class It>
- void insert(It first, It last) { base_map_.insert(first, last); }
-
- size_t erase(Key const &x) { return base_map_.erase(x); }
-
- void clear() { base_map_.clear(); }
-
- size_t size() const { return base_map_.size(); }
-
- bool operator==(const map &rhs) const {
- if (size() != rhs.size())
- return false;
-
- for (auto i = base_map_.begin(), j = rhs.base_map_.begin();
- i != base_map_.end() && j != rhs.base_map_.end();
- ++i, ++j) {
- if (*i != *j)
- return false;
- }
-
- return true;
- }
-
- bool operator!=(const map &rhs) const {
- return !(*this == rhs);
- }
-
- template<class Comparator>
- void Copy(std::map<Key, Value, Comparator> &container) const {
- container.insert(base_map_.begin(), base_map_.end());
- }
-
-private:
- base_map_t base_map_;
-};
-
-template<class T>
-std::ostream &operator<<(std::ostream &stream, const pure_pointer<T> &pointer) {
- stream << pointer.int_id();
- return stream;
-}
-
-} // namespace restricted
-
-namespace std {
-template<class T>
-struct hash<restricted::pure_pointer<T>> {
- size_t operator()(const restricted::pure_pointer<T> &pointer) const {
- return pointer.hash();
- }
-};
-}
-
-template<class T, class Comparator>
-class PairComparator {
-private:
- Comparator comparator_;
-public:
- PairComparator(Comparator comparator) : comparator_(comparator) {
- }
-
- bool operator()(std::pair<T, T> a, std::pair<T, T> b) const {
- return a.first == b.first ? comparator_(a.second, b.second) : comparator_(a.first, b.first);
- }
-};
-
-//
-//template<typename T, class Comparator>
-//class MixedComparator {
-//private:
-// Comparator c1_;
-// Comparator c2_;
-//public:
-// MixedComparator(const Comparator &c1, const Comparator &c2) : c1_(c1), c2_(c2) {
-// }
-//
-// bool operator()(const T &a, const T &b) const {
-// if(c1_.IsAFAKE(a) || c1_.IsAFAKE(b)) {
-// if(c1_.IsAFAKEMin(a))
-// return !c1_.IsAFAKEMin(b);
-// if(c1_.IsAFAKEMax(b))
-// return c1_.IsAFAKEMax(a);
-// return false;
-// }
-// if(c1_.IsValidId(a) && c1_.IsValidId(b))
-// return c1_(a, b);
-// if(c1_.IsValidId(a))
-// return true;
-// if(c1_.IsValidId(b))
-// return false;
-// if(c2_.IsValidId(a) && c2_.IsValidId(b)) {
-// return c2_(a, b);
-// }
-// VERIFY(false);
-// return false;
-// }
-//
-// bool IsValidId(T element) {
-// return c1_.IsValid(element) || c2_.IsValid(element);
-// }
-//};
-
-template<class Container, class Comparator>
-class ContainerComparator {
-private:
- Comparator comparator_;
-public:
- ContainerComparator(const Comparator &comparator) : comparator_(comparator) {
- }
-
- bool operator()(const Container &a, const Container &b) const {
- for (auto ita = a.begin, itb = b.begin(); ita != a.end() && itb != b.end(); ++ita, ++itb) {
- if (*ita != *itb)
- return comparator_(*ita, *itb);
- }
- if (a.size() < b.size()) {
- return true;
- }
- return false;
- }
-
-};
-
diff --git a/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp b/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
deleted file mode 100644
index f0b72a0..0000000
--- a/src/modules/assembly_graph/graph_support/basic_edge_conditions.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/func.hpp"
-#include "math/pred.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-#include "assembly_graph/graph_core/directions.hpp"
-#include "assembly_graph/paths/path_finders.hpp"
-
-namespace omnigraph {
-
-using namespace func;
-
-template<class Graph>
-class EdgeCondition : public Predicate<typename Graph::EdgeId> {
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &g_;
-protected:
-
- EdgeCondition(const Graph &g)
- : g_(g) {
- }
-
- const Graph &g() const {
- return g_;
- }
-
-};
-
-template<class Graph>
-class IsolatedEdgeCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- bool IsTerminalVertex(VertexId v) const {
- return this->g().IncomingEdgeCount(v) + this->g().OutgoingEdgeCount(v) == 1;
- }
-
-public:
- IsolatedEdgeCondition(const Graph &g) : base(g) {
- }
-
- bool Check(EdgeId e) const {
- return IsTerminalVertex(this->g().EdgeStart(e)) && IsTerminalVertex(this->g().EdgeEnd(e));
- }
-
-};
-
-template<class Graph>
-inline bool HasAlternatives(const Graph &g, typename Graph::EdgeId e) {
- return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1
- && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
-}
-
-
-template<class Graph>
-class AlternativesPresenceCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
-public:
-
- AlternativesPresenceCondition(const Graph &g)
- : base(g) {
-
- }
-
- bool Check(EdgeId e) const {
- return HasAlternatives(this->g(), e);
- }
-
-};
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> AddAlternativesPresenceCondition(const Graph &g,
- pred::TypedPredicate<typename Graph::EdgeId> condition) {
- return pred::And(AlternativesPresenceCondition<Graph>(g), condition);
-}
-
-
-template<class Graph>
-class CoverageUpperBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef EdgeCondition<Graph> base;
- const double max_coverage_;
-
-public:
-
- CoverageUpperBound(const Graph &g, double max_coverage)
- : base(g),
- max_coverage_(max_coverage) {
- }
-
- bool Check(EdgeId e) const {
- return math::le(this->g().coverage(e), max_coverage_);
- }
-
-};
-
-template<class Graph>
-class LengthUpperBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef EdgeCondition<Graph> base;
-
- const size_t max_length_;
-
-public:
-
- LengthUpperBound(const Graph &g, size_t max_length)
- : base(g),
- max_length_(max_length) {
- }
-
- bool Check(EdgeId e) const {
- return this->g().length(e) <= max_length_;
- }
-
-};
-
-template<class Graph, class PathFinder>
-class PathLengthLowerBound : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- PathFinder path_finder_;
- size_t min_length_;
-
- ForwardDirection<Graph> forward_;
- BackwardDirection<Graph> backward_;
-
- size_t CumulativePathLength(EdgeId e, const AbstractDirection<Graph> &direction) const {
- return CumulativeLength(this->g(), path_finder_(e, direction));
- }
-
-public:
- PathLengthLowerBound(const Graph &g, const PathFinder &path_finder,
- size_t min_length)
- : base(g),
- path_finder_(path_finder),
- min_length_(min_length),
- forward_(g),
- backward_(g) {
-
- }
-
- bool Check(EdgeId e) const {
- size_t forward = CumulativePathLength(e, forward_);
- size_t backward = CumulativePathLength(e, backward_);
- //checking that path was trivial in one of directions
- VERIFY(forward == this->g().length(e) || backward == this->g().length(e));
- return std::max(forward, backward) >= min_length_;
- }
-};
-
-template<class Graph, class PathFinder>
-PathLengthLowerBound<Graph, PathFinder>
-MakePathLengthLowerBound(const Graph &g, const PathFinder &path_finder, size_t min_length) {
- return PathLengthLowerBound<Graph, PathFinder>(g, path_finder, min_length);
-}
-
-template<class Graph>
-class UniquenessPlausabilityCondition : public EdgeCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef EdgeCondition<Graph> base;
-
- virtual bool CheckUniqueness(EdgeId e, bool forward) const = 0;
-
- virtual bool CheckPlausibility(EdgeId e, bool forward) const = 0;
-
- bool SingleUnique(const vector<EdgeId> &edges, bool forward) const {
- return edges.size() == 1 && CheckUniqueness(*edges.begin(), forward);
- }
-
- bool ExistPlausible(EdgeId init_e, const vector<EdgeId> &edges,
- bool forward) const {
- for (EdgeId e : edges) {
- if (e == init_e)
- continue;
- if (CheckPlausibility(e, forward)) {
- return true;
- }
- }
- return false;
- }
-
- bool Check(EdgeId e, const AbstractDirection<Graph> &direction) const {
- return SingleUnique(direction.IncomingEdges(direction.EdgeStart(e)),
- !direction.IsForward())
- && ExistPlausible(
- e, direction.OutgoingEdges(direction.EdgeStart(e)),
- direction.IsForward());
- }
-
-public:
-
- UniquenessPlausabilityCondition(const Graph &g)
- : base(g) {
-
- }
-
- bool Check(EdgeId e) const {
- return Check(e, ForwardDirection<Graph>(this->g()))
- || Check(e, BackwardDirection<Graph>(this->g()));
- }
-
-};
-
-template<class Graph>
-class PredicateUniquenessPlausabilityCondition :
- public UniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef UniquenessPlausabilityCondition<Graph> base;
-
- EdgePredicate uniqueness_condition_;
- EdgePredicate plausiblity_condition_;
-
- bool CheckUniqueness(EdgeId e, bool) const {
- return uniqueness_condition_(e);
- }
-
- bool CheckPlausibility(EdgeId e, bool) const {
- return plausiblity_condition_(e);
- }
-
-public:
-
- PredicateUniquenessPlausabilityCondition(
- const Graph &g, EdgePredicate uniqueness_condition,
- EdgePredicate plausiblity_condition)
- : base(g),
- uniqueness_condition_(uniqueness_condition),
- plausiblity_condition_(plausiblity_condition) {
- }
-
-};
-
-template<class Graph>
-class DefaultUniquenessPlausabilityCondition :
- public PredicateUniquenessPlausabilityCondition<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pred::TypedPredicate<EdgeId> EdgePredicate;
- typedef PredicateUniquenessPlausabilityCondition<Graph> base;
-
-public:
-
- DefaultUniquenessPlausabilityCondition(const Graph &g,
- size_t uniqueness_length,
- size_t plausibility_length)
- : base(g,
- MakePathLengthLowerBound(g,
- UniquePathFinder<Graph>(g), uniqueness_length),
- MakePathLengthLowerBound(g,
- PlausiblePathFinder<Graph>(g, 2 * plausibility_length),
- plausibility_length)) {
- }
-
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp b/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp
deleted file mode 100644
index 2d9e05e..0000000
--- a/src/modules/assembly_graph/graph_support/basic_vertex_conditions.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-#include "math/pred.hpp"
-#include "dev_support/func.hpp"
-
-namespace omnigraph {
-using func::Predicate;
-
-template<class Graph>
-class VertexCondition : public Predicate<typename Graph::VertexId> {
- typedef typename Graph::VertexId VertexId;
- const Graph &g_;
-protected:
-
- VertexCondition(const Graph &g)
- : g_(g) {
- }
-
- const Graph &g() const {
- return g_;
- }
-
-};
-
-template<class Graph>
-class CompressCondition : public VertexCondition<Graph> {
- typedef typename Graph::VertexId VertexId;
-
-public:
- CompressCondition(const Graph &g) :
- VertexCondition<Graph>(g) {
- }
-
- bool Check(VertexId v) const override {
- return this->g().CanCompressVertex(v);
- }
-};
-
-template<class Graph>
-class IsolatedVertexCondition : public VertexCondition<Graph> {
- typedef typename Graph::VertexId VertexId;
-
-public:
- IsolatedVertexCondition(const Graph& g) :
- VertexCondition<Graph>(g) {
- }
-
- bool Check(VertexId v) const override {
- return this->g().IsDeadStart(v) && this->g().IsDeadEnd(v);
- }
-};
-
-}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/graph_support/contig_output.hpp b/src/modules/assembly_graph/graph_support/contig_output.hpp
deleted file mode 100644
index 26e9dda..0000000
--- a/src/modules/assembly_graph/graph_support/contig_output.hpp
+++ /dev/null
@@ -1,425 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "assembly_graph/stats/picture_dump.hpp"
-#include <io/reads_io/osequencestream.hpp>
-#include "assembly_graph/components/connected_component.hpp"
-#include "assembly_graph/stats/statistics.hpp"
-#include "assembly_graph/paths/path_finders.hpp"
-#include "assembly_graph/paths/path_utils.hpp"
-
-namespace debruijn_graph {
-
-//This class corrects mismatches or masks repeat differences or other such things with the sequence of an edge
-template<class Graph>
-class ContigCorrector {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-protected:
- const Graph &graph() const {
- return graph_;
- }
-
-public:
- ContigCorrector(const Graph &graph) : graph_(graph) {
- }
-
- virtual string correct(EdgeId e) = 0;
-
- virtual ~ContigCorrector() {
- }
-};
-
-template<class Graph>
-class DefaultContigCorrector : public ContigCorrector<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-public:
- DefaultContigCorrector(const Graph &graph) : ContigCorrector<Graph>(graph) {
- }
-
- string correct(EdgeId e) {
- return this->graph().EdgeNucls(e).str();
- }
-};
-
-//This class uses corrected sequences to construct contig (just return as is, find unipath, trim contig)
-template<class Graph>
-class ContigConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- ContigCorrector<Graph> &corrector_;
-protected:
- string correct(EdgeId e) {
- return corrector_.correct(e);
- }
-
- const Graph &graph() const {
- return graph_;
- }
-
-public:
-
- ContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : graph_(graph), corrector_(corrector) {
- }
-
- virtual pair<string, double> construct(EdgeId e) = 0;
-
- virtual ~ContigConstructor(){
- }
-};
-
-template<class Graph>
-class DefaultContigConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-public:
-
- DefaultContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- return make_pair(this->correct(e), this->graph().coverage(e));
- }
-};
-
-template<class Graph>
-vector<typename Graph::EdgeId> Unipath(const Graph& g, typename Graph::EdgeId e) {
- omnigraph::UniquePathFinder<Graph> unipath_finder(g);
- vector<typename Graph::EdgeId> answer = unipath_finder.UniquePathBackward(e);
- const vector<typename Graph::EdgeId>& forward = unipath_finder.UniquePathForward(e);
- for (size_t i = 1; i < forward.size(); ++i) {
- answer.push_back(forward[i]);
- }
- return answer;
-}
-
-template<class Graph>
-class UnipathConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
-
-
- string MergeOverlappingSequences(std::vector<string>& ss, size_t overlap) {
- if (ss.empty()) {
- return "";
- }
- stringstream result;
- result << ss.front().substr(0, overlap);
-// prev_end = ss.front().substr(0, overlap);
- for (auto it = ss.begin(); it != ss.end(); ++it) {
-// VERIFY(prev_end == it->substr(0, overlap));
- result << it->substr(overlap);
-// prev_end = it->substr(it->size() - overlap);
- }
- return result.str();
- }
-
-
- string MergeSequences(const Graph& g,
- const vector<typename Graph::EdgeId>& continuous_path) {
- vector<string> path_sequences;
- for (size_t i = 0; i < continuous_path.size(); ++i) {
- if(i > 0)
- VERIFY(
- g.EdgeEnd(continuous_path[i - 1])
- == g.EdgeStart(continuous_path[i]));
- path_sequences.push_back(this->correct(continuous_path[i]));
- }
- return MergeOverlappingSequences(path_sequences, g.k());
- }
-
-public:
-
- UnipathConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- vector<EdgeId> unipath = Unipath(this->graph(), e);
- return make_pair(MergeSequences(this->graph(), unipath), stats::AvgCoverage(this->graph(), unipath));
- }
-};
-
-template<class Graph>
-class CuttingContigConstructor : public ContigConstructor<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
- bool ShouldCut(VertexId v) const {
- const Graph &g = this->graph();
- vector<EdgeId> edges;
- push_back_all(edges, g.OutgoingEdges(v));
- if(edges.size() == 0)
- return false;
- for(size_t i = 1; i < edges.size(); i++) {
- if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
- return false;
- }
- edges.clear();
- push_back_all(edges, g.IncomingEdges(v));
- for(size_t i = 0; i < edges.size(); i++)
- for(size_t j = i + 1; j < edges.size(); j++) {
- if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
- return true;
- }
- return false;
- }
-
-public:
-
- CuttingContigConstructor(const Graph &graph, ContigCorrector<Graph> &corrector) : ContigConstructor<Graph>(graph, corrector) {
- }
-
- pair<string, double> construct(EdgeId e) {
- string result = this->correct(e);
- if(result.size() > this->graph().k() && ShouldCut(this->graph().EdgeEnd(e))) {
- result = result.substr(0, result.size() - this->graph().k());
- }
- if(result.size() > this->graph().k() && ShouldCut(this->graph().conjugate(this->graph().EdgeStart(e)))) {
- result = result.substr(this->graph().k(), result.size());
- }
- return make_pair(result, this->graph().coverage(e));
- }
-};
-
-struct ExtendedContigIdT {
- string full_id_;
- string short_id_;
-
- ExtendedContigIdT(): full_id_(""), short_id_("") {}
-
- ExtendedContigIdT(string full_id, string short_id): full_id_(full_id), short_id_(short_id) {}
-};
-
-template <class Graph>
-void MakeContigIdMap(const Graph& graph, map<EdgeId, ExtendedContigIdT>& ids, const ConnectedComponentCounter &cc_counter_, string prefix) {
- int counter = 0;
- for (auto it = graph.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- EdgeId e = *it;
- if (ids.count(e) == 0) {
- string id;
- if (cfg::get().pd) {
- size_t c_id = cc_counter_.GetComponent(e);
- id = io::MakeContigComponentId(++counter, graph.length(e) + graph.k(), graph.coverage(e), c_id, prefix);
- }
- else
- id = io::MakeContigId(++counter, graph.length(e) + graph.k(), graph.coverage(e), prefix);
- ids[e] = ExtendedContigIdT(id, ToString(counter) + "+");
- if (e != graph.conjugate(e))
- ids[graph.conjugate(e)] = ExtendedContigIdT(id + "'", ToString(counter) + "-");
- }
- }
-}
-
-template<class Graph>
-class ContigPrinter {
-private:
- const Graph &graph_;
- ContigConstructor<Graph> &constructor_;
- template<class sequence_stream>
- void ReportEdge(sequence_stream& oss
- , const pair<string, double> sequence_data) {
- oss << sequence_data.second;
- oss << sequence_data.first;
- }
-
- void ReportEdge(io::osequencestream_for_fastg& oss,
- const string& sequence,
- const string& id,
- const set<string>& nex_ids) {
- oss.set_header(id);
- oss << nex_ids;
- oss << sequence;
- }
-
-public:
- ContigPrinter(const Graph &graph, ContigConstructor<Graph> &constructor) : graph_(graph), constructor_(constructor) {
- }
-
- template<class sequence_stream>
- void PrintContigs(sequence_stream &os) {
- for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- ReportEdge<sequence_stream>(os, constructor_.construct(*it));
- }
- }
-
- template<class sequence_stream>
- void PrintContigsFASTG(sequence_stream &os, const ConnectedComponentCounter & cc_counter) {
- map<EdgeId, ExtendedContigIdT> ids;
- MakeContigIdMap(graph_, ids, cc_counter, "EDGE");
- for (auto it = graph_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- set<string> next;
- VertexId v = graph_.EdgeEnd(*it);
- auto edges = graph_.OutgoingEdges(v);
- for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
- next.insert(ids[*next_it].full_id_);
- }
- ReportEdge(os, constructor_.construct(*it).first, ids[*it].full_id_, next);
- if (*it != graph_.conjugate(*it))
- {
- set<string> next_conj;
- v = graph_.EdgeEnd(graph_.conjugate(*it));
- edges = graph_.OutgoingEdges(v);
- for (auto next_it = edges.begin(); next_it != edges.end(); ++next_it) {
- next_conj.insert(ids[*next_it].full_id_);
- }
- ReportEdge(os, constructor_.construct(graph_.conjugate(*it)).first, ids[graph_.conjugate(*it)].full_id_, next_conj);
- }
- }
- }
-};
-
-template<class Graph>
-bool PossibleECSimpleCheck(const Graph& g
- , typename Graph::EdgeId e) {
- return g.OutgoingEdgeCount(g.EdgeStart(e)) > 1 && g.IncomingEdgeCount(g.EdgeEnd(e)) > 1;
-}
-
-template<class Graph>
-void ReportEdge(io::osequencestream_cov& oss
- , const Graph& g
- , typename Graph::EdgeId e
- , bool output_unipath = false
- , size_t solid_edge_length_bound = 0) {
- typedef typename Graph::EdgeId EdgeId;
- if (!output_unipath || (PossibleECSimpleCheck(g, e) && g.length(e) <= solid_edge_length_bound)) {
- TRACE("Outputting edge " << g.str(e) << " as single edge");
- oss << g.coverage(e);
- oss << g.EdgeNucls(e);
- } else {
- TRACE("Outputting edge " << g.str(e) << " as part of unipath");
- vector<EdgeId> unipath = Unipath(g, e);
- TRACE("Unipath is " << g.str(unipath));
- oss << stats::AvgCoverage(g, unipath);
- TRACE("Merged sequence is of length " << MergeSequences(g, unipath).size());
- oss << MergeSequences(g, unipath);
- }
-}
-
-inline void OutputContigs(ConjugateDeBruijnGraph &g, const string &contigs_output_filename, bool output_unipath) {
- INFO("Outputting contigs to " << contigs_output_filename << ".fasta");
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- io::osequencestream_cov oss(contigs_output_filename + ".fasta");
-
- if(!output_unipath) {
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
-
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
- } else {
- UnipathConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigs(oss);
- }
-
-// {
-// osequencestream_cov oss(contigs_output_filename);
-// set<ConjugateDeBruijnGraph::EdgeId> edges;
-// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
-// if (edges.count(*it) == 0) {
-// ReportEdge(oss, g, *it, output_unipath, solid_edge_length_bound + ".oppa.fasta");
-// edges.insert(g.conjugate(*it));
-// }
-// // oss << g.EdgeNucls(*it);
-// }
-// DEBUG("Contigs written");
-// }
-// if(!output_unipath) {
-// OutputContigs(g, contigs_output_filename + ".2.fasta", true, solid_edge_length_bound);
-// }
-}
-
-inline void OutputContigsToFASTG(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename, const ConnectedComponentCounter & cc_counter) {
-
- INFO("Outputting graph to " << contigs_output_filename << ".fastg");
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
- io::osequencestream_for_fastg ossfg(contigs_output_filename + ".fastg");
- ContigPrinter<ConjugateDeBruijnGraph>(g, constructor).PrintContigsFASTG(ossfg, cc_counter);
-}
-
-
-
-
-inline bool ShouldCut(ConjugateDeBruijnGraph& g, VertexId v) {
- vector<EdgeId> edges;
- push_back_all(edges, g.OutgoingEdges(v));
-
- if(edges.size() == 0)
- return false;
- for(size_t i = 1; i < edges.size(); i++) {
- if(g.EdgeNucls(edges[i])[g.k()] != g.EdgeNucls(edges[0])[g.k()])
- return false;
- }
- edges.clear();
- push_back_all(edges, g.IncomingEdges(v));
- for(size_t i = 0; i < edges.size(); i++)
- for(size_t j = i + 1; j < edges.size(); j++) {
- if(g.EdgeNucls(edges[i])[g.length(edges[i]) - 1] != g.EdgeNucls(edges[j])[g.length(edges[j]) - 1])
- return true;
- }
- return false;
-}
-
-inline void OutputCutContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_filename,
- bool /*output_unipath*/ = false,
- size_t /*solid_edge_length_bound*/ = 0) {
- INFO("Outputting contigs to " << contigs_output_filename);
- DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(g);
- io::osequencestream_cov oss(contigs_output_filename);
- CuttingContigConstructor<ConjugateDeBruijnGraph> constructor(g, corrector);
-
-// osequencestream_cov oss(contigs_output_filename);
-// set<ConjugateDeBruijnGraph::EdgeId> edges;
-// for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
-// EdgeId e = *it;
-// cout << g.length(e) << endl;
-// if (edges.count(e) == 0) {
-// Sequence s = g.EdgeNucls(e);
-// cout << s.size() << endl;
-// cout << "oppa " << ShouldCut(g, g.EdgeEnd(e)) << endl;
-// if(s.size() > g.k() && ShouldCut(g, g.EdgeEnd(e))) {
-// s = s.Subseq(0, s.size() - g.k());
-// cout << s.size() << endl;
-// }
-// cout << "oppa1 " << ShouldCut(g, g.conjugate(g.EdgeStart(e))) << endl;
-// if(s.size() > g.k() && ShouldCut(g, g.conjugate(g.EdgeStart(e)))) {
-// s = s.Subseq(g.k(), s.size());
-// cout << s.size() << endl;
-// }
-// oss << g.coverage(e);
-// oss << s;
-// edges.insert(g.conjugate(*it));
-// }
-// // oss << g.EdgeNucls(*it);
-// }
-}
-
-inline void OutputSingleFileContigs(ConjugateDeBruijnGraph& g,
- const string& contigs_output_dir) {
- INFO("Outputting contigs to " << contigs_output_dir);
- int n = 0;
- make_dir(contigs_output_dir);
- char n_str[20];
- set<ConjugateDeBruijnGraph::EdgeId> edges;
- for (auto it = g.SmartEdgeBegin(); !it.IsEnd(); ++it) {
- if (edges.count(*it) == 0) {
- sprintf(n_str, "%d.fa", n);
- edges.insert(g.conjugate(*it));
- io::osequencestream oss(contigs_output_dir + n_str);
- oss << g.EdgeNucls(*it);
- n++;
- }
- }DEBUG("SingleFileContigs(Conjugate) written");
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_support/detail_coverage.hpp b/src/modules/assembly_graph/graph_support/detail_coverage.hpp
deleted file mode 100644
index a203d75..0000000
--- a/src/modules/assembly_graph/graph_support/detail_coverage.hpp
+++ /dev/null
@@ -1,258 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/indices/perfect_hash_map.hpp"
-#include "assembly_graph/graph_core/coverage.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-#include "dev_support/verify.hpp"
-#include <vector>
-#include <map>
-#include <set>
-#include <string>
-#include <iostream>
-#include <fstream>
-
-namespace debruijn_graph {
-
-template<class Graph>
-class FlankingCoverage : public omnigraph::GraphActionHandler<Graph>,
- public omnigraph::AbstractFlankingCoverage<Graph> {
- typedef omnigraph::GraphActionHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef pair<EdgeId, unsigned> Pos;
-
- Graph& g_;
- const size_t averaging_range_;
-
- void SetRawCoverage(EdgeId e, unsigned cov) {
- g_.data(e).set_flanking_coverage(cov);
- }
-
- unsigned RawCoverage(EdgeId e) const {
- return g_.data(e).flanking_coverage();
- }
-
- size_t EdgeAveragingRange(EdgeId e) const {
- return std::min(this->g().length(e), averaging_range_);
- }
-
- double AverageFlankingCoverage(EdgeId e) const {
- return double(RawCoverage(e)) / double(EdgeAveragingRange(e));
- }
-
- unsigned InterpolateCoverage(EdgeId e, size_t l) const {
- VERIFY(l <= averaging_range_);
- VERIFY(l < g_.length(e));
- return unsigned(math::round(AverageFlankingCoverage(e) * double(l)));
- }
-
- void SetCoverageSimilarToAverageFlanking(EdgeId target, EdgeId source) {
- SetRawCoverage(target, unsigned(math::round(AverageFlankingCoverage(source) * double(EdgeAveragingRange(target)))));
- }
-
- void SetCoverageSimilarToAverageGlobal(EdgeId target, EdgeId source) {
- SetRawCoverage(target, unsigned(math::round(g_.coverage(source) * double(EdgeAveragingRange(target)))));
- }
-
-public:
-
- //todo think about interactions with gap closer
- FlankingCoverage(Graph& g, size_t averaging_range)
- : base(g, "FlankingCoverage"), g_(g),
- averaging_range_(averaging_range) {
- }
-
- size_t averaging_range() const {
- return averaging_range_;
- }
-
- //todo currently left for saves compatibility! remove later!
- template<class CoverageIndex>
- void Fill(const CoverageIndex& count_index) {
- TRACE("Filling flanking coverage from index");
-
- for (auto I = count_index.value_cbegin(), E = count_index.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- EdgeId e = edge_info.edge_id;
- unsigned offset = edge_info.offset;
- unsigned count = edge_info.count;
- VERIFY(offset != -1u);
- VERIFY(e.get() != NULL);
- if (offset < averaging_range_) {
- IncRawCoverage(e, count);
- }
- }
- }
-
- void IncRawCoverage(EdgeId e, unsigned count) {
- g_.data(e).inc_flanking_coverage(count);
- }
-
- double CoverageOfStart(EdgeId e) const {
- return AverageFlankingCoverage(e);
- }
-
- double CoverageOfEnd(EdgeId e) const {
- return CoverageOfStart(this->g().conjugate(e));
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
-// SetRawCoverage(new_edge, RawCoverage(old_edges.front()));
- size_t kpomers_left = averaging_range_;
- unsigned acc = 0;
- for (EdgeId e : old_edges) {
- if (kpomers_left >= g_.length(e)) {
- acc += RawCoverage(e);
- kpomers_left -= g_.length(e);
- } else {
- if (kpomers_left != 0)
- acc += InterpolateCoverage(e, kpomers_left);
- break;
- }
- }
- SetRawCoverage(new_edge, acc);
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- SetRawCoverage(new_edge, RawCoverage(edge1) + RawCoverage(edge2));
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
- EdgeId new_edge_2) {
- //todo maybe improve later
- SetCoverageSimilarToAverageFlanking(new_edge_1, old_edge);
- SetCoverageSimilarToAverageGlobal(new_edge_2, old_edge);
- if (old_edge == g_.conjugate(old_edge)) {
- SetCoverageSimilarToAverageGlobal(g_.conjugate(new_edge_1), old_edge);
- }
- }
-
- virtual void HandleDelete(EdgeId e) {
- SetRawCoverage(e, 0);
- }
-
- double LocalCoverage(EdgeId e, VertexId v) const {
- if (this->g().EdgeStart(e) == v) {
- return GetInCov(e);
- } else if (this->g().EdgeEnd(e) == v) {
- return GetOutCov(e);
- } else {
- VERIFY(false);
- return 0.0;
- }
- }
-
- //left for compatibility
- //todo rename
- double GetInCov(EdgeId e) const {
- return CoverageOfStart(e);
- }
-
- //todo rename
- double GetOutCov(EdgeId e) const {
- return CoverageOfEnd(e);
- }
-
- //////////////////////////
-
- void Save(EdgeId e, ostream& out) const {
- out << RawCoverage(e);
- }
-
- void Load(EdgeId e, istream& in) {
- unsigned cov;
- in >> cov;
- SetRawCoverage(e, cov);
- }
-
- /*
- * Is thread safe if different threads process different edges.
- */
- bool IsThreadSafe() const {
- return true;
- }
-
-private:
- DECL_LOGGER("FlankingCoverage")
- ;
-};
-
-template<class StoringType>
-struct SimultaneousCoverageCollector {
-};
-
-template<>
-struct SimultaneousCoverageCollector<SimpleStoring> {
- template<class SimultaneousCoverageFiller, class Info>
- static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
- filler.inc_coverage(edge_info);
- }
-};
-
-template<>
-struct SimultaneousCoverageCollector<InvertableStoring> {
- template<class SimultaneousCoverageFiller, class Info>
- static void CollectCoverage(SimultaneousCoverageFiller& filler, const Info &edge_info) {
- filler.inc_coverage(edge_info);
- filler.inc_coverage(edge_info.conjugate(filler.k()));
- }
-};
-
-template<class Graph, class CountIndex>
-class SimultaneousCoverageFiller {
- const Graph& g_;
- const CountIndex& count_index_;
- FlankingCoverage<Graph>& flanking_coverage_;
- omnigraph::CoverageIndex<Graph>& coverage_index_;
- typedef typename CountIndex::Value Value;
-public:
- SimultaneousCoverageFiller(const Graph& g, const CountIndex& count_index,
- FlankingCoverage<Graph>& flanking_coverage,
- omnigraph::CoverageIndex<Graph>& coverage_index) :
- g_(g),
- count_index_(count_index),
- flanking_coverage_(flanking_coverage),
- coverage_index_(coverage_index) {
- }
-
- size_t k() const {
- return count_index_.k();
- }
-
- void inc_coverage(const Value &edge_info) {
- coverage_index_.IncRawCoverage(edge_info.edge_id, edge_info.count);
- if (edge_info.offset < flanking_coverage_.averaging_range()) {
- flanking_coverage_.IncRawCoverage(edge_info.edge_id, edge_info.count);
- }
- }
-
- void Fill() {
- for (auto I = count_index_.value_cbegin(), E = count_index_.value_cend();
- I != E; ++I) {
- const auto& edge_info = *I;
- VERIFY(edge_info.valid());
- VERIFY(edge_info.edge_id.get() != NULL);
- SimultaneousCoverageCollector<typename CountIndex::storing_type>::CollectCoverage(*this, edge_info);
- }
- }
-};
-
-template<class Graph, class CountIndex>
-void FillCoverageAndFlanking(const CountIndex& count_index, Graph& g,
- FlankingCoverage<Graph>& flanking_coverage) {
- SimultaneousCoverageFiller<Graph, CountIndex> filler(g, count_index, flanking_coverage, g.coverage_index());
- filler.Fill();
-}
-
-}
diff --git a/src/modules/assembly_graph/graph_support/genomic_quality.hpp b/src/modules/assembly_graph/graph_support/genomic_quality.hpp
deleted file mode 100644
index ee9e75a..0000000
--- a/src/modules/assembly_graph/graph_support/genomic_quality.hpp
+++ /dev/null
@@ -1,554 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "visualization/visualization.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class EdgeQuality: public omnigraph::GraphLabeler<Graph>, public omnigraph::GraphActionHandler<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- map<EdgeId, size_t> quality_;
- size_t k_;
-
- template<class Index>
- void FillQuality(const Index &index
- , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
- if (genome.size() < k_)
- return;
- runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
- cur >>= 0;
- for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
- cur <<= genome[i + k_ - 1];
- auto corr_cur = kmer_mapper.Substitute(cur);
- if (index.contains(corr_cur)) {
- quality_[index.get(corr_cur).first]++;
- }
- }
- }
-
-public:
-
- template<class Index>
- void Fill(const Index &index
- , const KmerMapper<Graph>& kmer_mapper
- , const Sequence &genome) {
- FillQuality(index, kmer_mapper, genome);
- FillQuality(index, kmer_mapper, !genome);
- }
-
- EdgeQuality(const Graph &graph) :
- omnigraph::GraphActionHandler<Graph>(graph, "EdgeQuality"),
- k_(graph.k() + 1) {
- }
-
- virtual ~EdgeQuality() {
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleDelete(EdgeId e) {
- quality_.erase(e);
- }
-
- virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
- size_t res = 0;
- for (size_t i = 0; i < old_edges.size(); i++) {
- res += quality_[old_edges[i]];
- }
- quality_[new_edge] += res;
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- quality_[new_edge] += quality_[edge2];
- quality_[new_edge] += quality_[edge1];
- }
-
- virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
- EdgeId new_edge2) {
- if (old_edge == this->g().conjugate(old_edge)) {
- WARN("EdgeQuality does not support self-conjugate splits");
- return;
- }
- VERIFY(old_edge != this->g().conjugate(old_edge));
- quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
- / (this->g().length(new_edge1) + this->g().length(new_edge2));
- quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
- / (this->g().length(new_edge1) + this->g().length(new_edge2));
- }
-
- double quality(EdgeId edge) const {
- auto it = quality_.find(edge);
- if (it == quality_.end())
- return 0.;
- else
- return 1. * (double) it->second / (double) this->g().length(edge);
- }
-
- bool IsPositiveQuality(EdgeId edge) const {
- return math::gr(quality(edge), 0.);
- }
-
- bool IsZeroQuality(EdgeId edge) const {
- return math::eq(quality(edge), 0.);
- }
-
- virtual std::string label(VertexId /*vertexId*/) const {
- return "";
- }
-
- virtual std::string label(EdgeId edge) const {
- double q = quality(edge);
- return (q == 0) ? "" : "quality: " + ToString(q);
- }
-
- void clear() {
- quality_.clear();
- }
-
-};
-
-template<class Graph>
-class QualityLoggingRemovalHandler {
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
- const EdgeQuality<Graph>& quality_handler_;
- size_t black_removed_;
- size_t total_;
- bool handle_all_;
-
- virtual void HandlePositiveQuality(EdgeId /*e*/) {
-
- }
-
-public:
- QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph>& quality_handler,
- bool handle_all = false) :
- g_(g), quality_handler_(quality_handler), black_removed_(0), total_(0), handle_all_(handle_all) {
- }
-
- void HandleDelete(EdgeId e) {
- total_++;
- if (handle_all_ || math::gr(quality_handler_.quality(e), 0.)) {
- TRACE("Deleting good edge id = " << g_.int_id(e)
- << "; length = " << g_.length(e)
- << "; quality = " << quality_handler_.quality(e)
- << "; cov = " << g_.coverage(e));
- HandlePositiveQuality(e);
- } else {
- black_removed_++;
- }
- }
-
- const Graph& g() const {
- return g_;
- }
-
- const EdgeQuality<Graph>& quality_handler() const {
- return quality_handler_;
- }
-
- virtual ~QualityLoggingRemovalHandler() {
- TRACE("Overall stats: total removed = " << total_
- << "; bad removed = " << black_removed_
- << "; good removed = " << total_ - black_removed_);
- }
-
-private:
- DECL_LOGGER("QualityLoggingRemovalHandler");
-};
-
-template<class Graph>
-class QualityEdgeLocalityPrintingRH : public QualityLoggingRemovalHandler<Graph> {
- typedef QualityLoggingRemovalHandler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- omnigraph::visualization::LocalityPrintingRH<Graph> printing_rh_;
-public:
- QualityEdgeLocalityPrintingRH(const Graph& g
- , const EdgeQuality<Graph>& quality_handler
- , const omnigraph::GraphLabeler<Graph>& labeler
- , std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer
- , const string& output_folder, bool handle_all = false) :
- base(g, quality_handler, handle_all),
- printing_rh_(g, labeler, colorer, output_folder)
- {}
-
- virtual void HandlePositiveQuality(EdgeId e) {
- printing_rh_.HandleDelete(e, "_" + ToString(this->quality_handler().quality(e)));
- }
-
-private:
- DECL_LOGGER("QualityEdgeLocalityPrintingRH");
-};
-
-//earlier version from rel_cov branch
-//template<class Graph>
-//class EdgeNeighborhoodFinder: public omnigraph::GraphSplitter<Graph> {
-//private:
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// EdgeId edge_;
-// size_t max_size_;
-// size_t edge_length_bound_;
-// bool finished_;
-//public:
-// EdgeNeighborhoodFinder(const Graph &graph, EdgeId edge, size_t max_size
-// , size_t edge_length_bound) :
-// GraphSplitter<Graph>(graph), edge_(edge), max_size_(
-// max_size), edge_length_bound_(edge_length_bound), finished_(
-// false) {
-// }
-//
-// GraphComponent<Graph> NextComponent() {
-// CountingDijkstra<Graph> cf(this->graph(), max_size_,
-// edge_length_bound_);
-// set<VertexId> result_set;
-// cf.run(this->graph().EdgeStart(edge_));
-// vector<VertexId> result_start = cf.ReachedVertices();
-// result_set.insert(result_start.begin(), result_start.end());
-// cf.run(this->graph().EdgeEnd(edge_));
-// vector<VertexId> result_end = cf.ReachedVertices();
-// result_set.insert(result_end.begin(), result_end.end());
-//
-// ComponentCloser<Graph> cc(this->graph(), edge_length_bound_);
-// cc.CloseComponent(result_set);
-//
-// finished_ = true;
-// return GraphComponent<Graph>(this->graph(), result_set.begin(), result_set.end());
-// }
-//
-// /*virtual*/ bool Finished() {
-// return finished_;
-// }
-//};
-//
-//template<class Graph>
-//class EdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// std::function<double (EdgeId)>& quality_f_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// EdgeLocalityPrintingRH(const Graph& g
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , std::function<double (EdgeId)> quality_f = 0) :
-// g_(g),
-// labeler_(labeler), output_folder_(output_folder),
-// quality_f_(quality_f){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// TRACE("Deleting edge " << g_.str(edge));
-// if (quality_f_ && math::gr(quality_f_(edge), 0.))
-// INFO("EdgeLocalityPrintRH handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
-//
-// string folder = output_folder_ + "edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// map<EdgeId, string> empty_coloring;
-// omnigraph::visualization::WriteComponent(g_, EdgeNeighborhood<Graph>(g_, edge, 50, 250),
-// folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
-// }
-//
-//private:
-// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
-// ;
-//};
-
-//template<class Graph, class Index>
-//class EdgeQuality: public GraphLabeler<Graph>, public GraphActionHandler<Graph> {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// map<EdgeId, size_t> quality_;
-// size_t k_;
-//
-//public:
-//
-// void FillQuality(const Index &index
-// , const KmerMapper<Graph>& kmer_mapper, const Sequence &genome) {
-// if (genome.size() < k_)
-// return;
-// runtime_k::RtSeq cur = genome.start<runtime_k::RtSeq>(k_);
-// cur >>= 0;
-// for (size_t i = 0; i + k_ - 1 < genome.size(); i++) {
-// cur <<= genome[i + k_ - 1];
-// auto corr_cur = kmer_mapper.Substitute(cur);
-// if (index.contains(corr_cur)) {
-// quality_[index.get(corr_cur).first]++;
-// }
-// }
-// }
-//
-// EdgeQuality(const Graph &graph, const Index &index,
-// const KmerMapper<Graph>& kmer_mapper,
-// const Sequence &genome) :
-//
-// GraphActionHandler<Graph>(graph, "EdgeQualityLabeler"),
-// k_(kmer_mapper.get_k()) {
-// FillQuality(index, kmer_mapper, genome);
-// FillQuality(index, kmer_mapper, !genome);
-// }
-//
-// virtual ~EdgeQuality() {
-// }
-//
-// virtual void HandleAdd(EdgeId /*e*/) {
-// }
-//
-// virtual void HandleDelete(EdgeId e) {
-// quality_.erase(e);
-// }
-//
-// virtual void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) {
-// size_t res = 0;
-// for (size_t i = 0; i < old_edges.size(); i++) {
-// res += quality_[old_edges[i]];
-// }
-// quality_[new_edge] += res;
-// }
-//
-// virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// quality_[new_edge] += quality_[edge2];
-// quality_[new_edge] += quality_[edge1];
-// }
-//
-// virtual void HandleSplit(EdgeId old_edge, EdgeId new_edge1,
-// EdgeId new_edge2) {
-// quality_[new_edge1] = quality_[old_edge] * this->g().length(new_edge1)
-// / (this->g().length(new_edge1) + this->g().length(new_edge2));
-// quality_[new_edge2] = quality_[old_edge] * this->g().length(new_edge2)
-// / (this->g().length(new_edge1) + this->g().length(new_edge2));
-// }
-//
-// double quality(EdgeId edge) const {
-// auto it = quality_.find(edge);
-// if (it == quality_.end())
-// return 0.;
-// else
-// return 1. * (double) it->second / (double) this->g().length(edge);
-// }
-//
-// bool IsPositiveQuality(EdgeId edge) const {
-// return math::gr(quality(edge), 0.);
-// }
-//
-// virtual std::string label(VertexId /*vertexId*/) const {
-// return "";
-// }
-//
-// virtual std::string label(EdgeId edge) const {
-// double q = quality(edge);
-// return (q == 0) ? "" : "quality: " + ToString(q);
-// }
-//
-//};
-//
-//template<class Graph, class Index>
-//class QualityLoggingRemovalHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityLoggingRemovalHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
-// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
-// 0)*/{
-//
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (math::gr(quality_handler_.quality(edge), 0.)) {
-// TRACE("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
-// } else {
-//// TRACE("Deleting edge " << g_.int_id(edge) << " with zero quality");
-// }
-//// if (math::gr(quality_handler_.quality(edge), 0.))
-//// colored_removed_++;
-//// else
-//// black_removed_++;
-// }
-//
-//private:
-// DECL_LOGGER("QualityLoggingRemovalHandler")
-// ;
-//};
-//
-//template<class Graph, class Index>
-//class QualityLoggingRemovalCountHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// size_t black_removed_;
-// size_t total;
-//
-//public:
-// QualityLoggingRemovalCountHandler(const Graph& g, const EdgeQuality<Graph, Index>& quality_handler) :
-// g_(g), quality_handler_(quality_handler)/*, black_removed_(0), colored_removed_(
-// 0)*/{
-// black_removed_ = 0;
-// total = 0;
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// total++;
-// if (math::gr(quality_handler_.quality(edge), 0.)) {
-// TRACE("Deleting good edge " << g_.int_id(edge) << " with quality " << quality_handler_.quality(edge) << " cov " << g_.coverage(edge) << " length " << g_.length(edge));
-// }else{
-// black_removed_++;
-// }
-// if ((total % (1<<10)) != 0)
-// TRACE("Removed still " << black_removed_ << " " << total);
-// }
-//
-//private:
-//};
-//
-//template<class Graph, class Index>
-//class QualityEdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// const omnigraph::GraphLabeler<Graph>& labeler_;
-// const omnigraph::visualization::GraphColorer<Graph>& colorer_;
-// const string& output_folder_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityEdgeLocalityPrintingRH(const Graph& g
-// , const EdgeQuality<Graph, Index>& quality_handler
-// , const omnigraph::GraphLabeler<Graph>& labeler
-// , const omnigraph::visualization::GraphColorer<Graph>& colorer
-// , const string& output_folder) :
-// g_(g), quality_handler_(quality_handler),
-// labeler_(labeler), colorer_(colorer), output_folder_(output_folder){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (quality_handler_.IsPositiveQuality(edge)) {
-// DEBUG("Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge));
-// string folder = output_folder_ + "colored_edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-//// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
-// omnigraph::visualization::WriteComponents(g_, *splitter/*, "locality_of_edge_" + ToString(g_.int_id(edge))*/
-// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
-// , colorer_, labeler_);
-// } else {
-// TRACE("Deleting edge " << g_.str(edge) << " with zero quality");
-// }
-// }
-//
-//private:
-// DECL_LOGGER("QualityEdgeLocalityPrintingRH")
-// ;
-//};
-//
-//template<class Graph, class Index>
-//class QualityPairInfoHandler {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// typedef omnigraph::PairInfo<EdgeId> PairInfo;
-// typedef vector<PairInfo> PairInfos;
-// const Graph& g_;
-// const EdgeQuality<Graph, Index>& quality_handler_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// const PairedInfoIndex<ConjugateDeBruijnGraph>& index_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// QualityPairInfoHandler(const Graph& g
-// , const EdgeQuality<Graph, Index>& quality_handler
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , const PairedInfoIndex<ConjugateDeBruijnGraph>& index) :
-// g_(g), quality_handler_(quality_handler),
-// labeler_(labeler), output_folder_(output_folder), index_(index) {
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// if (quality_handler_.IsPositiveQuality(edge)) {
-// cout << "Deleting edge " << g_.str(edge) << " with quality " << quality_handler_.quality(edge) << endl;
-// string folder = output_folder_ + "colored_edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// PairInfos infos = index_.GetEdgeInfo(edge);
-// if (infos.size() > 0){
-// for (size_t i = 0; i<infos.size(); i++){
-// cout << "Tip Info " << g_.int_id(infos[i].first) << " " << g_.int_id(infos[i].second) << " " << infos[i].d << " " << infos[i].weight << " " << infos[i].variance << endl;
-// }
-// }
-// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50,
-// 250);
-//
-// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
-// , folder + "edge_" + ToString(g_.int_id(edge)) + "_" + ToString(quality_handler_.quality(edge)) + ".dot"
-// , empty_coloring, labeler_);
-// }
-// }
-//
-//private:
-//};
-//
-////todo what is the difference with QELPRH?!
-//template<class Graph>
-//class EdgeLocalityPrintingRH {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// const Graph& g_;
-// const GraphLabeler<Graph>& labeler_;
-// const string& output_folder_;
-// std::function<double (EdgeId)>& quality_f_;
-//// size_t black_removed_;
-//// size_t colored_removed_;
-//public:
-// EdgeLocalityPrintingRH(const Graph& g
-// , const GraphLabeler<Graph>& labeler
-// , const string& output_folder
-// , std::function<double (EdgeId)> quality_f = 0) :
-// g_(g),
-// labeler_(labeler), output_folder_(output_folder),
-// quality_f_(quality_f){
-// }
-//
-// void HandleDelete(EdgeId edge) {
-// TRACE("Deleting edge " << g_.str(edge));
-// if (quality_f_ && math::gr(quality_f_(edge), 0.))
-// INFO("Handling the edge with positive quality : " << quality_f_(edge) << " " << g_.str(edge));
-//
-// string folder = output_folder_ + "edges_deleted/";
-// path::make_dir(folder);
-// //todo magic constant
-// map<EdgeId, string> empty_coloring;
-// shared_ptr<GraphSplitter<Graph>> splitter = EdgeNeighborhoodFinder<Graph>(g_, edge, 50, 250);
-// omnigraph::visualization::WriteComponents(g_, *splitter, TrueFilter<vector<VertexId>>(), "locality_of_edge_" + ToString(g_.int_id(edge))
-// , folder + "edge_" + ToString(g_.int_id(edge)) + ".dot", empty_coloring, labeler_);
-// }
-//
-//private:
-// DECL_LOGGER("EdgeLocalityPrintingRH")
-// ;
-//};
-
-}
diff --git a/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp b/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp
deleted file mode 100644
index cce6c20..0000000
--- a/src/modules/assembly_graph/graph_support/graph_processing_algorithm.hpp
+++ /dev/null
@@ -1,262 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/func.hpp"
-#include <boost/none.hpp>
-#include <atomic>
-#include "assembly_graph/graph_core/graph_iterators.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "math/pred.hpp"
-#include "dev_support/logger/logger.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-using HandlerF = std::function<void(typename Graph::EdgeId)>;
-
-template<class Graph>
-class EdgeProcessingAlgorithm {
- typedef typename Graph::EdgeId EdgeId;
- typedef pred::TypedPredicate<EdgeId> ProceedConditionT;
-
- Graph& g_;
- bool conjugate_symmetry_;
- protected:
-
- Graph& g() {
- return g_;
- }
-
- const Graph& g() const {
- return g_;
- }
-
- virtual bool ProcessEdge(EdgeId e) = 0;
-
- public:
- EdgeProcessingAlgorithm(Graph& g,
- bool conjugate_symmetry = false)
- : g_(g), conjugate_symmetry_(conjugate_symmetry) {
-
- }
-
- virtual ~EdgeProcessingAlgorithm() {
- }
-
-// bool conjugate_symmetry() const {
-// return conjugate_symmetry_;
-// }
-
- template<class Comparator = std::less<EdgeId>>
- bool Run(const Comparator& comp = Comparator(), ProceedConditionT proceed_condition = pred::AlwaysTrue<EdgeId>()) {
- bool triggered = false;
- for (auto it = g_.SmartEdgeBegin(comp, conjugate_symmetry_); !it.IsEnd(); ++it) {
- EdgeId e = *it;
- TRACE("Current edge " << g_.str(e));
- if (!proceed_condition(e)) {
- TRACE("Stop condition was reached.");
- break;
- }
-
- TRACE("Processing edge " << this->g().str(e));
- triggered |= ProcessEdge(e);
- };
- return triggered;
- }
-
- private:
- DECL_LOGGER("EdgeProcessingAlgorithm");
-};
-
-template<class Graph>
-class CountingCallback {
- typedef typename Graph::EdgeId EdgeId;
- bool report_on_destruction_;
- std::atomic<size_t> cnt_;
-
-public:
- CountingCallback(bool report_on_destruction = false) :
- report_on_destruction_(report_on_destruction), cnt_(0) {
- }
-
- ~CountingCallback() {
- if (report_on_destruction_)
- Report();
- }
-
- void HandleDelete(EdgeId /*e*/) {
- cnt_++;
- }
-
- void Report() {
- TRACE(cnt_ << " edges were removed.")
- cnt_ = 0;
- }
-
-private:
- DECL_LOGGER("CountingCallback");
-};
-
-template<class Graph>
-std::function<void(typename Graph::EdgeId)> AddCountingCallback(CountingCallback<Graph>& cnt_callback, std::function<void(typename Graph::EdgeId)> handler) {
- std::function<void(typename Graph::EdgeId)> cnt_handler = std::bind(&CountingCallback<Graph>::HandleDelete, std::ref(cnt_callback), std::placeholders::_1);
- return func::Composition<typename Graph::EdgeId>(handler, cnt_handler);
-}
-template<class Graph>
-void RemoveIsolatedOrCompress(Graph& g, typename Graph::VertexId v) {
- if (g.IsDeadStart(v) && g.IsDeadEnd(v)) {
- g.DeleteVertex(v);
- } else {
- g.CompressVertex(v);
- }
-}
-
-template<class Graph>
-class EdgeRemover {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(EdgeId)> HandlerF;
-
- Graph& g_;
- HandlerF removal_handler_;
-
- public:
- EdgeRemover(Graph& g, HandlerF removal_handler = nullptr)
- : g_(g),
- removal_handler_(removal_handler) {
- }
-
- void DeleteEdge(EdgeId e) {
- VertexId start = g_.EdgeStart(e);
- VertexId end = g_.EdgeEnd(e);
- DeleteEdgeWithNoCompression(e);
- // NOTE: e here is already dead!
- TRACE("Compressing locality");
- if (!g_.RelatedVertices(start, end)) {
- TRACE("Vertices not related");
- TRACE("Processing end");
- RemoveIsolatedOrCompress(g_, end);
- TRACE("End processed");
- }
- TRACE("Processing start");
- RemoveIsolatedOrCompress(g_, start);
- TRACE("Start processed");
- }
-
- void DeleteEdgeWithNoCompression(EdgeId e) {
- TRACE("Deletion of edge " << g_.str(e));
- TRACE("Start " << g_.str(g_.EdgeStart(e)));
- TRACE("End " << g_.str(g_.EdgeEnd(e)));
- if (removal_handler_) {
- TRACE("Calling handler");
- removal_handler_(e);
- }
- TRACE("Deleting edge");
- g_.DeleteEdge(e);
- }
-
- private:
- DECL_LOGGER("EdgeRemover");
-};
-
-template<class Graph>
-class EdgeRemovingAlgorithm : public EdgeProcessingAlgorithm<Graph> {
- typedef EdgeProcessingAlgorithm<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
-
- pred::TypedPredicate<EdgeId> remove_condition_;
- EdgeRemover<Graph> edge_remover_;
-
- protected:
- bool ProcessEdge(EdgeId e) {
- TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
- if (remove_condition_(e)) {
- TRACE("Check passed, removing");
- edge_remover_.DeleteEdge(e);
- return true;
- }
- TRACE("Check not passed");
- return false;
- }
-
- public:
- EdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> remove_condition,
- std::function<void (EdgeId)> removal_handler = boost::none,
- bool conjugate_symmetry = false)
- : base(g, conjugate_symmetry),
- remove_condition_(remove_condition),
- edge_remover_(g, removal_handler) {}
-
- private:
- DECL_LOGGER("EdgeRemovingAlgorithm");
-};
-
-//todo rewrite with SmartSetIterator
-template<class Graph>
-class ComponentRemover {
- public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::function<void(const std::set<EdgeId>&)> HandlerF;
-
- private:
- Graph& g_;
- HandlerF removal_handler_;
-
- template<class ElemType>
- void InsertIfNotConjugate(std::set<ElemType>& elems, ElemType elem) {
- if (elems.count(g_.conjugate(elem)) == 0) {
- elems.insert(elem);
- }
- }
-
- public:
- ComponentRemover(Graph& g, HandlerF removal_handler = 0)
- : g_(g),
- removal_handler_(removal_handler) {
- }
-
- template<class EdgeIt>
- void DeleteComponent(EdgeIt begin, EdgeIt end, bool alter_vertices = true) {
- using std::set;
- set<EdgeId> edges;
- set<VertexId> vertices;
-
- //cleaning conjugates and gathering vertices
- for (EdgeIt it = begin; it != end; ++it) {
- EdgeId e = *it;
- InsertIfNotConjugate(edges, e);
- InsertIfNotConjugate(vertices, g_.EdgeStart(e));
- InsertIfNotConjugate(vertices, g_.EdgeEnd(e));
- }
-
- if (removal_handler_) {
- removal_handler_(edges);
- }
-
- for (EdgeId e: edges) {
- g_.DeleteEdge(e);
- }
-
- if (alter_vertices) {
- for (VertexId v: vertices) {
- RemoveIsolatedOrCompress(g_, v);
- }
- }
- }
-
- template<class Container>
- void DeleteComponent(const Container& container, bool alter_vertices = true) {
- DeleteComponent(container.begin(), container.end(), alter_vertices);
- }
-
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_support/parallel_processing.hpp b/src/modules/assembly_graph/graph_support/parallel_processing.hpp
deleted file mode 100644
index 9b5084b..0000000
--- a/src/modules/assembly_graph/graph_support/parallel_processing.hpp
+++ /dev/null
@@ -1,290 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include "assembly_graph/graph_core/graph_iterators.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "dev_support/openmp_wrapper.h"
-
-namespace omnigraph {
-
-template<class ItVec, class SmartIt, class Predicate>
-void FillInterestingFromChunkIterators(const ItVec& chunk_iterators,
- SmartIt& smart_it,
- const Predicate& predicate) {
- VERIFY(chunk_iterators.size() > 1);
- typedef typename Predicate::checked_type ElementType;
- std::vector<std::vector<ElementType>> of_interest(omp_get_max_threads());
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < chunk_iterators.size() - 1; ++i) {
- for (auto it = chunk_iterators[i], end = chunk_iterators[i + 1]; it != end; ++it) {
- ElementType t = *it;
- if (predicate(t)) {
- of_interest[omp_get_thread_num()].push_back(t);
- }
- }
- }
-
- for (auto& chunk : of_interest) {
- smart_it.insert(chunk.begin(), chunk.end());
- chunk.clear();
- }
-}
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class TrivialInterestingElementFinder {
-public:
-
- TrivialInterestingElementFinder() {
- }
-
- template<class SmartIt>
- bool Run(SmartIt& /*it*/) const {
- return false;
- }
-};
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class SimpleInterestingElementFinder {
- typedef GraphEdgeIterator<Graph> EdgeIt;
-
- const Graph& g_;
- pred::TypedPredicate<ElementId> condition_;
-public:
-
- SimpleInterestingElementFinder(const Graph& g,
- pred::TypedPredicate<ElementId> condition = pred::AlwaysTrue<ElementId>())
- : g_(g), condition_(condition) {}
-
- template<class SmartIt>
- bool Run(SmartIt& interest) const {
- for (EdgeIt it = EdgeIt(g_, g_.begin()), end = EdgeIt(g_, g_.end()); it != end; ++it) {
- if (condition_(*it)) {
- interest.push(*it);
- }
- }
- return false;
- }
-};
-
-template<class Graph, class ElementId = typename Graph::EdgeId>
-class ParallelInterestingElementFinder {
- typedef GraphEdgeIterator<Graph> EdgeIt;
-
- const Graph& g_;
- pred::TypedPredicate<ElementId> condition_;
- const size_t chunk_cnt_;
-public:
-
- ParallelInterestingElementFinder(const Graph& g,
- pred::TypedPredicate<ElementId> condition,
- size_t chunk_cnt)
- : g_(g), condition_(condition), chunk_cnt_(chunk_cnt) {}
-
- template<class SmartIt>
- bool Run(SmartIt& it) const {
- TRACE("Looking for interesting elements");
- TRACE("Splitting graph into " << chunk_cnt_ << " chunks");
- FillInterestingFromChunkIterators(IterationHelper<Graph, ElementId>(g_).Chunks(chunk_cnt_), it, condition_);
- TRACE("Found " << it.size() << " interesting elements");
- return false;
- }
-private:
- DECL_LOGGER("ParallelInterestingElementFinder");
-};
-
-template<class Graph>
-class PersistentAlgorithmBase {
- Graph& g_;
-protected:
-
- PersistentAlgorithmBase(Graph& g) : g_(g) {}
-
- Graph& g() { return g_; }
- const Graph& g() const { return g_; }
-public:
- virtual ~PersistentAlgorithmBase() {}
- virtual bool Run(bool force_primary_launch = false) = 0;
-};
-
-//todo use add_condition in it_
-template<class Graph, class ElementId, class InterestingElementFinder,
- class Comparator = std::less<ElementId>>
-class PersistentProcessingAlgorithm : public PersistentAlgorithmBase<Graph> {
- InterestingElementFinder interest_el_finder_;
-
- SmartSetIterator<Graph, ElementId, Comparator> it_;
- //todo remove
- bool tracking_;
- size_t total_iteration_estimate_;
-
- size_t curr_iteration_;
-
-protected:
-
- virtual bool Process(ElementId el) = 0;
- virtual bool Proceed(ElementId /*el*/) const { return true; }
-
- virtual void PrepareIteration(size_t /*it_cnt*/, size_t /*total_it_estimate*/) {}
-
-public:
-
- PersistentProcessingAlgorithm(Graph& g,
- const InterestingElementFinder& interest_el_finder,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul) :
- PersistentAlgorithmBase<Graph>(g),
- interest_el_finder_(interest_el_finder),
- it_(g, true, comp, canonical_only),
- tracking_(track_changes),
- total_iteration_estimate_(total_iteration_estimate),
- curr_iteration_(0) {
- it_.Detach();
- }
-
- bool Run(bool force_primary_launch = false) {
- bool primary_launch = !tracking_ || (curr_iteration_ == 0) || force_primary_launch ;
- if (!it_.IsAttached()) {
- it_.Attach();
- }
- if (primary_launch) {
- it_.clear();
- TRACE("Primary launch.");
- TRACE("Start preprocessing");
- interest_el_finder_.Run(it_);
- TRACE(it_.size() << " edges to process after preprocessing");
- } else {
- TRACE(it_.size() << " edges to process");
- VERIFY(tracking_);
- }
-
- if (curr_iteration_ >= total_iteration_estimate_) {
- PrepareIteration(total_iteration_estimate_ - 1, total_iteration_estimate_);
- } else {
- PrepareIteration(curr_iteration_, total_iteration_estimate_);
- }
-
- bool triggered = false;
- TRACE("Start processing");
- for (; !it_.IsEnd(); ++it_) {
- ElementId el = *it_;
- if (!Proceed(el)) {
- TRACE("Proceed condition turned false on element " << this->g().str(el));
- it_.ReleaseCurrent();
- break;
- }
- TRACE("Processing edge " << this->g().str(el));
- triggered |= Process(el);
- }
- TRACE("Finished processing. Triggered = " << triggered);
- if (!tracking_)
- it_.Detach();
-
- curr_iteration_++;
- return triggered;
- }
-
-};
-
-template<class Graph, class InterestingEdgeFinder,
- class Comparator = std::less<typename Graph::EdgeId>>
-class PersistentEdgeRemovingAlgorithm : public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- InterestingEdgeFinder, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId, InterestingEdgeFinder, Comparator> base;
- EdgeRemover<Graph> edge_remover_;
-public:
- PersistentEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul)
- : base(g, interest_edge_finder,
- canonical_only, comp, track_changes,
- total_iteration_estimate),
- edge_remover_(g, removal_handler) {
-
- }
-
-protected:
-
- virtual bool ShouldRemove(EdgeId e) const = 0;
-
- bool Process(EdgeId e) override {
- TRACE("Checking edge " << this->g().str(e) << " for the removal condition");
- if (ShouldRemove(e)) {
- TRACE("Check passed, removing");
- edge_remover_.DeleteEdge(e);
- return true;
- }
- TRACE("Check not passed");
- return false;
- }
-
-};
-
-template<class Graph, class InterestingEdgeFinder,
- class Comparator = std::less<typename Graph::EdgeId>>
-class ConditionEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
- InterestingEdgeFinder, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, Comparator> base;
- pred::TypedPredicate<EdgeId> remove_condition_;
-protected:
-
- bool ShouldRemove(EdgeId e) const override {
- return remove_condition_(e);
- }
-
-public:
- ConditionEdgeRemovingAlgorithm(Graph& g,
- const InterestingEdgeFinder& interest_edge_finder,
- pred::TypedPredicate<EdgeId> remove_condition,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true)
- : base(g, interest_edge_finder,
- removal_handler,
- canonical_only, comp, track_changes),
- remove_condition_(remove_condition) {
-
- }
-};
-
-template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
-class ParallelEdgeRemovingAlgorithm : public ConditionEdgeRemovingAlgorithm<Graph,
- ParallelInterestingElementFinder<Graph>, Comparator> {
- typedef ConditionEdgeRemovingAlgorithm<Graph,
- ParallelInterestingElementFinder<Graph>, Comparator> base;
- typedef typename Graph::EdgeId EdgeId;
-
-public:
- ParallelEdgeRemovingAlgorithm(Graph& g,
- pred::TypedPredicate<EdgeId> remove_condition,
- size_t chunk_cnt,
- std::function<void(EdgeId)> removal_handler = boost::none,
- bool canonical_only = false,
- const Comparator& comp = Comparator(),
- bool track_changes = true)
- : base(g,
- ParallelInterestingElementFinder<Graph>(g, remove_condition, chunk_cnt),
- remove_condition, removal_handler,
- canonical_only, comp, track_changes) {
- }
-
-};
-
-}
diff --git a/src/modules/assembly_graph/graph_support/scaff_supplementary.cpp b/src/modules/assembly_graph/graph_support/scaff_supplementary.cpp
deleted file mode 100644
index afb3779..0000000
--- a/src/modules/assembly_graph/graph_support/scaff_supplementary.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#include "scaff_supplementary.hpp"
-#include <algorithm>
-
-using namespace std;
-namespace path_extend {
-
-
-void ScaffoldingUniqueEdgeAnalyzer::SetCoverageBasedCutoff() {
- vector <pair<double, size_t>> coverages;
- map <EdgeId, size_t> long_component;
- size_t total_len = 0, short_len = 0, cur_len = 0;
-
- for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- if (gp_.g.length(*iter) > length_cutoff_) {
- coverages.push_back(make_pair(gp_.g.coverage(*iter), gp_.g.length(*iter)));
- total_len += gp_.g.length(*iter);
- long_component[*iter] = 0;
- } else {
- short_len += gp_.g.length(*iter);
- }
- }
- if (total_len == 0) {
- WARN("not enough edges longer than "<< length_cutoff_);
- return;
- }
- sort(coverages.begin(), coverages.end());
- size_t i = 0;
- while (cur_len < total_len / 2 && i < coverages.size()) {
- cur_len += coverages[i].second;
- i++;
- }
- median_coverage_ = coverages[i].first;
-}
-
-
-void ScaffoldingUniqueEdgeAnalyzer::FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_) {
- storage_.unique_edges_.clear();
- size_t total_len = 0;
- size_t unique_len = 0;
- size_t unique_num = 0;
- storage_.SetMinLength(length_cutoff_);
- for (auto iter = gp_.g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
- size_t tlen = gp_.g.length(*iter);
- total_len += tlen;
- if (gp_.g.length(*iter) >= length_cutoff_ && gp_.g.coverage(*iter) > median_coverage_ * (1 - relative_coverage_variation_)
- && gp_.g.coverage(*iter) < median_coverage_ * (1 + relative_coverage_variation_) ) {
- storage_.unique_edges_.insert(*iter);
- unique_len += tlen;
- unique_num ++;
- }
- }
- for (auto iter = storage_.begin(); iter != storage_.end(); ++iter) {
- DEBUG (gp_.g.int_id(*iter) << " " << gp_.g.coverage(*iter) << " " << gp_.g.length(*iter) );
- }
- INFO ("With length cutoff: " << length_cutoff_ <<", median long edge coverage: " << median_coverage_ << ", and maximal unique coverage: " <<
- relative_coverage_variation_);
- INFO("Unique edges quantity: " << unique_num << ", unique edges length " << unique_len <<", total edges length" << total_len);
- if (unique_len * 2 < total_len) {
- WARN("Less than half of genome in unique edges!");
- }
-
-}
-
-
-
-}
diff --git a/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp b/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp
deleted file mode 100644
index 71522f6..0000000
--- a/src/modules/assembly_graph/graph_support/scaff_supplementary.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#pragma once
-
-#include "assembly_graph/graph_core/graph.hpp"
-#include "pipeline/graph_pack.hpp"
-#include "dev_support/logger/logger.hpp"
-
-namespace path_extend {
- typedef debruijn_graph::EdgeId EdgeId;
-
-/* Storage of presumably unique, relatively long edges. Filled by ScaffoldingUniqueEdgeAnalyzer
- *
- */
- class ScaffoldingUniqueEdgeStorage {
- friend class ScaffoldingUniqueEdgeAnalyzer;
- private:
- set <EdgeId> unique_edges_;
- size_t min_unique_length_;
- public:
- ScaffoldingUniqueEdgeStorage(): unique_edges_(){
- DEBUG("storage created, empty");
- }
-
- bool IsUnique(EdgeId e) const {
- return (unique_edges_.find(e) != unique_edges_.end());
- }
-
- decltype(unique_edges_.begin()) begin() const {
- return unique_edges_.begin();
- }
-
- decltype(unique_edges_.end()) end() const {
- return unique_edges_.end();
- }
-
- size_t size() const {
- return unique_edges_.size();
- }
- size_t GetMinLength() const {
- return min_unique_length_;
- }
- void SetMinLength(size_t min_length) {
- min_unique_length_ = min_length;
- }
-
- const set<EdgeId>& GetSet() const {
- return unique_edges_;
- }
- protected:
- DECL_LOGGER("ScaffoldingUniqueEdgeStorage")
-
- };
-
-/* Auxillary class required to fillin the unique edge storage.
- *
- */
- class ScaffoldingUniqueEdgeAnalyzer {
-
- ;
- private:
- const debruijn_graph::conj_graph_pack &gp_;
- size_t length_cutoff_;
- double median_coverage_;
- double relative_coverage_variation_;
- protected:
- DECL_LOGGER("ScaffoldingUniqueEdgeAnalyzer")
-
-
- void SetCoverageBasedCutoff();
- public:
- ScaffoldingUniqueEdgeAnalyzer(const debruijn_graph::conj_graph_pack &gp, size_t apriori_length_cutoff, double max_relative_coverage):gp_(gp), length_cutoff_(apriori_length_cutoff), relative_coverage_variation_(max_relative_coverage){
- SetCoverageBasedCutoff();
- }
- void FillUniqueEdgeStorage(ScaffoldingUniqueEdgeStorage &storage_);
- };
-}
-
-
diff --git a/src/modules/assembly_graph/handlers/edge_labels_handler.hpp b/src/modules/assembly_graph/handlers/edge_labels_handler.hpp
deleted file mode 100644
index 4a8c653..0000000
--- a/src/modules/assembly_graph/handlers/edge_labels_handler.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- *
- * Saves labeling of new_graph via different graph transformation by edges of unresolved graph - old_graph
- * Has two methods
- *
- * Created on: Aug 5, 2011
- * Author: undead
- */
-
-#ifndef EDGE_LABELS_HANDLER_HPP_
-#define EDGE_LABELS_HANDLER_HPP_
-
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "dev_support/simple_tools.hpp"
-#include <unordered_map>
-#include <map>
-
-using namespace omnigraph;
-
-namespace omnigraph {
-using std::map;
-
-//todo ask Shurik to remove new_graph_
-template<class Graph>
-class EdgeLabelHandler : public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-private:
- Graph &new_graph_;
- Graph &old_graph_;
- //From new edge to sequence of old
-public:
- map<EdgeId, vector<EdgeId> > edge_labels;
- //From old edge to set of new ones, containing it.
- map<EdgeId, set<EdgeId> > edge_inclusions;
-public:
- //TODO: integrate this to resolver, remove "from_resolve" parameter
- EdgeLabelHandler(Graph &new_graph, Graph &old_graph,
- const std::map<EdgeId, EdgeId> &from_resolve)
- : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
- new_graph_(new_graph),
- old_graph_(old_graph) {
- // printing from resolve
- FillLabels(from_resolve);
- /* for(auto iter = from_resolve.begin(); iter != from_resolve.end(); ++iter) {
- if (edge_inclusions.find(iter->second) == edge_inclusions.end()){
- set<EdgeId> tmp;
- edge_inclusions.insert(make_pair(iter->second, tmp));
- }
- edge_inclusions[iter->second].insert(iter->first);
-
- if (edge_labels.find(iter->first) == edge_labels.end()) {
- set<EdgeId> tmp;
- edge_labels.insert(make_pair(iter->first, tmp));
- }
- edge_labels[iter->second].push_back(iter->second);
- }
- */}
-
- EdgeLabelHandler(Graph &new_graph, Graph &old_graph)
- : GraphActionHandler<Graph>(new_graph, "EdgePositionHandler"),
- new_graph_(new_graph),
- old_graph_(old_graph) {
- }
-
- void FillLabels(const map<EdgeId, EdgeId> &from_resolve) {
- for (auto iter = from_resolve.begin(); iter != from_resolve.end();
- ++iter) {
- if (edge_inclusions.find(iter->second) == edge_inclusions.end()) {
- set<EdgeId> tmp;
- edge_inclusions.insert(make_pair(iter->second, tmp));
- }
- edge_inclusions.find(iter->second)->second.insert(iter->first);
-
- if (edge_labels.find(iter->first) == edge_labels.end()) {
- vector<EdgeId> tmp;
- edge_labels.insert(make_pair(iter->first, tmp));
- }
- edge_labels[iter->first].push_back(iter->second);
- }
- }
-
- virtual ~EdgeLabelHandler() {
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
- TRACE("Handle glue");
- if (edge_labels[edge1] != edge_labels[edge2])
- WARN("gluing two different edges is not a good idea on this step! EdgeLabel Handler can fail on such operation");
- vector<EdgeId> tmp;
- for (size_t i = 0; i < edge_labels[edge1].size(); i++) {
- edge_inclusions.find(edge_labels[edge1][i])->second.insert(
- new_edge);
- edge_inclusions.find(edge_labels[edge1][i])->second.erase(edge1);
- tmp.push_back(edge_labels[edge1][i]);
-
- edge_labels.erase(edge1);
- }
- for (size_t i = 0; i < edge_labels[edge2].size(); i++) {
- edge_inclusions.find(edge_labels[edge2][i])->second.insert(
- new_edge);
- edge_inclusions.find(edge_labels[edge2][i])->second.erase(edge2);
- edge_labels.erase(edge2);
-
- // tmp.push_back(edge_labels[edge1][i]);
- }
-
- edge_labels.insert(make_pair(new_edge, tmp));
-
- }
-
- virtual void HandleSplit(EdgeId /*oldEdge*/, EdgeId /*newEdge1*/, EdgeId /*newEdge2*/) {
- WARN("EdgesLabelHandler does not support splits");
- }
-
- virtual void HandleMerge(const vector<EdgeId> &oldEdges, EdgeId newEdge) {
- TRACE("HandleMerge by edge labels handler");
- size_t n = oldEdges.size();
- vector<EdgeId> tmp;
- for (size_t j = 0; j < n; j++) {
- TRACE("Edge " << oldEdges[j] << " was labeled by " << edge_labels[oldEdges[j]]);
- for (size_t i = 0; i < edge_labels[oldEdges[j]].size(); i++) {
- edge_inclusions[edge_labels[oldEdges[j]][i]].insert(newEdge);
- edge_inclusions[edge_labels[oldEdges[j]][i]].erase(oldEdges[j]);
- tmp.push_back(edge_labels[oldEdges[j]][i]);
- }
- edge_labels.erase(oldEdges[j]);
- }
- if (edge_labels.find(newEdge) != edge_labels.end()) {
- DEBUG("Unexpected finding of new edge labels");
- };
- edge_labels[newEdge] = tmp;
-
- }
-
- /*
- virtual void HandleAdd(VertexId v) {
- AddVertexIntId(v);
- }
- virtual void HandleDelete(VertexId v) {
- ClearVertexId(v);
- }
- */
- virtual void HandleAdd(EdgeId e) {
- TRACE("Add edge " << e);
-
- }
-
- virtual void HandleDelete(EdgeId e) {
- for (size_t i = 0; i < edge_labels[e].size(); i++) {
- edge_inclusions[edge_labels[e][i]].erase(e);
- }
- edge_labels.erase(e);
- }
-
- std::string str(EdgeId edgeId) const {
- std::stringstream ss;
-
- auto it = edge_labels.find(edgeId);
- if (it != edge_labels.end()) {
- TRACE("Number of labels " << it->second.size());
- for (auto label_it = it->second.begin(), end = it->second.end();
- label_it != end; ++label_it) {
- ss << this->g().str(*label_it) << "\\n";
- }
- }
- return ss.str();
- }
-
- vector<pair<EdgeId, size_t> > resolvedPositions(EdgeId old_edge, size_t position_on_edge) {
- vector<pair<EdgeId, size_t> > res;
- for (auto it = edge_inclusions[old_edge].begin(); it != edge_inclusions[old_edge].end(); it++) {
- EdgeId cur_edge = *it;
- size_t cur_shift = 0;
- for (size_t i = 0; i < edge_labels[cur_edge].size(); i++) {
- if (edge_labels[cur_edge][i] == old_edge) {
- res.push_back(make_pair(cur_edge, cur_shift + position_on_edge));
- }
- cur_shift += old_graph_.length(edge_labels[cur_edge][i]);
- }
- }
- return res;
- }
-
-};
-
-template<class Graph>
-class EdgesLabelsGraphLabeler : public GraphLabeler<Graph> {
-
-protected:
- typedef GraphLabeler<Graph> super;
- typedef typename super::EdgeId EdgeId;
- typedef typename super::VertexId VertexId;
- Graph &g_;
-public:
- EdgeLabelHandler<Graph> &EdgesLabels;
-
- EdgesLabelsGraphLabeler(Graph &g, EdgeLabelHandler<Graph> &EdgesLab)
- : g_(g),
- EdgesLabels(EdgesLab) {
- }
-
- virtual std::string label(VertexId vertexId) const {
- return g_.str(vertexId);
- }
-
- virtual std::string label(EdgeId edgeId) const {
- return EdgesLabels.str(edgeId) + ": " + g_.str(edgeId);
- }
-
- virtual ~EdgesLabelsGraphLabeler() {
- TRACE("~EdgesPosGraphLabeler");
- }
-
-};
-}
-
-#endif /* EDGE_LABELS_HANDLER_HPP_ */
diff --git a/src/modules/assembly_graph/handlers/edges_position_handler.hpp b/src/modules/assembly_graph/handlers/edges_position_handler.hpp
deleted file mode 100644
index aaa9af0..0000000
--- a/src/modules/assembly_graph/handlers/edges_position_handler.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * edges_position_handler.hpp
- *
- * Created on: 22.07.2011
- *
- */
-
-#ifndef EDGES_POSITION_HANDLER_HPP_
-#define EDGES_POSITION_HANDLER_HPP_
-
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-
-namespace omnigraph {
-
-struct EdgePosition {
- string contigId;
- MappingRange mr;
- EdgePosition(string _contigId, MappingRange _mr) : contigId(_contigId), mr(_mr) {
- }
-
- EdgePosition() {
- }
-};
-
-inline ostream& operator <<(ostream& os, const EdgePosition& ep) {
- return os << ep.contigId << " " << ep.mr;
-}
-
-template<class Graph>
-class EdgesPositionHandler: public GraphActionHandler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- size_t max_mapping_gap_;
- size_t max_gap_diff_;
- map<EdgeId, map<string, std::set<MappingRange>>> edges_positions_;
- //TODO extract set<MappingRange> as a storage class
-
- MappingRange EraseAndExtract(set<MappingRange> &ranges, set<MappingRange>::iterator &position, const MappingRange &new_pos) {
- auto &old_pos = *position;
- if(old_pos.IntersectLeftOf(new_pos) || old_pos.StrictlyContinuesWith(new_pos, max_mapping_gap_, max_gap_diff_)) {
- ranges.erase(position);
- return old_pos.Merge(new_pos);
- } else if(new_pos.IntersectLeftOf(old_pos) || new_pos.StrictlyContinuesWith(old_pos, max_mapping_gap_, max_gap_diff_)) {
- ranges.erase(position);
- return new_pos.Merge(old_pos);
- } else {
- return new_pos;
- }
- }
-
-public:
- MappingRange EraseAndExtract(set<MappingRange> &ranges, MappingRange new_pos) {
- auto it = ranges.lower_bound(new_pos);
- if(it != ranges.end()) {
- new_pos = EraseAndExtract(ranges, it, new_pos);
- it = ranges.lower_bound(new_pos);
- }
- if(it != ranges.begin()) {
- new_pos = EraseAndExtract(ranges, --it, new_pos);
- }
- return new_pos;
- }
-
- set<MappingRange> GetEdgePositions(EdgeId edge, string contig_id) const {
- VERIFY(this->IsAttached());
- auto edge_it = edges_positions_.find(edge);
- if(edge_it == edges_positions_.end())
- return set<MappingRange>();
- const auto& positions = edge_it->second;
- auto it = positions.find(contig_id);
- if(it == positions.end())
- return set<MappingRange>();
- else
- return it->second;
- }
-
- vector<EdgePosition> GetEdgePositions(EdgeId edge) const {
- VERIFY(this->IsAttached());
- auto edge_it = edges_positions_.find(edge);
- if(edge_it == edges_positions_.end())
- return vector<EdgePosition>();
- vector<EdgePosition> result;
- for(auto it = edge_it->second.begin(); it != edge_it->second.end(); ++it) {
- for(auto pos_it = it->second.begin(); pos_it != it->second.end(); ++pos_it) {
- result.push_back(EdgePosition(it->first, *pos_it));
- }
- }
- return result;
- }
-
- void AddEdgePosition(EdgeId edge, string contig_id, size_t start, size_t end, size_t m_start, size_t m_end) {
- VERIFY(this->IsAttached());
- AddEdgePosition(edge, contig_id, MappingRange(start, end, m_start, m_end));
- }
-
- void AddEdgePosition(EdgeId edge, string contig_id, MappingRange new_pos) {
- VERIFY(this->IsAttached());
- if(new_pos.empty())
- return;
- set<MappingRange> &new_set = edges_positions_[edge][contig_id];
- new_pos = EraseAndExtract(new_set, new_pos);
- new_set.insert(new_pos);
- }
-
- void AddAndShiftEdgePositions(EdgeId edge, const map<string, set<MappingRange>> &contig_map, int shift = 0) {
- VERIFY(this->IsAttached());
- for(auto contig_it = contig_map.begin(); contig_it != contig_map.end(); ++contig_it) {
- for(auto it = contig_it->second.begin(); it != contig_it->second.end(); ++it) {
- AddEdgePosition(edge, contig_it->first, it->Shift(shift).Fit(this->g().length(edge)));
- }
- }
- }
-
- template<typename Iter>
- void AddEdgePositions(EdgeId edge, Iter begin, Iter end) {
- VERIFY(this->IsAttached());
- for(auto it = begin; it != end; ++it) {
- AddEdgePosition(edge, it->contigId, it->mr);
- }
- }
-
- std::string str(EdgeId edge) const {
- VERIFY(this->IsAttached());
- std::stringstream ss;
- vector<EdgePosition> positions = GetEdgePositions(edge);
- size_t counter = 0;
- for (auto pos_it = positions.begin(), end = positions.end(); pos_it != end; ++pos_it) {
- ss << "(" << pos_it->contigId << ": " << pos_it->mr << ")\\n";
- counter++;
- if(counter > 30) {
- ss << "and many more. Totally " << positions.size() << " positions.";
- break;
- }
- }
- return ss.str();
- }
-
- /**
- * @param max_mapping_gap - maximal difference in positions of
- * original sequence for two mapping ranges to be merged.
- * @param max_gap_diff - maximal difference between gaps in initial and mapped ranges for
- * mapping ranges to be merged
- */
- EdgesPositionHandler(const Graph &g, size_t max_mapping_gap, size_t max_gap_diff = 0) :
- GraphActionHandler<Graph>(g, "EdgePositionHandler"),
- max_mapping_gap_(max_mapping_gap),
- max_gap_diff_(max_gap_diff) {
- }
-
- virtual ~EdgesPositionHandler() {
- TRACE("~EdgePositionHandler ok");
- }
-
- virtual void HandleGlue(EdgeId new_edge, EdgeId edge1, EdgeId edge2) {
-// TRACE("Handle glue ");
- auto positions1 = GetEdgePositions(edge1);
- auto positions2 = GetEdgePositions(edge2);
- AddEdgePositions(new_edge, positions1.begin(), positions1.end());
- AddEdgePositions(new_edge, positions2.begin(), positions2.end());
- }
-
- virtual void HandleSplit(EdgeId oldEdge, EdgeId newEdge1, EdgeId newEdge2) {
- if (oldEdge == this->g().conjugate(oldEdge)) {
- WARN("EdgesPositionHandler does not support self-conjugate splits");
- return;
- }
- if (edges_positions_.count(oldEdge) != 0) {
- auto contig_map = edges_positions_[oldEdge];
- AddAndShiftEdgePositions(newEdge1, contig_map, 0);
- AddAndShiftEdgePositions(newEdge2, contig_map, -int(this->g().length(newEdge1)));
- }
- }
-
- virtual void HandleMerge(const vector<EdgeId>& oldEdges, EdgeId newEdge) {
- int shift = 0;
- for(auto it = oldEdges.begin(); it != oldEdges.end(); ++it) {
- if (edges_positions_.count(*it) != 0) {
- AddAndShiftEdgePositions(newEdge, edges_positions_[*it], shift);
- }
- shift += int(this->g().length(*it));
- }
- }
-
- virtual void HandleAdd(EdgeId /*e*/) {
- }
-
- virtual void HandleDelete(EdgeId e) {
- edges_positions_.erase(e);
- }
-
- void clear() {
- edges_positions_.clear();
- }
-
-private:
- DECL_LOGGER("EdgesPositionHandler");
-};
-
-}
-
-#endif /* EDGES_POSITION_HANDLER_HPP_ */
diff --git a/src/modules/assembly_graph/handlers/id_track_handler.hpp b/src/modules/assembly_graph/handlers/id_track_handler.hpp
deleted file mode 100644
index 7ab0ec8..0000000
--- a/src/modules/assembly_graph/handlers/id_track_handler.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <unordered_map>
-//#include "utils.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "assembly_graph/graph_core/action_handlers.hpp"
-using namespace omnigraph;
-
-namespace omnigraph {
-template<class Graph>
-class GraphElementFinder : public GraphActionHandler<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- unordered_map<size_t, VertexId> id2vertex_;
- unordered_map<size_t, EdgeId> id2edge_;
-
-public:
- GraphElementFinder(const Graph &graph) : GraphActionHandler<Graph>(graph, "Graph element finder") {
- }
-
- virtual ~GraphElementFinder() {
- }
-
- virtual void HandleAdd(EdgeId e) {
-#pragma omp critical
- {
- id2edge_[e.int_id()] = e;
- }
- }
-
- virtual void HandleAdd(VertexId v) {
-#pragma omp critical
- {
- id2vertex_[v.int_id()] = v;
- }
- }
-
- virtual void HandleDelete(EdgeId e) {
- id2edge_[e.int_id()] = e;
- }
-
- virtual void HandleDelete(VertexId v) {
- id2vertex_[v.int_id()] = v;
- }
-
- VertexId ReturnVertexId(size_t id) const {
- auto it = id2vertex_.find(id);
- if(it == id2vertex_.end())
- return VertexId();
- else
- return it->second;
- }
-
- EdgeId ReturnEdgeId(size_t id) const {
- auto it = id2edge_.find(id);
- if(it == id2edge_.end())
- return EdgeId();
- else
- return it->second;
- }
-
- void Init() {
- for(auto it = this->g().begin(); it != this->g().end(); ++it) {
- HandleAdd(*it);
- for(auto eit = this->g().OutgoingEdges(*it).begin(); eit != this->g().OutgoingEdges(*it).end(); ++eit) {
- HandleAdd(*eit);
- }
- }
- }
-};
-
-template<class VertexId, class EdgeId>
-class BaseIdTrackHandler {
-public:
- BaseIdTrackHandler() {
- }
-
- size_t ReturnIntId(EdgeId e) const {
- return e.int_id();
- }
-
- size_t ReturnIntId(VertexId v) const {
- return v.int_id();
- }
-};
-
-template<class Graph>
-class IdTrackHandler : public BaseIdTrackHandler<typename Graph::VertexId, typename Graph::EdgeId> {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph &graph_;
-public:
- IdTrackHandler(const Graph& g) : graph_(g) {
- }
-
- ~IdTrackHandler() {
- }
-};
-
-}
diff --git a/src/modules/assembly_graph/paths/bidirectional_path.cpp b/src/modules/assembly_graph/paths/bidirectional_path.cpp
deleted file mode 100644
index 0718c2c..0000000
--- a/src/modules/assembly_graph/paths/bidirectional_path.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bidirectional_path.cpp
- *
- * Created on: Jun 25, 2015
- * Author: andrey
- */
-
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/paths/bidirectional_path.hpp"
-
-namespace path_extend {
-
-std::atomic<uint64_t> BidirectionalPath::path_id_{0};
-
-}
diff --git a/src/modules/assembly_graph/paths/bidirectional_path.hpp b/src/modules/assembly_graph/paths/bidirectional_path.hpp
deleted file mode 100644
index 36e6030..0000000
--- a/src/modules/assembly_graph/paths/bidirectional_path.hpp
+++ /dev/null
@@ -1,1087 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * bidirectional_path.h
- *
- * Created on: Nov 14, 2011
- * Author: andrey
- */
-#pragma once
-
-#include <atomic>
-#include "assembly_graph/graph_core/graph.hpp"
-#include "assembly_graph/components/connected_component.hpp"
-
-using debruijn_graph::Graph;
-using debruijn_graph::EdgeId;
-using debruijn_graph::VertexId;
-
-namespace path_extend {
-
-class BidirectionalPath;
-
-struct Gap {
- int gap_;
- uint32_t trash_previous_;
- uint32_t trash_current_;
- Gap(int gap)
- : gap_(gap), trash_previous_(0), trash_current_(0)
- { }
-
- Gap(int gap, uint32_t trash_previous, uint32_t trash_current)
- : gap_(gap), trash_previous_(trash_previous), trash_current_(trash_current)
- { }
-};
-
-
-class PathListener {
-public:
- virtual void FrontEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath * path, Gap gap) = 0;
- virtual void FrontEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
- virtual void BackEdgeRemoved(EdgeId e, BidirectionalPath * path) = 0;
- virtual ~PathListener() {
- }
-};
-
-
-class BidirectionalPath : public PathListener {
-private:
- static std::atomic<uint64_t> path_id_;
-
-
-public:
- BidirectionalPath(const Graph& g)
- : g_(g),
- data_(),
- conj_path_(NULL),
- cumulative_len_(),
- gap_len_(),
- listeners_(),
- id_(path_id_++),
- weight_(1.0),
- has_overlaped_begin_(false),
- has_overlaped_end_(false),
- overlap_(false) {
- }
-
- BidirectionalPath(const Graph& g, const std::vector<EdgeId>& path)
- : BidirectionalPath(g) {
- for (size_t i = 0; i < path.size(); ++i) {
- PushBack(path[i]);
- }
- RecountLengths();
- }
-
- BidirectionalPath(const Graph& g, EdgeId startingEdge)
- : BidirectionalPath(g) {
- PushBack(startingEdge);
- }
-
- BidirectionalPath(const BidirectionalPath& path)
- : g_(path.g_),
- data_(path.data_),
- conj_path_(NULL),
- cumulative_len_(path.cumulative_len_),
- gap_len_(path.gap_len_),
- listeners_(),
- id_(path_id_++),
- weight_(path.weight_),
- has_overlaped_begin_(path.has_overlaped_begin_),
- has_overlaped_end_(path.has_overlaped_end_),
- overlap_(path.overlap_) {
- }
-
-public:
- void Subscribe(PathListener * listener) {
- listeners_.push_back(listener);
- }
-
- void Unsubscribe(PathListener * listener) {
- listeners_.push_back(listener);
- }
-
- void SetConjPath(BidirectionalPath* path) {
- conj_path_ = path;
- }
-
- const BidirectionalPath* GetConjPath() const {
- return conj_path_;
- }
-
- BidirectionalPath* GetConjPath() {
- return conj_path_;
- }
-
- void SetWeight(float w) {
- weight_ = w;
- }
-
- double GetWeight() const {
- return weight_;
- }
-
- size_t Size() const {
- return data_.size();
- }
-
- const Graph& graph() const {
- return g_;
- }
-
- bool Empty() const {
- return data_.empty();
- }
-
- size_t Length() const {
- if (gap_len_.size() == 0 || cumulative_len_.size() == 0) {
- return 0;
- }
- return cumulative_len_[0] + gap_len_[0].gap_;
- }
-
- //TODO iterators forward/reverse
- EdgeId operator[](size_t index) const {
- return data_[index];
- }
-
- EdgeId At(size_t index) const {
- return data_[index];
- }
-
- EdgeId ReverseAt(size_t index) const {
- return data_[data_.size() - index - 1];
- }
-
-
- // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN)
- size_t LengthAt(size_t index) const {
- return cumulative_len_[index];
- }
-
- int GapAt(size_t index) const {
- return gap_len_[index].gap_;
- }
-
- uint32_t TrashCurrentAt(size_t index) const {
- return gap_len_[index].trash_current_;
- }
-
- uint32_t TrashPreviousAt(size_t index) const {
- return gap_len_[index].trash_previous_;
- }
-
- size_t GetId() const {
- return id_;
- }
-
- EdgeId Back() const {
- return data_.back();
- }
-
- EdgeId Front() const {
- return data_.front();
- }
-
- void PushBack(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
- data_.push_back(e);
- Gap gap_struct(gap, trash_previous, trash_current);
- gap_len_.push_back(gap_struct);
- IncreaseLengths(g_.length(e), gap_struct);
- NotifyBackEdgeAdded(e, gap_struct);
- }
-
- void PushBack(EdgeId e, Gap gap) {
- data_.push_back(e);
- gap_len_.push_back(gap);
- IncreaseLengths(g_.length(e), gap);
- NotifyBackEdgeAdded(e, gap);
- }
-
- void PushBack(const BidirectionalPath& path) {
- for (size_t i = 0; i < path.Size(); ++i) {
- PushBack(path.At(i), path.GapAt(i), path.TrashPreviousAt(i), path.TrashCurrentAt(i));
- }
- }
-
- void PopBack() {
- if (data_.empty()) {
- return;
- }
- EdgeId e = data_.back();
- DecreaseLengths();
- gap_len_.pop_back();
- data_.pop_back();
- NotifyBackEdgeRemoved(e);
- }
-
- void PopBack(size_t count) {
- for (size_t i = 0; i < count; ++i) {
- PopBack();
- }
- }
-
- void Clear() {
- while (!Empty()) {
- PopBack();
- }
- }
-
- virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, int) {
- }
-
- virtual void FrontEdgeAdded(EdgeId, BidirectionalPath*, Gap) {
- }
-
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, int gap) {
- PushFront(g_.conjugate(e), gap);
- }
-
- virtual void BackEdgeAdded(EdgeId e, BidirectionalPath*, Gap gap) {
- PushFront(g_.conjugate(e), gap);
- }
-
- virtual void FrontEdgeRemoved(EdgeId, BidirectionalPath*) {
- }
-
- virtual void BackEdgeRemoved(EdgeId, BidirectionalPath *) {
- PopFront();
- }
-
- int FindFirst(EdgeId e) const {
- for (size_t i = 0; i < Size(); ++i) {
- if (data_[i] == e) {
- return (int) i;
- }
- }
- return -1;
- }
-
- int FindLast(EdgeId e) const {
- for (int i = (int) Size() - 1; i >= 0; --i) {
- if (data_[i] == e) {
- return i;
- }
- }
- return -1;
- }
-
- bool Contains(EdgeId e) const {
- return FindFirst(e) != -1;
- }
-
- bool Contains(VertexId v) const {
- for(auto edge : data_) {
- if(g_.EdgeEnd(edge) == v || g_.EdgeStart(edge) == v ) {
- return true;
- }
- }
- return false;
- }
-
- vector<size_t> FindAll(EdgeId e, size_t start = 0) const {
- vector<size_t> result;
- for (size_t i = start; i < Size(); ++i) {
- if (data_[i] == e) {
- result.push_back(i);
- }
- }
- return result;
- }
-
- bool CompareFrom(size_t from, const BidirectionalPath& sample) const {
- if (from + sample.Size() > Size()) {
- return false;
- }
-
- for (size_t i = 0; i < sample.Size(); ++i) {
- if (At(from + i) != sample[i]) {
- return false;
- }
- }
- return true;
- }
-
- size_t CommonEndSize(const BidirectionalPath& p) const {
- if (p.Size() == 0) {
- return 0;
- }
- std::vector<size_t> begins = FindAll(p.At(0));
- for (size_t i = 0; i < begins.size(); ++i) {
- size_t it1 = begins[i];
- size_t it2 = 0;
- while (it2 < p.Size() and At(it1) == p.At(it2)) {
- it1++;
- it2++;
- if (it1 == Size()) {
- return it2;
- }
- }
- }
- return 0;
- }
-
- size_t OverlapEndSize(const BidirectionalPath* path2) const {
- if (Size() == 0) {
- return 0;
- }
- int last1 = (int) Size() - 1;
- int max_over = 0;
- vector<size_t> begins2 = path2->FindAll(At(last1));
- for (size_t i = 0; i < begins2.size(); ++i) {
- int begin2 = (int) begins2[i];
- int cur1 = last1;
- while (begin2 > 0 && cur1 > 0 && path2->At(begin2 - 1) == At(cur1 - 1)) {
- cur1--;
- begin2--;
- }
- int over = last1 - cur1 + 1;
- if (begin2 == 0 && cur1 > 0 && over > max_over) {
- max_over = over;
- }
- }
- return (size_t) max_over;
- }
-
- int FindFirst(const BidirectionalPath& path, size_t from = 0) const {
- if (path.Size() > Size()) {
- return -1;
- }
- for (size_t i = from; i <= Size() - path.Size(); ++i) {
- if (CompareFrom(i, path)) {
- return (int) i;
- }
- }
- return -1;
- }
-//TODO: Why just naive search?
- int FindLast(const BidirectionalPath& path) const {
- if (path.Size() > Size()) {
- return -1;
- }
- for (int i = (int) (Size() - path.Size()); i >= 0; --i) {
- if (CompareFrom((size_t) i, path)) {
- return i;
- }
- }
- return -1;
- }
-
- bool Contains(const BidirectionalPath& path) const {
- return FindFirst(path) != -1;
- }
-
- bool Equal(const BidirectionalPath& path) const {
- return operator==(path);
- }
-
- bool operator==(const BidirectionalPath& path) const {
- return Size() == path.Size() && CompareFrom(0, path);
- }
-
- bool operator!=(const BidirectionalPath& path) const {
- return !operator==(path);
- }
-
- void CheckConjugateEnd(size_t max_repeat_length) {
- size_t prev_size = 0;
- while (prev_size != Size()) {
- prev_size = Size();
- FindConjEdges(max_repeat_length);
- }
- }
-
- size_t GetComponent(const debruijn_graph::ConnectedComponentCounter &component_counter) const {
- std::unordered_map <size_t, size_t> component_sizes;
- for (size_t i = 0; i < this->Size(); i++) {
- auto e = this->At(i);
- size_t comp_id = component_counter.GetComponent(e);
- if (component_sizes.find(comp_id) == component_sizes.end())
- component_sizes[comp_id] = 0;
- component_sizes[comp_id] += g_.length(e);
- }
- size_t ans = 0;
- size_t maxans = 0;
- for (auto pp: component_sizes) {
- if (pp.second > maxans) {
- ans = pp.first;
- maxans = pp.second;
- }
- }
- return ans;
- }
-
- void FindConjEdges(size_t max_repeat_length) {
- for (size_t begin_pos = 0; begin_pos < Size(); ++begin_pos) {
- size_t begin = begin_pos;
- vector<size_t> conj_pos = FindAll(g_.conjugate(At(begin_pos)), begin + 1);
- for (auto end_pos = conj_pos.rbegin(); end_pos != conj_pos.rend(); ++end_pos) {
- VERIFY(*end_pos < Size());
- size_t end = *end_pos;
- if (end <= begin) {
- continue;
- }
- while (begin < end && At(begin) == g_.conjugate(At(end))) {
- begin++;
- end--;
- }
- DEBUG("Found palindromic fragment from " << begin_pos << " to " << *end_pos);
- Print();
- VERIFY(*end_pos < Size());
- size_t tail_size = Size() - *end_pos - 1;
- size_t head_size = begin_pos;
- size_t palindrom_half_size = begin - begin_pos;
- size_t head_len = Length() - LengthAt(begin_pos);
- size_t tail_len = *end_pos < Size() - 1 ? LengthAt(*end_pos + 1) : 0;
-//TODO : this is not true in case of gaps inside the palindrom_len;
- size_t palindrom_len = (size_t) max((int) LengthAt(begin_pos) - (int) LengthAt(begin), 0);
- size_t between = (size_t) max(0, (int) LengthAt(begin) - (int) (end < Size() - 1 ? LengthAt(end + 1) : 0));
- DEBUG("tail len " << tail_len << " head len " << head_len << " palindrom_len "<< palindrom_len << " between " << between);
- if (palindrom_len <= max_repeat_length) {
- if (palindrom_len < head_len && palindrom_len < tail_len) {
- DEBUG("too big head and end");
- continue;
- }
- if (between > palindrom_len) {
- DEBUG("too big part between");
- continue;
- }
- }
- bool delete_tail = tail_size < head_size;
- if (tail_size == head_size) {
- delete_tail = tail_len < head_len;
- }
- if (delete_tail) {
- PopBack(tail_size + palindrom_half_size);
- DEBUG("Deleting tail because of palindrom removal");
- return;
- } else {
- GetConjPath()->PopBack(head_size + palindrom_half_size);
- DEBUG("Deleting head because of palindrom removal");
- return;
- }
- }
- }
- }
-
- BidirectionalPath SubPath(size_t from, size_t to) const {
- BidirectionalPath result(g_);
- for (size_t i = from; i < min(to, Size()); ++i) {
- result.PushBack(data_[i], gap_len_[i]);
- }
- return result;
- }
-
- BidirectionalPath SubPath(size_t from) const {
- return SubPath(from, Size());
- }
-
- double Coverage() const {
- double cov = 0.0;
-
- for (size_t i = 0; i < Size(); ++i) {
- cov += g_.coverage(data_[i]) * (double) g_.length(data_[i]);
- }
- return cov / (double) Length();
- }
-
- BidirectionalPath Conjugate() const {
- BidirectionalPath result(g_);
- if (Empty()) {
- return result;
- }
- result.PushBack(g_.conjugate(Back()), 0);
- for (int i = ((int) Size()) - 2; i >= 0; --i) {
- result.PushBack(g_.conjugate(data_[i]), gap_len_[i + 1].gap_ + gap_len_[i + 1].trash_current_ - gap_len_[i + 1].trash_previous_, gap_len_[i + 1].trash_current_, gap_len_[i + 1].trash_previous_);
- }
-
- return result;
- }
-
- vector<EdgeId> ToVector() const {
- return vector<EdgeId>(data_.begin(), data_.end());
- }
-
- bool CameToInterstrandBulge() const {
- if (Empty())
- return false;
-
- EdgeId lastEdge = Back();
- VertexId lastVertex = g_.EdgeEnd(lastEdge);
-
- if (g_.OutgoingEdgeCount(lastVertex) == 2) {
- vector<EdgeId> bulgeEdges(g_.out_begin(lastVertex), g_.out_end(lastVertex));
- VertexId nextVertex = g_.EdgeEnd(bulgeEdges[0]);
-
- if (bulgeEdges[0] == g_.conjugate(bulgeEdges[1]) && nextVertex == g_.EdgeEnd(bulgeEdges[1]) && g_.CheckUniqueOutgoingEdge(nextVertex)
- && *(g_.out_begin(nextVertex)) == g_.conjugate(lastEdge)) {
-
- DEBUG("Came to interstrand bulge " << g_.int_id(lastEdge));
- return true;
- }
- }
- return false;
- }
-
- bool IsInterstrandBulge() const {
- if (Empty())
- return false;
-
- EdgeId lastEdge = Back();
- VertexId lastVertex = g_.EdgeEnd(lastEdge);
- VertexId prevVertex = g_.EdgeStart(lastEdge);
-
- if (g_.OutgoingEdgeCount(prevVertex) == 2 && g_.IncomingEdgeCount(lastVertex) == 2 && g_.CheckUniqueOutgoingEdge(lastVertex)
- && g_.CheckUniqueIncomingEdge(prevVertex) && *(g_.in_begin(prevVertex)) == g_.conjugate(*(g_.out_begin(lastVertex)))) {
-
- vector<EdgeId> bulgeEdges(g_.out_begin(prevVertex), g_.out_end(prevVertex));
- EdgeId bulgeEdge = bulgeEdges[0] == lastEdge ? bulgeEdges[1] : bulgeEdges[0];
-
- if (bulgeEdge == g_.conjugate(lastEdge)) {
- DEBUG("In interstrand bulge " << g_.int_id(lastEdge));
- return true;
- }
- }
- return false;
- }
-
- void Print() const {
- DEBUG("Path " << id_);
- DEBUG("Length " << Length());
- DEBUG("Weight " << weight_);
- DEBUG("#, edge, length, gap length, trash length, total length, total length from begin");
- for (size_t i = 0; i < Size(); ++i) {
- DEBUG(i << ", " << g_.int_id(At(i)) << ", "
- << g_.length(At(i)) << ", " << GapAt(i) << ", "
- << TrashPreviousAt(i) << "-" << TrashCurrentAt(i)
- << ", " << LengthAt(i) << ", "
- << ((Length() < LengthAt(i)) ? 0 : Length() - LengthAt(i)));
- }
- }
-
- void PrintInString() const {
- stringstream str;
- for (size_t i = 0; i < Size(); ++i) {
- str << g_.int_id(At(i)) << " ";
- }
- DEBUG(str.str());
- }
- void PrintInfo() const {
- INFO("Path " << id_);
- INFO("Length " << Length());
- INFO("Weight " << weight_);
- INFO("#, edge, length, gap length, total length");
- for (size_t i = 0; i < Size(); ++i) {
- INFO(i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i));
- }
- }
-
- void Print(std::ostream& os) {
- if (Empty()) {
- return;
- }
- os << "Path " << GetId() << endl;
- os << "Length " << Length() << endl;
- os << "#, edge, length, gap, total length" << endl;
- for (size_t i = 0; i < Size(); ++i) {
- os << i << ", " << g_.int_id(At(i)) << ", " << g_.length(At(i)) << ", " << GapAt(i) << ", " << LengthAt(i) << endl;
- }
- }
-
- void SetOverlapedBeginTo(BidirectionalPath* to) {
- if (has_overlaped_begin_) {
- to->SetOverlapBegin();
- }
- SetOverlapBegin();
- to->SetOverlapEnd();
- }
-
- void SetOverlapedEndTo(BidirectionalPath* to) {
- if (has_overlaped_end_) {
- to->SetOverlapEnd();
- }
- SetOverlapEnd();
- to->SetOverlapBegin();
- }
-
- void SetOverlap(bool overlap = true) {
- overlap_ = overlap;
- conj_path_->overlap_ = overlap;
- }
-
- bool HasOverlapedBegin() const {
- return has_overlaped_begin_;
- }
-
- bool HasOverlapedEnd() const {
- return has_overlaped_end_;
- }
-
- bool IsOverlap() const {
- return overlap_;
- }
-
- void ResetOverlaps() {
- overlap_ = false;
- has_overlaped_begin_ = false;
- has_overlaped_end_ = false;
- conj_path_->overlap_ = false;
- conj_path_->has_overlaped_begin_ = false;
- conj_path_->has_overlaped_end_ = false;
- }
-private:
-
- void RecountLengths() {
- cumulative_len_.clear();
- size_t currentLength = 0;
- for (auto iter = data_.rbegin(); iter != data_.rend(); ++iter) {
- currentLength += g_.length((EdgeId) *iter);
- cumulative_len_.push_front(currentLength);
- }
- }
-
- void IncreaseLengths(size_t length, Gap gap_struct) {
- for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
- *iter += length + gap_struct.gap_ - gap_struct.trash_previous_;
- }
- cumulative_len_.push_back(length);
- }
-
- void DecreaseLengths() {
- size_t length = g_.length(data_.back()) + gap_len_.back().gap_ - gap_len_.back().trash_previous_;
-
- for (auto iter = cumulative_len_.begin(); iter != cumulative_len_.end(); ++iter) {
- *iter -= length;
- }
- cumulative_len_.pop_back();
- }
-
- void NotifyFrontEdgeAdded(EdgeId e, int gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyFrontEdgeAdded(EdgeId e, Gap gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyBackEdgeAdded(EdgeId e, int gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyBackEdgeAdded(EdgeId e, Gap gap) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeAdded(e, this, gap);
- }
- }
-
- void NotifyFrontEdgeRemoved(EdgeId e) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->FrontEdgeRemoved(e, this);
- }
- }
-
- void NotifyBackEdgeRemoved(EdgeId e) {
- for (auto i = listeners_.begin(); i != listeners_.end(); ++i) {
- (*i)->BackEdgeRemoved(e, this);
- }
- }
-
- void PushFront(EdgeId e, Gap gap) {
- PushFront(e, gap.gap_ + gap.trash_current_ - gap.trash_previous_, gap.trash_current_, gap.trash_previous_);
- }
-
- void PushFront(EdgeId e, int gap = 0, uint32_t trash_previous = 0, uint32_t trash_current = 0) {
- data_.push_front(e);
- if (gap_len_.size() > 0) {
- gap_len_[0].gap_ += gap;
- gap_len_[0].trash_previous_ += trash_previous;
- gap_len_[0].trash_current_ += trash_current;
- }
- gap_len_.push_front(Gap(0, 0, 0));
-
- int length = (int) g_.length(e);
- if (cumulative_len_.empty()) {
- cumulative_len_.push_front(length);
- } else {
- cumulative_len_.push_front(length + cumulative_len_.front() + gap - trash_previous );
- }
- NotifyFrontEdgeAdded(e, gap);
- }
-
- void PopFront() {
- EdgeId e = data_.front();
- if (gap_len_.size() > 1) {
- gap_len_[1].gap_ = 0;
- gap_len_[1].trash_previous_ = 0;
- gap_len_[1].trash_current_ = 0;
- }
- data_.pop_front();
- gap_len_.pop_front();
-
- cumulative_len_.pop_front();
- NotifyFrontEdgeRemoved(e);
- }
-
- void SetOverlapBegin(bool overlap = true) {
- if (has_overlaped_begin_ != overlap) {
- has_overlaped_begin_ = overlap;
- }
- if (GetConjPath()->has_overlaped_end_ != overlap) {
- GetConjPath()->has_overlaped_end_ = overlap;
- }
- }
-
- void SetOverlapEnd(bool overlap = true) {
- GetConjPath()->SetOverlapBegin(overlap);
- }
-
- const Graph& g_;
- std::deque<EdgeId> data_;
- BidirectionalPath* conj_path_;
- std::deque<size_t> cumulative_len_; // Length from beginning of i-th edge to path end for forward directed path: L(e1 + e2 + ... + eN) ... L(eN)
- std::deque<Gap> gap_len_; // e1 - gap2 - e2 - ... - gapN - eN
- std::vector<PathListener *> listeners_;
- const uint64_t id_; //Unique ID
- float weight_;
- bool has_overlaped_begin_;
- bool has_overlaped_end_;
- bool overlap_;
- DECL_LOGGER("BidirectionalPath");
-};
-
-inline int SkipOneGap(EdgeId end, const BidirectionalPath& path, int gap, int pos, bool forward) {
- size_t len = 0;
- while (pos < (int) path.Size() && pos >= 0 && end != path.At(pos) && (int) len < 2 * gap) {
- len += path.graph().length(path.At(pos));
- forward ? pos++ : pos--;
- }
- if (pos < (int) path.Size() && pos >= 0 && end == path.At(pos)) {
- return pos;
- }
- return -1;
-}
-
-inline void SkipGaps(const BidirectionalPath& path1, size_t& cur_pos1, int gap1, const BidirectionalPath& path2, size_t& cur_pos2, int gap2, bool use_gaps,
- bool forward) {
- if (use_gaps) {
- if (gap1 > 0 && gap2 <= 0) {
- int temp2 = SkipOneGap(path1.At(cur_pos1), path2, gap1, (int) cur_pos2, forward);
- if (temp2 >= 0) {
- cur_pos2 = (size_t) temp2;
- }
- } else if (gap2 > 0 && gap1 <= 0) {
- int temp1 = SkipOneGap(path2.At(cur_pos2), path1, gap2, (int) cur_pos1, forward);
- if (temp1 >= 0) {
- cur_pos1 = (size_t) temp1;
- }
- } else if (gap1 > 0 && gap2 > 0 && gap1 != gap2) {
- DEBUG("not equal gaps in two paths!!!");
- }
- }
-}
-
-inline size_t FirstNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- int cur_pos1 = (int) pos1;
- int cur_pos2 = (int) pos2;
- int gap1 = path1.GapAt(cur_pos1);
- int gap2 = path2.GapAt(cur_pos2);
- while (cur_pos1 >= 0 && cur_pos2 >= 0) {
- if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
- cur_pos1--;
- cur_pos2--;
- } else {
- DEBUG("Not Equal at " << cur_pos1 << " and " << cur_pos2);
- return cur_pos1;
- }
- if (cur_pos1 >= 0 && cur_pos2 >= 0) {
- size_t p1 = (size_t) cur_pos1;
- size_t p2 = (size_t) cur_pos2;
- SkipGaps(path1, p1, gap1, path2, p2, gap2, use_gaps, false);
- cur_pos1 = (int) p1;
- cur_pos2 = (int) p2;
- gap1 = path1.GapAt(cur_pos1);
- gap2 = path2.GapAt(cur_pos2);
- }
- }
- DEBUG("Equal!!");
- return -1UL;
-}
-inline bool EqualBegins(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- DEBUG("Checking for equal begins");
- return FirstNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
-}
-
-inline size_t LastNotEqualPosition(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- size_t cur_pos1 = pos1;
- size_t cur_pos2 = pos2;
- while (cur_pos1 < path1.Size() && cur_pos2 < path2.Size()) {
- if (path1.At(cur_pos1) == path2.At(cur_pos2)) {
- cur_pos1++;
- cur_pos2++;
- } else {
- return cur_pos1;
- }
- int gap1 = cur_pos1 < path1.Size() ? path1.GapAt(cur_pos1) : 0;
- int gap2 = cur_pos2 < path2.Size() ? path2.GapAt(cur_pos2) : 0;
- SkipGaps(path1, cur_pos1, gap1, path2, cur_pos2, gap2, use_gaps, true);
- }
- return -1UL;
-}
-
-inline bool EqualEnds(const BidirectionalPath& path1, size_t pos1, const BidirectionalPath& path2, size_t pos2, bool use_gaps) {
- return LastNotEqualPosition(path1, pos1, path2, pos2, use_gaps) == -1UL;
-}
-
-inline bool PathIdCompare(const BidirectionalPath* p1, const BidirectionalPath* p2) {
- return p1->GetId() < p2->GetId();
-}
-
-
-
-typedef std::pair<BidirectionalPath*, BidirectionalPath*> PathPair;
-
-inline bool compare_path_pairs(const PathPair& p1, const PathPair& p2) {
- if (p1.first->Length() != p2.first->Length() || p1.first->Size() == 0 || p2.first->Size() == 0) {
- return p1.first->Length() > p2.first->Length();
- }
- const Graph& g = p1.first->graph();
- return g.int_id(p1.first->Front()) < g.int_id(p2.first->Front());
-}
-
-class PathComparator {
-public:
- bool operator()(const BidirectionalPath& p1, const BidirectionalPath& p2) const {
- return p1.GetId() < p2.GetId();
- }
-
- bool operator()(const BidirectionalPath* p1, const BidirectionalPath* p2) const {
- return p1->GetId() < p2->GetId();
- }
-};
-
-typedef set<BidirectionalPath*, PathComparator> BidirectionalPathSet;
-
-template<class Value>
-using BidirectionalPathMap = map<BidirectionalPath*, Value, PathComparator>;
-
-typedef std::multiset <BidirectionalPath *, PathComparator> BidirectionalPathMultiset;
-
-class PathContainer {
-
-public:
-
- typedef std::vector<PathPair> PathContainerT;
-
- class Iterator : public PathContainerT::iterator {
- public:
- Iterator(const PathContainerT::iterator& iter)
- : PathContainerT::iterator(iter) {
- }
- BidirectionalPath* get() const {
- return this->operator *().first;
- }
- BidirectionalPath* getConjugate() const {
- return this->operator *().second;
- }
- };
-
- class ConstIterator : public PathContainerT::const_iterator {
- public:
- ConstIterator(const PathContainerT::const_iterator& iter)
- : PathContainerT::const_iterator(iter) {
- }
- BidirectionalPath* get() const {
- return this->operator *().first;
- }
- BidirectionalPath* getConjugate() const {
- return this->operator *().second;
- }
- };
-
- PathContainer() {
- }
-
- BidirectionalPath& operator[](size_t index) const {
- return *(data_[index].first);
- }
-
- BidirectionalPath* Get(size_t index) const {
- return data_[index].first;
- }
-
- BidirectionalPath* GetConjugate(size_t index) const {
- return data_[index].second;
- }
-
- void DeleteAllPaths() {
- for (size_t i = 0; i < data_.size(); ++i) {
- delete data_[i].first;
- delete data_[i].second;
- }
- clear();
- }
-
- size_t size() const {
- return data_.size();
- }
-
- void clear() {
- data_.clear();
- }
-
- void reserve(size_t size) {
- data_.reserve(size);
- }
-
- bool AddPair(BidirectionalPath* p, BidirectionalPath* cp) {
- p->SetConjPath(cp);
- cp->SetConjPath(p);
- p->Subscribe(cp);
- cp->Subscribe(p);
- data_.push_back(std::make_pair(p, cp));
- return true;
- }
-
- void SortByLength() {
- std::stable_sort(data_.begin(), data_.end(), compare_path_pairs);
- }
-
- Iterator begin() {
- return Iterator(data_.begin());
- }
-
- Iterator end() {
- return Iterator(data_.end());
- }
-
-
- ConstIterator begin() const {
- return ConstIterator(data_.begin());
- }
-
- ConstIterator end() const {
- return ConstIterator(data_.end());
- }
-
- Iterator erase(Iterator iter) {
- return Iterator(data_.erase(iter));
- }
-
- void print() const {
- for (size_t i = 0; i < size(); ++i) {
- Get(i)->Print();
- GetConjugate(i)->Print();
- }
- }
-
- void FilterEmptyPaths() {
- DEBUG ("try to delete empty paths");
- for (Iterator iter = begin(); iter != end();) {
- if (iter.get()->Size() == 0) {
- // FIXME: This is trash. PathContainer should own paths
- delete iter.get();
- delete iter.getConjugate();
- iter = erase(iter);
- } else {
- ++iter;
- }
- }
- DEBUG("empty paths are removed");
- }
-
- void FilterInterstandBulges() {
- DEBUG ("Try to delete paths with interstand bulges");
- for (Iterator iter = begin(); iter != end(); ++iter) {
- if (iter.get()->IsInterstrandBulge()) {
- iter.get()->PopBack();
- }
- if (iter.getConjugate()->IsInterstrandBulge()) {
- iter.getConjugate()->PopBack();
- }
- }
- DEBUG("deleted paths with interstand bulges");
- }
-
-private:
- std::vector<PathPair> data_;
-
-protected:
- DECL_LOGGER("BidirectionalPath");
-
-};
-
-inline pair<size_t, size_t> ComparePaths(size_t start_pos1, size_t start_pos2, const BidirectionalPath& path1, const BidirectionalPath& path2,
- size_t max_diff) {
- path1.Print();
- path2.Print();
- if (start_pos1 >= path1.Size() || start_pos2 >= path2.Size()) {
- return make_pair(start_pos1, start_pos2);
- }
- const Graph& g = path1.graph();
- size_t cur_pos = start_pos1;
- size_t last2 = start_pos2;
- size_t last1 = cur_pos;
- cur_pos++;
- size_t diff_len = 0;
- while (cur_pos < path1.Size()) {
- if (diff_len > max_diff) {
- return make_pair(last1, last2);
- }
- EdgeId e = path1[cur_pos];
- vector<size_t> poses2 = path2.FindAll(e);
- bool found = false;
- for (size_t pos2 = 0; pos2 < poses2.size(); ++pos2) {
- if (poses2[pos2] > last2) {
- if (path2.LengthAt(last2) - path2.LengthAt(poses2[pos2]) - g.length(path2.At(last2)) - path2.GapAt(poses2[pos2]) > max_diff) {
- break;
- }
- last2 = poses2[pos2];
- last1 = cur_pos;
- DEBUG("found " << cur_pos);
- found = true;
- break;
- }
- }
- if (!found) {
- diff_len += g.length(e) + path1.GapAt(cur_pos);
- DEBUG("not found " << cur_pos << " now diff len " << diff_len);
- } else {
- diff_len = 0;
- }
- cur_pos++;
- }
- return make_pair(last1, last2);
-}
-
-inline void DeletePaths(BidirectionalPathSet& paths) {
- for (auto i = paths.begin(); i != paths.end(); ++i) {
- delete (*i);
- }
-}
-
-inline void DeletePaths(vector<BidirectionalPath*>& paths) {
- for (auto i = paths.begin(); i != paths.end(); ++i) {
- delete (*i);
- }
-}
-
-inline void DeleteMapWithPaths(map<EdgeId, BidirectionalPath*> m) {
- for (auto i = m.begin(); i != m.end(); ++i){
- delete i->second;
- }
-}
-
-} // path extend
-
diff --git a/src/modules/assembly_graph/paths/mapping_path.hpp b/src/modules/assembly_graph/paths/mapping_path.hpp
deleted file mode 100644
index 2cb6076..0000000
--- a/src/modules/assembly_graph/paths/mapping_path.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/range.hpp"
-
-namespace omnigraph {
-
-/**
- * This class is a representation of how certain sequence is mapped to genome. Needs further adjustment.
- */
-template<typename ElementId>
-class Path {
- std::vector<ElementId> sequence_;
- size_t start_pos_;
- size_t end_pos_;
- public:
- typedef typename vector<ElementId>::const_iterator iterator;
-
- Path(const vector<ElementId>& sequence, size_t start_pos, size_t end_pos)
- : sequence_(sequence), start_pos_(start_pos), end_pos_( end_pos) {
- }
-
- Path()
- : sequence_(),
- start_pos_(-1ul),
- end_pos_(-1ul) {
- }
-
- size_t start_pos() const { return start_pos_; }
- size_t end_pos() const { return end_pos_; }
-
- size_t size() const { return sequence_.size(); }
-
- const std::vector<ElementId>& sequence() const { return sequence_; }
- ElementId operator[](size_t index) const { return sequence_[index]; }
-
- iterator begin() const { return sequence_.begin(); }
- iterator end() const { return sequence_.end(); }
-};
-
-struct MappingRange {
-// on genome/contig/whatever
- Range initial_range;
-//on edge
- Range mapped_range;
-
- MappingRange() {
- }
-
- MappingRange(Range initial_range, Range mapped_range)
- : initial_range(initial_range), mapped_range(mapped_range) {}
-
- MappingRange(size_t i_start, size_t i_end, size_t m_start, size_t m_end)
- : initial_range(i_start, i_end), mapped_range(m_start, m_end) {}
-
- MappingRange Merge(const MappingRange &other) const {
- return MappingRange(initial_range.Merge(other.initial_range), mapped_range.Merge(other.mapped_range));
- }
-
- MappingRange ShiftInitial(int shift) const {
- MappingRange result(*this);
- result.initial_range.shift(shift);
- return result;
- }
-
- MappingRange Shift(int shift) const {
- VERIFY(initial_range.end_pos >= initial_range.start_pos);
- if(empty())
- return MappingRange();
- MappingRange result(*this);
- if(int(result.mapped_range.end_pos) <= -shift)
- return MappingRange();
- result.mapped_range.end_pos += shift;
- if(int(result.mapped_range.start_pos) <= -shift) {
- result.initial_range.start_pos -= result.mapped_range.start_pos + shift;
- if(result.initial_range.start_pos >= result.initial_range.end_pos)
- result.initial_range.start_pos = result.initial_range.end_pos - 1;
- result.mapped_range.start_pos = 0;
- } else {
- result.mapped_range.start_pos += shift;
- }
- return result;
- }
-
- MappingRange Fit(size_t length) const {
- VERIFY(initial_range.end_pos >= initial_range.start_pos);
- if(empty())
- return MappingRange();
- MappingRange result(*this);
- if(result.mapped_range.start_pos >= length)
- return MappingRange();
- if(result.mapped_range.end_pos >= length) {
- if(result.initial_range.end_pos + length < result.mapped_range.end_pos)
- return MappingRange();
- result.initial_range.end_pos -= result.mapped_range.end_pos - length;
- result.mapped_range.end_pos = length;
- }
- return result;
- }
-
- bool empty() const {
- return initial_range.empty() || mapped_range.empty();
- }
-
- bool operator<(const MappingRange &other) const {
- if(this->initial_range != other.initial_range)
- return this->initial_range < other.initial_range;
- return this->mapped_range < other.mapped_range;
- }
- MappingRange operator = (const MappingRange & other) {
- initial_range = other.initial_range;
- mapped_range = other.mapped_range;
- return *this;
- }
-
- bool Intersect(const MappingRange &other) {
- return initial_range.Intersect(other.initial_range) && mapped_range.Intersect(other.mapped_range);
- }
-
- bool IntersectLeftOf(const MappingRange &other) const {
- return initial_range.IntersectLeftOf(other.initial_range) && mapped_range.IntersectLeftOf(other.mapped_range);
- }
-
- bool StrictlyContinuesWith(const MappingRange &other, size_t max_gap, size_t gap_diff = 0) const {
- return this->initial_range.end_pos <= other.initial_range.start_pos
- && this->mapped_range.end_pos <= other.mapped_range.start_pos
- && other.initial_range.start_pos - this->initial_range.end_pos
- <= other.mapped_range.start_pos - this->mapped_range.end_pos + gap_diff
- && other.mapped_range.start_pos - this->mapped_range.end_pos
- <= other.initial_range.start_pos - this->initial_range.end_pos + gap_diff
- && other.initial_range.start_pos - this->initial_range.end_pos <= max_gap;
- }
-
- bool operator==(const MappingRange &that) const {
- return initial_range == that.initial_range || mapped_range == that.mapped_range;
- }
-
- bool operator!=(const MappingRange &that) const {
- return !(*this == that);
- }
-
-};
-
-inline std::ostream& operator<<(std::ostream& os, const MappingRange& map_range) {
- os << map_range.initial_range << " --> " << map_range.mapped_range;
- return os;
-}
-
-template<typename ElementId>
-class MappingPath {
- public:
- MappingPath() {}
-
- MappingPath(const ElementId &edge,
- const MappingRange &range_mapping)
- : edges_({ edge }),
- range_mappings_({ range_mapping }) {}
-
- MappingPath(const std::vector<ElementId>& edges,
- const std::vector<MappingRange> range_mappings)
- : edges_(edges),
- range_mappings_(range_mappings) {}
-
- size_t size() const { return edges_.size(); }
-
- std::pair<const ElementId, const MappingRange> operator[](size_t idx) const {
- return std::make_pair(edges_[idx], range_mappings_[idx]);
- }
-
- std::pair<const ElementId, const MappingRange> front() const {
- return std::make_pair(edges_.front(), range_mappings_.front());
- }
-
- std::pair<const ElementId, const MappingRange> back() const {
- return std::make_pair(edges_.back(), range_mappings_.back());
- }
-
- size_t start_pos() const {
- return range_mappings_.front().mapped_range.start_pos;
- }
-
- size_t end_pos() const {
- return range_mappings_.back().mapped_range.end_pos;
- }
-
- Path<ElementId> path() const {
- if (edges_.size() != 0)
- return Path<ElementId>(edges_,
- range_mappings_[0].mapped_range.start_pos,
- range_mappings_[range_mappings_.size() - 1].mapped_range.end_pos);
- else
- return Path<ElementId>();
- }
-
- const std::vector<ElementId>& simple_path() const {
- return edges_;
- }
-
- void join(const MappingPath<ElementId>& that, int pos_shift = 0) {
- for (size_t i = 0; i < that.size(); ++i) {
- edges_.push_back(that.edges_[i]);
- range_mappings_.push_back(that.range_mappings_[i].ShiftInitial(pos_shift));
- }
- }
-
- void push_back(ElementId id, MappingRange range) {
- edges_.push_back(id);
- range_mappings_.push_back(range);
- }
-
- private:
- std::vector<ElementId> edges_;
- std::vector<MappingRange> range_mappings_;
-};
-
-template <typename ElementId>
-inline std::ostream& operator<<(std::ostream& os, const MappingPath<ElementId>& mp) {
- os << "MappingPath ( ";
- for(size_t i = 0; i < mp.size(); i++) {
- os << mp[i] << " ";
- }
- os << " )";
- return os;
-}
-
-}
diff --git a/src/modules/assembly_graph/paths/path_finders.hpp b/src/modules/assembly_graph/paths/path_finders.hpp
deleted file mode 100644
index 40f5add..0000000
--- a/src/modules/assembly_graph/paths/path_finders.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-#include "assembly_graph/graph_core/directions.hpp"
-
-namespace omnigraph {
-template<class Graph>
-class UniquePathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- const Graph& graph_;
-public:
- //todo use length bound if needed
- UniquePathFinder(const Graph& graph, size_t /*length_bound*/ =
- std::numeric_limits<size_t>::max())
- : graph_(graph) {}
-
- std::vector<EdgeId> operator()(EdgeId e,
- const AbstractDirection<Graph> &direction) const {
- std::vector<EdgeId> answer;
- EdgeId curr = e;
- answer.push_back(curr);
- std::set<EdgeId> was;
- while (direction.CheckUniqueOutgoingEdge(direction.EdgeEnd(curr))) {
- curr = direction.GetUniqueOutgoingEdge(direction.EdgeEnd(curr));
- if (was.count(curr) > 0)
- break;
- was.insert(curr);
- answer.push_back(curr);
- }
- return answer;
- }
-
- std::vector<EdgeId> UniquePathForward(EdgeId e) const {
- return this->operator()(e, ForwardDirection<Graph>(graph_));
- }
-
- std::vector<EdgeId> UniquePathBackward(EdgeId e) const {
- auto tmp = this->operator()(e, BackwardDirection<Graph>(graph_));
- return std::vector<EdgeId>(tmp.rbegin(), tmp.rend());
- }
-
-};
-
-template<class Graph>
-class TrivialPathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
-public:
- TrivialPathFinder(const Graph&, size_t = 0) {}
-
- std::vector<EdgeId> operator()(EdgeId e, const AbstractDirection<Graph> &) const {
- return {e};
- }
-
-};
-
-template<class Graph>
-class PlausiblePathFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- //todo remove graph_ field???
- const Graph& graph_;
- const size_t length_bound_;
-
- class DFS {
- private:
- const Graph &graph_;
- const AbstractDirection<Graph> &direction_;
- const size_t length_bound_;
-
- std::pair<size_t, EdgeId> find(EdgeId edge, size_t length) {
- length += graph_.length(edge);
- VertexId cross = direction_.EdgeEnd(edge);
- auto result = make_pair(length, edge);
- if (length < length_bound_
- && direction_.CheckUniqueIncomingEdge(cross)) {
- std::vector<EdgeId> outgoing = direction_.OutgoingEdges(cross);
- for (auto it = outgoing.begin(); it != outgoing.end(); ++it) {
- auto candidate = find(*it, length);
- if (candidate.first > result.first)
- result = candidate;
- }
- }
- return result;
- }
-
- std::vector<EdgeId> RestoreAnswer(EdgeId start, EdgeId end) {
- std::vector<EdgeId> result;
- while (end != start) {
- result.push_back(end);
- end = direction_.GetUniqueIncomingEdge(direction_.EdgeStart(end));
- }
- result.push_back(start);
- return std::vector<EdgeId>(result.rbegin(), result.rend());
- }
-
- public:
- DFS(const Graph &graph, const AbstractDirection<Graph> &direction,
- size_t length_bound)
- : graph_(graph),
- direction_(direction),
- length_bound_(length_bound) {
- }
-
- std::vector<EdgeId> find(EdgeId edge) {
- return RestoreAnswer(edge, find(edge, 0).second);
- }
- };
-
-public:
- PlausiblePathFinder(const Graph& graph, size_t length_bound)
- : graph_(graph),
- length_bound_(length_bound) {}
-
- std::vector<EdgeId> operator()(EdgeId e,
- const AbstractDirection<Graph> &direction) const {
- return DFS(graph_, direction, length_bound_).find(e);
- }
-
-};
-}
\ No newline at end of file
diff --git a/src/modules/assembly_graph/paths/path_processor.hpp b/src/modules/assembly_graph/paths/path_processor.hpp
deleted file mode 100644
index 5f3d3b6..0000000
--- a/src/modules/assembly_graph/paths/path_processor.hpp
+++ /dev/null
@@ -1,441 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-#include "utils/adt/bag.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
-
-namespace omnigraph {
-
-template<class Graph>
-const string PrintPath(const Graph& g, const vector<typename Graph::EdgeId>& edges) {
- string delim = "";
- std::stringstream ss;
- for (size_t i = 0; i < edges.size(); ++i) {
- ss << delim << g.str(edges[i]);
- delim = " -> ";
- }
- return ss.str();
-}
-
-
-template<class Graph>
-class PathProcessor {
-
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef vector<EdgeId> Path;
- typedef typename DijkstraHelper<Graph>::BoundedDijkstra DijkstraT;
-public:
- class Callback {
-
- public:
- virtual ~Callback() {
- }
-
- virtual void Flush() {
- }
-
- virtual void HandleReversedPath(const vector<EdgeId>& reversed_path) = 0;
-
-
- protected:
- Path ReversePath(const Path& path) const {
- Path result;
- for (auto I = path.rbegin(), E = path.rend(); I != E; ++I)
- result.push_back(*I);
- return result;
- }
- };
-
-private:
-
- class Traversal {
- const PathProcessor& outer_;
- VertexId end_;
- size_t min_len_;
- size_t max_len_;
- Callback& callback_;
- size_t edge_depth_bound_;
-
- size_t curr_len_;
- size_t curr_depth_;
- size_t call_cnt_;
- Path reversed_edge_path_;
- bag<VertexId> vertex_cnts_;
-
- const Graph& g_;
- const DijkstraT& dijkstra_;
-
- void Push(EdgeId e, VertexId start_v) {
- TRACE("Pushing edge " << g_.str(e));
- curr_len_ += g_.length(e);
- curr_depth_++;
- reversed_edge_path_.push_back(e);
- vertex_cnts_.put(start_v);
- }
-
- void Pop() {
- VERIFY(!reversed_edge_path_.empty());
- EdgeId e = reversed_edge_path_.back();
- size_t len = g_.length(e);
- VERIFY(curr_len_ >= len);
-
- TRACE("Popping edge " << g_.str(e));
- vertex_cnts_.take(g_.EdgeStart(e));
- reversed_edge_path_.pop_back();
- curr_len_ -= len;
- curr_depth_--;
- }
-
- bool CanGo(EdgeId e, VertexId start_v) {
- if (!dijkstra_.DistanceCounted(start_v))
- return false;
- if (dijkstra_.GetDistance(start_v) + g_.length(e) + curr_len_ > max_len_)
- return false;
- if (curr_depth_ >= edge_depth_bound_)
- return false;
- if (vertex_cnts_.mult(start_v) >= PathProcessor::MAX_VERTEX_USAGE)
- return false;
- return true;
- }
-
- bool Go(VertexId v, const size_t min_len) {
- TRACE("Got to vertex " << g_.str(v));
- if (++call_cnt_ >= PathProcessor::MAX_CALL_CNT) {
- TRACE("Maximal count " << MAX_CALL_CNT << " of recursive calls was exceeded!");
- return true;
- }
-
- if (v == outer_.start_ && curr_len_ >= min_len) {
- //TRACE("New path found: " << PrintPath(g_, path_));
- callback_.HandleReversedPath(reversed_edge_path_);
- }
-
- TRACE("Iterating through incoming edges of vertex " << g_.int_id(v))
- //TODO: doesn`t work with parallel simplification
- vector<EdgeId> incoming;
- incoming.reserve(4);
- std::copy_if(g_.in_begin(v), g_.in_end(v), std::back_inserter(incoming), [&] (EdgeId e) {
- return dijkstra_.DistanceCounted(g_.EdgeStart(e));
- });
-
- std::sort(incoming.begin(), incoming.end(), [&] (EdgeId e1, EdgeId e2) {
- return dijkstra_.GetDistance(g_.EdgeStart(e1)) < dijkstra_.GetDistance(g_.EdgeStart(e2));
- });
-
- for (EdgeId e : incoming) {
- VertexId start_v = g_.EdgeStart(e);
- if (CanGo(e, start_v)) {
- Push(e, start_v);
- bool exceeded_limits = Go(start_v, min_len);
- Pop();
- if (exceeded_limits)
- return true;
- }
- }
- return false;
- }
-
- public:
- Traversal(const PathProcessor& outer, VertexId end,
- size_t min_len, size_t max_len,
- Callback& callback, size_t edge_depth_bound) :
- outer_(outer), end_(end),
- min_len_(min_len), max_len_(max_len),
- callback_(callback),
- edge_depth_bound_(edge_depth_bound),
- curr_len_(0), curr_depth_(0), call_cnt_(0),
- g_(outer.g_),
- dijkstra_(outer.dijkstra_) {
- reversed_edge_path_.reserve(PathProcessor::MAX_CALL_CNT);
- vertex_cnts_.put(end_);
- }
-
- //returns true iff limits were exceeded
- bool Go() {
- bool code = Go(end_, min_len_);
- VERIFY(curr_len_ == 0);
- VERIFY(curr_depth_ == 0);
- vertex_cnts_.take(end_);
- VERIFY(vertex_cnts_.size() == 0);
- return code;
- }
- };
-
- friend class Traversal;
-
-public:
-
- PathProcessor(const Graph& g, VertexId start, size_t length_bound) :
- g_(g),
- start_(start),
- dijkstra_(DijkstraHelper<Graph>::CreateBoundedDijkstra(g, length_bound, MAX_DIJKSTRA_VERTICES)) {
- TRACE("Dijkstra launched");
- dijkstra_.Run(start);
- TRACE("Dijkstra finished");
- }
-
- // dfs from the end vertices
- // 3 two mistakes, 2 bad dijkstra, 1 some bad dfs, 0 = okay
- int Process(VertexId end, size_t min_len, size_t max_len, Callback& callback, size_t edge_depth_bound = -1ul) const {
- TRACE("Process launched");
- int error_code = 0;
-
- if (dijkstra_.VertexLimitExceeded()) {
- TRACE("dijkstra : vertex limit exceeded");
- error_code = 2;
- }
-
- TRACE("Start vertex is " << g_.str(start_));
- TRACE("Bounds are " << min_len << " " << max_len);
- TRACE("End vertex " << g_.str(end));
-
- Traversal traversal(*this, end, min_len, max_len, callback, edge_depth_bound);
- error_code |= int(traversal.Go());
-
- callback.Flush();
- TRACE("Process finished with error code " << error_code);
- return error_code;
- }
-
-private:
- static const size_t MAX_CALL_CNT = 3000;
- static const size_t MAX_DIJKSTRA_VERTICES = 3000;
- static const size_t MAX_VERTEX_USAGE = 5;
-
- const Graph& g_;
- VertexId start_;
- DijkstraT dijkstra_;
-
- DECL_LOGGER("PathProcessor")
-};
-
-template<class Graph>
-int ProcessPaths(const Graph& g, size_t min_len, size_t max_len,
- typename Graph::VertexId start, typename Graph::VertexId end,
- typename PathProcessor<Graph>::Callback& callback, size_t max_edge_cnt = -1ul) {
- PathProcessor<Graph> processor(g, start, max_len);
- return processor.Process(end, min_len, max_len, callback, max_edge_cnt);
-}
-
-template<class Graph>
-class CompositeCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- void AddProcessor(typename PathProcessor<Graph>::Callback& processor) {
- processors_.push_back(&processor);
- }
-
- void Flush() override {
- for (auto it = processors_.begin(); it != processors_.end(); ++it) {
- (*it)->Flush();
- }
- }
-
- void HandleReversedPath(const Path& path) override {
- for (auto it = processors_.begin(); it != processors_.end(); ++it) {
- (*it)->HandleReversedPath(path);
- }
- }
-
-private:
- vector<typename PathProcessor<Graph>::Callback*> processors_;
-};
-
-template<class Graph, class Comparator>
-class BestPathStorage: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-public:
- BestPathStorage(const Graph& g, Comparator comparator) :
- g_(g), cnt_(0), comparator_(comparator) {
- }
-
- void HandleReversedPath(const vector<EdgeId>& path) override {
- cnt_++;
- if(best_path_.size() == 0 || comparator_(path, best_path_))
- best_path_ = path;
- }
-
- vector<EdgeId> BestPath() const {
- return best_path_;
- }
-
- size_t size() const {
- return cnt_;
- }
-
-private:
- const Graph& g_;
- size_t cnt_;
- Comparator comparator_;
- vector<vector<Path>> best_path_;
-};
-
-
-template<class Graph>
-class PathStorageCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- PathStorageCallback(const Graph& g) :
- g_(g) {
- }
-
- void Flush() override {
- all_paths_.push_back(cur_paths_);
- cur_paths_.clear();
- }
-
- void HandleReversedPath(const vector<EdgeId>& path) override {
- cur_paths_.push_back(this->ReversePath(path));
- }
-
- size_t size(size_t k = 0) const {
- return all_paths_[k].size();
- }
-
- const vector<Path>& paths(size_t k = 0) const {
- return all_paths_[k];
- }
-
-private:
- const Graph& g_;
- vector<vector<Path>> all_paths_;
- vector<Path> cur_paths_;
-};
-
-template<class Graph>
-class NonEmptyPathCounter: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- NonEmptyPathCounter(const Graph& g) :
- g_(g), count_(0) {
- }
-
- void Flush() override {
- all_paths_.push_back(cur_paths_);
- counts_.push_back(count_);
- cur_paths_.clear();
- }
-
- void HandleReversedPath(const Path& path) override {
- if (path.size() > 0) {
- ++count_;
- cur_paths_.push_back(this->ReversePath(path));
- }
- }
-
- size_t count(size_t k = 0) const {
- return counts_[k];
- }
-
- const vector<Path>& paths(size_t k = 0) const {
- return all_paths_[k];
- }
-
-private:
- const Graph& g_;
- vector<size_t> counts_;
- size_t count_;
- vector<vector<Path> > all_paths_;
- vector<Path> cur_paths_;
-};
-
-template<class Graph>
-class VertexLabelerCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef vector<EdgeId> Path;
-
-public:
- VertexLabelerCallback(const Graph& g) :
- g_(g), count_(0) {
- }
-
- void Flush() override {
- all_vertices_.push_back(vertices_);
- vertices_.clear();
- counts_.push_back(count_);
- }
-
- void HandleReversedPath(const Path& path) override {
- for (auto it = path.rbegin(); it != path.rend(); ++it) {
- if (path.size() > 0) {
- vertices_.insert(g_.EdgeStart(*it));
- vertices_.insert(g_.EdgeEnd(*it));
- ++count_;
- }
- }
- }
-
- const set<VertexId>& vertices(size_t k = 0) const {
- return all_vertices_[k];
- }
-
- size_t count(size_t k = 0) const {
- return counts_[k];
- }
-
-private:
- Graph& g_;
- vector<size_t> counts_;
- vector<set<VertexId>> all_vertices_;
- size_t count_;
- set<VertexId> vertices_;
-};
-
-template<class Graph>
-class DistancesLengthsCallback: public PathProcessor<Graph>::Callback {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
-public:
- DistancesLengthsCallback(const Graph& g) :
- g_(g) {
- }
-
- void Flush() override {
- all_distances_.push_back(distances_);
- distances_.clear();
- }
-
- void HandleReversedPath(const Path& path) override {
- size_t path_length = PathLength(path);
- distances_.insert(path_length);
- }
-
- vector<size_t> distances(size_t k = 0) const {
- VERIFY(k < all_distances_.size());
- const set<size_t>& tmp = all_distances_[k];
- return vector<size_t>(tmp.begin(), tmp.end());
- }
-
-private:
- size_t PathLength(const Path& path) const {
- size_t res = 0;
- for (auto I = path.begin(); I != path.end(); ++I)
- res += g_.length(*I);
- return res;
- }
-
- const Graph& g_;
- set<size_t> distances_;
- vector<set<size_t>> all_distances_;
-
- DECL_LOGGER("DistancesLengthsCallback");
-};
-
-}
diff --git a/src/modules/assembly_graph/paths/path_utils.hpp b/src/modules/assembly_graph/paths/path_utils.hpp
deleted file mode 100644
index 212c81c..0000000
--- a/src/modules/assembly_graph/paths/path_utils.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * path_utils.hpp
- *
- */
-
-#pragma once
-
-#include "assembly_graph/paths/path_processor.hpp"
-
-namespace debruijn_graph {
-
- // TODO: rewrite this function
- template<class Graph>
- vector<typename Graph::EdgeId> GetCommonPathsEnd(
- const Graph& g,
- typename Graph::EdgeId e1,
- typename Graph::EdgeId e2,
- size_t min_dist,
- size_t max_dist,
- const omnigraph::PathProcessor<Graph>& path_processor)
- {
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<EdgeId> Path;
-
- //PathProcessor<Graph> path_processor(g,
- //min_dist - g.length(e1),
- //max_dist - g.length(e1),
- //g.EdgeEnd(e1), g.EdgeStart(e2), callback);
-
- omnigraph::PathStorageCallback<Graph> callback(g);
- int error_code = path_processor.Process(g.EdgeStart(e2), min_dist - g.length(e1),
- max_dist - g.length(e1), callback);
- vector<Path> paths = callback.paths();
-
- vector<EdgeId> result;
- if (error_code != 0) {
- DEBUG("Edge " << g.int_id(e1) << " path_processor problem")
- return result;
- }
- if (paths.size() == 0)
- return result;
- if (paths.size() == 1)
- return paths[0];
- size_t j = 0;
- while (j < paths[0].size()) {
- for (size_t i = 1; i < paths.size(); ++i) {
- if (j == paths[i].size()) {
- vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
- return result;
- } else {
- if (paths[0][paths[0].size()-1-j] != paths[i][paths[i].size()-1-j]) {
- vector<EdgeId> result(paths[0].begin()+(paths[0].size() - j), paths[0].end());
- return result;
- }
- }
- }
- ++j;
- }
- return paths[0];
- }
-
-
-
- template<class Graph>
- vector<vector<typename Graph::EdgeId> > GetAllPathsBetweenEdges(
- const Graph& g,
- typename Graph::EdgeId& e1,
- typename Graph::EdgeId& e2, size_t min_dist,
- size_t max_dist) {
- omnigraph::PathStorageCallback<Graph> callback(g);
- ProcessPaths(g,
- min_dist,
- max_dist, //0, *cfg::get().ds.IS - K + size_t(*cfg::get().ds.is_var),
- g.EdgeEnd(e1), g.EdgeStart(e2),
- callback);
- auto paths = callback.paths();
- return paths;
- }
-
-template<class graph_pack>
-size_t GetAllPathsQuantity(const graph_pack& origin_gp,
- const typename graph_pack::graph_t::EdgeId& e1,
- const typename graph_pack::graph_t::EdgeId& e2, double d, double is_var) {
- omnigraph::PathStorageCallback<typename graph_pack::graph_t> callback(origin_gp.g);
- omnigraph::PathProcessor<typename graph_pack::graph_t>
- path_processor(origin_gp.g,
- (size_t) d - origin_gp.g.length(e1) - size_t(is_var),
- (size_t) d - origin_gp.g.length(e1) + size_t(is_var),
- origin_gp.g.EdgeEnd(e1),
- origin_gp.g.EdgeStart(e2),
- callback);
- path_processor.Process();
- auto paths = callback.paths();
- TRACE(e1.ind_id() << " " << e2.int_id() << " " << paths.size());
- return paths.size();
-}
-
-template<class Graph>
-Sequence MergeSequences(const Graph& g,
- const vector<typename Graph::EdgeId>& continuous_path) {
- vector < Sequence > path_sequences;
- path_sequences.push_back(g.EdgeNucls(continuous_path[0]));
- for (size_t i = 1; i < continuous_path.size(); ++i) {
- VERIFY(
- g.EdgeEnd(continuous_path[i - 1])
- == g.EdgeStart(continuous_path[i]));
- path_sequences.push_back(g.EdgeNucls(continuous_path[i]));
- }
- return MergeOverlappingSequences(path_sequences, g.k());
-}
-
-template<class Graph>
-Sequence PathSequence(const Graph& g, const omnigraph::Path<typename Graph::EdgeId>& path) {
- Sequence path_sequence = MergeSequences(g, path.sequence());
- size_t start = path.start_pos();
- size_t end = path_sequence.size()
- - g.length(path[path.size() - 1]) + path.end_pos();
- return path_sequence.Subseq(start, end);
-}
-
-}
diff --git a/src/modules/assembly_graph/stats/picture_dump.hpp b/src/modules/assembly_graph/stats/picture_dump.hpp
deleted file mode 100644
index 18c6d39..0000000
--- a/src/modules/assembly_graph/stats/picture_dump.hpp
+++ /dev/null
@@ -1,447 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "statistics.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-
-#include "pipeline/graph_pack.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "pipeline/graphio.hpp"
-//FIXME awful dependency to get write_lib_data
-#include "pipeline/config_struct.hpp"
-#include "visualization/position_filler.hpp"
-
-#include "visualization/visualization.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "io/reads_io/rc_reader_wrapper.hpp"
-#include "io/reads_io/delegating_reader_wrapper.hpp"
-#include "io/reads_io/io_helper.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
-#include "io/reads_io/osequencestream.hpp"
-#include "io/dataset_support/dataset_readers.hpp"
-#include "dev_support/copy_file.hpp"
-
-#include <boost/algorithm/string.hpp>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <cmath>
-
-namespace debruijn_graph {
-
-namespace stats {
-
-template<class Graph, class Index>
-MappingPath<typename Graph::EdgeId>
-FindGenomeMappingPath(const Sequence& genome, const Graph& g,
- const Index& index,
- const KmerMapper<Graph>& kmer_mapper) {
- NewExtendedSequenceMapper<Graph, Index> srt(g, index, kmer_mapper);
- return srt.MapSequence(genome);
-}
-
-template<class graph_pack>
-MappingPath<typename graph_pack::graph_t::EdgeId>
-FindGenomeMappingPath(const Sequence& genome, const graph_pack& gp) {
- return FindGenomeMappingPath(genome, gp.g, gp.index, gp.kmer_mapper);
-}
-
-template <class graph_pack>
-shared_ptr<omnigraph::visualization::GraphColorer<Graph>> DefaultColorer(const graph_pack& gp) {
- return omnigraph::visualization::DefaultColorer(gp.g,
- FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path(),
- FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index, gp.kmer_mapper).path());
-}
-
-template <class graph_pack>
-void CollectContigPositions(graph_pack &gp) {
- if (!cfg::get().pos.contigs_for_threading.empty() &&
- path::FileExists(cfg::get().pos.contigs_for_threading))
- FillPos(gp, cfg::get().pos.contigs_for_threading, "thr_", true);
-
- if (!cfg::get().pos.contigs_to_analyze.empty() &&
- path::FileExists(cfg::get().pos.contigs_to_analyze))
- FillPos(gp, cfg::get().pos.contigs_to_analyze, "anlz_", true);
-}
-
-template<class Graph, class Index>
-class GenomeMappingStat: public AbstractStatCounter {
- private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- const Index& index_;
- Sequence genome_;
- size_t k_;
- public:
- GenomeMappingStat(const Graph &graph, const Index &index, GenomeStorage genome, size_t k) :
- graph_(graph), index_(index), genome_(genome.GetSequence()), k_(k) {}
-
- virtual ~GenomeMappingStat() {}
-
- virtual void Count() {
- INFO("Mapping genome");
- size_t break_number = 0;
- size_t covered_kp1mers = 0;
- size_t fail = 0;
- if (genome_.size() <= k_)
- return;
-
- runtime_k::RtSeq cur = genome_.start<runtime_k::RtSeq>(k_ + 1);
- cur >>= 0;
- bool breaked = true;
- pair<EdgeId, size_t> cur_position;
- for (size_t cur_nucl = k_; cur_nucl < genome_.size(); cur_nucl++) {
- cur <<= genome_[cur_nucl];
- if (index_.contains(cur)) {
- pair<EdgeId, size_t> next = index_.get(cur);
- if (!breaked
- && cur_position.second + 1
- < graph_.length(cur_position.first)) {
- if (next.first != cur_position.first
- || cur_position.second + 1 != next.second) {
- fail++;
- }
- }
- cur_position = next;
- covered_kp1mers++;
- breaked = false;
- } else {
- if (!breaked) {
- breaked = true;
- break_number++;
- }
- }
- }
- INFO("Genome mapped");
- INFO("Genome mapping results:");
- INFO("Covered k+1-mers:" << covered_kp1mers << " of " << (genome_.size() - k_) << " which is "
- << (100.0 * (double) covered_kp1mers / (double) (genome_.size() - k_)) << "%");
- INFO("Covered k+1-mers form " << break_number + 1 << " contigious parts");
- INFO("Continuity failtures " << fail);
- }
-};
-
-template<class Graph>
-void WriteErrorLoc(const Graph &g,
- const string& folder_name,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> genome_colorer,
- const omnigraph::GraphLabeler<Graph>& labeler) {
- INFO("Writing error localities for graph to folder " << folder_name);
- GraphComponent<Graph> all(g, g.begin(), g.end());
- set<typename Graph::EdgeId> edges = genome_colorer->ColoredWith(all.edges().begin(),
- all.edges().end(), "black");
- set<typename Graph::VertexId> to_draw;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- to_draw.insert(g.EdgeEnd(*it));
- to_draw.insert(g.EdgeStart(*it));
- }
- shared_ptr<GraphSplitter<Graph>> splitter = StandardSplitter(g, to_draw);
- WriteComponents(g, folder_name, splitter, genome_colorer, labeler);
- INFO("Error localities written written to folder " << folder_name);
-}
-
-template<class graph_pack>
-void CountStats(const graph_pack& gp) {
- typedef typename graph_pack::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- INFO("Counting stats");
- StatList stats;
- Path<EdgeId> path1 = FindGenomeMappingPath(gp.genome.GetSequence(), gp.g, gp.index,
- gp.kmer_mapper).path();
- Path<EdgeId> path2 = FindGenomeMappingPath(!gp.genome.GetSequence(), gp.g, gp.index,
- gp.kmer_mapper).path();
- stats.AddStat(new VertexEdgeStat<Graph>(gp.g));
- stats.AddStat(new BlackEdgesStat<Graph>(gp.g, path1, path2));
- stats.AddStat(new NStat<Graph>(gp.g, path1, 50));
- stats.AddStat(new SelfComplementStat<Graph>(gp.g));
- stats.AddStat(
- new GenomeMappingStat<Graph, Index>(gp.g, gp.index,
- gp.genome, gp.k_value));
- stats.AddStat(new IsolatedEdgesStat<Graph>(gp.g, path1, path2));
- stats.Count();
- INFO("Stats counted");
-}
-
-template<class Graph>
-void WriteGraphComponentsAlongGenome(const Graph& g,
- const GraphLabeler<Graph>& labeler,
- const string& folder,
- const Path<typename Graph::EdgeId>& path1,
- const Path<typename Graph::EdgeId>& path2) {
- INFO("Writing graph components along genome");
-
- make_dir(folder);
- omnigraph::visualization::WriteComponentsAlongPath(g, path1, folder, omnigraph::visualization::DefaultColorer(g, path1, path2), labeler);
-
- INFO("Writing graph components along genome finished");
-}
-
-//todo refactoring needed: use graph pack instead!!!
-template<class Graph, class Mapper>
-void WriteGraphComponentsAlongContigs(const Graph& g,
- Mapper &mapper,
- const std::string& folder,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph>& labeler) {
- INFO("Writing graph components along contigs");
- auto contigs_to_thread = io::EasyStream(cfg::get().pos.contigs_to_analyze, false);
- contigs_to_thread->reset();
- io::SingleRead read;
- while (!contigs_to_thread->eof()) {
- (*contigs_to_thread) >> read;
- make_dir(folder + read.name());
- omnigraph::visualization::WriteComponentsAlongPath(g, mapper.MapSequence(read.sequence()).simple_path(), folder + read.name() + "/",
- colorer, labeler);
- }
- INFO("Writing graph components along contigs finished");
-}
-
-template<class Graph>
-void WriteKmerComponent(conj_graph_pack &gp, runtime_k::RtSeq const& kp1mer, const std::string& file,
- std::shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer,
- const omnigraph::GraphLabeler<Graph>& labeler) {
- if(!gp.index.contains(kp1mer)) {
- WARN("no such kmer in the graph");
- return;
- }
- VERIFY(gp.index.contains(kp1mer));
- auto pos = gp.index.get(kp1mer);
- typename Graph::VertexId v = pos.second * 2 < gp.g.length(pos.first) ? gp.g.EdgeStart(pos.first) : gp.g.EdgeEnd(pos.first);
- GraphComponent<Graph> component = omnigraph::VertexNeighborhood<Graph>(gp.g, v);
- omnigraph::visualization::WriteComponent<Graph>(component, file, colorer, labeler);
-}
-
-inline
-optional<runtime_k::RtSeq> FindCloseKP1mer(const conj_graph_pack &gp,
- size_t genome_pos, size_t k) {
- VERIFY(gp.genome.size() > 0);
- VERIFY(genome_pos < gp.genome.size());
- static const size_t magic_const = 200;
- for (size_t diff = 0; diff < magic_const; diff++) {
- for (int dir = -1; dir <= 1; dir += 2) {
- size_t pos = (gp.genome.size() - k + genome_pos + dir * diff) % (gp.genome.size() - k);
- runtime_k::RtSeq kp1mer = gp.kmer_mapper.Substitute(
- runtime_k::RtSeq (k + 1, gp.genome.GetSequence(), pos));
- if (gp.index.contains(kp1mer))
- return optional<runtime_k::RtSeq>(kp1mer);
- }
- }
- return boost::none;
-}
-
-inline
-void PrepareForDrawing(conj_graph_pack &gp) {
- gp.EnsureDebugInfo();
- CollectContigPositions(gp);
-}
-
-
-struct detail_info_printer {
- detail_info_printer(conj_graph_pack &gp,
- const omnigraph::GraphLabeler<Graph>& labeler,
- const string& folder)
- : gp_(gp),
- labeler_(labeler),
- folder_(folder) {
- }
-
- void operator() (config::info_printer_pos pos,
- const string& folder_suffix = "") {
- string pos_name = ModeName(pos, config::InfoPrinterPosNames());
-
- ProduceDetailedInfo(pos_name + folder_suffix, pos);
- }
-
- private:
-
- void ProduceDetailedInfo(const string &pos_name,
- config::info_printer_pos pos) {
- static size_t call_cnt = 0;
-
- auto it = cfg::get().info_printers.find(pos);
- VERIFY(it != cfg::get().info_printers.end());
-
- const config::debruijn_config::info_printer & config = it->second;
-
- if (config.basic_stats) {
- VertexEdgeStat<conj_graph_pack::graph_t> stats(gp_.g);
- INFO("Number of vertices : " << stats.vertices() << ", number of edges : "
- << stats.edges() << ", sum length of edges : " << stats.edge_length());
- }
-
- if (config.save_graph_pack) {
- string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(saves_folder);
- graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
- graphio::PrintGraphPack(saves_folder + "graph_pack", printer, gp_);
- //TODO: separate
- graphio::PrintClusteredIndices(saves_folder + "graph_pack", printer, gp_.clustered_indices);
- }
-
- if (config.save_all) {
- string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
- ToString(call_cnt++, 2) + "_" + pos_name);
- path::make_dirs(saves_folder);
- string p = saves_folder + "/saves";
- INFO("Saving current state to " << p);
-
- debruijn_graph::graphio::PrintAll(p, gp_);
- debruijn_graph::config::write_lib_data(p);
- }
-
- if (config.save_full_graph) {
- string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(saves_folder);
- graphio::ConjugateDataPrinter<conj_graph_pack::graph_t> printer(gp_.g);
- graphio::PrintBasicGraph(saves_folder + "graph", printer);
- }
-
- if (config.lib_info) {
- string saves_folder = path::append_path(path::append_path(folder_, "saves/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(saves_folder);
- config::write_lib_data(saves_folder + "lib_info");
- }
-
- if (config.extended_stats) {
- VERIFY(cfg::get().developer_mode);
- CountStats(gp_);
- }
-
- if (!(config.write_error_loc ||
- config.write_full_graph ||
- config.write_full_nc_graph ||
- config.write_components ||
- !config.components_for_kmer.empty() ||
- config.write_components_along_genome ||
- config.write_components_along_contigs ||
- !config.components_for_genome_pos.empty())) {
- return;
- }
-
- VERIFY(cfg::get().developer_mode);
- string pics_folder = path::append_path(path::append_path(folder_, "pictures/"),
- ToString(call_cnt++, 2) + "_" + pos_name + "/");
- path::make_dirs(pics_folder);
- PrepareForDrawing(gp_);
-
- auto path1 = FindGenomeMappingPath(gp_.genome.GetSequence(), gp_.g, gp_.index,
- gp_.kmer_mapper).path();
-
- auto colorer = DefaultColorer(gp_);
-
- if (config.write_error_loc) {
- make_dir(pics_folder + "error_loc/");
- WriteErrorLoc(gp_.g, pics_folder + "error_loc/", colorer, labeler_);
- }
-
- if (config.write_full_graph) {
- WriteComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "full_graph.dot", colorer, labeler_);
- }
-
- if (config.write_full_nc_graph) {
- WriteSimpleComponent(GraphComponent<Graph>(gp_.g, gp_.g.begin(), gp_.g.end()), pics_folder + "nc_full_graph.dot", colorer, labeler_);
- }
-
- if (config.write_components) {
- make_dir(pics_folder + "components/");
- omnigraph::visualization::WriteComponents(gp_.g, pics_folder + "components/", omnigraph::ReliableSplitter<Graph>(gp_.g), colorer, labeler_);
- }
-
- if (!config.components_for_kmer.empty()) {
- string kmer_folder = path::append_path(pics_folder, "kmer_loc/");
- make_dir(kmer_folder);
- auto kmer = runtime_k::RtSeq(gp_.k_value + 1, config.components_for_kmer.substr(0, gp_.k_value + 1).c_str());
- string file_name = path::append_path(kmer_folder, pos_name + ".dot");
- WriteKmerComponent(gp_, kmer, file_name, colorer, labeler_);
- }
-
- if (config.write_components_along_genome) {
- make_dir(pics_folder + "along_genome/");
- omnigraph::visualization::WriteComponentsAlongPath(gp_.g, path1.sequence(), pics_folder + "along_genome/", colorer, labeler_);
- }
-
- if (config.write_components_along_contigs) {
- make_dir(pics_folder + "along_contigs/");
- NewExtendedSequenceMapper<Graph, Index> mapper(gp_.g, gp_.index, gp_.kmer_mapper);
- WriteGraphComponentsAlongContigs(gp_.g, mapper, pics_folder + "along_contigs/", colorer, labeler_);
- }
-
- if (!config.components_for_genome_pos.empty()) {
- string pos_loc_folder = path::append_path(pics_folder, "pos_loc/");
- make_dir(pos_loc_folder);
- vector<string> positions;
- boost::split(positions, config.components_for_genome_pos,
- boost::is_any_of(" ,"), boost::token_compress_on);
- for (auto it = positions.begin(); it != positions.end(); ++it) {
- boost::optional<runtime_k::RtSeq> close_kp1mer = FindCloseKP1mer(gp_,
- std::stoi(*it), gp_.k_value);
- if (close_kp1mer) {
- string locality_folder = path::append_path(pos_loc_folder, *it + "/");
- make_dir(locality_folder);
- WriteKmerComponent(gp_, *close_kp1mer, path::append_path(locality_folder, pos_name + ".dot"), colorer, labeler_);
- } else {
- WARN(
- "Failed to find genome kp1mer close to the one at position "
- << *it << " in the graph. Which is " << runtime_k::RtSeq (gp_.k_value + 1, gp_.genome.GetSequence(), std::stoi(*it)));
- }
- }
- }
- }
-
- conj_graph_pack& gp_;
- const omnigraph::GraphLabeler<Graph>& labeler_;
- string folder_;
-};
-
-inline
-std::string ConstructComponentName(std::string file_name, size_t cnt) {
- stringstream ss;
- ss << cnt;
- string res = file_name;
- res.insert(res.length(), ss.str());
- return res;
-}
-
-template<class Graph>
-double AvgCoverage(const Graph& g,
- const std::vector<typename Graph::EdgeId>& edges) {
- double total_cov = 0.;
- size_t total_length = 0;
- for (auto it = edges.begin(); it != edges.end(); ++it) {
- total_cov += g.coverage(*it) * (double) g.length(*it);
- total_length += g.length(*it);
- }
- return total_cov / (double) total_length;
-}
-
-template<class Graph>
-size_t Nx(Graph &g, double percent) {
- size_t sum_edge_length = 0;
- vector<size_t> lengths;
- for (auto iterator = g.ConstEdgeBegin(); !iterator.IsEnd(); ++iterator) {
- lengths.push_back(g.length(*iterator));
- sum_edge_length += g.length(*iterator);
- }
- sort(lengths.begin(), lengths.end());
- double len_perc = (1.0 - percent * 0.01) * (double) (sum_edge_length);
- for (size_t i = 0; i < lengths.size(); i++) {
- if (lengths[i] >= len_perc)
- return lengths[i];
- else
- len_perc -= (double) lengths[i];
- }
- return 0;
-}
-
-}
-}
diff --git a/src/modules/assembly_graph/stats/statistics.hpp b/src/modules/assembly_graph/stats/statistics.hpp
deleted file mode 100644
index 3ab53a5..0000000
--- a/src/modules/assembly_graph/stats/statistics.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/simple_tools.hpp"
-#include "math/xmath.h"
-#include "pipeline/config_struct.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-
-#include <iostream>
-#include <fstream>
-#include <map>
-
-namespace debruijn_graph {
-namespace stats {
-
-using namespace math;
-using namespace omnigraph;
-
-class AbstractStatCounter {
-public:
- AbstractStatCounter() {
- }
-
- virtual ~AbstractStatCounter() {
- }
-
- virtual void Count() = 0;
- //protected:
- // DECL_LOGGER("StatCounter")
-};
-
-class StatList : AbstractStatCounter {
-private:
- vector<AbstractStatCounter *> to_count_;
-public:
- StatList(vector<AbstractStatCounter *> to_count =
- vector<AbstractStatCounter *>()) :
- to_count_(to_count) {
- }
-
- virtual ~StatList() {
- }
-
- void AddStat(AbstractStatCounter *new_stat) {
- to_count_.push_back(new_stat);
- }
-
- const vector<AbstractStatCounter *> stats() {
- return to_count_;
- }
-
- virtual void Count() {
- for (size_t i = 0; i < to_count_.size(); i++) {
- to_count_[i]->Count();
- }
- }
-
- void DeleteStats() {
- for (size_t i = 0; i < to_count_.size(); i++)
- delete to_count_[i];
- to_count_.clear();
- }
-};
-
-template<class Graph>
-class VertexEdgeStat : public AbstractStatCounter {
-private:
- const Graph &graph_;
-public:
- VertexEdgeStat(const Graph &graph) :
- graph_(graph) {
- }
-
- virtual ~VertexEdgeStat() {
- }
-
- size_t vertices() {
- return graph_.size();
- }
-
- size_t edges() {
- size_t edgeNumber = 0;
- size_t sum_edge_length = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator) {
- edgeNumber++;
- // if (graph_.coverage(*iterator) > 30) {
- sum_edge_length += graph_.length(*iterator);
- // }
- }
- return edgeNumber;
- }
-
- size_t edge_length() {
- size_t sum_edge_length = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator) {
- if (graph_.coverage(*iterator) > 30) {
- sum_edge_length += graph_.length(*iterator);
- }
- }
- return sum_edge_length;
- }
-
- virtual void Count() {
- INFO(
- "Vertex count=" << vertices() << "; Edge count=" << edges());
- INFO(
- "sum length of edges " << edge_length());
- }
-};
-
-template<class Graph>
-class BlackEdgesStat : public AbstractStatCounter {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- Path<EdgeId> path1_;
- Path<EdgeId> path2_;
-public:
- BlackEdgesStat(const Graph &graph, Path<EdgeId> path1, Path<EdgeId> path2) :
- graph_(graph), path1_(path1), path2_(path2) {
- }
-
- virtual ~BlackEdgesStat() {
- }
-
- virtual void Count() {
- size_t black_count = 0;
- size_t edge_count = 0;
- const vector <EdgeId> path_edges1 = path1_.sequence();
- const vector <EdgeId> path_edges2 = path2_.sequence();
- set <EdgeId> colored_edges;
- colored_edges.insert(path_edges1.begin(), path_edges1.end());
- colored_edges.insert(path_edges2.begin(), path_edges2.end());
- size_t sum_length = 0;
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- edge_count++;
- if (colored_edges.count(*it) == 0) {
- black_count++;
- sum_length += graph_.length(*it);
- }
- }
- if (edge_count > 0) {
- INFO("Error edges count: " << black_count << " which is " <<
- 100.0 * (double) black_count / (double) edge_count << "% of all edges");
- INFO("Total length of all black edges: " << sum_length << ". While double genome length is " <<
- (2 * cfg::get().ds.reference_genome.size()));
- } else {
- INFO("Error edges count: " << black_count << " which is 0% of all edges");
- }
- }
-};
-
-template<class Graph>
-class NStat : public AbstractStatCounter {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- Path<EdgeId> path_;
- size_t perc_;
-public:
- NStat(const Graph &graph, Path<EdgeId> path, size_t perc = 50) :
- graph_(graph), path_(path), perc_(perc) {
- }
-
- virtual ~NStat() {
- }
-
- virtual void Count() {
- vector <size_t> lengths;
- size_t sum_all = 0;
- for (size_t i = 0; i < path_.size(); i++) {
- lengths.push_back(graph_.length(path_[i]));
- sum_all += graph_.length(path_[i]);
- }
- sort(lengths.begin(), lengths.end());
- size_t sum = 0;
- size_t current = lengths.size();
- while (current > 0 && (double) sum < (double) perc_ * 0.01 * (double) sum_all) {
- current--;
- sum += lengths[current];
- }
- if (current < lengths.size())
- INFO("N" << perc_ << ": " << lengths[current]);
- }
-};
-
-template<class Graph>
-class IsolatedEdgesStat : public AbstractStatCounter {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
- set <EdgeId> black_edges_;
- vector <size_t> lengths;
-public:
- IsolatedEdgesStat(const Graph &graph, Path<EdgeId> path1,
- Path<EdgeId> path2) :
- graph_(graph) {
- for (auto it = graph.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- black_edges_.insert(*it);
- }
- for (size_t i = 0; i < path1.size(); i++) {
- black_edges_.erase(path1[i]);
- }
- for (size_t i = 0; i < path2.size(); i++) {
- black_edges_.erase(path2[i]);
- }
- }
-
- virtual ~IsolatedEdgesStat() {
- }
-
- virtual void Count() {
- lengths.clear();
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it) {
- EdgeId edge = *it;
- if (graph_.IsDeadEnd(graph_.EdgeEnd(edge))
- && graph_.IsDeadStart(graph_.EdgeStart(edge))
- && black_edges_.count(edge) == 0) {
- lengths.push_back(graph_.length(edge));
- }
- }
- INFO("Isolated not black edges: " << lengths.size());
- WriteLengths(cfg::get().output_dir, "isolated_edges.txt");
- }
-
- void WriteLengths(string folder_name, string file_name) {
- ofstream os;
- os.open((folder_name + "/" + file_name).c_str());
- WriteLengths(os);
- os.close();
- }
-
- void WriteLengths(ostream &os) {
- sort(lengths.begin(), lengths.end());
- for (size_t i = 0; i < lengths.size(); i++) {
- os << lengths[i] << endl;
- }
- }
-};
-
-template<class Graph>
-class SelfComplementStat : public AbstractStatCounter {
-private:
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-public:
- SelfComplementStat(const Graph &graph) :
- graph_(graph) {
- }
-
- virtual ~SelfComplementStat() {
- }
-
- virtual void Count() {
- size_t sc_number = 0;
- for (auto iterator = graph_.ConstEdgeBegin(); !iterator.IsEnd();
- ++iterator)
- if (graph_.conjugate(*iterator) == (*iterator))
- sc_number++;
- // INFO("Self-complement count failed!!! ");
- INFO("Self-complement count=" << sc_number);
- }
-};
-}
-}
diff --git a/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp b/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
deleted file mode 100644
index 7a293f5..0000000
--- a/src/modules/data_structures/debruijn_graph/debruijn_graph_constructor.hpp
+++ /dev/null
@@ -1,548 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "assembly_graph/graph_core/graph.hpp"
-#include "assembly_graph/graph_core/construction_helper.hpp"
-#include "dev_support/standard_base.hpp"
-#include "data_structures/indices/kmer_extension_index.hpp"
-#include "dev_support/openmp_wrapper.h"
-#include "dev_support/parallel_wrapper.hpp"
-
-namespace debruijn_graph {
-
-/*
- * Constructs DeBruijnGraph from DeBruijn Graph using "new DeBruijnGraphConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
- */
-template<class Graph, class Index>
-class DeBruijnGraphConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef Index DeBruijn;
- typedef typename Graph::VertexId VertexId;
- typedef typename Index::KMer Kmer;
- typedef typename DeBruijn::KeyWithHash KeyWithHash;
- typedef typename DeBruijn::kmer_iterator kmer_iterator;
-
- Graph &graph_;
- DeBruijn &origin_;
- size_t kmer_size_;
-
- bool StepRightIfPossible(KeyWithHash &kwh) {
- // VERIFY(origin_.contains(edge));
- if (origin_.RivalEdgeCount(kwh) == 1
- && origin_.NextEdgeCount(kwh) == 1) {
- kwh = origin_.NextEdge(kwh);
- // VERIFY(origin_.contains(next_edge));
- return true;
- }
- return false;
- }
-
- KeyWithHash &GoRight(KeyWithHash &kwh) {
- KeyWithHash initial = kwh;
- while (StepRightIfPossible(kwh) && kwh != initial) {
- ;
- }
- return kwh;
- }
-
- KeyWithHash &GoLeft(KeyWithHash &kwh) {
- //These strange things are in order to avoid making copies of kwh
- kwh = !kwh;
- kwh = !GoRight(kwh);
- return kwh;
- }
-
- Sequence ConstructSeqGoingRight(KeyWithHash &kwh) {
- SequenceBuilder s;
- s.append(kwh.key());
- KeyWithHash initial = kwh;
- while (StepRightIfPossible(kwh) && kwh != initial) {
- s.append(kwh[kmer_size_]);
- }
- return s.BuildSequence();
- }
-
- Sequence ConstructSequenceWithEdge(const KeyWithHash &kwh) {
- KeyWithHash tmp = kwh;
- return ConstructSeqGoingRight(GoLeft(tmp));
- }
-
- VertexId FindVertexByOutgoingEdges(Kmer kmer) {
- for (char c = 0; c < 4; ++c) {
- KeyWithHash edge = origin_.ConstructKWH(kmer.pushBack(c));
- if (origin_.contains(edge))
- return graph_.EdgeStart(origin_.get_value(edge).edge_id);
- }
- return VertexId(NULL);
- }
-
- VertexId FindVertexByIncomingEdges(Kmer kmer) {
- for (char c = 0; c < 4; ++c) {
- KeyWithHash edge = origin_.ConstructKWH(kmer.pushFront(c));
- if (origin_.contains(edge)) {
- return graph_.EdgeEnd(origin_.get_value(edge).edge_id);
- }
- }
- return VertexId(NULL);
- }
-
- VertexId FindVertex(Kmer kmer) {
- VertexId v = FindVertexByOutgoingEdges(kmer);
- return v == VertexId(NULL) ? FindVertexByIncomingEdges(kmer) : v;
- }
-
- VertexId FindVertexMaybeMissing(Kmer kmer) {
- VertexId v = FindVertex(kmer);
- return v != VertexId(NULL) ? v : graph_.AddVertex();
- }
-
- VertexId FindEndMaybeMissing(const ConjugateDeBruijnGraph& graph,
- VertexId start, Kmer start_kmer, Kmer end_kmer) {
- if (start_kmer == end_kmer) {
- return start;
- } else if (start_kmer == !end_kmer) {
- return graph.conjugate(start);
- } else {
- return FindVertexMaybeMissing(end_kmer);
- }
- }
-
- void ConstructPart(const std::vector<KeyWithHash>& kwh_list,
- std::vector<Sequence>& sequences) {
- for (size_t i = 0; i < sequences.size(); ++i) {
- if (origin_.contains(kwh_list[i])) {
- continue;
- }
-
- Kmer start_kmer = sequences[i].start < Kmer > (kmer_size_);
- Kmer end_kmer = sequences[i].end < Kmer > (kmer_size_);
-
- VertexId start = FindVertexMaybeMissing(start_kmer);
- VertexId end = FindEndMaybeMissing(graph_, start, start_kmer,
- end_kmer);
-
- graph_.AddEdge(start, end, sequences[i]);
- }
- }
-
- void AddKmers(kmer_iterator &it, kmer_iterator &end, size_t queueSize,
- std::vector<KeyWithHash>& kwh_list) {
- for (; kwh_list.size() != queueSize && it != end; ++it) {
- KeyWithHash kwh = origin_.ConstructKWH(Kmer(unsigned(kmer_size_ + 1), (*it).data()));
-
- if (!origin_.contains(kwh))
- kwh_list.push_back(kwh);
- }
- }
-
- void CalculateSequences(std::vector<KeyWithHash> &kwh_list,
- std::vector<Sequence> &sequences) {
- size_t size = kwh_list.size();
- sequences.resize(size);
-
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- sequences[i] = ConstructSequenceWithEdge(kwh_list[i]);
- }
- }
-
-public:
- DeBruijnGraphConstructor(Graph& graph, DeBruijn &origin) :
- graph_(graph), origin_(origin), kmer_size_(graph_.k()) {
- }
-
- void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- kmer_iterator it = origin_.kmer_begin();
- kmer_iterator end = origin_.kmer_end();
- size_t queueSize = queueMinSize;
- std::vector<KeyWithHash> kwh_list;
- std::vector<Sequence> sequences;
- kwh_list.reserve(queueSize);
- sequences.reserve(queueMaxSize);
- while (it != end) {
- AddKmers(it, end, queueSize, kwh_list); // format a queue of kmers that are not in index
- CalculateSequences(kwh_list, sequences); // in parallel
- ConstructPart(kwh_list, sequences);
- kwh_list.clear();
- queueSize = min(size_t(double(queueSize) * queueGrowthRate), queueMaxSize);
- }
- }
-
-private:
- DECL_LOGGER("DeBruijnGraphConstructor")
-};
-
-class UnbranchingPathFinder {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef runtime_k::RtSeq Kmer;
- typedef Index::kmer_iterator kmer_iterator;
- typedef Index::KeyWithHash KeyWithHash;
- typedef Index::DeEdge DeEdge;
-
- Index &origin_;
- size_t kmer_size_;
- bool clean_condensed_;
-
-
-public:
- UnbranchingPathFinder(Index &origin, size_t kmer_size) : origin_(origin), kmer_size_(kmer_size) {
- }
-
- bool StepRightIfPossible(DeEdge &edge) {
- if (origin_.CheckUniqueOutgoing(edge.end) && origin_.CheckUniqueIncoming(edge.end)) {
- edge = DeEdge(edge.end, origin_.GetUniqueOutgoing(edge.end));
- return true;
- }
- return false;
- }
-
- Sequence ConstructSeqGoingRight(DeEdge edge) {
- SequenceBuilder s;
- s.append(edge.start.key());
- s.append(edge.end[kmer_size_ - 1]);
- DeEdge initial = edge;
- while (StepRightIfPossible(edge) && edge != initial) {
- s.append(edge.end[kmer_size_ - 1]);
- }
- return s.BuildSequence();
- }
-
- Sequence ConstructSequenceWithEdge(DeEdge edge) {
- return ConstructSeqGoingRight(edge);
- }
-
-//TODO Think about what happends to self rc perfect loops
- Sequence ConstructLoopFromVertex(const KeyWithHash &kh) {
- DeEdge break_point(kh, origin_.GetUniqueOutgoing(kh));
- Sequence result = ConstructSequenceWithEdge(break_point);
- if (clean_condensed_)
- origin_.IsolateVertex(kh);
- return result;
- }
-};
-
-class UnbranchingPathExtractor {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef runtime_k::RtSeq Kmer;
- typedef Index::kmer_iterator kmer_iterator;
- typedef Index::DeEdge DeEdge;
- typedef Index::KeyWithHash KeyWithHash;
-
- Index &origin_;
- size_t kmer_size_;
-
- bool IsJunction(KeyWithHash kh) const {
- return !(origin_.CheckUniqueOutgoing(kh) && origin_.CheckUniqueIncoming(kh));
- }
-
- void AddStartDeEdgesForVertex(KeyWithHash kh, std::vector<DeEdge>& start_edges) const {
- for (char next = 0; next < 4; next++) {
- if (origin_.CheckOutgoing(kh, next)) {
- TRACE("Added to queue " << DeEdge(kh, origin_.GetOutgoing(kh, next)));
- start_edges.push_back(DeEdge(kh, origin_.GetOutgoing(kh, next)));
- }
- }
- }
-
- void AddStartDeEdges(kmer_iterator &it, size_t queueSize,
- std::vector<DeEdge>& start_edges) const {
- for (; start_edges.size() < queueSize && it.good(); ++it) {
- KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
- if (IsJunction(kh)) {
- AddStartDeEdgesForVertex(kh, start_edges);
- KeyWithHash kh_inv = !kh;
- if(!(kh_inv.is_minimal())) {
- AddStartDeEdgesForVertex(kh_inv, start_edges);
- }
- }
- }
- }
-
- void CalculateSequences(std::vector<DeEdge> &edges,
- std::vector<Sequence> &sequences, UnbranchingPathFinder &finder) const {
- size_t size = edges.size();
- size_t start = sequences.size();
- sequences.resize(start + size);
-
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- sequences[start + i] = finder.ConstructSequenceWithEdge(edges[i]);
- TRACE("From " << edges[i] << " calculated sequence");
- TRACE(sequences[start + i]);
- }
- }
-
- void CleanCondensed(const Sequence &sequence) {
- Kmer kmer = sequence.start<Kmer>(kmer_size_);
- KeyWithHash kwh = origin_.ConstructKWH(kmer);
- origin_.IsolateVertex(kwh);
- for(size_t pos = kmer_size_; pos < sequence.size(); pos++) {
- kwh = kwh << sequence[pos];
- origin_.IsolateVertex(kwh);
- }
- }
-
- void CleanCondensed(const std::vector<Sequence> &sequences) {
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < sequences.size(); ++i) {
- CleanCondensed(sequences[i]);
- }
- }
-
- //This methods collects all loops that were not extracted by finding unbranching paths because there are no junctions on loops.
- //TODO make parallel
- const std::vector<Sequence> CollectLoops() {
- INFO("Collecting perfect loops");
- UnbranchingPathFinder finder(origin_, kmer_size_);
- std::vector<Sequence> result;
- for (kmer_iterator it = origin_.kmer_begin(); it.good(); ++it) {
- KeyWithHash kh = origin_.ConstructKWH(Kmer(kmer_size_, *it));
- if (!IsJunction(kh)) {
- Sequence loop = finder.ConstructLoopFromVertex(kh);
- result.push_back(loop);
- CleanCondensed(loop);
- if(loop != (!loop)) {
- CleanCondensed(!loop);
- result.push_back(!loop);
- }
- }
- }
- INFO("Collecting perfect loops finished. " << result.size() << " loops collected");
- return result;
- }
-
-public:
- UnbranchingPathExtractor(Index &origin, size_t k) : origin_(origin), kmer_size_(k) {
- }
-
- //TODO very large vector is returned. But I hate to make all those artificial changes that can fix it.
- const std::vector<Sequence> ExtractUnbranchingPaths(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- INFO("Extracting unbranching paths");
- UnbranchingPathFinder finder(origin_, kmer_size_);
- std::vector<Sequence> result;
- size_t queueSize = queueMinSize;
- std::vector<DeEdge> start_edges;
- std::vector<Sequence> sequences;
- start_edges.reserve(queueSize);
- auto it = origin_.kmer_begin();
- while (it.good()) {
- AddStartDeEdges(it, queueSize, start_edges); // format a queue of junction kmers
- CalculateSequences(start_edges, sequences, finder); // in parallel
- start_edges.clear();
- queueSize = min((size_t) ((double) queueSize * queueGrowthRate), queueMaxSize);
- }
- INFO("Extracting unbranching paths finished. " << sequences.size() << " sequences extracted");
- return sequences;
- }
-
- const std::vector<Sequence> ExtractUnbranchingPathsAndLoops(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate) {
- std::vector<Sequence> result = ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
- CleanCondensed(result);
- std::vector<Sequence> loops = CollectLoops();
- for(auto it = loops.begin(); it != loops.end(); ++it) {
- result.push_back(*it);
- }
- return result;
- }
-
-private:
- DECL_LOGGER("UnbranchingPathExtractor")
-};
-
-/*
- * Only works for Conjugate dbg
- */
-template<class Graph>
-class FastGraphFromSequencesConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
- typedef DeBruijnExtensionIndex<> Index;
- size_t kmer_size_;
- Index &origin_;
-
- class LinkRecord {
- private:
- size_t hash_and_mask_;
- EdgeId edge_;
-
- size_t BitBool(bool flag) const {
- if(flag)
- return 1;
- return 0;
- }
-
- public:
- size_t GetHash() const {
- return hash_and_mask_ >> 2;
- }
-
- bool IsRC() const {
- return hash_and_mask_ & 2;
- }
-
- bool IsStart() const {
- return hash_and_mask_ & 1;
- }
-
-
- EdgeId GetEdge() const {
- return edge_;
- }
-
- LinkRecord(size_t hash, EdgeId edge, bool is_start, bool is_rc) :
- hash_and_mask_((hash << 2) | (BitBool(is_rc) << 1)| BitBool(is_start)), edge_(edge) {
- }
-
- LinkRecord() :
- hash_and_mask_(-1ul), edge_(0) {
- }
-
- bool IsInvalid() {
- return hash_and_mask_ + 1 == 0 && edge_ == EdgeId(0);
- }
-
- bool operator<(const LinkRecord &other) const {
- if(this->hash_and_mask_ == other.hash_and_mask_)
- return this->edge_ < other.edge_;
- return this->hash_and_mask_ < other.hash_and_mask_;
- }
- };
-
- LinkRecord StartLink(const EdgeId &edge, const Sequence &sequence) const {
- Kmer kmer(kmer_size_, sequence);
- Kmer kmer_rc = !kmer;
- if(kmer < kmer_rc)
- return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, true, false);
- else
- return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, true, true);
- }
-
- LinkRecord EndLink(const EdgeId &edge, const Sequence &sequence) const {
- Kmer kmer(kmer_size_, sequence, sequence.size() - kmer_size_);
- Kmer kmer_rc = !kmer;
- if(kmer < kmer_rc)
- return LinkRecord(origin_.ConstructKWH(kmer).idx(), edge, false, false);
- else
- return LinkRecord(origin_.ConstructKWH(kmer_rc).idx(), edge, false, true);
- }
-
- void CollectLinkRecords(typename Graph::HelperT &helper, const Graph &graph, vector<LinkRecord> &records, const vector<Sequence> &sequences) const {
- size_t size = sequences.size();
- records.resize(size * 2, LinkRecord(0, EdgeId(0), false, false));
- restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
-# pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < size; ++i) {
- size_t j = i << 1;
- auto id_distributor = id_storage.GetSegmentIdDistributor(j, j + 2);//indices for two edges are required
- EdgeId edge = helper.AddEdge(DeBruijnEdgeData(sequences[i]), id_distributor);
- records[j] = StartLink(edge, sequences[i]);
- if(graph.conjugate(edge) != edge)
- records[j + 1] = EndLink(edge, sequences[i]);
- else
- records[j + 1] = LinkRecord();
- }
- }
-
- void LinkEdge(typename Graph::HelperT &helper, const Graph &graph, const VertexId v, const EdgeId edge, const bool is_start, const bool is_rc) const {
- VertexId v1 = v;
- if(is_rc) {
- v1 = graph.conjugate(v);
- }
- if(is_start) {
- helper.LinkOutgoingEdge(v1, edge);
- } else {
- helper.LinkIncomingEdge(v1, edge);
- }
- }
-
-public:
- FastGraphFromSequencesConstructor(size_t k, Index &origin) : kmer_size_(k), origin_(origin) {
- }
-
- void ConstructGraph(Graph &graph, const vector<Sequence> &sequences) const {
- typename Graph::HelperT helper = graph.GetConstructionHelper();
- vector<LinkRecord> records;
- CollectLinkRecords(helper, graph, records, sequences);//TODO make parallel
- parallel::sort(records.begin(), records.end());
- size_t size = records.size();
- vector<vector<VertexId>> vertices_list(omp_get_max_threads());
- restricted::IdSegmentStorage id_storage = helper.graph().GetGraphIdDistributor().Reserve(size * 2);
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < size; i++) {
- if(i != 0 && records[i].GetHash() == records[i - 1].GetHash()) {
- continue;
- }
- if(records[i].IsInvalid())
- continue;
- auto id_distributor = id_storage.GetSegmentIdDistributor(i << 1, (i << 1) + 2);
- VertexId v = helper.CreateVertex(DeBruijnVertexData(), id_distributor);
- vertices_list[omp_get_thread_num()].push_back(v);
- for(size_t j = i; j < size && records[j].GetHash() == records[i].GetHash(); j++) {
- LinkEdge(helper, graph, v, records[j].GetEdge(), records[j].IsStart(), records[j].IsRC());
- }
- }
- for(size_t i = 0; i < vertices_list.size(); i++)
- helper.AddVerticesToGraph(vertices_list[i].begin(), vertices_list[i].end());
- }
-};
-
-/*
- * Constructs DeBruijnGraph from DeBruijnExtensionIndex using "new DeBruijnGraphExtentionConstructor(DeBruijn).ConstructGraph(DeBruijnGraph, Index)"
- */
-template<class Graph>
-class DeBruijnGraphExtentionConstructor {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef DeBruijnExtensionIndex<> DeBruijn;
- typedef typename Graph::VertexId VertexId;
- typedef runtime_k::RtSeq Kmer;
-
- Graph &graph_;
- DeBruijn &origin_;
- size_t kmer_size_;
-
- void FilterRC(std::vector<Sequence> &edge_sequences) {
- size_t size = 0;
- for(size_t i = 0; i < edge_sequences.size(); i++) {
- if(!(edge_sequences[i] < !edge_sequences[i])) {
- edge_sequences[size] = edge_sequences[i];
- size++;
- }
- }
- edge_sequences.resize(size);
- }
-
-public:
- DeBruijnGraphExtentionConstructor(Graph& graph, DeBruijn &origin) :
- graph_(graph), origin_(origin), kmer_size_(graph.k()) {
- }
-
- void ConstructGraph(size_t queueMinSize, size_t queueMaxSize,
- double queueGrowthRate, bool keep_perfect_loops) {
- std::vector<Sequence> edge_sequences;
- if(keep_perfect_loops)
- edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPathsAndLoops(queueMinSize, queueMaxSize, queueGrowthRate);
- else
- edge_sequences = UnbranchingPathExtractor(origin_, kmer_size_).ExtractUnbranchingPaths(queueMinSize, queueMaxSize, queueGrowthRate);
- FilterRC(edge_sequences);
- FastGraphFromSequencesConstructor<Graph>(kmer_size_, origin_).ConstructGraph(graph_, edge_sequences);
- }
-
-private:
- DECL_LOGGER("DeBruijnGraphConstructor")
-};
-
-}
diff --git a/src/modules/data_structures/debruijn_graph/early_simplification.hpp b/src/modules/data_structures/debruijn_graph/early_simplification.hpp
deleted file mode 100644
index 3fc9d55..0000000
--- a/src/modules/data_structures/debruijn_graph/early_simplification.hpp
+++ /dev/null
@@ -1,192 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dev_support/standard_base.hpp"
-#include "data_structures/indices/perfect_hash_map.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "data_structures/mph_index/kmer_index.hpp"
-
-namespace debruijn_graph {
-
-class LinkCleaner {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
-
- void CleanForwardLinks(KeyWithHash &kh, char i) {
- if(index_.CheckOutgoing(kh, i)) {
- KeyWithHash next_kh = index_.GetOutgoing(kh, i);
- if(!index_.CheckIncoming(next_kh, kh[0])) {
- index_.DeleteOutgoing(kh, i);
- }
- }
- }
-
- void CleanBackwardLinks(KeyWithHash &kh, char i) {
- if(index_.CheckIncoming(kh, i)) {
- KeyWithHash prev_kh = index_.GetIncoming(kh, i);
- if(!index_.CheckOutgoing(prev_kh, kh[index_.k() - 1])) {
- index_.DeleteIncoming(kh, i);
- }
- }
- }
-
-public:
- LinkCleaner(Index &index) : index_(index) {}
-
- //TODO make parallel
- void CleanLinks() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < iters.size(); i++) {
- for (Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if (kh.is_minimal()) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- for (char i = 0; i < 4; i++) {
- CleanForwardLinks(kh, i);
- CleanBackwardLinks(kh, i);
- }
- }
- }
- }
- }
-};
-
-class AlternativeEarlyTipClipper {
-private:
- typedef DeBruijnExtensionIndex<> Index;
- typedef Index::KMer Kmer;
- typedef Index::KeyWithHash KeyWithHash;
- Index &index_;
- size_t length_bound_;
-
- /*
- * This method starts from the kmer that is second in the tip counting from junction vertex. It records all kmers of a tip into tip vector.
- * The method returns length of a tip.
- * In case it did not end as a tip or if it was too long tip vector is cleared and infinite length is returned.
- * Thus tip vector contains only kmers to be removed while returned length value gives reasonable information of what happend.
- */
- size_t FindForward(KeyWithHash kh, vector<KeyWithHash> &tip) {
- while(tip.size() < length_bound_ && index_.CheckUniqueIncoming(kh) && index_.CheckUniqueOutgoing(kh)) {
- tip.push_back(kh);
- kh = index_.GetUniqueOutgoing(kh);
- }
- tip.push_back(kh);
- if(index_.CheckUniqueIncoming(kh) && index_.IsDeadEnd(kh)) {
- return tip.size();
- }
- tip.clear();
- return -1;
- }
-
- size_t FindBackward(KeyWithHash kh, vector<KeyWithHash> &tip) {
- while(tip.size() < length_bound_ && index_.CheckUniqueOutgoing(kh) && index_.CheckUniqueIncoming(kh)) {
- tip.push_back(kh);
- kh = index_.GetUniqueIncoming(kh);
- }
- tip.push_back(kh);
- if(index_.CheckUniqueOutgoing(kh) && index_.IsDeadStart(kh)) {
- return tip.size();
- }
- tip.clear();
- return -1;
- }
-
- size_t RemoveTip(vector<KeyWithHash > &tip) {
- for(size_t i = 0; i < tip.size(); i++)
- index_.IsolateVertex(tip[i]);
- return tip.size();
- }
-
- size_t RemoveTips(vector<vector<KeyWithHash > > tips, size_t max) {
- size_t result = 0;
- for(char c = 0; c < 4; c++) {
- if(tips[c].size() < max) {
- result += RemoveTip(tips[c]);
- }
- }
- return result;
- }
-
- size_t RemoveForward(KeyWithHash kh) {
- vector<vector<KeyWithHash >> tips;
- tips.resize(4);
- size_t max = 0;
- for(char c = 0; c < 4; c++) {
- if(index_.CheckOutgoing(kh, c)) {
- KeyWithHash khc = index_.GetOutgoing(kh, c);
- size_t len = FindForward(khc, tips[c]);
- if(len > max)
- max = len;
- }
- }
- return RemoveTips(tips, max);
- }
-
- size_t RemoveBackward(KeyWithHash kh) {
- vector<vector<KeyWithHash >> tips;
- tips.resize(4);
- size_t max = 0;
- for(char c = 0; c < 4; c++) {
- if(index_.CheckIncoming(kh, c)) {
- KeyWithHash khc = index_.GetIncoming(kh, c);
- size_t len = FindBackward(khc, tips[c]);
- if(len > max)
- max = len;
- }
- }
- return RemoveTips(tips, max);
- }
-
- //TODO make parallel
- size_t RoughClipTips() {
- vector<Index::kmer_iterator> iters = index_.kmer_begin(10 * omp_get_max_threads());
- vector<size_t> result(iters.size());
-# pragma omp parallel for schedule(guided)
- for(size_t i = 0; i < iters.size(); i++) {
- for(Index::kmer_iterator &it = iters[i]; it.good(); ++it) {
- KeyWithHash kh = index_.ConstructKWH(runtime_k::RtSeq(index_.k(), *it));
- if(kh.is_minimal()) {
- if (index_.OutgoingEdgeCount(kh) >= 2) {
- result[i] += RemoveForward(kh);
- }
- if (index_.IncomingEdgeCount(kh) >= 2) {
- result[i] += RemoveBackward(kh);
- }
- }
- }
- }
- size_t sum = 0;
- for(size_t i = 0; i < result.size(); i++)
- sum += result[i];
- return sum;
- }
-
-
-public:
- AlternativeEarlyTipClipper(Index &index, size_t length_bound) : index_(index), length_bound_(length_bound) {
- }
-
- /*
- * Method returns the number of removed edges
- */
- size_t ClipTips() {
- INFO("Early tip clipping");
- size_t result = RoughClipTips();
- LinkCleaner(index_).CleanLinks();
- INFO(result << " " << (index_.k()+1) <<"-mers were removed by early tip clipper");
- return result;
- }
-protected:
- DECL_LOGGER("Early tip clipping");
-};
-
-}
diff --git a/src/modules/data_structures/indices/edge_index_builders.hpp b/src/modules/data_structures/indices/edge_index_builders.hpp
deleted file mode 100644
index 5281bbc..0000000
--- a/src/modules/data_structures/indices/edge_index_builders.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "edge_position_index.hpp"
-#include "perfect_hash_map_builder.hpp"
-
-namespace debruijn_graph {
-
-template<class Index>
-class GraphPositionFillingIndexBuilder {
-public:
- typedef Index IndexT;
- typedef typename Index::KMer Kmer;
-
- template<class Graph>
- void BuildIndexFromGraph(Index &index,
- const Graph/*T*/ &g, size_t read_buffer_size = 0) const {
- debruijn_graph::BuildIndexFromGraph(index, g, read_buffer_size);
-
- // Now use the index to fill the coverage and EdgeId's
- INFO("Collecting k-mer coverage information from graph, this takes a while.");
- EdgeInfoUpdater<Index, Graph> updater(g, index);
- updater.UpdateAll();
- }
-
-};
-
-template<typename> struct Void { typedef void type; };
-
-template<typename T, typename Sfinae = void>
-struct has_contains: std::false_type {};
-
-template<typename T>
-struct has_contains<
- T
- , typename Void<
- //decltype( std::declval<T&>().contains(typename T::KMerIdx(0), typename T::KMer()) )
- decltype( ((T*)(0))->contains(*((typename T::KeyWithHash*)(0))) )
- >::type
->: std::true_type {};
-
-template <class Builder>
-class CoverageFillingEdgeIndexBuilder : public Builder {
- typedef Builder base;
- public:
- typedef typename Builder::IndexT IndexT;
- typedef typename IndexT::KMer Kmer;
- typedef typename IndexT::KMerIdx KmerIdx;
- typedef typename IndexT::KeyWithHash KeyWithHash;
-
- private:
-
-
- bool ContainsWrap(bool check_contains, IndexT& index, const KeyWithHash &kwh, std::true_type) const {
- return !check_contains || index.contains(kwh);
- }
-
- bool ContainsWrap(bool /*check_contains*/, IndexT&/* index*/, const KeyWithHash &/*kwh*/, std::false_type) const {
- VERIFY(false);
-// VERIFY(!check_contains);
- return true;
- }
-
- template<class ReadStream>
- size_t FillCoverageFromStream(ReadStream &stream,
- IndexT &index, bool check_contains) const {
- unsigned k = index.k();
- size_t rl = 0;
-
- while (!stream.eof()) {
- typename ReadStream::ReadT r;
- stream >> r;
- rl = std::max(rl, r.size());
-
- const Sequence &seq = r.sequence();
- if (seq.size() < k)
- continue;
-
- KeyWithHash kwh = index.ConstructKWH(seq.start<Kmer>(k) >> 'A');
- for (size_t j = k - 1; j < seq.size(); ++j) {
- kwh <<= seq[j];
- //contains is not used since index might be still empty here
- if (kwh.is_minimal() && index.valid(kwh) && ContainsWrap(check_contains, index, kwh, has_contains<IndexT>())) {
-# pragma omp atomic
- index.get_raw_value_reference(kwh).count += 1;
- }
- }
- }
-
- return rl;
- }
-
- public:
-
- template<class Streams>
- size_t ParallelFillCoverage(IndexT &index,
- Streams &streams,
- bool check_contains = true) const {
- INFO("Collecting k-mer coverage information from reads, this takes a while.");
- unsigned nthreads = (unsigned) streams.size();
- size_t rl = 0;
- streams.reset();
-#pragma omp parallel for num_threads(nthreads) shared(rl)
- for (size_t i = 0; i < nthreads; ++i) {
- size_t crl = FillCoverageFromStream(streams[i], index, check_contains);
-
- // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
-#pragma omp flush(rl)
- if (crl > rl)
-#pragma omp critical
- {
- rl = std::max(rl, crl);
- }
- }
-
- // Contigs have zero coverage!
-#if 0
- if (contigs_stream) {
- contigs_stream->reset();
- FillCoverageFromStream(*contigs_stream, index, check_contains);
- }
-#endif
-
-//todo if this verify is neede, put it outside
-//#ifndef NDEBUG
-// for (auto idx = index.kmer_idx_begin(), eidx = index.kmer_idx_end();
-// idx != eidx; ++idx) {
-//
-// Kmer k = index.kmer(idx);
-//
-// VERIFY(index[k].count == index[!k].count);
-// }
-//#endif
-
- return rl;
- }
-
- template<class Streams>
- size_t BuildIndexFromStream(IndexT &index,
- Streams &streams,
- io::SingleStream* contigs_stream = 0) const {
- debruijn_graph::BuildIndexFromStream(index, streams, contigs_stream);
-
- return ParallelFillCoverage(index, streams, false);
- }
-
-// template<class Streams>
-// size_t BuildIndexWithCoverageFromGraph(
-// GraphT &graph, IndexT &index,
-// Streams &streams,
-// SingleReadStream* contigs_stream = 0) const {
-// this->BuildIndexFromGraph(index, graph);
-//
-// return ParallelFillCoverage(index, streams, contigs_stream, true);
-// }
-};
-
-template<class Index>
-struct EdgeIndexHelper {
- typedef typename Index::KMer Kmer;
- typedef typename Index::KMerIdx KMerIdx;
- typedef typename Index::traits_t traits_t;
- typedef CoverageFillingEdgeIndexBuilder<Index> CoverageFillingEdgeIndexBuilderT;
- typedef GraphPositionFillingIndexBuilder<Index> GraphPositionFillingIndexBuilderT;
- typedef CoverageFillingEdgeIndexBuilder<GraphPositionFillingIndexBuilderT> CoverageAndGraphPositionFillingIndexBuilderT;
-};
-
-}
diff --git a/src/modules/data_structures/indices/edge_info_updater.hpp b/src/modules/data_structures/indices/edge_info_updater.hpp
deleted file mode 100644
index ce957f6..0000000
--- a/src/modules/data_structures/indices/edge_info_updater.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-#include "dev_support/openmp_wrapper.h"
-#include "modules/data_structures/sequence/sequence.hpp"
-#include "modules/assembly_graph/graph_core/graph_iterators.hpp"
-
-namespace debruijn_graph {
-
-template<typename Index, typename Graph>
-class EdgeInfoUpdater {
- typedef typename Index::KMer Kmer;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Index::KeyWithHash KeyWithHash;
- typedef typename Index::Value EdgeInfo;
-
- const Graph &g_;
- Index &index_;
-
-
- void PutInIndex(const KeyWithHash &kwh, EdgeId id, size_t offset) {
- if (index_.valid(kwh)) {
- auto &entry = index_.get_raw_value_reference(kwh);
- if (!entry.valid() || index_.contains(kwh)) {
- index_.put_value(kwh, EdgeInfo(id, (unsigned)offset, entry.count));
- }
- }
- }
-
- //todo why do we need to check equality???!!!
- bool DeleteIfEqual(const KeyWithHash &kwh, EdgeId e) {
- if (!index_.contains(kwh))
- return false;
- if (index_.get_value(kwh).edge_id == e) {
- index_.get_raw_value_reference(kwh).invalidate();
- return true;
- }
- return false;
- }
-
- void UpdateKMers(const Sequence &nucls, EdgeId e) {
- VERIFY(nucls.size() >= index_.k());
- KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
- index_.PutInIndex(kwh, e, 0);
- for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
- kwh <<= nucls[i];
- index_.PutInIndex(kwh, e, i - index_.k() + 1);
- }
- }
-
- void DeleteKMers(const Sequence &nucls, EdgeId e) {
- VERIFY(nucls.size() >= index_.k());
- KeyWithHash kwh = index_.ConstructKWH(Kmer(index_.k(), nucls));
- DeleteIfEqual(kwh, e);
- for (size_t i = index_.k(), n = nucls.size(); i < n; ++i) {
- kwh <<= nucls[i];
- DeleteIfEqual(kwh, e);
- }
- }
-
- public:
- /**
- * Creates DataHashRenewer for specified graph and index
- * @param g graph to be indexed
- * @param index index to be synchronized with graph
- */
- EdgeInfoUpdater(const Graph& g, Index& index)
- : g_(g),
- index_(index) {
- }
-
- void UpdateKmers(EdgeId e) {
- Sequence nucls = g_.EdgeNucls(e);
- UpdateKMers(nucls, e);
- }
-
- void DeleteKmers(EdgeId e) {
- Sequence nucls = g_.EdgeNucls(e);
- DeleteKMers(nucls, e);
- }
-
- void UpdateAll() {
- unsigned nthreads = omp_get_max_threads();
-
- omnigraph::IterationHelper<Graph, EdgeId> edges(g_);
- auto iters = edges.Chunks(16 * nthreads);
-
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < iters.size() - 1; ++i) {
- TRACE("Processing chunk #" << i);
- for (auto it = iters[i]; it != iters[i + 1]; ++it) {
- UpdateKmers(*it);
- }
- }
- }
-
- private:
- DECL_LOGGER("EdgeInfoUpdater")
-};
-
-}
diff --git a/src/modules/data_structures/indices/edge_multi_index.hpp b/src/modules/data_structures/indices/edge_multi_index.hpp
deleted file mode 100644
index c514e55..0000000
--- a/src/modules/data_structures/indices/edge_multi_index.hpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "perfect_hash_map.hpp"
-#include "edge_info_updater.hpp"
-#include "edge_position_index.hpp"
-
-#include <folly/SmallLocks.h>
-
-namespace debruijn_graph {
-
-template<class IdType>
-class EdgeInfoStorage {
-public:
- typedef vector<EdgeInfo<IdType>> Content;
- typedef typename Content::iterator iterator;
- typedef typename Content::const_iterator const_iterator;
- Content content_;
- folly::MicroSpinLock lock_;
-
- EdgeInfoStorage(const Content &content) : content_(content) {
- lock_.init();
- }
-
- EdgeInfoStorage() {
- lock_.init();
- }
-
- EdgeInfo<IdType> &operator[](size_t i) {
- return content_[i];
- }
-
- iterator begin() {
- return content_.begin();
- }
-
- iterator end() {
- return content_.end();
- }
-
- const_iterator begin() const {
- return content_.cbegin();
- }
-
- const_iterator end() const {
- return content_.cend();
- }
-
- iterator find(const EdgeInfo<IdType> &info) {
- return content_.find(info);
- }
-
- const_iterator find(const EdgeInfo<IdType> &info) const {
- return content_.find(info);
- }
-
- void push_back(const EdgeInfo<IdType> &info) {
- folly::MSLGuard g(lock_);
- content_.push_back(info);
- }
-
- template<class... Args>
- void emplace_back(Args&&... args) {
- folly::MSLGuard g(lock_);
- content_.emplace_back(std::forward<Args>(args)...);
- }
-
- size_t size() const{
- return content_.size();
- }
-
- bool valid() const {
- //what's invalid edge info storage?
- return true;
- }
-
- EdgeInfoStorage conjugate(size_t k) const {
- EdgeInfoStorage result;
- for(auto it = content_.rbegin(); it != content_.rend(); ++it) {
- result.push_back(it->conjugate(k));
- }
- return result;
- }
-};
-
-//todo it is not handling graph events!!!
-template<class IdType, class Seq = runtime_k::RtSeq,
- class traits = kmer_index_traits<Seq>, class StoringType = SimpleStoring >
-class DeBruijnEdgeMultiIndex : public KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > {
- typedef KeyStoringMap<Seq, EdgeInfoStorage<IdType>, traits, StoringType > base;
- public:
- typedef StoringType storing_type;
- typedef typename base::traits_t traits_t;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfoStorage<IdType> Value;
-
- using base::ConstructKWH;
-// typedef typename base::IdType IdType;
- //todo move this typedef up in hierarchy (need some c++ tricks)
-
- DeBruijnEdgeMultiIndex(unsigned k, const std::string &workdir)
- : base(k, workdir) {
- INFO("Constructing multi-kmer index");
- }
-
- ~DeBruijnEdgeMultiIndex() {}
-
-
- Value get(const KeyWithHash &kwh) const {
- VERIFY(contains(kwh));
- return base::get_value(kwh);
- }
-
- bool contains(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
- return this->get_raw_value_reference(kwh).valid();
- }
-
- bool valid(const KMer &kmer) const {
- KeyWithHash kwh = base::ConstructKWH(kmer);
- return base::valid(kwh);
- }
-
- void PutInIndex(const KeyWithHash &kwh, IdType id, size_t offset) {
- if (!contains(kwh))
- return;
-
- EdgeInfoStorage<IdType> &entry = this->get_raw_value_reference(kwh);
- entry.emplace_back(id, (unsigned int)offset);
- }
-
- const EdgeInfoStorage<IdType> get(const KMer& kmer) const {
- auto kwh = base::ConstructKWH(kmer);
- auto entry = this->get_value(kwh);
- return entry;
- }
-
- //todo delete if equal seems to work improperly!!!
- bool DeleteIfEqual(const KeyWithHash &, IdType) {
- VERIFY(false);
- return false;
- }
-
-};
-
-}
diff --git a/src/modules/data_structures/indices/edge_position_index.hpp b/src/modules/data_structures/indices/edge_position_index.hpp
deleted file mode 100644
index 76f3502..0000000
--- a/src/modules/data_structures/indices/edge_position_index.hpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "perfect_hash_map.hpp"
-#include "edge_info_updater.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "modules/io/reads/single_read.hpp"
-
-namespace debruijn_graph {
-
-template<class IdType>
-struct EdgeInfo {
- IdType edge_id;
- unsigned offset;
- unsigned count;
-
- EdgeInfo(IdType edge_id_ = IdType(), unsigned offset_ = -1u, unsigned count_ = 0) :
- edge_id(edge_id_), offset(offset_), count(count_) { }
-
- template<class KWH>
- EdgeInfo conjugate(const KWH &kwh) const {
- return conjugate(kwh.key().size());
- }
-
- EdgeInfo conjugate(size_t k) const {
- if(!valid()) {
- return EdgeInfo(IdType(0), unsigned(-1), count);
- } else {
- return EdgeInfo(edge_id->conjugate(), (unsigned)edge_id->length(k) - offset, count);
- }
- }
-
- void invalidate() {
- offset = unsigned(-1);
- }
-
- bool valid() const {
- return offset != unsigned(-1);
- }
-};
-
-template<class stream, class IdType>
-stream &operator<<(stream &s, const EdgeInfo<IdType> &info) {
- return s << "EdgeInfo[" << info.edge_id << ", " << info.offset << ", " << info.count << "]";
-}
-
-template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
-class KmerFreeEdgeIndex : public KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
- typedef KeyIteratingMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
- const Graph &graph_;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef Graph GraphT;
- typedef typename Graph::EdgeId IdType;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfo<typename Graph::EdgeId> Value;
- using base::valid;
- using base::ConstructKWH;
-
-public:
-
- KmerFreeEdgeIndex(const Graph &graph, const std::string &workdir)
- : base(unsigned(graph.k() + 1), workdir), graph_(graph) {}
-
- /**
- * Shows if kmer has some entry associated with it
- */
- bool contains(const KeyWithHash &kwh) const {
- // Sanity check
- if (!valid(kwh))
- return false;
-
- Value entry = base::get_value(kwh);
- if (entry.offset == -1u)
- return false;
-
- return graph_.EdgeNucls(entry.edge_id).contains(kwh.key(), entry.offset);
- }
-
- void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
- if (!valid(kwh))
- return;
-
- auto &entry = this->get_raw_value_reference(kwh);
- if (!entry.valid() || contains(kwh)) {
- this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
- }
- }
-
- //Only coverage is loaded
- template<class Writer>
- void BinWrite(Writer &writer) const {
- this->index_.serialize(writer);
- size_t sz = this->data_.size();
- writer.write((char*)&sz, sizeof(sz));
- for (size_t i = 0; i < sz; ++i)
- writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string/* &FileName*/) {
- this->clear();
- this->index_.deserialize(reader);
- size_t sz = 0;
- reader.read((char*)&sz, sizeof(sz));
- this->data_.resize(sz);
- for (size_t i = 0; i < sz; ++i)
- reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- }
-};
-
-template<class Graph, class Seq = runtime_k::RtSeq, class traits = kmer_index_traits<Seq>, class StoringType = DefaultStoring>
-class KmerStoringEdgeIndex : public KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> {
- typedef KeyStoringMap<Seq, EdgeInfo<typename Graph::EdgeId>, traits, StoringType> base;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KMer KMer;
- typedef typename base::KMerIdx KMerIdx;
- typedef Graph GraphT;
- typedef typename Graph::EdgeId IdType;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef EdgeInfo<typename Graph::EdgeId> Value;
- using base::valid;
- using base::ConstructKWH;
-
-
- KmerStoringEdgeIndex(const Graph& g, const std::string &workdir)
- : base(unsigned(g.k() + 1), workdir) {}
-
- ~KmerStoringEdgeIndex() {}
-
- /**
- * Shows if kmer has some entry associated with it
- */
- bool contains(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
- return this->get_raw_value_reference(kwh).valid();
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- this->index_.serialize(writer);
- size_t sz = this->data_.size();
- writer.write((char*)&sz, sizeof(sz));
- for (size_t i = 0; i < sz; ++i)
- writer.write((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- this->BinWriteKmers(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &FileName) {
- this->clear();
- this->index_.deserialize(reader);
- size_t sz = 0;
- reader.read((char*)&sz, sizeof(sz));
- this->data_.resize(sz);
- for (size_t i = 0; i < sz; ++i)
- reader.read((char*)&(this->data_[i].count), sizeof(this->data_[0].count));
- this->BinReadKmers(reader, FileName);
- }
- void PutInIndex(KeyWithHash &kwh, IdType id, size_t offset) {
- if (valid(kwh)) {
- auto &entry = this->get_raw_value_reference(kwh);
- if (!entry.valid() || contains(kwh)) {
- this->put_value(kwh, Value(id, (unsigned)offset, entry.count));
- }
- }
- }
-};
-
-}
diff --git a/src/modules/data_structures/indices/editable_index.hpp b/src/modules/data_structures/indices/editable_index.hpp
deleted file mode 100644
index 204bf3f..0000000
--- a/src/modules/data_structures/indices/editable_index.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "perfect_hash_map.hpp.hpp"
-
-namespace debruijn_graph {
-
-//template<class ValueType, class traits>
-//class EditableDeBruijnKMerIndex: public DeBruijnKMerIndex<ValueType, traits> {
-//public:
-// typedef size_t KMerIdx;
-//private:
-// typedef typename traits::SeqType KMer;
-// typedef KMerIndex<traits> KMerIndexT;
-// typedef ValueType KMerIndexValueType;
-// typedef std::vector<KMerIndexValueType> KMerIndexStorageType;
-// typedef boost::bimap<KMer, size_t> KMerPushBackIndexType;
-//
-// KMerPushBackIndexType push_back_index_;
-// KMerIndexStorageType push_back_buffer_;
-//
-// using DeBruijnKMerIndex<ValueType, traits>::index_;
-// using DeBruijnKMerIndex<ValueType, traits>::data_;
-// using DeBruijnKMerIndex<ValueType, traits>::kmers;
-// using DeBruijnKMerIndex<ValueType, traits>::K_;
-// using DeBruijnKMerIndex<ValueType, traits>::InvalidKMerIdx;
-//public:
-// EditableDeBruijnKMerIndex(unsigned K, const std::string &workdir) :
-// DeBruijnKMerIndex<ValueType, traits>(K, workdir) {
-// }
-//
-// KMerIdx seq_idx(const KMer &s) const {
-// KMerIdx idx = index_.seq_idx(s);
-//
-// // First, check whether we're insert index itself.
-// if (contains(idx, s, /* check push back */false))
-// return idx;
-//
-// // Maybe we're inside push_back buffer then?
-// auto it = push_back_index_.left.find(s);
-// if (it != push_back_index_.left.end())
-// return data_.size() + it->second;
-//
-// return InvalidKMerIdx;
-// }
-//
-// KMerIndexValueType &operator[](const KMer &s) {
-// return operator[](index_.seq_idx(s));
-// }
-//
-// const KMerIndexValueType &operator[](const KMer &s) const {
-// return operator[](index_.seq_idx(s));
-// }
-//
-//
-// const KMerIndexValueType &operator[](KMerIdx idx) const {
-// if (idx < this->data_.size())
-// return this->data_[idx];
-// return push_back_buffer_[idx - this->data_.size()];
-// }
-//
-// KMerIndexValueType &operator[](KMerIdx idx) {
-// if (idx < this->data_.size())
-// return this->data_[idx];
-//
-// return push_back_buffer_[idx - this->data_.size()];
-// }
-//
-// size_t size() const {
-// return this->data_.size() + push_back_buffer_.size();
-// }
-//
-// bool contains(const KMer &k) const {
-// KMerIdx idx = seq_idx(k);
-//
-// return idx != InvalidKMerIdx;
-// }
-// bool contains(KMerIdx idx) const {
-// return idx < size();
-// }
-//
-// size_t insert(const KMer &s, const KMerIndexValueType &value) {
-// size_t idx = push_back_buffer_.size();
-// push_back_index_.insert(
-// typename KMerPushBackIndexType::value_type(s, idx));
-// push_back_buffer_.push_back(value);
-//
-// return idx;
-// }
-//
-// KMer kmer(KMerIdx idx) const {
-// VERIFY(contains(idx));
-//
-// if (idx < this->data_.size()) {
-// auto it = kmers->begin() + idx;
-// return (typename traits::raw_create()(K_, *it));
-// }
-//
-// idx -= this->data_.size();
-// return push_back_index_.right.find(idx)->second;
-// }
-//
-// template<class Writer>
-// void BinWrite(Writer &writer) const {
-// index_.serialize(writer);
-// size_t sz = this->data_.size();
-// writer.write((char*) &sz, sizeof(sz));
-// writer.write((char*) &this->data_[0], sz * sizeof(data_[0]));
-// sz = push_back_buffer_.size();
-// writer.write((char*) &sz, sizeof(sz));
-// writer.write((char*) &push_back_buffer_[0],
-// sz * sizeof(push_back_buffer_[0]));
-// for (auto it = push_back_index_.left.begin(), e =
-// push_back_index_.left.end(); it != e; ++it) {
-// size_t idx = it->second;
-// KMer::BinWrite(writer, it->first);
-// writer.write((char*) &idx, sizeof(idx));
-// sz -= 0;
-// }
-// VERIFY(sz == 0);
-// traits::raw_serialize(writer, kmers);
-// }
-//
-// template<class Reader>
-// void BinRead(Reader &reader, const std::string &FileName) {
-// clear();
-// index_.deserialize(reader);
-// size_t sz = 0;
-// reader.read((char*) &sz, sizeof(sz));
-// data_.resize(sz);
-// reader.read((char*) &data_[0], sz * sizeof(data_[0]));
-// reader.read((char*) &sz, sizeof(sz));
-// push_back_buffer_.resize(sz);
-// reader.read((char*) &push_back_buffer_[0],
-// sz * sizeof(push_back_buffer_[0]));
-// for (size_t i = 0; i < sz; ++i) {
-// KMer s(K_);
-// size_t idx;
-//
-// s.BinRead(reader);
-// reader.read((char*) &idx, sizeof(idx));
-//
-// push_back_index_.insert(
-// typename KMerPushBackIndexType::value_type(s, idx));
-// }
-//
-// kmers = traits::raw_deserialize(reader, FileName);
-// }
-//
-// void clear() {
-// index_.clear();
-// this->data_.clear();
-// KMerIndexStorageType().swap(data_);
-// push_back_index_.clear();
-// push_back_buffer_.clear();
-// delete kmers;
-// kmers = NULL;
-// }
-//
-//protected:
-// bool contains(KMerIdx idx, const KMer &k,
-// bool check_push_back = true) const {
-// // Sanity check
-// if (idx == InvalidKMerIdx || idx >= size())
-// return false;
-//
-// if (idx < data_.size()) {
-// auto it = kmers->begin() + idx;
-// return (typename traits::raw_equal_to()(k, *it));
-// }
-//
-// if (check_push_back) {
-// auto it = push_back_index_.right.find(idx - data_.size());
-// return (it != push_back_index_.right.end() && it->second == k);
-// }
-//
-// return false;
-// }
-//
-//};
-
-//template <class kmer_index_traits>
-//class EditableDeBruijnKMerIndexBuilder {
-// public:
-// template <class IdType, class Read>
-// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex<IdType, kmer_index_traits> &index,
-// io::ReadStreamVector<io::IReader<Read> > &streams,
-// SingleReadStream* contigs_stream = 0) const;
-//
-// template <class IdType, class Graph>
-// void BuildIndexFromGraph(EditableDeBruijnKMerIndex<IdType, kmer_index_traits> &index,
-// const Graph &g) const;
-//
-// protected:
-// template <class KMerCounter, class Index>
-// void SortUniqueKMers(KMerCounter &counter, Index &index) const;
-//
-// protected:
-// DECL_LOGGER("K-mer Index Building");
-//};
-
-//template <>
-//class EditableDeBruijnKMerIndexBuilder<kmer_index_traits<runtime_k::RtSeq>> {
-// public:
-// template <class IdType, class Read>
-// size_t BuildIndexFromStream(EditableDeBruijnKMerIndex<IdType, kmer_index_traits<runtime_k::RtSeq>> &index,
-// io::ReadStreamVector<io::IReader<Read> > &streams,
-// SingleReadStream* contigs_stream = 0) const {
-// DeBruijnReadKMerSplitter<Read> splitter(index.workdir(),
-// index.K(), 0,
-// streams, contigs_stream);
-// KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
-// KMerIndexBuilder<typename DeBruijnKMerIndex<IdType, kmer_index_traits<runtime_k::RtSeq>>::KMerIndexT> builder(index.workdir(), 16, streams.size());
-// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true);
-// index.data_.resize(sz);
-//
-// if (!index.kmers)
-// index.kmers = counter.GetFinalKMers();
-//
-// SortUniqueKMers(counter, index);
-//
-// return 0;
-// }
-//
-// template <class IdType, class Graph>
-// void BuildIndexFromGraph(EditableDeBruijnKMerIndex<IdType, runtime_k::RtSeq> &index,
-// const Graph &g) const {
-// DeBruijnGraphKMerSplitter<Graph> splitter(index.workdir(), index.K(), g);
-// KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
-// KMerIndexBuilder<typename DeBruijnKMerIndex<typename Graph::EdgeId, kmer_index_traits<runtime_k::RtSeq>>::KMerIndexT> builder(index.workdir(), 16, 1);
-// size_t sz = builder.BuildIndex(index.index_, counter, /* save final */ true);
-// index.data_.resize(sz);
-//
-// if (!index.kmers)
-// index.kmers = counter.GetFinalKMers();
-//
-// SortUniqueKMers(counter, index);
-// }
-//
-// protected:
-// template <class KMerCounter, class Index>
-// void SortUniqueKMers(KMerCounter &counter, Index &index) const {
-// size_t swaps = 0;
-// INFO("Arranging kmers in hash map order");
-// for (auto I = index.kmers->begin(), E = index.kmers->end(); I != E; ++I) {
-// size_t cidx = I - index.kmers->begin();
-// size_t kidx = index.raw_seq_idx(*I);
-// while (cidx != kidx) {
-// auto J = index.kmers->begin() + kidx;
-// using std::swap;
-// swap(*I, *J);
-// swaps += 1;
-//
-// kidx = index.raw_seq_idx(*I);
-// }
-// }
-// INFO("Done. Total swaps: " << swaps);
-// }
-//
-// protected:
-// DECL_LOGGER("K-mer Index Building");
-//};
-
-}
diff --git a/src/modules/data_structures/indices/key_with_hash.hpp b/src/modules/data_structures/indices/key_with_hash.hpp
deleted file mode 100644
index 81026ae..0000000
--- a/src/modules/data_structures/indices/key_with_hash.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "storing_traits.hpp"
-
-namespace debruijn_graph {
-
-template<typename Key, class HashFunction>
-class SimpleKeyWithHash {
-public:
- typedef Key KeyType;
-private:
- typedef typename HashFunction::IdxType IdxType;
- const HashFunction &hash_;
- Key key_;
- mutable IdxType idx_; //lazy computation
- mutable bool ready_;
-
- void CountIdx() const {
- ready_ = true;
- idx_ = hash_.seq_idx(key_);
- }
-
- void SetKey(const Key &key) {
- ready_ = false;
- key_ = key;
- }
-public:
-
- SimpleKeyWithHash(Key key, const HashFunction &hash) : hash_(hash), key_(key), idx_(0), ready_(false) {
- }
-
- Key key() const {
- return key_;
- }
-
- IdxType idx() const {
- if(!ready_) {
- CountIdx();
- }
- return idx_;
- }
-
- SimpleKeyWithHash &operator=(const SimpleKeyWithHash &that) {
- VERIFY(&this->hash_ == &that.hash_);
- this->key_= that.key_;
- this->idx_ = that.idx_;
- this->ready_ = that.ready_;
- return *this;
- }
-
- bool operator==(const SimpleKeyWithHash &that) const {
- VERIFY(&this->hash_ == &that.hash_);
- return this->key_ == that.key_;
- }
-
- bool operator!=(const SimpleKeyWithHash &that) const {
- VERIFY(&this->hash_ == &that.hash_);
- return this->key_ != that.key_;
- }
-
- SimpleKeyWithHash operator!() const {
- return SimpleKeyWithHash(!key_, hash_);
- }
-
- SimpleKeyWithHash operator<<(char nucl) const {
- return SimpleKeyWithHash(key_ << nucl, hash_);
- }
-
- SimpleKeyWithHash operator>>(char nucl) const {
- return SimpleKeyWithHash(key_ >> nucl, hash_);
- }
-
- void operator<<=(char nucl) {
- SetKey(key_ << nucl);
- }
-
- void operator>>=(char nucl) {
- SetKey(key_ >> nucl);
- }
-
- char operator[](size_t i) const {
- return key_[i];
- }
-
- bool is_minimal() const {
- return true;;
- }
-};
-
-template<class stream, class Key, class Index>
-stream &operator<<(stream &s, const SimpleKeyWithHash<Key, Index> &kwh) {
- return s << "SKWH[" << kwh.key() << ", " << kwh.idx() << "]";
-}
-
-//Would it make sense to also store inverted kmer for not minimal kwh?
-template<typename Key, class HashFunction>
-class InvertableKeyWithHash {
-private:
- typedef typename HashFunction::IdxType IdxType;
-
- const HashFunction &hash_;
- Key key_;
- mutable IdxType idx_; //lazy computation
- mutable bool is_minimal_;
- mutable bool ready_;
-
- void CountIdx() const {
- ready_ = true;
- is_minimal_ = key_.IsMinimal();
- if(is_minimal_)
- idx_ = hash_.seq_idx(key_);
- else{
- idx_ = hash_.seq_idx(!key_);
- }
- }
-
- InvertableKeyWithHash(Key key, const HashFunction &hash, bool is_minimal,
- size_t idx, bool ready)
- : hash_(hash), key_(key), idx_(idx),
- is_minimal_(is_minimal), ready_(ready) {
- }
- public:
-
- InvertableKeyWithHash(Key key, const HashFunction &hash)
- : hash_(hash), key_(key), idx_(0), is_minimal_(false), ready_(false) {}
-
- const Key &key() const {
- return key_;
- }
-
- IdxType idx() const {
- if (!ready_)
- CountIdx();
-
- return idx_;
- }
-
- bool is_minimal() const {
- if(!ready_) {
- return key_.IsMinimal();
- }
- return is_minimal_;
- }
-
- bool ready() const {
- return ready_;
- }
-
- InvertableKeyWithHash &operator=(const InvertableKeyWithHash &that) {
- VERIFY(&this->hash_ == &that.hash_);
- this->key_= that.key_;
- this->idx_ = that.idx_;
- this->ready_ = that.ready_;
- this->is_minimal_ = that.is_minimal_;
- return *this;
- }
-
- bool operator==(const InvertableKeyWithHash &that) const {
- VERIFY(&this->hash_ == &that.hash_);
- return this->key_ == that.key_;
- }
-
- bool operator!=(const InvertableKeyWithHash &that) const {
- VERIFY(&this->hash_ == &that.hash_);
- return this->key_ != that.key_;
- }
-
- InvertableKeyWithHash operator!() const {
- if(!ready_)
- return InvertableKeyWithHash(!key_, hash_);
- return InvertableKeyWithHash(!key_, hash_, !is_minimal_, idx_, ready_);
- }
-
- InvertableKeyWithHash operator<<(char nucl) const {
- return InvertableKeyWithHash(key_ << nucl, hash_);
- }
-
- InvertableKeyWithHash operator>>(char nucl) const {
- return InvertableKeyWithHash(key_ >> nucl, hash_);
- }
-
- void operator<<=(char nucl) {
- key_ <<= nucl;
- ready_ = false;
- }
-
- void operator>>=(char nucl) {
- key_ >>= nucl;
- ready_ = false;
- }
-
- char operator[](size_t i) const {
- return key_[i];
- }
-};
-
-template<class stream, class Key, class Index>
-stream &operator<<(stream &s, const InvertableKeyWithHash<Key, Index> &kwh) {
- s << "IKWH[" << kwh.key();
- if(kwh.ready()) {
- return s << ", " << kwh.is_minimal() << ", " << kwh.idx() << "]";
- } else {
- return s << ", not ready]";
- }
-}
-
-template<class K, class Index, class StoringType>
-struct StoringTraits;
-
-template<class K, class Index>
-struct StoringTraits<K, Index, SimpleStoring> {
- typedef SimpleKeyWithHash<K, Index> KeyWithHash;
-};
-
-template<class K, class Index>
-struct StoringTraits<K, Index, InvertableStoring> {
- typedef InvertableKeyWithHash<K, Index> KeyWithHash;
-};
-
-}
diff --git a/src/modules/data_structures/indices/kmer_extension_index.hpp b/src/modules/data_structures/indices/kmer_extension_index.hpp
deleted file mode 100644
index 9e7cc55..0000000
--- a/src/modules/data_structures/indices/kmer_extension_index.hpp
+++ /dev/null
@@ -1,309 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "perfect_hash_map.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "storing_traits.hpp"
-#include <bitset>
-
-namespace debruijn_graph {
-
-inline uint8_t invert_byte_slow(uint8_t a) {
- size_t res = 0;
- for(size_t i = 0; i < 8; i++) {
- res <<= 1;
- res += a & 1;
- a = uint8_t(a >> 1);
- }
- return uint8_t(res);
-}
-
-inline vector<uint8_t> count_invert_byte() {
- vector<uint8_t> result;
- for (size_t a = 0; a < 256; a++) {
- result.push_back(invert_byte_slow((uint8_t)a));
- }
- return result;
-}
-
-inline uint8_t invert_byte(uint8_t a) {
- static vector<uint8_t> precalc = count_invert_byte();
- return precalc[a];
-}
-
-class InOutMask {
-private:
- uint8_t mask_;
-
- bool CheckUnique(uint8_t mask) const {
- static bool unique[] =
- { 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
- return unique[mask];
- }
-
- char GetUnique(uint8_t mask) const {
- static char next[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1,
- -1, -1, -1 };
- VERIFY(next[mask] != -1)
- return next[mask];
- }
-
- size_t Count(uint8_t mask) const {
- static char count[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
- return count[mask];
- }
-
-
- char inv_position(char nucl, bool as_is) const {
- if(as_is)
- return nucl;
- else
- return char(7 - nucl);
- }
-
-public:
- explicit InOutMask(uint8_t mask = 0) : mask_(mask){
- }
-
- uint8_t get_mask() const {
- return mask_;
- }
-
- template<class Key>
- InOutMask conjugate(const Key & /*k*/) const {
- return InOutMask(invert_byte(mask_));
- }
-
- void AddOutgoing(char nnucl, bool as_is) {
- unsigned nmask = (unsigned) (1 << inv_position(nnucl, as_is));
- if (!(mask_ & nmask)) {
-# pragma omp atomic
- mask_ |= (unsigned char) nmask;
- }
- }
-
- void AddIncoming(char pnucl, bool as_is) {
- unsigned pmask = (unsigned) (1 << inv_position(char(pnucl + 4), as_is));
- if (!(mask_ & pmask)) {
-# pragma omp atomic
- mask_|= (unsigned char) pmask;
- }
- }
-
- void DeleteOutgoing(char nnucl, bool as_is) {
- unsigned nmask = (1 << inv_position(nnucl, as_is));
- if (mask_ & nmask) {
-# pragma omp atomic
- mask_ &= (unsigned char) ~nmask;
- }
- }
-
- void DeleteIncoming(char pnucl, bool as_is) {
- unsigned pmask = (1 << inv_position(char(pnucl + 4), as_is));
- if (mask_ & pmask) {
-# pragma omp atomic
- mask_ &= (unsigned char) ~pmask;
- }
- }
-
- void IsolateVertex() {
- mask_ = 0;
- }
-
- bool CheckOutgoing(char nucl) const {
- return mask_ & (1 << nucl);
- }
-
- bool CheckIncoming(char nucl) const {
- return mask_ & (1 << (4 + nucl));
- }
-
- bool IsDeadEnd() const {
- return !(mask_ & 15);
- }
-
- bool IsDeadStart() const {
- return !(mask_ >> 4);
- }
-
- bool CheckUniqueOutgoing() const {
- return CheckUnique(mask_ & 15);
- }
-
- bool CheckUniqueIncoming() const {
- return CheckUnique(uint8_t(mask_ >> 4));
- }
-
- char GetUniqueOutgoing() const {
- return GetUnique(mask_ & 15);
- }
-
- char GetUniqueIncoming() const {
- return GetUnique(uint8_t(mask_ >> 4));
- }
-
- size_t OutgoingEdgeCount() const {
- return Count(mask_ & 15);
- }
-
- size_t IncomingEdgeCount() const {
- return Count(uint8_t(mask_ >> 4));
- }
-};
-
-template<class Stream>
-Stream &operator<<(Stream& stream, const InOutMask &mask) {
- return stream << std::bitset<8>(mask.get_mask());
-}
-
-template<class Seq>
-struct slim_kmer_index_traits : public kmer_index_traits<Seq> {
- typedef kmer_index_traits<Seq> __super;
-
- typedef MMappedRecordReader<typename Seq::DataType> FinalKMerStorage;
-
- template<class Writer>
- static void raw_serialize(Writer&, typename __super::RawKMerStorage*) {
- VERIFY(false && "Cannot save extension index");
- }
-
- template<class Reader>
- static typename __super::RawKMerStorage *raw_deserialize(
- Reader&, const std::string &) {
- VERIFY(false && "Cannot load extension index");
- return NULL;
- }
-
-};
-
-template<typename KeyWithHash>
-struct AbstractDeEdge {
- KeyWithHash start;
- KeyWithHash end;
- AbstractDeEdge(KeyWithHash _start, KeyWithHash _end) : start(_start), end(_end) {
- }
-
- AbstractDeEdge<KeyWithHash> &operator=(const AbstractDeEdge<KeyWithHash> &that) {
- this->start = that.start;
- this->end = that.end;
- return *this;
- }
-
- bool operator==(const AbstractDeEdge &other) {
- return start.idx() == other.start.idx() && end.idx() == other.end.idx();
- }
-
- bool operator!=(const AbstractDeEdge &other) {
- return !(*this == other);
- }
-};
-
-template<class stream, class KWH>
-stream &operator<<(stream &s, const AbstractDeEdge<KWH> de_edge) {
- return s << "DeEdge[" << de_edge.start << ", " << de_edge.end << "]";
-}
-
-template<class traits = slim_kmer_index_traits<runtime_k::RtSeq>, class StoringType = DefaultStoring>
-class DeBruijnExtensionIndex : public KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> {
- typedef KeyIteratingMap<typename traits::SeqType, InOutMask, traits, StoringType> base;
-
-public:
- typedef typename base::traits_t traits_t;
- typedef StoringType storing_type;
- typedef typename base::KeyType KMer;
- typedef typename base::IdxType KMerIdx;
- typedef typename base::KeyWithHash KeyWithHash;
- typedef AbstractDeEdge<KeyWithHash> DeEdge;
- using base::ConstructKWH;
-
- DeBruijnExtensionIndex(unsigned K, const std::string &workdir)
- : base((size_t) K, workdir) {
- }
-
- void AddOutgoing(const KeyWithHash &kwh, char nucl) {
- TRACE("Add outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).AddOutgoing(nucl, kwh.is_minimal());
- }
-
- void AddIncoming(const KeyWithHash &kwh, char nucl) {
- TRACE("Add incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).AddIncoming(nucl, kwh.is_minimal());
- }
-
- void DeleteOutgoing(const KeyWithHash &kwh, char nucl) {
- TRACE("Delete outgoing " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).DeleteOutgoing(nucl, kwh.is_minimal());
- }
-
- void DeleteIncoming(const KeyWithHash &kwh, char nucl) {
- TRACE("Delete incoming " << kwh << " " << size_t(nucl) << " " << kwh.is_minimal());
- this->get_raw_value_reference(kwh).DeleteIncoming(nucl, kwh.is_minimal());
- }
-
- void IsolateVertex(const KeyWithHash &kwh) {
- TRACE("Isolate vertex " << kwh);
- this->get_raw_value_reference(kwh).IsolateVertex();
- }
-
- bool CheckOutgoing(const KeyWithHash &kwh, char nucl) const {
- return this->get_value(kwh).CheckOutgoing(nucl);
- }
-
- KeyWithHash GetOutgoing(const KeyWithHash &kwh, char nucl) const {
- return kwh << nucl;
- }
-
- bool CheckIncoming(const KeyWithHash &kwh, char nucl) const {
- return this->get_value(kwh).CheckIncoming(nucl);
- }
-
- KeyWithHash GetIncoming(const KeyWithHash &kwh, char nucl) const {
- return kwh >> nucl;
- }
-
- bool IsDeadEnd(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IsDeadEnd();
- }
-
- bool IsDeadStart(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IsDeadStart();
- }
-
- bool CheckUniqueOutgoing(const KeyWithHash &kwh) const {
- return this->get_value(kwh).CheckUniqueOutgoing();
- }
-
- KeyWithHash GetUniqueOutgoing(const KeyWithHash &kwh) const {
- return GetOutgoing(kwh, this->get_value(kwh).GetUniqueOutgoing());
- }
-
- bool CheckUniqueIncoming(const KeyWithHash &kwh) const {
- return this->get_value(kwh).CheckUniqueIncoming();
- }
-
- KeyWithHash GetUniqueIncoming(const KeyWithHash &kwh) const {
- return GetIncoming(kwh, this->get_value(kwh).GetUniqueIncoming());
- }
-
- size_t OutgoingEdgeCount(const KeyWithHash &kwh) const {
- return this->get_value(kwh).OutgoingEdgeCount();
- }
-
- size_t IncomingEdgeCount(const KeyWithHash &kwh) const {
- return this->get_value(kwh).IncomingEdgeCount();
- }
-
- ~DeBruijnExtensionIndex() {
- }
-
-private:
- DECL_LOGGER("ExtentionIndex");
-};
-
-}
diff --git a/src/modules/data_structures/indices/kmer_extension_index_builder.hpp b/src/modules/data_structures/indices/kmer_extension_index_builder.hpp
deleted file mode 100644
index 6f4f9fc..0000000
--- a/src/modules/data_structures/indices/kmer_extension_index_builder.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2016 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "kmer_extension_index.hpp"
-#include "kmer_splitters.hpp"
-
-class DeBruijnExtensionIndexBuilder {
-public:
- template<class ReadStream, class Index>
- size_t FillExtensionsFromStream(ReadStream &stream, Index &index) const {
- unsigned k = index.k();
- size_t rl = 0;
-
- while (!stream.eof()) {
- typename ReadStream::read_type r;
- stream >> r;
- rl = std::max(rl, r.size());
-
- const Sequence &seq = r.sequence();
- if (seq.size() < k + 1)
- continue;
-
- typename Index::KeyWithHash kwh = index.ConstructKWH(seq.start<runtime_k::RtSeq>(k));
- for (size_t j = k; j < seq.size(); ++j) {
- char nnucl = seq[j], pnucl = kwh[0];
- index.AddOutgoing(kwh, nnucl);
- kwh <<= nnucl;
- index.AddIncoming(kwh, pnucl);
- }
- }
-
- return rl;
- }
-
- template<class Index>
- void FillExtensionsFromIndex(const std::string &KPlusOneMersFilename,
- Index &index) const {
- unsigned KPlusOne = index.k() + 1;
-
- typename Index::kmer_iterator it(KPlusOneMersFilename,
- runtime_k::RtSeq::GetDataSize(KPlusOne));
- for (; it.good(); ++it) {
- runtime_k::RtSeq kpomer(KPlusOne, *it);
-
- char pnucl = kpomer[0], nnucl = kpomer[KPlusOne - 1];
- TRACE("processing k+1-mer " << kpomer);
- index.AddOutgoing(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer)),
- nnucl);
- // FIXME: This is extremely ugly. Needs to add start / end methods to extract first / last N symbols...
- index.AddIncoming(index.ConstructKWH(runtime_k::RtSeq(KPlusOne - 1, kpomer << 0)),
- pnucl);
- }
- }
-
-public:
- template<class Index, class Streams>
- ReadStatistics BuildExtensionIndexFromStream(Index &index, Streams &streams, io::SingleStream* contigs_stream = 0,
- size_t read_buffer_size = 0) const {
- unsigned nthreads = (unsigned) streams.size();
-
- // First, build a k+1-mer index
- DeBruijnReadKMerSplitter<typename Streams::ReadT,
- StoringTypeFilter<typename Index::storing_type>>
- splitter(index.workdir(), index.k() + 1, 0xDEADBEEF, streams,
- contigs_stream, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- counter.CountAll(nthreads, nthreads, /* merge */false);
-
- // Now, count unique k-mers from k+1-mers
- DeBruijnKMerKMerSplitter<StoringTypeFilter<typename Index::storing_type> >
- splitter2(index.workdir(), index.k(),
- index.k() + 1, Index::storing_type::IsInvertable(), read_buffer_size);
- for (unsigned i = 0; i < nthreads; ++i)
- splitter2.AddKMers(counter.GetMergedKMersFname(i));
- KMerDiskCounter<runtime_k::RtSeq> counter2(index.workdir(), splitter2);
-
- BuildIndex(index, counter2, 16, nthreads);
-
- // Build the kmer extensions
- INFO("Building k-mer extensions from k+1-mers");
-# pragma omp parallel for num_threads(nthreads)
- for (unsigned i = 0; i < nthreads; ++i)
- FillExtensionsFromIndex(counter.GetMergedKMersFname(i), index);
- INFO("Building k-mer extensions from k+1-mers finished.");
-
- return splitter.stats();
- }
-
-private:
- DECL_LOGGER("DeBruijnExtensionIndexBuilder");
-};
-
-template<class Index>
-struct ExtensionIndexHelper {
- using IndexT = Index;
- typedef typename Index::traits_t traits_t;
- typedef typename Index::KMer Kmer;
- typedef typename Index::KMerIdx KMerIdx;
- using DeBruijnExtensionIndexBuilderT = DeBruijnExtensionIndexBuilder;
-};
-
diff --git a/src/modules/data_structures/indices/kmer_splitters.hpp b/src/modules/data_structures/indices/kmer_splitters.hpp
deleted file mode 100644
index 9e35934..0000000
--- a/src/modules/data_structures/indices/kmer_splitters.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "io/reads_io/io_helper.hpp"
-#include "storing_traits.hpp"
-
-#include "dev_support/file_limit.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "data_structures/mph_index/kmer_index_builder.hpp"
-
-namespace debruijn_graph {
-
-template<class StoringType>
-struct StoringTypeFilter {
-};
-
-template<>
-struct StoringTypeFilter<SimpleStoring> {
- template<class Kmer>
- bool filter(const Kmer &/*kmer*/) const {
- return true;
- }
-};
-
-template<>
-struct StoringTypeFilter<InvertableStoring> {
- template<class Kmer>
- bool filter(const Kmer &kmer) const {
- return kmer.IsMinimal();
- }
-};
-
-using RtSeqKMerSplitter = ::KMerSortingSplitter<runtime_k::RtSeq>;
-
-template<class KmerFilter>
-class DeBruijnKMerSplitter : public RtSeqKMerSplitter {
- private:
- KmerFilter kmer_filter_;
- protected:
- size_t read_buffer_size_;
- protected:
- bool FillBufferFromSequence(const Sequence &seq,
- unsigned thread_id) {
- if (seq.size() < this->K_)
- return false;
-
- runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
- bool stop = false;
- for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
- kmer <<= seq[j];
- if (!kmer_filter_.filter(kmer))
- continue;
-
- stop |= this->push_back_internal(kmer, thread_id);
- }
-
- return stop;
- }
-
- public:
- DeBruijnKMerSplitter(const std::string &work_dir,
- unsigned K, KmerFilter kmer_filter, size_t read_buffer_size = 0, uint32_t seed = 0)
- : RtSeqKMerSplitter(work_dir, K, seed), kmer_filter_(kmer_filter), read_buffer_size_(read_buffer_size) {
- }
- protected:
- DECL_LOGGER("DeBruijnKMerSplitter");
-};
-
-struct ReadStatistics {
- size_t reads_;
- size_t max_read_length_;
- size_t bases_;
-};
-
-template<class Read, class KmerFilter>
-class DeBruijnReadKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- io::ReadStreamList<Read> &streams_;
- io::SingleStream *contigs_;
-
- template<class ReadStream>
- ReadStatistics
- FillBufferFromStream(ReadStream& stream, unsigned thread_id);
-
- ReadStatistics rs_;
-
- public:
- DeBruijnReadKMerSplitter(const std::string &work_dir,
- unsigned K, uint32_t seed,
- io::ReadStreamList<Read>& streams,
- io::SingleStream* contigs_stream = 0,
- size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size, seed),
- streams_(streams), contigs_(contigs_stream), rs_({0 ,0 ,0}) {}
-
- path::files_t Split(size_t num_files) override;
-
- size_t read_length() const { return rs_.max_read_length_; }
- ReadStatistics stats() const { return rs_; }
-};
-
-template<class Read, class KmerFilter> template<class ReadStream>
-ReadStatistics
-DeBruijnReadKMerSplitter<Read, KmerFilter>::FillBufferFromStream(ReadStream &stream,
- unsigned thread_id) {
- typename ReadStream::ReadT r;
- size_t reads = 0, rl = 0, bases = 0;
-
- while (!stream.eof()) {
- stream >> r;
- rl = std::max(rl, r.size());
- reads += 1;
- bases += r.size();
-
- if (this->FillBufferFromSequence(r.sequence(), thread_id))
- break;
- }
- return { reads, rl, bases };
-}
-
-template<class Read, class KmerFilter>
-path::files_t DeBruijnReadKMerSplitter<Read, KmerFilter>::Split(size_t num_files) {
- unsigned nthreads = (unsigned) streams_.size();
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
- path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
-
- size_t counter = 0, rl = 0, bases = 0, n = 15;
- streams_.reset();
- while (!streams_.eof()) {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : counter) reduction(+ : bases) shared(rl)
- for (unsigned i = 0; i < nthreads; ++i) {
- ReadStatistics stats = FillBufferFromStream(streams_[i], i);
- counter += stats.reads_;
- bases += stats.bases_;
-
- // There is no max reduction in C/C++ OpenMP... Only in FORTRAN :(
-# pragma omp flush(rl)
- if (stats.max_read_length_ > rl)
-# pragma omp critical
- {
- rl = std::max(rl, stats.max_read_length_);
- }
- }
-
- this->DumpBuffers(out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " reads");
- n += 1;
- }
- }
-
- if (contigs_) {
- INFO("Adding contigs from previous K");
- unsigned cnt = 0;
- contigs_->reset();
- while (!contigs_->eof()) {
- FillBufferFromStream(*contigs_, cnt);
- this->DumpBuffers(out);
- if (++cnt >= nthreads)
- cnt = 0;
- }
- }
-
- INFO("Used " << counter << " reads. Maximum read length " << rl);
- INFO("Average read length " << double(bases) / double(counter));
- rs_ = { counter, rl, bases };
-
- return out;
-}
-
-template<class Graph, class KmerFilter>
-class DeBruijnGraphKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- typedef typename Graph::ConstEdgeIt EdgeIt;
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &g_;
-
- size_t FillBufferFromEdges(EdgeIt &edge, unsigned thread_id);
-
- public:
- DeBruijnGraphKMerSplitter(const std::string &work_dir,
- unsigned K, const Graph &g, size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K, KmerFilter(), read_buffer_size), g_(g) {}
-
- path::files_t Split(size_t num_files) override;
-};
-
-template<class Graph, class KmerFilter>
-size_t
-DeBruijnGraphKMerSplitter<Graph, KmerFilter>::FillBufferFromEdges(EdgeIt &edge,
- unsigned thread_id) {
- size_t seqs = 0;
- for (; !edge.IsEnd(); ++edge) {
- const Sequence &nucls = g_.EdgeNucls(*edge);
-
- seqs += 1;
- if (this->FillBufferFromSequence(nucls, thread_id))
- break;
- }
-
- return seqs;
-}
-
-template<class Graph, class KmerFilter>
-path::files_t DeBruijnGraphKMerSplitter<Graph, KmerFilter>::Split(size_t num_files) {
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- path::files_t out = this->PrepareBuffers(num_files, 1, this->read_buffer_size_);
-
- size_t counter = 0, n = 10;
- for (auto it = g_.ConstEdgeBegin(); !it.IsEnd(); ) {
- counter += FillBufferFromEdges(it, 0);
-
- this->DumpBuffers(out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " edges");
- n += 1;
- }
- }
-
- INFO("Used " << counter << " sequences.");
-
- return out;
-}
-
-
-template<class KmerFilter>
-class DeBruijnKMerKMerSplitter : public DeBruijnKMerSplitter<KmerFilter> {
- typedef MMappedFileRecordArrayIterator<runtime_k::RtSeq::DataType> kmer_iterator;
-
- unsigned K_source_;
- std::vector<std::string> kmers_;
- bool add_rc_;
-
- size_t FillBufferFromKMers(kmer_iterator &kmer,
- unsigned thread_id);
-
- public:
- DeBruijnKMerKMerSplitter(const std::string &work_dir,
- unsigned K_target, unsigned K_source, bool add_rc, size_t read_buffer_size = 0)
- : DeBruijnKMerSplitter<KmerFilter>(work_dir, K_target, KmerFilter(), read_buffer_size),
- K_source_(K_source), add_rc_(add_rc) {}
-
- void AddKMers(const std::string &file) {
- kmers_.push_back(file);
- }
-
- path::files_t Split(size_t num_files) override;
-};
-
-template<class KmerFilter>
-inline size_t DeBruijnKMerKMerSplitter<KmerFilter>::FillBufferFromKMers(kmer_iterator &kmer,
- unsigned thread_id) {
- size_t seqs = 0;
- for (; kmer.good(); ++kmer) {
- Sequence nucls(runtime_k::RtSeq(K_source_, *kmer));
- seqs += 1;
-
- bool stop = this->FillBufferFromSequence(nucls, thread_id);
- if (add_rc_)
- stop |= this->FillBufferFromSequence(!nucls, thread_id);
-
- if (stop)
- break;
- }
-
- return seqs;
-}
-
-template<class KmerFilter>
-path::files_t DeBruijnKMerKMerSplitter<KmerFilter>::Split(size_t num_files) {
- unsigned nthreads = (unsigned) kmers_.size();
-
- INFO("Splitting kmer instances into " << num_files << " buckets. This might take a while.");
-
- path::files_t out = this->PrepareBuffers(num_files, nthreads, this->read_buffer_size_);
-
- size_t counter = 0, n = 10;
- std::vector<kmer_iterator> its;
- its.reserve(nthreads);
- for (auto it = kmers_.begin(), et = kmers_.end(); it != et; ++it)
- its.emplace_back(*it, runtime_k::RtSeq::GetDataSize(K_source_));
-
- while (std::any_of(its.begin(), its.end(),
- [](const kmer_iterator &it) { return it.good(); })) {
-# pragma omp parallel for num_threads(nthreads) reduction(+ : counter)
- for (unsigned i = 0; i < nthreads; ++i)
- counter += FillBufferFromKMers(its[i], i);
-
- this->DumpBuffers(out);
-
- if (counter >> n) {
- INFO("Processed " << counter << " kmers");
- n += 1;
- }
- }
-
- INFO("Used " << counter << " kmers.");
-
- return out;
-}
-
-
-}
diff --git a/src/modules/data_structures/indices/perfect_hash_map.hpp b/src/modules/data_structures/indices/perfect_hash_map.hpp
deleted file mode 100644
index 941acba..0000000
--- a/src/modules/data_structures/indices/perfect_hash_map.hpp
+++ /dev/null
@@ -1,318 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dev_support/openmp_wrapper.h"
-#include "dev_support/path_helper.hpp"
-#include "io/kmers_io/kmer_iterator.hpp"
-
-#include "data_structures/mph_index/kmer_index.hpp"
-
-#include "key_with_hash.hpp"
-#include "values.hpp"
-#include "storing_traits.hpp"
-
-#include <vector>
-#include <cstdlib>
-#include <cstdint>
-
-namespace debruijn_graph {
-
-template<class K, class traits>
-class IndexWrapper {
- static const size_t InvalidIdx = size_t(-1);
-public:
- typedef size_t IdxType;
- typedef K KeyType;
- typedef traits traits_t;
-protected:
- typedef KMerIndex<traits> KMerIndexT;
- //these fields are protected only for reduction of storage in edge indices BinWrite
- KMerIndexT index_;
-private:
- std::string workdir_;
- unsigned k_;
-
-protected:
- size_t raw_seq_idx(const typename KMerIndexT::KMerRawReference s) const {
- return index_.raw_seq_idx(s);
- }
-
- bool valid(const size_t idx) const {
- return idx != InvalidIdx && idx < index_.size();
- }
-public:
- IndexWrapper(size_t k, const std::string &workdir) : k_((unsigned) k) {
- //fixme string literal
- workdir_ = path::make_temp_dir(workdir, "kmeridx");
- }
-
- ~IndexWrapper() {
- path::remove_dir(workdir_);
- }
-
- void clear() {
- index_.clear();
- }
-
- unsigned k() const { return k_; }
-
-public:
- template<class Writer>
- void BinWrite(Writer &writer) const {
- index_.serialize(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &) {
- clear();
- index_.deserialize(reader);
- }
-
- const std::string &workdir() const {
- return workdir_;
- }
-};
-
-template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
-class PerfectHashMap : public ValueArray<V>, public IndexWrapper<K, traits> {
-public:
- typedef size_t IdxType;
- typedef K KeyType;
- typedef ValueArray<V> ValueBase;
- typedef IndexWrapper<KeyType, traits> KeyBase;
- using KeyBase::index_;
- typedef typename KeyBase::KMerIndexT KMerIndexT;
- typedef typename StoringTraits<K, KMerIndexT, StoringType>::KeyWithHash KeyWithHash;
-
- KeyWithHash ConstructKWH(const KeyType &key) const {
- return KeyWithHash(key, index_);
- }
-
- bool valid(const KeyWithHash &kwh) const {
- return KeyBase::valid(kwh.idx());
- }
-
- PerfectHashMap(size_t k, const std::string &workdir) : KeyBase(k, workdir) {
- }
-
- ~PerfectHashMap() {
- }
-
- void clear() {
- KeyBase::clear();
- ValueBase::clear();
- }
-
- const V get_value(const KeyWithHash &kwh) const {
- return StoringType::get_value(*this, kwh);
- }
-
- //Think twice or ask AntonB if you want to use it!
- V &get_raw_value_reference(const KeyWithHash &kwh) {
- return ValueBase::operator[](kwh.idx());
- }
-
- const V &get_raw_value_reference(const KeyWithHash &kwh) const {
- return ValueBase::operator[](kwh.idx());
- }
-
- void put_value(const KeyWithHash &kwh, const V &value) {
- StoringType::set_value(*this, kwh, value);
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- ValueBase::BinWrite(writer);
- KeyBase::BinWrite(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &tmp) {
- KeyBase::BinRead(reader, tmp);
- ValueBase::BinRead(reader, tmp);
- }
-
- friend struct PerfectHashMapBuilder;
-};
-
-
-template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
-class KeyStoringMap : public PerfectHashMap<K, V, traits, StoringType> {
-private:
- typedef PerfectHashMap<K, V, traits, StoringType> base;
-
-public:
- typedef traits traits_t;
- typedef K KMer;
- typedef typename base::IdxType KMerIdx;
- typedef typename traits::FinalKMerStorage::iterator kmer_iterator;
- typedef typename traits::FinalKMerStorage::const_iterator const_kmer_iterator;
- typedef typename base::KeyWithHash KeyWithHash;
- using base::ConstructKWH;
-
-private:
- std::unique_ptr<typename traits::FinalKMerStorage> kmers_;
-
- void SortUniqueKMers() const {
- size_t swaps = 0;
- INFO("Arranging kmers in hash map order");
- for (auto I = kmers_->begin(), E = kmers_->end(); I != E; ++I) {
- size_t cidx = I - kmers_->begin();
- size_t kidx = this->raw_seq_idx(*I);
- while (cidx != kidx) {
- auto J = kmers_->begin() + kidx;
- using std::swap;
- swap(*I, *J);
- swaps += 1;
- kidx = this->raw_seq_idx(*I);
- }
- }
- INFO("Done. Total swaps: " << swaps);
- }
-
-protected:
- template<class Writer>
- void BinWriteKmers(Writer &writer) const {
- traits::raw_serialize(writer, this->kmers_);
- }
-
- template<class Reader>
- void BinReadKmers(Reader &reader, const std::string &FileName) {
- this->kmers_ = traits_t::raw_deserialize(reader, FileName);
- }
-
- template<class Writer>
- void BinWrite(Writer &writer) const {
- base::BinWrite(writer);
- BinWriteKmers(writer);
- }
-
- template<class Reader>
- void BinRead(Reader &reader, const std::string &FileName) {
- base::BinRead(reader, FileName);
- BinReadKmers(reader, FileName);
- }
-
-public:
-
- KeyStoringMap(size_t k, const std::string &workdir)
- : base(k, workdir), kmers_(nullptr) {}
-
- ~KeyStoringMap() {}
-
- KMer true_kmer(KeyWithHash kwh) const {
- VERIFY(this->valid(kwh));
-
- auto it = this->kmers_->begin() + kwh.idx();
- return (typename traits_t::raw_create()(this->k(), *it));
- }
-
- void clear() {
- base::clear();
- kmers_ = nullptr;
- }
-
- kmer_iterator kmer_begin() {
- return kmers_->begin();
- }
- const_kmer_iterator kmer_begin() const {
- return kmers_->cbegin();
- }
-
- kmer_iterator kmer_end() {
- return kmers_->end();
- }
- const_kmer_iterator kmer_end() const {
- return kmers_->cend();
- }
-
- bool valid(const KeyWithHash &kwh) const {
- if (!base::valid(kwh))
- return false;
-
- auto it = this->kmers_->begin() + kwh.idx();
- if (!kwh.is_minimal())
- return (typename traits_t::raw_equal_to()(!kwh.key(), *it));
- else
- return (typename traits_t::raw_equal_to()(kwh.key(), *it));
- }
-
- /**
- * Number of edges going out of the param edge's end
- */
- unsigned NextEdgeCount(const KeyWithHash &kwh) const {
- unsigned res = 0;
- for (char c = 0; c < 4; ++c)
- if (valid(kwh << c))
- res += 1;
-
- return res;
- }
-
- KeyWithHash NextEdge(const KeyWithHash &kwh) const { // returns any next edge
- for (char c = 0; c < 4; ++c) {
- if (valid(kwh << c))
- //hack for this code to work with long seqs! (oterwise return s is totally fine)
- return ConstructKWH(true_kmer(kwh));//s;
- }
-
- VERIFY_MSG(false, "Couldn't find requested edge!");
- return ConstructKWH(KMer(this->k()));
- // no next edges (we should request one here).
- }
-
- /**
- * Number of edges coming into param edge's end
- */
- unsigned RivalEdgeCount(const KeyWithHash &kwh) const {
- KeyWithHash next = kwh << 'A';
- unsigned res = 0;
- for (char c = 0; c < 4; ++c)
- if (valid(next >> c))
- res += 1;
-
- return res;
- }
-
- friend struct KeyStoringIndexBuilder;
-};
-
-template<class K, class V, class traits = kmer_index_traits<K>, class StoringType = SimpleStoring>
-class KeyIteratingMap : public PerfectHashMap<K, V, traits, StoringType> {
- typedef PerfectHashMap<K, V, traits, StoringType> base;
-
- std::string KMersFilename_;
-
-public:
- typedef StoringType storing_type;
- typedef typename base::traits_t traits_t;
- typedef typename base::KeyType KMer;
- typedef typename base::IdxType KMerIdx;
- using base::ConstructKWH;
-
-public:
-
- KeyIteratingMap(size_t k, const std::string &workdir)
- : base(k, workdir), KMersFilename_("") {}
-
- ~KeyIteratingMap() {}
-
- typedef MMappedFileRecordArrayIterator<typename KMer::DataType> kmer_iterator;
-
- kmer_iterator kmer_begin() const {
- return kmer_iterator(this->KMersFilename_, KMer::GetDataSize(base::k()));
- }
-
- std::vector<kmer_iterator> kmer_begin(size_t parts) const {
- return io::make_kmer_iterator<KMer>(this->KMersFilename_, base::k(), parts);
- }
-
- friend struct KeyIteratingIndexBuilder;
-};
-
-}
diff --git a/src/modules/data_structures/indices/perfect_hash_map_builder.hpp b/src/modules/data_structures/indices/perfect_hash_map_builder.hpp
deleted file mode 100644
index b94a596..0000000
--- a/src/modules/data_structures/indices/perfect_hash_map_builder.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2016 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "data_structures/mph_index/kmer_index_builder.hpp"
-
-#include "perfect_hash_map.hpp"
-#include "kmer_splitters.hpp"
-
-namespace debruijn_graph {
-
-struct PerfectHashMapBuilder {
- template<class K, class V, class traits, class StoringType, class Counter>
- void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) const {
- using KMerIndex = typename PerfectHashMap<K, V, traits, StoringType>::KMerIndexT;
-
- KMerIndexBuilder<KMerIndex> builder(index.workdir(),
- (unsigned) bucket_num,
- (unsigned) thread_num);
- size_t sz = builder.BuildIndex(index.index_, counter, save_final);
- index.resize(sz);
- }
-};
-
-struct KeyStoringIndexBuilder {
- template<class K, class V, class traits, class StoringType, class Counter>
- void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) const {
- phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
- VERIFY(!index.kmers_.get());
- index.kmers_ = counter.GetFinalKMers();
- VERIFY(index.kmers_.get());
- index.SortUniqueKMers();
- }
-
- private:
- PerfectHashMapBuilder phm_builder_;
-};
-
-struct KeyIteratingIndexBuilder {
- template<class K, class V, class traits, class StoringType, class Counter>
- void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) const {
- phm_builder_.BuildIndex(index, counter, bucket_num, thread_num, save_final);
- index.KMersFilename_ = counter.GetFinalKMersFname();
- }
-
- private:
- PerfectHashMapBuilder phm_builder_;
-};
-
-template<class K, class V, class traits, class StoringType, class Counter>
-void BuildIndex(KeyIteratingMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) {
- KeyIteratingIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
-}
-
-
-template<class K, class V, class traits, class StoringType, class Counter>
-void BuildIndex(KeyStoringMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) {
- KeyStoringIndexBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
-}
-
-template<class K, class V, class traits, class StoringType, class Counter>
-void BuildIndex(PerfectHashMap<K, V, traits, StoringType> &index,
- Counter& counter, size_t bucket_num,
- size_t thread_num, bool save_final = true) {
- PerfectHashMapBuilder().BuildIndex(index, counter, bucket_num, thread_num, save_final);
-}
-
-template<class Index, class Streams>
-size_t BuildIndexFromStream(Index &index,
- Streams &streams,
- io::SingleStream* contigs_stream = 0) {
- DeBruijnReadKMerSplitter<typename Streams::ReadT,
- StoringTypeFilter<typename Index::storing_type>>
- splitter(index.workdir(), index.k(), 0, streams, contigs_stream);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- BuildIndex(index, counter, 16, streams.size());
- return 0;
-}
-
-template<class Index, class Graph>
-void BuildIndexFromGraph(Index &index, const Graph &g, size_t read_buffer_size = 0) {
- DeBruijnGraphKMerSplitter<Graph,
- StoringTypeFilter<typename Index::storing_type>>
- splitter(index.workdir(), index.k(), g, read_buffer_size);
- KMerDiskCounter<runtime_k::RtSeq> counter(index.workdir(), splitter);
- BuildIndex(index, counter, 16, 1);
-}
-
-}
diff --git a/src/modules/data_structures/indices/storing_traits.hpp b/src/modules/data_structures/indices/storing_traits.hpp
deleted file mode 100644
index b91406f..0000000
--- a/src/modules/data_structures/indices/storing_traits.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-/*
- * key_with_hash.hpp
- *
- * Created on: Nov 7, 2013
- * Author: anton
- */
-
-#include "values.hpp"
-
-namespace debruijn_graph {
-
-
-struct SimpleStoring {
- template<class K, class V>
- static V get_value(const ValueArray<V> &values, const K& key) {
- return values[key.idx()];
- }
-
- template<class K, class V>
- static void set_value(ValueArray<V> &values, const K& key, const V& value) {
- values[key.idx()] = value;
- }
-
- static bool IsInvertable() {
- return false;
- }
-};
-
-struct InvertableStoring {
- template<class K, class V>
- static V get_value(const ValueArray<V> &values, const K& key) {
- if(key.is_minimal())
- return values[key.idx()];
- else
- return values[key.idx()].conjugate(key);
- }
-
- template<class K, class V>
- static void set_value(ValueArray<V> &values, const K& key, const V& value) {
- if(key.is_minimal())
- values[key.idx()] = value;
- else
- values[key.idx()] = value.conjugate(key);
- }
-
- static bool IsInvertable() {
- return true;
- }
-};
-
-typedef InvertableStoring DefaultStoring;
-
-}
diff --git a/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp b/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp
deleted file mode 100644
index 649be20..0000000
--- a/src/modules/data_structures/mph_index/hypergraph_sorter_seq.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <tuple>
-#include <cmath>
-#include <vector>
-#include <iterator>
-#include <algorithm>
-#include <stdexcept>
-
-#include "common.hpp"
-#include "hypergraph.hpp"
-
-#include "dev_support/logger/logger.hpp"
-
-namespace emphf {
-
- template <typename HypergraphType>
- class hypergraph_sorter_seq {
- public:
- typedef HypergraphType hg;
- typedef typename hg::node_t node_t;
- typedef typename hg::hyperedge hyperedge;
- typedef typename hg::xored_adj_list xored_adj_list;
-
- hypergraph_sorter_seq()
- {}
-
- template <typename Range, typename EdgeGenerator>
- bool try_generate_and_sort(Range const& input_range,
- EdgeGenerator const& edge_gen,
- size_t n,
- size_t hash_domain,
- bool verbose = true)
- {
- using std::get;
- std::vector<xored_adj_list> adj_lists;
-
- size_t m = hash_domain * 3;
-
- // do all the allocations upfront
- m_peeling_order.clear();
- m_peeling_order.reserve(n);
- adj_lists.resize(m);
-
- // generate edges
- if (verbose) {
- //logger() << "Generating hyperedges and populating adjacency lists"
- // << std::endl;
- }
-
- for (auto const& val: input_range) {
- auto edge = edge_gen(val);
- // canonical by construction
- assert(orientation(edge) == 0);
-
- adj_lists[edge.v0].add_edge(edge);
-
- std::swap(edge.v0, edge.v1);
- adj_lists[edge.v0].add_edge(edge);
-
- std::swap(edge.v0, edge.v2);
- adj_lists[edge.v0].add_edge(edge);
- }
-
- // peel
- if (verbose) {
- // logger() << "Peeling" << std::endl;
- }
-
- auto visit = [&](node_t v0) {
- if (adj_lists[v0].degree == 1) {
- auto edge = adj_lists[v0].edge_from(v0);
- m_peeling_order.push_back(edge);
-
- edge = canonicalize_edge(edge);
- adj_lists[edge.v0].delete_edge(edge);
-
- std::swap(edge.v0, edge.v1);
- adj_lists[edge.v0].delete_edge(edge);
-
- std::swap(edge.v0, edge.v2);
- adj_lists[edge.v0].delete_edge(edge);
- }
- };
-
- size_t queue_position = 0;
- for (node_t v0 = 0; v0 < m; ++v0) {
- visit(v0);
-
- while (queue_position < m_peeling_order.size()) {
- auto const& cur_edge = m_peeling_order[queue_position];
-
- visit(cur_edge.v1);
- visit(cur_edge.v2);
- queue_position += 1;
- }
- }
-
- if (m_peeling_order.size() < n) {
- if (verbose) {
- // logger() << "Hypergraph is not peelable: "
- // << (n - m_peeling_order.size()) << " edges remaining"
- // << std::endl;
- }
- return false;
- }
-
- assert(m_peeling_order.size() == n);
-
- return true;
- }
-
- typedef typename std::vector<hyperedge>::const_reverse_iterator
- peeling_iterator;
-
- std::pair<peeling_iterator, peeling_iterator>
- get_peeling_order() const
- {
- return std::make_pair(m_peeling_order.crbegin(),
- m_peeling_order.crend());
- }
-
- private:
-
- size_t m_hash_domain;
- std::vector<hyperedge> m_peeling_order;
- };
-}
diff --git a/src/modules/data_structures/mph_index/kmer_index_builder.hpp b/src/modules/data_structures/mph_index/kmer_index_builder.hpp
deleted file mode 100644
index 9993ba1..0000000
--- a/src/modules/data_structures/mph_index/kmer_index_builder.hpp
+++ /dev/null
@@ -1,404 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "kmer_index.hpp"
-
-#include "io/kmers_io/mmapped_reader.hpp"
-#include "io/kmers_io/mmapped_writer.hpp"
-#include "utils/adt/pointer_iterator.hpp"
-#include "utils/adt/kmer_vector.hpp"
-
-#include "dev_support/openmp_wrapper.h"
-
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/path_helper.hpp"
-
-#include "dev_support/memory_limit.hpp"
-#include "dev_support/file_limit.hpp"
-
-#include "mphf.hpp"
-#include "base_hash.hpp"
-#include "hypergraph.hpp"
-#include "hypergraph_sorter_seq.hpp"
-
-#include <libcxx/sort.hpp>
-
-#include <algorithm>
-#ifdef USE_GLIBCXX_PARALLEL
-#include <parallel/algorithm>
-#endif
-
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-#include <fstream>
-#include <vector>
-#include <cmath>
-
-template<class Seq>
-class KMerSplitter {
- public:
- typedef typename Seq::hash hash_function;
-
- KMerSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
- : work_dir_(work_dir), K_(K), seed_(seed) {}
-
- virtual ~KMerSplitter() {}
-
- virtual path::files_t Split(size_t num_files) = 0;
-
- size_t kmer_size() const {
- return Seq::GetDataSize(K_) * sizeof(typename Seq::DataType);
- }
-
- unsigned K() const { return K_; }
-
- protected:
- const std::string &work_dir_;
- hash_function hash_;
- unsigned K_;
- uint32_t seed_;
-
- DECL_LOGGER("K-mer Splitting");
-};
-
-template<class Seq>
-class KMerSortingSplitter : public KMerSplitter<Seq> {
- public:
- KMerSortingSplitter(const std::string &work_dir, unsigned K, uint32_t seed = 0)
- : KMerSplitter<Seq>(work_dir, K, seed), cell_size_(0), num_files_(0) {}
-
- protected:
- using SeqKMerVector = KMerVector<Seq>;
- using KMerBuffer = std::vector<SeqKMerVector>;
-
- std::vector<KMerBuffer> kmer_buffers_;
- size_t cell_size_;
- size_t num_files_;
-
- path::files_t PrepareBuffers(size_t num_files, unsigned nthreads, size_t reads_buffer_size) {
- num_files_ = num_files;
-
- // Determine the set of output files
- path::files_t out;
- for (unsigned i = 0; i < num_files_; ++i)
- out.push_back(this->GetRawKMersFname(i));
-
- size_t file_limit = num_files_ + 2*nthreads;
- size_t res = limit_file(file_limit);
- if (res < file_limit) {
- WARN("Failed to setup necessary limit for number of open files. The process might crash later on.");
- WARN("Do 'ulimit -n " << file_limit << "' in the console to overcome the limit");
- }
-
- if (reads_buffer_size == 0) {
- reads_buffer_size = 536870912ull;
- size_t mem_limit = (size_t)((double)(get_free_memory()) / (nthreads * 3));
- INFO("Memory available for splitting buffers: " << (double)mem_limit / 1024.0 / 1024.0 / 1024.0 << " Gb");
- reads_buffer_size = std::min(reads_buffer_size, mem_limit);
- }
- cell_size_ = reads_buffer_size / (num_files_ * this->kmer_size());
- // Set sane minimum cell size
- if (cell_size_ < 16384)
- cell_size_ = 16384;
-
- INFO("Using cell size of " << cell_size_);
- kmer_buffers_.resize(nthreads);
- for (unsigned i = 0; i < nthreads; ++i) {
- KMerBuffer &entry = kmer_buffers_[i];
- entry.resize(num_files_, KMerVector<Seq>(this->K_, (size_t) (1.1 * (double) cell_size_)));
- }
-
- return out;
- }
-
- bool push_back_internal(const Seq &seq, unsigned thread_id) {
- KMerBuffer &entry = kmer_buffers_[thread_id];
-
- size_t idx = this->GetFileNumForSeq(seq, (unsigned)num_files_);
- entry[idx].push_back(seq);
- return entry[idx].size() > cell_size_;
- }
-
- void DumpBuffers(const path::files_t &ostreams) {
- VERIFY(ostreams.size() == num_files_ && kmer_buffers_[0].size() == num_files_);
-
-# pragma omp parallel for
- for (unsigned k = 0; k < num_files_; ++k) {
- // Below k is thread id!
-
- size_t sz = 0;
- for (size_t i = 0; i < kmer_buffers_.size(); ++i)
- sz += kmer_buffers_[i][k].size();
-
- KMerVector<Seq> SortBuffer(this->K_, sz);
- for (auto & entry : kmer_buffers_) {
- const auto &buffer = entry[k];
- for (size_t j = 0; j < buffer.size(); ++j)
- SortBuffer.push_back(buffer[j]);
- }
- libcxx::sort(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::less2_fast());
- auto it = std::unique(SortBuffer.begin(), SortBuffer.end(), typename KMerVector<Seq>::equal_to());
-
-# pragma omp critical
- {
- FILE *f = fopen(ostreams[k].c_str(), "ab");
- VERIFY_MSG(f, "Cannot open temporary file to write");
- fwrite(SortBuffer.data(), SortBuffer.el_data_size(), it - SortBuffer.begin(), f);
- fclose(f);
- }
- }
-
- for (auto & entry : kmer_buffers_)
- for (auto & eentry : entry)
- eentry.clear();
- }
-
- std::string GetRawKMersFname(unsigned suffix) const {
- return path::append_path(this->work_dir_, "kmers.raw." + std::to_string(suffix));
- }
-
- unsigned GetFileNumForSeq(const Seq &s, unsigned total) const {
- return (unsigned)(this->hash_(s, this->seed_) % total);
- }
-
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerCounter {
- public:
- typedef typename traits::raw_data_iterator iterator;
- typedef typename traits::raw_data_const_iterator const_iterator;
- typedef typename traits::RawKMerStorage RawKMerStorage;
- typedef typename traits::FinalKMerStorage FinalKMerStorage;
-
- virtual size_t kmer_size() const = 0;
-
- virtual size_t Count(unsigned num_buckets, unsigned num_threads) = 0;
- virtual size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) = 0;
- virtual void MergeBuckets(unsigned num_buckets) = 0;
-
- virtual std::unique_ptr<RawKMerStorage> GetBucket(size_t idx, bool unlink = true) = 0;
- virtual std::unique_ptr<FinalKMerStorage> GetFinalKMers() = 0;
-
- virtual ~KMerCounter() {}
-
-protected:
- DECL_LOGGER("K-mer Counting");
-};
-
-template<class Seq, class traits = kmer_index_traits<Seq> >
-class KMerDiskCounter : public KMerCounter<Seq> {
- typedef KMerCounter<Seq, traits> __super;
- typedef typename traits::RawKMerStorage BucketStorage;
-public:
- KMerDiskCounter(const std::string &work_dir, KMerSplitter<Seq> &splitter)
- : work_dir_(work_dir), splitter_(splitter) {
- std::string prefix = path::append_path(work_dir, "kmers_XXXXXX");
- char *tempprefix = strcpy(new char[prefix.length() + 1], prefix.c_str());
- VERIFY_MSG(-1 != (fd_ = ::mkstemp(tempprefix)), "Cannot create temporary file");
- kmer_prefix_ = tempprefix;
- delete[] tempprefix;
- }
-
- ~KMerDiskCounter() {
- ::close(fd_);
- ::unlink(kmer_prefix_.c_str());
- }
-
- size_t kmer_size() const override {
- return Seq::GetDataSize(splitter_.K()) * sizeof(typename Seq::DataType);
- }
-
- std::unique_ptr<BucketStorage> GetBucket(size_t idx, bool unlink = true) override {
- unsigned K = splitter_.K();
- return std::unique_ptr<BucketStorage>(new BucketStorage(GetMergedKMersFname((unsigned)idx), Seq::GetDataSize(K), unlink));
- }
-
- size_t Count(unsigned num_buckets, unsigned num_threads) override {
- unsigned K = splitter_.K();
-
- // Split k-mers into buckets.
- path::files_t raw_kmers = splitter_.Split(num_buckets * num_threads);
-
- INFO("Starting k-mer counting.");
- size_t kmers = 0;
-# pragma omp parallel for shared(raw_kmers) num_threads(num_threads) schedule(dynamic) reduction(+:kmers)
- for (unsigned iFile = 0; iFile < raw_kmers.size(); ++iFile) {
- kmers += MergeKMers(raw_kmers[iFile], GetUniqueKMersFname(iFile), K);
- }
- INFO("K-mer counting done. There are " << kmers << " kmers in total. ");
-
- INFO("Merging temporary buckets.");
- for (unsigned i = 0; i < num_buckets; ++i) {
- std::string ofname = GetMergedKMersFname(i);
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_threads; ++j) {
- BucketStorage ins(GetUniqueKMersFname(i + j * num_buckets), Seq::GetDataSize(K), /* unlink */ true);
- ofs.write((const char*)ins.data(), ins.data_size());
- }
- }
-
- return kmers;
- }
-
- void MergeBuckets(unsigned num_buckets) override {
- unsigned K = splitter_.K();
-
- INFO("Merging final buckets.");
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(GetFinalKMersFname(), Seq::GetDataSize(K));
- std::string ofname = GetFinalKMersFname();
- std::ofstream ofs(ofname.c_str(), std::ios::out | std::ios::binary);
- for (unsigned j = 0; j < num_buckets; ++j) {
- auto bucket = GetBucket(j, /* unlink */ true);
- ofs.write((const char*)bucket->data(), bucket->data_size());
- }
- ofs.close();
- }
-
- size_t CountAll(unsigned num_buckets, unsigned num_threads, bool merge = true) override {
- size_t kmers = Count(num_buckets, num_threads);
- if (merge)
- MergeBuckets(num_buckets);
-
- return kmers;
- }
-
- std::unique_ptr<typename __super::FinalKMerStorage> GetFinalKMers() override {
- unsigned K = splitter_.K();
- return std::unique_ptr<typename __super::FinalKMerStorage>(new typename __super::FinalKMerStorage(GetFinalKMersFname(), Seq::GetDataSize(K), /* unlink */ true));
- }
-
- std::string GetMergedKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".merged." + std::to_string(suffix);
- }
-
- std::string GetFinalKMersFname() const {
- return kmer_prefix_ + ".final";
- }
-
-private:
- std::string work_dir_;
- KMerSplitter<Seq> &splitter_;
- int fd_;
- std::string kmer_prefix_;
-
- std::string GetUniqueKMersFname(unsigned suffix) const {
- return kmer_prefix_ + ".unique." + std::to_string(suffix);
- }
-
- size_t MergeKMers(const std::string &ifname, const std::string &ofname,
- unsigned K) {
- MMappedRecordArrayReader<typename Seq::DataType> ins(ifname, Seq::GetDataSize(K), /* unlink */ true);
-
- // Sort the stuff
- libcxx::sort(ins.begin(), ins.end(), array_less<typename Seq::DataType>());
-
- // FIXME: Use something like parallel version of unique_copy but with explicit
- // resizing.
- auto it = std::unique(ins.begin(), ins.end(), array_equal_to<typename Seq::DataType>());
-
- MMappedRecordArrayWriter<typename Seq::DataType> os(ofname, Seq::GetDataSize(K));
- os.resize(it - ins.begin());
- std::copy(ins.begin(), it, os.begin());
-
- return it - ins.begin();
- }
-};
-
-template<class Index>
-class KMerIndexBuilder {
- typedef typename Index::KMerSeq Seq;
- typedef typename Index::kmer_index_traits kmer_index_traits;
-
- std::string work_dir_;
- unsigned num_buckets_;
- unsigned num_threads_;
-
- public:
- KMerIndexBuilder(const std::string &workdir,
- unsigned num_buckets, unsigned num_threads)
- : work_dir_(workdir), num_buckets_(num_buckets), num_threads_(num_threads) {}
- size_t BuildIndex(Index &out, KMerCounter<Seq> &counter,
- bool save_final = false);
-
- unsigned num_buckets() const { return num_buckets_; }
-
- private:
-
- DECL_LOGGER("K-mer Index Building");
-};
-
-template<class Index>
-size_t KMerIndexBuilder<Index>::BuildIndex(Index &index, KMerCounter<Seq> &counter,
- bool save_final) {
- index.clear();
-
- INFO("Building kmer index ");
-
- // First, count the unique k-mers
- size_t kmers = counter.Count(num_buckets_, num_threads_);
-
- index.num_buckets_ = num_buckets_;
- index.bucket_starts_.resize(num_buckets_ + 1);
- index.index_ = new typename KMerIndex<kmer_index_traits>::KMerDataIndex[num_buckets_];
-
- INFO("Building perfect hash indices");
-
- // Index building requires up to 40 bytes per k-mer. Limit number of threads depending on the memory limit.
- unsigned num_threads = num_threads_;
-# ifdef SPADES_USE_JEMALLOC
- const size_t *cmem = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- size_t bucket_size = (36 * kmers + kmers * counter.kmer_size()) / num_buckets_;
- num_threads = std::min<unsigned>((unsigned) ((get_memory_limit() - *cmem) / bucket_size), num_threads);
- if (num_threads < 1)
- num_threads = 1;
- if (num_threads < num_threads_)
- WARN("Number of threads was limited down to " << num_threads << " in order to fit the memory limits during the index construction");
-# endif
-
-# pragma omp parallel for shared(index) num_threads(num_threads)
- for (unsigned iFile = 0; iFile < num_buckets_; ++iFile) {
- typename KMerIndex<kmer_index_traits>::KMerDataIndex &data_index = index.index_[iFile];
- auto bucket = counter.GetBucket(iFile, !save_final);
- size_t sz = bucket->end() - bucket->begin();
- index.bucket_starts_[iFile + 1] = sz;
- typename kmer_index_traits::KMerRawReferenceAdaptor adaptor;
- size_t max_nodes = (size_t(std::ceil(double(sz) * 1.23)) + 2) / 3 * 3;
- if (max_nodes >= uint64_t(1) << 32) {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint64_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(bucket->begin(), bucket->end()),
- adaptor).swap(data_index);
- } else {
- emphf::hypergraph_sorter_seq<emphf::hypergraph<uint32_t> > sorter;
- typename KMerIndex<kmer_index_traits>::KMerDataIndex(sorter,
- sz, emphf::range(bucket->begin(), bucket->end()),
- adaptor).swap(data_index);
- }
- }
-
- // Finally, record the sizes of buckets.
- for (unsigned iFile = 1; iFile < num_buckets_; ++iFile)
- index.bucket_starts_[iFile] += index.bucket_starts_[iFile - 1];
-
- if (save_final)
- counter.MergeBuckets(num_buckets_);
-
- double bits_per_kmer = 8.0 * (double)index.mem_size() / (double)kmers;
- INFO("Index built. Total " << index.mem_size() << " bytes occupied (" << bits_per_kmer << " bits per kmer).");
- index.count_size();
- return kmers;
-}
diff --git a/src/modules/data_structures/mph_index/kmer_index_traits.hpp b/src/modules/data_structures/mph_index/kmer_index_traits.hpp
deleted file mode 100644
index c9ef67b..0000000
--- a/src/modules/data_structures/mph_index/kmer_index_traits.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-//***************************************************************************
-//* Copyright (c) 2016 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/kmers_io/mmapped_reader.hpp"
-#include "mphf.hpp"
-
-template<class Seq>
-struct kmer_index_traits {
- typedef Seq SeqType;
- typedef MMappedRecordArrayReader<typename Seq::DataType> RawKMerStorage;
- typedef MMappedRecordArrayReader<typename Seq::DataType> FinalKMerStorage;
- typedef typename RawKMerStorage::iterator raw_data_iterator;
- typedef typename RawKMerStorage::const_iterator raw_data_const_iterator;
- typedef typename RawKMerStorage::iterator::value_type KMerRawData;
- typedef typename RawKMerStorage::iterator::reference KMerRawReference;
- typedef typename RawKMerStorage::const_iterator::reference KMerRawConstReference;
-
- struct raw_equal_to {
- bool operator()(const Seq &lhs, const KMerRawReference rhs) {
- return (array_equal_to<typename Seq::DataType>()(lhs.data(), lhs.data_size(), rhs));
- }
- };
-
- struct raw_create {
- Seq operator()(unsigned K, const KMerRawReference kmer) {
- return Seq(K, kmer.data());
- }
- Seq operator()(unsigned K, const KMerRawConstReference kmer) {
- return Seq(K, kmer.data());
- }
- };
-
- struct hash_function {
- uint64_t operator()(const Seq &k) const{
- return typename Seq::hash()(k);
- }
- uint64_t operator()(const KMerRawReference k) const {
- return typename Seq::hash()(k.data(), k.size());
- }
- };
-
- struct KMerRawReferenceAdaptor {
- emphf::byte_range_t operator()(const KMerRawReference k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size());
- }
- };
-
- struct KMerSeqAdaptor {
- emphf::byte_range_t operator()(const Seq &k) const {
- const uint8_t * data = (const uint8_t*)k.data();
- return std::make_pair(data, data + k.data_size() * sizeof(typename Seq::DataType));
- }
- };
-
- template<class Writer>
- static void raw_serialize(Writer &writer, RawKMerStorage *data) {
- size_t sz = data->data_size(), elcnt = data->elcnt();
- unsigned PageSize = getpagesize();
- writer.write((char*)&sz, sizeof(sz));
- writer.write((char*)&elcnt, sizeof(elcnt));
- // Make sure data is aligned to the page boundary
- size_t cpos = writer.tellp();
- size_t pos = (cpos + PageSize - 1 + sizeof(size_t)) / PageSize * PageSize;
- size_t off = pos - writer.tellp();
- writer.write((char*)&off, sizeof(off));
- writer.seekp(pos);
- writer.write((char*)data->data(), data->data_size());
- }
-
- template<class Reader>
- static std::unique_ptr<RawKMerStorage> raw_deserialize(Reader &reader, const std::string &FileName) {
- size_t sz, off, elcnt;
- reader.read((char*)&sz, sizeof(sz));
- reader.read((char*)&elcnt, sizeof(elcnt));
- reader.read((char*)&off, sizeof(off));
- off -= sizeof(off);
- off += reader.tellg();
-
- return std::unique_ptr<RawKMerStorage>(new RawKMerStorage(FileName, elcnt, false, off, sz));
- }
-
-};
diff --git a/src/modules/data_structures/mph_index/mphf.hpp b/src/modules/data_structures/mph_index/mphf.hpp
deleted file mode 100644
index 6c364ca..0000000
--- a/src/modules/data_structures/mph_index/mphf.hpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#pragma once
-
-#include <random>
-
-#include "bitpair_vector.hpp"
-#include "ranked_bitpair_vector.hpp"
-
-#include "dev_support/logger/logger.hpp"
-
-namespace emphf {
-
- template <typename BaseHasher>
- class mphf {
- public:
- mphf()
- {}
-
- template <typename HypergraphSorter, typename Range, typename Adaptor>
- mphf(HypergraphSorter& sorter, size_t n,
- Range const& input_range, Adaptor adaptor,
- double gamma = 1.23)
- : m_n(n)
- , m_hash_domain(std::max((size_t(std::ceil(double(m_n) * gamma)) + 2) / 3, size_t(2)))
- {
- typedef typename HypergraphSorter::node_t node_t;
- typedef typename HypergraphSorter::hyperedge hyperedge;
- typedef decltype(*std::begin(input_range)) value_type;
-
- size_t nodes_domain = m_hash_domain * 3;
-
- if (nodes_domain >= std::numeric_limits<node_t>::max()) {
- throw std::invalid_argument("Too many nodes for node_t");
- }
-
- auto edge_gen = [&](value_type s) {
- using std::get;
- auto hashes = m_hasher(adaptor(s));
- return hyperedge((node_t)(get<0>(hashes) % m_hash_domain),
- (node_t)(m_hash_domain +
- (get<1>(hashes) % m_hash_domain)),
- (node_t)(2 * m_hash_domain +
- (get<2>(hashes) % m_hash_domain)));
- };
-
- std::mt19937_64 rng(37); // deterministic seed
-
- for (size_t trial = 0; ; ++trial) {
- //logger() << "Hypergraph generation: trial " << trial << std::endl;
-
- m_hasher = BaseHasher::generate(rng);
- if (sorter.try_generate_and_sort(input_range, edge_gen,
- m_n, m_hash_domain)) break;
- }
-
- auto peeling_order = sorter.get_peeling_order();
- bitpair_vector bv(nodes_domain);
-
- //logger() << "Assigning values" << std::endl;
-
- for (auto edge = peeling_order.first;
- edge != peeling_order.second;
- ++edge) {
-
- uint64_t target = orientation(*edge);
- uint64_t assigned = bv[edge->v1] + bv[edge->v2];
-
- // "assigned values" must be nonzeros to be ranked, so
- // if the result is 0 we assign 3
- bv.set(edge->v0, ((target - assigned + 9) % 3) ?: 3);
- }
-
- m_bv.build(std::move(bv));
- }
-
- uint64_t size() const
- {
- return m_n;
- }
-
- size_t mem_size() const {
- return m_bv.mem_size();
- }
-
- BaseHasher const& base_hasher() const
- {
- return m_hasher;
- }
-
- template <typename T, typename Adaptor>
- uint64_t lookup(const T &val, Adaptor adaptor)
- {
- using std::get;
- auto hashes = m_hasher(adaptor(val));
- uint64_t nodes[3] = {get<0>(hashes) % m_hash_domain,
- m_hash_domain + (get<1>(hashes) % m_hash_domain),
- 2 * m_hash_domain + (get<2>(hashes) % m_hash_domain)};
-
- uint64_t hidx = (m_bv[nodes[0]] + m_bv[nodes[1]] + m_bv[nodes[2]]) % 3;
- return m_bv.rank(nodes[hidx]);
- }
-
- void swap(mphf& other)
- {
- std::swap(m_n, other.m_n);
- std::swap(m_hash_domain, other.m_hash_domain);
- m_hasher.swap(other.m_hasher);
- m_bv.swap(other.m_bv);
- }
-
- void save(std::ostream& os) const
- {
- os.write(reinterpret_cast<char const*>(&m_n), sizeof(m_n));
- os.write(reinterpret_cast<char const*>(&m_hash_domain),
- sizeof(m_hash_domain));
- m_hasher.save(os);
- m_bv.save(os);
- }
-
- void load(std::istream& is)
- {
- is.read(reinterpret_cast<char*>(&m_n), sizeof(m_n));
- is.read(reinterpret_cast<char*>(&m_hash_domain),
- sizeof(m_hash_domain));
- m_hasher.load(is);
- m_bv.load(is);
- }
-
-
- private:
-
- uint64_t m_n;
- uint64_t m_hash_domain;
- BaseHasher m_hasher;
- ranked_bitpair_vector m_bv;
- };
-}
diff --git a/src/modules/data_structures/sequence/CMakeLists.txt b/src/modules/data_structures/sequence/CMakeLists.txt
deleted file mode 100644
index f465519..0000000
--- a/src/modules/data_structures/sequence/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(sequence CXX)
-
-add_library(sequence STATIC genome_storage.cpp)
diff --git a/src/modules/data_structures/sequence/genome_storage.cpp b/src/modules/data_structures/sequence/genome_storage.cpp
deleted file mode 100644
index f2f262e..0000000
--- a/src/modules/data_structures/sequence/genome_storage.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-//
-// Created by lab42 on 8/19/15.
-//
-
-#include "genome_storage.hpp"
-#include "data_structures/sequence/nucl.hpp"
-using namespace std;
-
-namespace debruijn_graph {
-//TODO exterminate this where possible
- Sequence GenomeStorage::GetSequence() const{
- stringstream ss;
- size_t l = 0, r = 0;
- for(size_t i = 0; i < s_.size(); i++) {
- if (! is_nucl(s_[i]) ) {
- if (r > l) {
- ss << s_.substr(l, r - l);
- }
- r = i + 1;
- l = i + 1;
- } else {
- r++;
- }
- }
- if (r > l) {
- ss << s_.substr(l, r - l);
- }
- return Sequence(ss.str());
- }
- void GenomeStorage::SetSequence(const Sequence &s) {
- s_ = s.str();
- }
- string GenomeStorage::str() const{
- return s_;
- }
- size_t GenomeStorage::size() const {
- return s_.size();
- }
-}
\ No newline at end of file
diff --git a/src/modules/data_structures/sequence/genome_storage.hpp b/src/modules/data_structures/sequence/genome_storage.hpp
deleted file mode 100644
index 401576d..0000000
--- a/src/modules/data_structures/sequence/genome_storage.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-//
-// Created by lab42 on 8/19/15.
-//
-
-#ifndef GENOME_STORAGE_HPP_
-#define GENOME_STORAGE_HPP_
-
-#include <string>
-#include "data_structures/sequence/sequence.hpp"
-namespace debruijn_graph {
- class GenomeStorage {
- private:
- std::string s_;
- public:
- GenomeStorage():s_(""){
- }
-
- GenomeStorage(const std::string &s): s_(s){
- }
-
- Sequence GetSequence() const;
- void SetSequence(const Sequence &s);
- std::string str() const;
- size_t size() const;
- };
-}
-#endif //PROJECT_GENOME_STORAGE_HPP
diff --git a/src/modules/data_structures/sequence/nucl.hpp b/src/modules/data_structures/sequence/nucl.hpp
deleted file mode 100755
index 905d8c2..0000000
--- a/src/modules/data_structures/sequence/nucl.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file nucl.hpp
- * @author vyahhi
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Simple operations and checks for nucleotide-letters
- *
- */
-
-
-#ifndef NUCL_HPP_
-#define NUCL_HPP_
-
-#include "dev_support/verify.hpp"
-#include <iostream>
-
-const char dignucl_map['T' + 1] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3};
-
-const bool isnucl_map[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
-
-const char nucl_map[4] = {'A', 'C', 'G', 'T'};
-
-const char nucl_complement_map['T' + 1] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 'T', 0, 'G', 0, 0, 0, 'C', 0, 0, 0, 0, 0, 0, 'N', 0, 0, 0, 0, 0, 'A'};
-
-/**
- * ACGT -> true
- * @param char c
- * @return true if c is 'A', 'C', 'G' or 'T'.
- */
-inline bool is_nucl(char c) { // is ACGT
- return isnucl_map[(unsigned)c];
-}
-
-/**
- * 0123 -> true
- * @param char c
- * @return true if c is 0, 1, 2 or 3.
- */
-inline bool is_dignucl(char c) { // is 0123
- return (c < 4);
-}
-
-/**
- * 0123 -> 3210
- * @param char c
- * @return c ^ 3
- */
-inline char complement(char c) {
- // VERIFY(is_dignucl(c));
- return c ^ 3;
-}
-
-/**
- * ACGT -> TGCA
- * @param char c is 'A', 'C', 'G', 'T' or 'N'
- * @return complement symbol, i.e. 'A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A', 'N' => 'N'
- */
-
-struct nucl_complement_functor { // still unused
- inline bool operator() (char c) const {
- char cc = nucl_complement_map[(unsigned)c];
- return cc ? cc : 'N';
- }
-};
-
-inline char nucl_complement(char c){
- // TODO: deal with 'N' case
- //VERIFY(is_nucl(c));
- char cc = nucl_complement_map[(unsigned)c];
- return cc ? cc : 'N';
-}
-
-/**
- * 0123 -> ACGT
- * @param char c is 0, 1, 2 or 3
- * @return 0 => 'A', 1 => 'C', 2 => 'G', 3 => 'T'
- */
-inline char nucl(char c) {
- return nucl_map[(unsigned)c];
-}
-
-/**
- * ACGT -> 0123
- * @param char c is 'A', 'C', 'G' or 'T'
- * @return A => 0, C => 1, G => 2, T => 3
- */
-
-/*
-struct dignucl : public unary_function<int,bool> {
- bool operator()(signed char c) const {
- return dignucl_map[c];
- }
-};*/
-
-inline char dignucl(char c) {
- // VERIFY(is_nucl(c));
- return dignucl_map[(unsigned)c];
-}
-
-
-#endif /* NUCL_HPP_ */
diff --git a/src/modules/data_structures/sequence/rtseq.hpp b/src/modules/data_structures/sequence/rtseq.hpp
deleted file mode 100644
index ea1e279..0000000
--- a/src/modules/data_structures/sequence/rtseq.hpp
+++ /dev/null
@@ -1,740 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * rtseq.hpp
- *
- * Created on: Jun 28, 2012
- * Author: andrey
- */
-
-#ifndef RTSEQ_HPP_
-#define RTSEQ_HPP_
-
-#include <string>
-#include "dev_support/verify.hpp"
-#include <array>
-#include <algorithm>
-#include "data_structures/sequence/nucl.hpp"
-#include "dev_support/log.hpp"
-#include "seq_common.hpp"
-#include "seq.hpp"
-#include "simple_seq.hpp"
-
-#include <cstring>
-#include <iostream>
-
-template<size_t max_size_, typename T = seq_element_type>
-class RuntimeSeq {
-public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- const static size_t Iterations = log_<TBits, 2>::value;
-
- static const std::array<T, Iterations> ConstructLeftMasks() {
- std::array<T, Iterations> result;
- for (size_t i = 0; i < Iterations; i++) {
- size_t shift = 1 << i;
- T mask = T(T(1) << shift) - T(1);
- result[i] = T(mask << shift);
- for (size_t j = 0; j < i; j++) {
- result[j] += T(result[j] << shift);
- }
- }
- return result;
- }
-
- static const std::array<T, Iterations> ConstructRightMasks() {
- std::array<T, Iterations> result(ConstructLeftMasks());
- for (size_t i = 0; i < Iterations; i++) {
- result[i] = T(~result[i]);
- }
- return result;
- }
-
-
- RuntimeSeq<max_size_, T> FastRC() const {
- const static std::array<T, Iterations> LeftMasks(ConstructLeftMasks());
- const static std::array<T, Iterations> RightMasks(ConstructRightMasks());
- const static size_t LogTSize = log_<sizeof(T), 2>::value + 3;
-
- RuntimeSeq<max_size_, T> res(this->size());
-
- const size_t bit_size = size_ << 1;
- const size_t extra = bit_size & ((1 << LogTSize) - 1);
- const size_t to_extra = TBits - extra;
- const size_t filled = bit_size >> LogTSize;
- size_t real_length = filled;
- if (extra == 0) {
- for (size_t i = 0, j = filled - 1; i < filled; i++, j--) {
- res.data_[i] = data_[j];
- }
- } else {
- for (size_t i = 0, j = filled; i < filled && j > 0; i++, j--) {
- res.data_[i] = (data_[j] << to_extra) + (data_[j - 1] >> extra);
- }
- res.data_[filled] = (data_[0] << to_extra);
- real_length++;
- }
-
- for (size_t i = 0; i < real_length; i++) {
- res.data_[i] = res.data_[i] ^ T(-1);
- for (size_t it = 1; it < Iterations; it++) {
- size_t shift = 1 << it;
- res.data_[i] = T((res.data_[i] & LeftMasks[it]) >> shift) ^ T((res.data_[i] & RightMasks[it]) << shift);
- }
- }
-
- if (extra != 0) {
- res.data_[real_length - 1] = (res.data_[real_length - 1] & ((T(1) << extra) - 1));
- }
- return res;
- }
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (max_size_ + TNucl - 1) >> TNuclBits;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
- typedef T DataType;
-
- static size_t GetDataSize(size_t size) {
- return (size + TNucl - 1) >> TNuclBits;
- }
-
-private:
- /* *
- * @variable Just some prime number to count the hash function of the kmer
- * */
- const static size_t PrimeNum = 239;
-
-
- // number of nucleotides in the last data_ bucket
- static size_t NuclsRemain(size_t size) {
- return size & (TNucl - 1);
- }
-
- // useful mask to fill the last element of the data_ array
- static size_t MaskForLastBucket(size_t size) {
- size_t nr = NuclsRemain(size);
- return nr != 0 ? (((T) 1) << (nr << 1)) - 1 : -1ul;
- }
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
- size_t size_;
-
- /**
- * Initialize data_ array of this object with C-string
- *
- * @param s C-string (ACGT chars only), strlen(s) = size_
- */
- void init(const char *s) {
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
- for (size_t pos = 0; pos < size_; ++pos, ++s) { // unsafe!
- // VERIFY(is_nucl(*s)); // for performance
- data = data | ((T) dignucl(*s) << cnt);
- cnt += 2;
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur < DataSize; ++cur)
- this->data_[cur] = 0;
-
- VERIFY(*s == 0); // C-string always ends on 0
- }
-
- /**
- * Sets i-th symbol of Seq with 0123-char
- */
- inline void set(const size_t i, char c) {
- data_[i >> TNuclBits] =
- (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
- }
-
- // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
- template<class S>
- size_t size(const S &t,
- typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
- return t.size();
- }
-
- template<class S>
- size_t size(const S &t,
- typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
- return strlen(t);
- }
-
-
-public:
-
- const static size_t max_size = max_size_;
-
- RuntimeSeq() : size_(0) {
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- /**
- * Default constructor, fills Seq with A's
- */
-
- explicit RuntimeSeq(size_t k) : size_(k) {
- VERIFY(k <= max_size_);
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- RuntimeSeq(size_t k, const char *s) : size_(k) {
- VERIFY(k <= max_size_);
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- init(s);
- }
-
-
- explicit RuntimeSeq(size_t k, const T *data_array) : size_(k) {
- VERIFY(k <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
-
- size_t data_size = GetDataSize(size_);
- memcpy(data_.data(), data_array, data_size * sizeof(T));
-
- if (NuclsRemain(size_)) {
- data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
- }
- }
-
- explicit RuntimeSeq(size_t k, T *data_array) : size_(k) {
- VERIFY(k <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
-
- size_t data_size = GetDataSize(size_);
- memcpy(data_.data(), data_array, data_size * sizeof(T));
-
- if (NuclsRemain(size_)) {
- data_[data_size - 1] = data_[data_size - 1] & MaskForLastBucket(size_);
- }
- }
-
- template<size_t size2_, typename T2 = T>
- explicit RuntimeSeq(const Seq<size2_, T2> &seq, bool) : size_(size2_) {
- VERIFY(size_ <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
- seq.copy_data(data_.data());
- }
-
- template<size_t size2_, typename T2 = T>
- explicit RuntimeSeq(const SimpleSeq<size2_, T2> &seq, size_t k) : size_(k) {
- VERIFY(size_ <= max_size_);
- VERIFY(size2_ <= max_size_);
- std::fill(data_.begin(), data_.end(), 0);
- seq.copy_data(data_.data());
- }
-
-
- /**
- * Ultimate constructor from ACGT0123-string.
- *
- * @param s Any object with operator[], which returns 0123 chars
- * @param offset Offset when this sequence starts
- * @number_to_read A number of nucleotides, we want to fetch from this string
- * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
- * no init method, filling right here
- */
- template<typename S>
- explicit RuntimeSeq(size_t k, const S &s, size_t offset = 0) : size_(k) {
- VERIFY(size_ <= max_size_);
- //TRACE("New Constructor for seq " << s[0] << " is first symbol");
- VERIFY(size_ == 0 || is_dignucl(s[0]) || is_nucl(s[0]));
- VERIFY(offset + size_ <= this->size(s));
-
- // which symbols does our string contain : 0123 or ACGT?
- bool digit_str = size_ == 0 || is_dignucl(s[0]);
-
- // we fill everything with zeros (As) by default.
- std::fill(data_.begin(), data_.end(), 0);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- for (size_t i = 0; i < size_; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
-
- // we fill everything with zeros (As) by default.
- char c = (char) (digit_str ? s[offset + i] : dignucl(s[offset + i]));
-
- data = data | (T(c) << cnt);
- cnt += 2;
-
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
-
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur < DataSize; ++cur)
- this->data_[cur] = 0;
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- bool BinRead(std::istream &file) {
- file.read((char *) data_.data(), sizeof(T) * GetDataSize(size_));
- return !file.fail();
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- bool BinWrite(std::ostream &file) const {
- file.write((const char *) data_.data(), sizeof(T) * GetDataSize(size_));
- return !file.fail();
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinRead(std::istream &file, RuntimeSeq<max_size_, T> *seq) {
- return seq->BinRead(file);
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinWrite(std::ostream &file, const RuntimeSeq<max_size_, T> &seq) {
- return seq.BinWrite(file);
- }
-
-
- /**
- * Get i-th symbol of Seq.
- *
- * @param i Index of the symbol (0 <= i < size_)
- * @return 0123-char on position i
- */
- char operator[](const size_t i) const {
- VERIFY(i < size_);
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- /**::
- * Reverse complement.
- *
- * @return Reverse complement Seq.
- */
- RuntimeSeq<max_size_, T> operator!() const {
-// RuntimeSeq<max_size_, T> res(*this);
-// for (size_t i = 0; i < (size_ >> 1); ++i) {
-// auto front = complement(res[i]);
-// auto end = complement(res[size_ - 1 - i]);
-// res.set(i, end);
-// res.set(size_ - 1 - i, front);
-// }
-// if ((size_ & 1) == 1) {
-// res.set(size_ >> 1, complement(res[size_ >> 1]));
-// }
- return FastRC();
-// return res;
- }
-
- /**
- * Is the kmer minimal among this and !this.
- *
- * @return True if kmer < !kmer and false otherwise.
- */
- bool IsMinimal() const {
- for (size_t i = 0; (i << 1) + 1 <= size_; ++i) {
- auto front = this->operator[](i);
- auto end = complement(this->operator[](size_ - 1 - i));
- if (front != end)
- return front < end;
- }
- return true;
- }
-
- /**
- * Shift left
- *
- * @param c New 0123 char which should be added to the right.
- * @return Shifted (to the left) sequence with 'c' char on the right.
- */
- RuntimeSeq<max_size_, T> operator<<(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- RuntimeSeq<max_size_, T> res(*this);
- std::array<T, DataSize> &data = res.data_;
-
- size_t data_size = GetDataSize(size_);
-
- if (data_size != 0) { // unless empty sequence
- T rm = data[data_size - 1] & 3;
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data[data_size - 1] = (data[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
-
- if (data_size >= 2) { // if we have at least 2 elements in data
- for (int i = (int) data_size - 2; i >= 0; --i) {
- T new_rm = data[i] & 3;
- data[i] = (data[i] >> 2) |
- (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
- rm = new_rm;
- }
- }
- }
- return res;
- }
-
- void operator<<=(char c) {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_t data_size = GetDataSize(size_);
-
- if (data_size == 0) {
- return;
- }
-
- for (size_t i = 0; i < data_size - 1; ++i) {
- data_[i] = (data_[i] >> 2) | (((T) data_[i + 1] & 3) << (TBits - 2));
- }
-
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data_[data_size - 1] = (data_[data_size - 1] >> 2) | ((T) c << lastnuclshift_);
- }
-
-//todo naming convention violation!
- RuntimeSeq<max_size_, T> pushBack(char c) const {
- //VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- //VERIFY(is_dignucl(c));
- RuntimeSeq<max_size_, T> s(size_ + 1);
- copy(this->data_.begin(), this->data_.end(), s.data_.begin());
-
- size_t data_size = GetDataSize(size_ + 1);
-
- s.data_[data_size - 1] |= ((T) c << ((size_ & (TNucl - 1)) << 1));
-
- return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
- }
-
-
-//todo naming convention violation!
- void pushBackThis(char c) {
- VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_ += 1;
- size_t data_size = GetDataSize(size_);
-
- data_[data_size - 1] |= ((T) c << (((size_ - 1) & (TNucl - 1)) << 1));
- }
-
- // /**
- // * @todo optimize!!!
- // */
- // RuntimeSeq<max_size_, T> pushFront(char c) const {
- // VERIFY(size_ + 1 < max_size_);
- // if (is_nucl(c)) {
- // c = dignucl(c);
- // }
- // VERIFY(is_dignucl(c));
- // return RuntimeSeq<max_size_, T> (size_ + 1, nucl(c) + str());
- // }
-
- //todo naming convention violation!
- RuntimeSeq<max_size_, T> pushFront(char c) const {
- VERIFY(size_ + 1 <= max_size_);
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- RuntimeSeq<max_size_, T> res(size_ + 1);
-
- size_t data_size = GetDataSize(size_ + 1);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- return res;
- }
-
-//todo naming convention violation!
- void pushFrontThis(char c) {
- VERIFY(size_ + 1 <= max_size_);
-
- if (is_nucl(c)) {
- c = dignucl(c);
- }
-
- size_ += 1;
- size_t data_size = GetDataSize(size_);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
- }
-
- /**
- * Shift right
- *
- * @param c New 0123 char which should be added to the left.
- * @return Shifted (to the right) sequence with 'c' char on the left.
- */
- RuntimeSeq<max_size_, T> operator>>(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
-
- RuntimeSeq<max_size_, T> res(*this);
- size_t data_size = GetDataSize(size_);
-
- T rm = c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (res.data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- res.data_[data_size - 1] &= MaskForLastBucket(size_);
-
- return res;
- }
-
- //todo remove code duplication!
- void operator>>=(char c) {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
-
- size_t data_size = GetDataSize(size_);
-
- T rm = (T) c;
- for (size_t i = 0; i < data_size; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- data_[data_size - 1] &= MaskForLastBucket(size_);
- }
-
- bool operator==(const RuntimeSeq<max_size_, T> &s) const {
- VERIFY(size_ == s.size_);
-
- size_t data_size = GetDataSize(size_);
- for (size_t i = 0; i < data_size; ++i)
- if (data_[i] != s.data_[i])
- return false;
-
- return true;
- }
-
- /**
- * @see operator ==()
- */
- bool operator!=(const RuntimeSeq<max_size_, T> &s) const {
- return !operator==(s);
- }
-
- /**
- * String representation of this Seq
- *
- * @return ACGT-string of length size_
- * @see nucl()
- */
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- std::string err() const {
- return "";
- }
-
-
- std::string full_str() const {
- std::string res(max_size, '-');
- for (size_t i = 0; i < max_size; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t data_size() const {
- return GetDataSize(size_);
- }
-
- const T *data() const {
- return data_.data();
- }
-
- template<size_t size2_, typename T2 = T>
- Seq<size2_, T2> get_seq() const {
- VERIFY(size2_ == size_);
- return Seq<size2_, T2>((T2 *) data_.data());
- }
-
- template<size_t size2_, typename T2 = T>
- SimpleSeq<size2_, T2> get_sseq() const {
- VERIFY(size2_ <= max_size_);
- return SimpleSeq<size2_, T2>((T2 *) data_.data());
- }
-
- void copy_data(void *dst) const {
- memcpy(dst, (const void *) data_.data(), GetDataSize(size_) * sizeof(T));
- }
-
- char last() const {
- return operator[](size_ - 1);
- }
-
- char first() const {
- return operator[](0);
- }
-
- static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
- return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(unsigned seed = 0) const {
- return GetHash(data_.data(), GetDataSize(size_), seed);
- }
-
- struct hash {
- size_t operator()(const RuntimeSeq<max_size_, T> &seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct less2 {
- int operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
- for (size_t i = 0; i < l.size(); ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return l.size() < r.size();
- }
- };
-
- /**
- * Denotes some (weird) order on k-mers. Works fast.
- */
- struct less2_fast {
- bool operator()(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) const {
- return 0 > memcmp(l.data(), r.data(), sizeof(T) * l.data_size());
- }
- };
-
-};
-
-template<size_t max_size_, typename T = seq_element_type>
-bool operator<(const RuntimeSeq<max_size_, T> &l, const RuntimeSeq<max_size_, T> &r) {
- for (size_t i = 0; i < l.size(); ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
-
- return l.size() < r.size();
-}
-
-
-template<size_t max_size_, typename T>
-std::ostream &operator<<(std::ostream &os, RuntimeSeq<max_size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-namespace std {
-template<size_t max_size, typename T>
-struct hash<RuntimeSeq<max_size, T>> {
- size_t operator()(const RuntimeSeq<max_size, T> &seq) const {
- return seq.GetHash();
- }
-};
-
-};
-
-
-#endif /* RTSEQ_HPP_ */
diff --git a/src/modules/data_structures/sequence/runtime_k.hpp b/src/modules/data_structures/sequence/runtime_k.hpp
deleted file mode 100644
index bbb28b7..0000000
--- a/src/modules/data_structures/sequence/runtime_k.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef RUNTIME_K_HPP_
-#define RUNTIME_K_HPP_
-
-#include "data_structures/sequence/sequence.hpp"
-#include "data_structures/sequence/seq.hpp"
-#include "data_structures/sequence/simple_seq.hpp"
-#include "data_structures/sequence/rtseq.hpp"
-
-#include "k_range.hpp"
-
-namespace runtime_k {
-
-constexpr size_t t_size(void) {
- return sizeof(seq_element_type);
-}
-
-constexpr size_t get_t_elements_number(size_t value) {
- return ((value - 1) / (t_size() << 2) + 1);
-}
-
-constexpr size_t get_k_by_ts(size_t value) {
- return (value * (t_size() << 2));
-}
-
-constexpr size_t get_upper_bound(size_t value) {
- return get_k_by_ts(get_t_elements_number(value));
-}
-
-const size_t UPPER_BOUND = get_upper_bound(MAX_K); //((MAX_K - 1) / (sizeof(seq_element_type) << 2) + 1) * (sizeof(seq_element_type) << 2);
-
-const size_t MAX_TS = get_t_elements_number(MAX_K);
-
-const size_t MIN_TS = get_t_elements_number(MIN_K);
-
-
-typedef RuntimeSeq<UPPER_BOUND> RtSeq;
-
-} /* namespace runtime_k */
-
-#endif /* RUNTIME_K_HPP_ */
diff --git a/src/modules/data_structures/sequence/seq.hpp b/src/modules/data_structures/sequence/seq.hpp
deleted file mode 100755
index 3753b74..0000000
--- a/src/modules/data_structures/sequence/seq.hpp
+++ /dev/null
@@ -1,529 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file seq.hpp
- * @author vyahhi
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Immutable ACGT-sequence with compile-time size.
- * It compress sequence to array of Ts (default: char).
- */
-
-#ifndef SEQ_HPP_
-#define SEQ_HPP_
-
-#include <string>
-#include <array>
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-
-#include <city/city.h>
-
-#include "dev_support/verify.hpp"
-#include "data_structures/sequence/nucl.hpp"
-#include "dev_support/log.hpp"
-#include "seq_common.hpp"
-
-
-/**
- * @param T is max number of nucleotides, type for storage
- */
-template<size_t size_, typename T = seq_element_type>
-class Seq {
-public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
-
- typedef T DataType;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
- static size_t GetDataSize(size_t size) {
- VERIFY(size == size_);
- return (size_ + TNucl - 1) >> TNuclBits;
- }
-
-private:
- /* *
- * @variable Just some prime number to count the hash function of the kmer
- * */
- const static size_t PrimeNum = 239;
-
- // number of nucleotides in the last data_ bucket
- const static size_t NuclsRemain = size_ & (TNucl - 1);
-
- // useful mask to fill the last element of the data_ array
- const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1)) - 1;
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
- friend class Seq<size_ - 1, T>;
-
- /**
- * Initialize data_ array of this object with C-string
- *
- * @param s C-string (ACGT chars only), strlen(s) = size_
- */
- void init(const char *s) {
- T data = 0;
- size_t cnt = 0;
- int cur = 0;
- for (size_t pos = 0; pos != size_; ++pos, ++s) { // unsafe!
- // VERIFY(is_nucl(*s)); // for performance
- data = data | (T) ((T) dignucl(*s) << cnt);
- cnt += 2;
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
- VERIFY(*s == 0); // C-string always ends on 0
- }
-
- // Template voodoo to calculate the length of the string regardless whether it is std::string or const char*
- template<class S>
- size_t size(const S &t,
- typename std::enable_if<std::is_class<S>::value, T>::type * = 0) {
- return t.size();
- }
-
- template<class S>
- size_t size(const S &t,
- typename std::enable_if<std::is_same<S, const char *>::value, T>::type * = 0) {
- return strlen(t);
- }
-
-public:
- /**
- * Default constructor, fills Seq with A's
- */
- Seq() {
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- Seq(const char *s) {
- init(s);
- }
-
- explicit Seq(T *data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
-
- explicit Seq(unsigned, const T *data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
-
-
- /**
- * Ultimate constructor from ACGT0123-string.
- *
- * @param s Any object with operator[], which returns 0123 chars
- * @param offset Offset when this sequence starts
- * @number_to_read A number of nucleotides, we want to fetch from this string
- * @raw Flag whether to check for string length (e.g. via strlen, or not)
- * @warning assuming that s is a correct string, filled with ACGT _OR_ 0123
- * no init method, filling right here
- */
- template<typename S>
- explicit Seq(const S &s, size_t offset = 0, size_t number_to_read = size_,
- bool raw = false) {
- if (this->size(s) == 0) {
- return;
- }
- VERIFY(offset < this->size(s));
- VERIFY(is_dignucl(s[offset]) || is_nucl(s[offset]));
- if (!raw)
- VERIFY(offset + number_to_read <= this->size(s));
-
- // which symbols does our string contain : 0123 or ACGT?
- bool digit_str = is_dignucl(s[offset]);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- T data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- for (size_t i = 0; i < number_to_read; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
-
- // we fill everything with zeros (As) by default.
- char c = digit_str ? s[offset + i] : (char) dignucl(s[offset + i]);
-
- data = data | (T(c) << cnt);
- cnt += 2;
-
- if (cnt == TBits) {
- this->data_[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
-
- if (cnt != 0) {
- this->data_[cur++] = data;
- }
-
- for (; cur != DataSize; ++cur)
- this->data_[cur] = 0;
- }
-
-
- /**
- * Get i-th symbol of Seq.
- *
- * @param i Index of the symbol (0 <= i < size_)
- * @return 0123-char on position i
- */
- char operator[](const size_t i) const {
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- /**
- * Reverse complement.
- *
- * @return Reverse complement Seq.
- */
- Seq<size_, T> operator!() const {
- Seq<size_, T> res(*this);
- for (size_t i = 0; i < (size_ >> 1); ++i) {
- T front = complement(res[i]);
- T end = complement(res[size_ - 1 - i]);
- res.set(i, (char) end);
- res.set(size_ - 1 - i, (char) front);
- }
- if ((size_ & 1) == 1) {
- res.set(size_ >> 1, complement(res[size_ >> 1]));
- }
- // can be made without complement calls, but with xor on all bytes afterwards.
- return res;
- }
-
- /**
- * Shift left
- *
- * @param c New 0123 char which should be added to the right.
- * @return Shifted (to the left) sequence with 'c' char on the right.
- */
- Seq<size_, T> operator<<(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- Seq<size_, T> res(*this);
- std::array<T, DataSize> &data = res.data_;
- if (DataSize != 0) { // unless empty sequence
- T rm = data[DataSize - 1] & 3;
- T lastnuclshift_ = ((size_ + TNucl - 1) & (TNucl - 1)) << 1;
- data[DataSize - 1] = (data[DataSize - 1] >> 2) | ((T) c << lastnuclshift_);
-
- if (DataSize >= 2) { // if we have at least 2 elements in data
- int data_size = DataSize;
- for (int i = data_size - 2; i >= 0; --i) {
- T new_rm = data[i] & 3;
- data[i] = (data[i] >> 2) |
- (rm << (TBits - 2)); // we need & here because if we shift negative, it fill with ones :(
- rm = new_rm;
- }
- }
- }
- return res;
- }
-
- Seq<size_ + 1, T> pushBack(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- //VERIFY(is_dignucl(c));
- Seq<size_ + 1, T> s;
- copy(this->data_.begin(), this->data_.end(), s.data_.begin());
- s.data_[s.DataSize - 1] = s.data_[s.DataSize - 1] | ((T) c << ((size_ & (TNucl - 1)) << 1));
-
- return s; //was: Seq<size_ + 1, T>(str() + nucl(c));
-
- }
-
- // /**
- // * @todo optimize!!!
- // */
- // Seq<size_ + 1, T> pushFront(char c) const {
- // if (is_nucl(c)) {
- // c = dignucl(c);
- // }
- // VERIFY(is_dignucl(c));
- // return Seq<size_ + 1, T> (nucl(c) + str());
- // }
-
- Seq<size_ + 1, T> pushFront(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- Seq<size_ + 1, T> res;
-
- //if new kmer has more Ts
- if (Seq<size_ + 1, T>::DataSize > DataSize) {
- res.data_[DataSize] = (data_[DataSize - 1] >> (TBits - 2)) & 3;
- }
-
- T rm = c;
- for (size_t i = 0; i < DataSize; ++i) {
- T new_rm = (data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (data_[i] << 2) | rm;
- rm = new_rm;
- }
-
- return res;
- }
-
- /**
- * Shift right
- *
- * @param c New 0123 char which should be added to the left.
- * @return Shifted (to the right) sequence with 'c' char on the left.
- */
- Seq<size_, T> operator>>(char c) const {
- if (is_nucl(c)) {
- c = dignucl(c);
- }
- VERIFY(is_dignucl(c));
- Seq<size_, T> res(*this);
- T rm = c;
- for (size_t i = 0; i < DataSize; ++i) {
- T new_rm = (res.data_[i] >> (TBits - 2)) & 3;
- res.data_[i] = (res.data_[i] << 2) | rm;
- rm = new_rm;
- }
- if ((size_ & (TNucl - 1)) != 0) {
- T lastnuclshift_ = (size_ & (TNucl - 1)) << 1;
- res.data_[DataSize - 1] = res.data_[DataSize - 1] & (((T) 1
- << lastnuclshift_) - 1);
- }
- return res;
- }
-
- /**
- * Sets i-th symbol of Seq with 0123-char
- */
- inline void set(const size_t i, char c) {
- data_[i >> TNuclBits] =
- (data_[i >> TNuclBits] & ~((T) 3 << ((i & (TNucl - 1)) << 1))) | ((T) c << ((i & (TNucl - 1)) << 1));
- }
-
- bool operator==(const Seq<size_, T> &s) const {
- for (size_t i = 0; i < DataSize; ++i)
- if (data_[i] != s.data_[i])
- return false;
- return true;
- }
-
- /**
- * @see operator ==()
- */
-
- bool operator!=(const Seq<size_, T> &s) const {
- return !operator==(s);
- }
-
- /**
- * String representation of this Seq
- *
- * @return ACGT-string of length size_
- * @see nucl()
- */
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i != size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- static size_t size() {
- return size_;
- }
-
-
- void copy_data(void *dst) const {
- memcpy(dst, (const void *) data_.data(), TotalBytes);
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinRead(std::istream &file, Seq<size_> *seq) {
- file.read((char *) seq->data_.data(), sizeof(T) * DataSize);
- return !file.fail();
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- static bool BinWrite(std::ostream &file, const Seq<size_> &seq) {
- file.write((const char *) seq.data_.data(), sizeof(T) * DataSize);
- return !file.fail();
- }
-
- /**
- * Reads sequence from the file (in the same format as BinWrite writes it)
- * and returns false if error occured, true otherwise.
- */
- bool BinRead(std::istream &file) {
- return BinRead(file, this);
- }
-
- /**
- * Writes sequence to the file (in the same format as BinRead reads it)
- * and returns false if error occured, true otherwise.
- */
- bool BinWrite(std::ostream &file) const {
- return BinWrite(file, *this);
- }
-
- /**
- * @see Seq
- */
- template<size_t size2_, typename T2 = T>
- Seq<size2_, T2> start() const {
- VERIFY(size2_ <= size_);
- return Seq<size2_, T2>(*this);
- }
-
- template<size_t size2_/* = size_ - 1*/, typename T2 = T>
- Seq<size2_, T2> end() const {
- VERIFY(size2_ <= size_);
- return Seq<size2_, T2>(*this, size_ - size2_);
- }
-
- const T *data() const {
- return data_.data();
- }
-
- size_t data_size() const {
- return DataSize;
- }
-
-
- char last() const {
- return operator[](size_ - 1);
- }
-
- char first() const {
- return operator[](0);
- }
-
- static size_t GetHash(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
- return CityHash64WithSeed((const char *) data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(uint32_t seed = 0) const {
- return GetHash(data_.data(), DataSize, seed);
- }
-
- struct hash {
- size_t operator()(const Seq<size_, T> &seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz = DataSize, uint32_t seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct equal_to {
- bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
- return r == l;
- }
- };
-
- struct less2 {
- bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
- for (size_t i = 0; i < size_; ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return false;
- }
- };
-
- /**
- * Denotes some (weird) order on k-mers. Works fast.
- */
- struct less2_fast {
- bool operator()(const Seq<size_, T> &l, const Seq<size_, T> &r) const {
- return 0 > memcmp(l.data_.data(), r.data_.data(), sizeof(T) * DataSize);
- }
- };
-};
-
-template<size_t size_, typename T>
-std::ostream &operator<<(std::ostream &os, Seq<size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-//namespace std {
-//
-//template<size_t size_, typename T = seq_element_type>
-//struct hash<Seq<size_, T> {
-// typedef size_t result_type;
-// typedef Seq<size_, T> argument_type;
-//
-// result_type operator() (const argument_type& arg) {
-// return Seq<size_, T>::hash()(arg);
-// }
-//};
-//
-//}
-
-#endif /* SEQ_HPP_ */
diff --git a/src/modules/data_structures/sequence/seq_common.hpp b/src/modules/data_structures/sequence/seq_common.hpp
deleted file mode 100644
index eb987d5..0000000
--- a/src/modules/data_structures/sequence/seq_common.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * seq_common.hpp
- *
- * Created on: Jun 25, 2012
- * Author: andrey
- */
-
-#ifndef SEQ_COMMON_HPP_
-#define SEQ_COMMON_HPP_
-
-typedef u_int64_t seq_element_type;
-
-#endif /* SEQ_COMMON_HPP_ */
diff --git a/src/modules/data_structures/sequence/sequence.hpp b/src/modules/data_structures/sequence/sequence.hpp
deleted file mode 100755
index b25d217..0000000
--- a/src/modules/data_structures/sequence/sequence.hpp
+++ /dev/null
@@ -1,553 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_HPP_
-#define SEQUENCE_HPP_
-
-#include <vector>
-#include <string>
-#include <memory>
-#include <cstring>
-
-#include "data_structures/sequence/seq.hpp"
-#include "data_structures/sequence/rtseq.hpp"
-
-class Sequence {
- // Type to store Seq in Sequences
- typedef seq_element_type ST;
- // Number of bits in ST
- const static size_t STBits = sizeof(ST) << 3;
- // Number of nucleotides in ST
- const static size_t STN = (STBits >> 1);
- // Number of bits in STN (for faster div and mod)
- const static size_t STNBits = log_<STN, 2>::value;
-
- template<typename T>
- struct array_deleter {
- void operator()(const T *p) { delete[] p; }
- };
-
-private:
- size_t from_;
- size_t size_;
- bool rtl_; // Right to left + complimentary (?)
- std::shared_ptr<ST> data_;
-
- static size_t DataSize(size_t size) {
- return (size + STN - 1) >> STNBits;
- }
-
- template<typename S>
- void InitFromNucls(const S &s, bool rc = false) {
- size_t bytes_size = DataSize(size_);
- ST *bytes = data_.get();
-
- VERIFY(is_dignucl(s[0]) || is_nucl(s[0]));
-
- // Which symbols does our string contain : 0123 or ACGT?
- bool digit_str = is_dignucl(s[0]);
-
- // data -- one temporary variable corresponding to the i-th array element
- // and some counters
- ST data = 0;
- size_t cnt = 0;
- size_t cur = 0;
-
- if (rc) {
- for (int i = (int) size_ - 1; i >= 0; --i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
- char c = complement(digit_str ? s[(unsigned) i] : dignucl(s[(unsigned) i]));
-
- data = data | (ST(c) << cnt);
- cnt += 2;
-
- if (cnt == STBits) {
- bytes[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- } else {
- for (size_t i = 0; i < size_; ++i) {
- //VERIFY(is_dignucl(s[i]) || is_nucl(s[i]));
- char c = digit_str ? s[i] : dignucl(s[i]);
-
- data = data | (ST(c) << cnt);
- cnt += 2;
-
- if (cnt == STBits) {
- bytes[cur++] = data;
- cnt = 0;
- data = 0;
- }
- }
- }
-
- if (cnt != 0)
- bytes[cur++] = data;
-
- for (; cur < bytes_size; ++cur)
- bytes[cur] = 0;
- }
-
-
-public:
- /**
- * Sequence initialization (arbitrary size string)
- *
- * @param s ACGT or 0123-string
- */
- explicit Sequence(const char *s, bool rc = false) :
- from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- explicit Sequence(char *s, bool rc = false) :
- from_(0), size_(strlen(s)), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- template<typename S>
- explicit Sequence(const S &s, bool rc = false) :
- from_(0), size_(s.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- InitFromNucls(s, rc);
- }
-
- Sequence() :
- from_(0), size_(0), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
- memset(data_.get(), 0, DataSize(size_));
- }
-
- template<size_t size2_>
- explicit Sequence(const Seq<size2_> &kmer, size_t) :
- from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
-
- kmer.copy_data(data_.get());
- }
-
- template<size_t size2_>
- explicit Sequence(const RuntimeSeq<size2_> &kmer, size_t) :
- from_(0), size_(kmer.size()), rtl_(false), data_(new ST[DataSize(size_)], array_deleter<ST>()) {
-
- kmer.copy_data(data_.get());
- }
-
- Sequence(const Sequence &seq, size_t from, size_t size, bool rtl) :
- from_(from), size_(size), rtl_(rtl), data_(seq.data_) {
- }
-
- Sequence(const Sequence &s) :
- from_(s.from_), size_(s.size_), rtl_(s.rtl_), data_(s.data_) {
- }
-
- ~Sequence() { }
-
- const Sequence &operator=(const Sequence &rhs) {
- if (&rhs != this) {
- from_ = rhs.from_;
- size_ = rhs.size_;
- rtl_ = rhs.rtl_;
- data_ = rhs.data_;
- }
-
- return *this;
- }
-
- char operator[](const size_t index) const {
- //todo can be put back after switching to distributing release without asserts
- //VERIFY(index < size_);
- const ST *bytes = data_.get();
- if (rtl_) {
- size_t i = from_ + size_ - 1 - index;
- return complement((bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3);
- } else {
- size_t i = from_ + index;
- return (bytes[i >> STNBits] >> ((i & (STN - 1)) << 1)) & 3;
- }
- }
-
- bool operator==(const Sequence &that) const {
- if (size_ != that.size_) {
- return false;
- }
-
- if (data_ == that.data_ && from_ == that.from_ && rtl_ == that.rtl_) {
- return true;
- }
-
- for (size_t i = 0; i < size_; ++i) {
- if (this->operator[](i) != that[i]) {
- return false;
- }
- }
- return true;
- }
-
- bool operator!=(const Sequence &that) const {
- return !(operator==(that));
- }
-
- /**
- * @todo Might be optimized via int comparison (not so easy)
- */
- bool operator<(const Sequence &that) const {
- size_t s = std::min(size_, that.size_);
- for (size_t i = 0; i < s; ++i) {
- if (this->operator[](i) != that[i]) {
- return (this->operator[](i) < that[i]);
- }
- }
- return (size_ < that.size_);
- }
-
- Sequence operator!() const {
- return Sequence(*this, from_, size_, !rtl_);
- }
-
- inline Sequence operator<<(char c) const;
-
- /**
- * @param from inclusive
- * @param to exclusive;
- */
- inline Sequence Subseq(size_t from, size_t to) const;
-
- inline Sequence Subseq(size_t from) const; // up to size_ by default
- inline Sequence First(size_t count) const;
-
- inline Sequence Last(size_t count) const;
-
- inline Sequence operator+(const Sequence &s) const;
-
- /////todo what are these methods???
- inline size_t find(const Sequence &t, size_t from = 0) const;
-
- inline size_t similar(const Sequence &t, size_t k, char directed = 0) const;
-
- inline size_t leftSimilar(const Sequence &t, size_t k) const;
-
- inline size_t rightSimilar(const Sequence &t, size_t k) const;
-
- /**
- * @param from inclusive
- * @param to exclusive;
- * @return true if two sequences intersect
- */
- inline bool intersects(const Sequence &t) const;
-
- template<size_t size2_>
- Seq<size2_> start() const;
-
- template<size_t size2_>
- Seq<size2_> fast_start() const;
-
- template<size_t size2_>
- Seq<size2_> end() const;
-
- template<class Seq>
- Seq start(size_t k) const;
-
- template<class Seq>
- Seq end(size_t k) const;
-
- inline std::string str() const;
-
- inline std::string err() const;
-
- size_t size() const {
- return size_;
- }
-
- template<class Seq>
- bool contains(const Seq& s, size_t offset = 0) const {
- VERIFY(offset + s.size() <= size());
-
- for (size_t i = 0, e = s.size(); i != e; ++i)
- if (operator[](offset + i) != s[i])
- return false;
-
- return true;
- }
-
-private:
- inline bool ReadHeader(std::istream &file);
-
- inline bool WriteHeader(std::ostream &file) const;
-
-public:
- inline bool BinRead(std::istream &file);
-
- inline bool BinWrite(std::ostream &file) const;
-};
-
-inline std::ostream &operator<<(std::ostream &os, const Sequence &s);
-
-/**
- * start of Sequence is Seq with preferred size
- */
-template<size_t size2_>
-Seq<size2_> Sequence::start() const {
- //VERIFY(size2_ <= size_);
- return Seq<size2_>(*this);
-}
-
-template<size_t size2_>
-Seq<size2_> Sequence::fast_start() const {
- ST result[(size2_ + STN - 1) >> STNBits] = {0};
-
- size_t start = from_ >> STNBits;
- size_t end = (from_ + size_ - 1) >> STNBits;
- size_t shift = (from_ & (STN - 1)) << 1;
- const ST *bytes = data_.get();
-
- for (size_t i = start; i <= end; ++i) {
- result[i - start] = bytes[i] >> shift;
- }
-
- if (shift != 0) {
- shift = STBits - shift;
-
- for (size_t i = start + 1; i <= end; ++i) {
- result[i - start - 1] |= bytes[i] << shift;
- }
- }
-
- return (rtl_ ? !Seq<size2_>(result) : Seq<size2_>(result));
-}
-
-template<size_t size2_>
-Seq<size2_> Sequence::end() const {
- return Seq<size2_>(*this, size_ - size2_);
-}
-
-
-template<class Seq>
-Seq Sequence::start(size_t k) const {
- return Seq(unsigned(k), *this);
-}
-
-template<class Seq>
-Seq Sequence::end(size_t k) const {
- return Seq(unsigned(k), *this, size_ - k);
-}
-
-
-Sequence Sequence::First(size_t count) const {
- return Subseq(0, count);
-}
-
-Sequence Sequence::Last(size_t count) const {
- return Subseq(size_ - count);
-}
-
-bool Sequence::intersects(const Sequence &t) const {
- for (size_t i = 0; i < std::min(size_, t.size_); ++i) {
- if (this->operator[](i) == t[i]) {
- return true;
- }
- }
- return false;
-}
-
-// O(1)
-//including from, excluding to
-//safe if not #DEFINE NDEBUG
-Sequence Sequence::Subseq(size_t from, size_t to) const {
- // cerr << endl<<"subseq:" << from <<" " << to << " " << this->str() << endl;
- VERIFY(to >= from);
- VERIFY(to <= size_);
- //VERIFY(to - from <= size_);
- if (rtl_) {
- return Sequence(*this, from_ + size_ - to, to - from, true);
- } else {
- return Sequence(*this, from_ + from, to - from, false);
- }
-}
-
-//including from, excluding to
-Sequence Sequence::Subseq(size_t from) const {
- return Subseq(from, size_);
-}
-
-/**
- * @todo : must be KMP or hashing instead of this
- */
-size_t Sequence::find(const Sequence &t, size_t from) const {
- for (size_t i = from; i <= size() - t.size(); i++) {
- if (Subseq(i, i + t.size()) == t) {
- return i;
- }
- }
- return -1ULL;
-}
-
-/**
- *
- *@param k minimal intersection of sequences
- *@param directed LEFT means that after intersection t continues to left over _this and matches perfectly with _this on overlaping
- *@return 0 - undirected similarity, 1: t extends this to right, -1: this extends t
- *
- */
-size_t Sequence::similar(const Sequence &t, size_t k, char directed) const {
- size_t result = 0;
- if (directed != -1)
- result |= rightSimilar(t, k);
- if (directed != 1)
- result |= leftSimilar(t, k);
- return result;
-}
-
-size_t Sequence::leftSimilar(const Sequence &t, size_t k) const {
- return t.rightSimilar(*this, k);
-}
-
-size_t Sequence::rightSimilar(const Sequence &t, size_t k) const {
- size_t tsz = t.size();
- size_t sz = size();
- Sequence d(t.Subseq(0, k));
- for (size_t res = find(d, 0); res != -1ULL; res = find(d, res + 1)) {
- if (res + tsz < sz)
- continue;
- size_t i;
- for (i = k; i + res < sz; i++) {
- if (t[i] != this->operator[](i + res)) {
- break;
- };
- }
- if (i == sz - res)
- return 1;
- }
- return 0;
-}
-
-/**
- * @todo optimize
- */
-Sequence Sequence::operator+(const Sequence &s) const {
- return Sequence(str() + s.str());
- // TODO might be opposite to correct
- // int total = size_ + s.size_;
- // std::vector<Seq<4> > bytes((total + 3) >> 2);
- // for (size_t i = 0; i < size_; ++i) {
- // bytes[i / 4] = (bytes[i / 4] << operator [](i)); // TODO :-) use <<=
- // }
- // for (size_t i = 0, j = size_; i < s.size_; ++i, ++j) {
- // bytes[j / 4] = (bytes[j / 4]) << s[i];
- // }
- // return Sequence(new Data(bytes), 0, total, false);
-}
-
-std::string Sequence::str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(this->operator[](i));
- }
- return res;
-}
-
-std::string Sequence::err() const {
- std::ostringstream oss;
- oss << "{ *data=" << data_ <<
- ", from_=" << from_ <<
- ", size_=" << size_ <<
- ", rtl_=" << int(rtl_) << " }";
- return oss.str();
-}
-
-std::ostream &operator<<(std::ostream &os, const Sequence &s) {
- os << s.str();
- return os;
-}
-
-bool Sequence::ReadHeader(std::istream &file) {
- file.read((char *) &size_, sizeof(size_));
-
- from_ = 0;
- rtl_ = false;
-
- return !file.fail();
-}
-
-bool Sequence::WriteHeader(std::ostream &file) const {
- VERIFY(from_ == 0);
- VERIFY(!rtl_);
-
- file.write((const char *) &size_, sizeof(size_));
-
- return !file.fail();
-}
-
-
-bool Sequence::BinRead(std::istream &file) {
- ReadHeader(file);
-
- data_ = std::shared_ptr<ST>(new ST[DataSize(size_)], array_deleter<ST>());
- file.read((char *) data_.get(), DataSize(size_) * sizeof(ST));
-
- return !file.fail();
-}
-
-
-bool Sequence::BinWrite(std::ostream &file) const {
- if (from_ != 0 || rtl_) {
- Sequence clear(this->str());
- return clear.BinWrite(file);
- }
-
- WriteHeader(file);
-
- file.write((const char *) data_.get(), DataSize(size_) * sizeof(ST));
-
- return !file.fail();
-}
-
-/**
- * @class SequenceBuilder
- * @section DESCRIPTION
- *
- * Class was created for build sequence. It is included method: size(), append()
- */
-
-class SequenceBuilder {
- std::vector<char> buf_;
-public:
- template<typename S>
- SequenceBuilder &append(const S &s) {
- for (size_t i = 0; i < s.size(); ++i) {
- buf_.push_back(s[i]);
- }
- return *this;
- }
-
- SequenceBuilder &append(char c) {
- buf_.push_back(c);
- return *this;
- }
-
- Sequence BuildSequence() {
- return Sequence(buf_);
- }
-
- size_t size() const {
- return buf_.size();
- }
-
- char operator[](const size_t index) const {
- VERIFY(index < buf_.size());
- return buf_[index];
- }
-
- std::string str() const {
- std::string s(buf_.size(), '-');
- for (size_t i = 0; i < s.size(); ++i) {
- s[i] = nucl(buf_[i]);
- }
- return s;
- }
-};
-
-#endif /* SEQUENCE_HPP_ */
diff --git a/src/modules/data_structures/sequence/sequence_tools.hpp b/src/modules/data_structures/sequence/sequence_tools.hpp
deleted file mode 100644
index eea0e65..0000000
--- a/src/modules/data_structures/sequence/sequence_tools.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SEQUENCE_TOOLS_HPP_
-#define SEQUENCE_TOOLS_HPP_
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "data_structures/sequence/nucl.hpp"
-#include "data_structures/sequence/sequence.hpp"
-#include "utils/levenshtein.hpp"
-
-inline const std::string Reverse(const std::string &s) {
- return std::string(s.rbegin(), s.rend());
-}
-
-inline const std::string Complement(const std::string &s) {
- std::string res(s.size(), 0);
- transform(s.begin(), s.end(), res.begin(), nucl_complement);
- return res;
-}
-
-inline const Sequence MergeOverlappingSequences(std::vector<Sequence>& ss,
- size_t overlap, bool safe_merging = true) {
- if (ss.empty()) {
- return Sequence();
- }
- SequenceBuilder sb;
- Sequence prev_end = ss.front().Subseq(0, overlap);
- sb.append(prev_end);
- for (auto it = ss.begin(); it != ss.end(); ++it) {
- if(safe_merging)
- VERIFY(prev_end == it->Subseq(0, overlap));
- sb.append(it->Subseq(overlap));
- prev_end = it->Subseq(it->size() - overlap);
- }
- return sb.BuildSequence();
-}
-
-inline size_t EditDistance(const Sequence& s1, const Sequence& s2) {
- return edit_distance(s1.str(), s2.str());
-}
-
-inline bool Relax(int& val, int new_val) {
- if (new_val > val) {
- val = new_val;
- return true;
- }
- return false;
-}
-
-inline std::pair<size_t, size_t> LocalSimilarity(const Sequence& s1, const Sequence& s2) {
- size_t m = s1.size();
- size_t n = s2.size();
- std::vector<std::vector<int>> a(m + 1);
- for (size_t i = 0; i <= m; ++i) {
- a[i].resize(n + 1);
- }
- for (size_t i = 0; i <= m; ++i) {
- for (size_t j = 0; j <= n; ++j) {
- a[i][j] = 0;
- }
- }
- for (size_t i = 1; i <= m; ++i) {
- for (size_t j = 1; j <= n; ++j) {
- Relax(a[i][j], a[i - 1][j] - 1);
- Relax(a[i][j], a[i][j - 1] - 1);
- if (s1[i - 1] == s2[j - 1]) {
- Relax(a[i][j], a[i - 1][j - 1] + 1);
- } else {
- Relax(a[i][j], a[i - 1][j - 1] - 1);
- }
- }
- }
-
- //finding local alignment
- int answer = 0;
- size_t i_m = 0;
- size_t j_m = 0;
- for (size_t i = 0; i <= m; ++i) {
- for (size_t j = 0; j <= n; ++j) {
- if (Relax(answer, a[i][j])) {
- i_m = i;
- j_m = j;
- }
- }
- }
-
- //finding alignment lengths
- size_t i = i_m;
- size_t j = j_m;
- while (a[i][j] > 0) {
- if (a[i][j] == a[i][j - 1] - 1) {
- j--;
- } else if (a[i][j] == a[i-1][j] - 1) {
- i--;
- } else if (a[i][j] == a[i-1][j-1] + 1) {
- VERIFY(s1[i-1] == s2[j-1]);
- i--;
- j--;
- } else {
- VERIFY(a[i-1][j-1] - 1 == a[i][j] && s1[i-1] != s2[j-1]);
- i--;
- j--;
- }
- }
- return std::make_pair(size_t(answer), std::min(i_m - i, j_m - j));
-}
-
-inline const std::string ReverseComplement(const std::string &s) {
- std::string res(s.size(), 0);
- transform(s.begin(), s.end(), res.rbegin(), nucl_complement); // only difference with reverse is rbegin() instead of begin()
- return res;
-}
-
-class UniformPositionAligner {
-private:
- size_t upper_length_;
- size_t lower_length_;
-public:
- UniformPositionAligner(size_t upper_length, size_t lower_length) :
- upper_length_(upper_length), lower_length_(lower_length) {
- }
-
- size_t GetPosition(size_t upper_position) {
- if (upper_position * 2 + 1 >= upper_length_)
- return (2 * upper_position + 1) * lower_length_
- / (2 * upper_length_);
- else
- return lower_length_ - 1
- - GetPosition(upper_length_ - 1 - upper_position);
- }
-};
-
-class EnsureEndsPositionAligner {
-private:
- size_t upper_length_;
- size_t lower_length_;
-public:
- EnsureEndsPositionAligner(size_t upper_length, size_t lower_length) :
- upper_length_(upper_length), lower_length_(lower_length) {
- }
-
- size_t GetPosition(size_t upper_position) {
- VERIFY(upper_position > 0);
- if (lower_length_ == 1)
- return 1;
- return (2 * upper_position * lower_length_ + upper_length_)
- / (2 * upper_length_);
- }
-};
-
-#endif /* SEQUENCE_TOOLS_HPP_ */
diff --git a/src/modules/data_structures/sequence/simple_seq.hpp b/src/modules/data_structures/sequence/simple_seq.hpp
deleted file mode 100644
index 77d0fe3..0000000
--- a/src/modules/data_structures/sequence/simple_seq.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * simple_seq.hpp
- *
- * Created on: Jul 23, 2012
- * Author: andrey
- */
-
-#ifndef SIMPLE_SEQ_HPP_
-#define SIMPLE_SEQ_HPP_
-
-#include <string>
-#include <array>
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-
-#include "dev_support/verify.hpp"
-#include "data_structures/sequence/nucl.hpp"
-#include "dev_support/log.hpp"
-#include "seq_common.hpp"
-/**
- * @param T is max number of nucleotides, type for storage
- */
-template<size_t size_, typename T = seq_element_type>
-class SimpleSeq {
-public:
- /**
- * @variable Number of bits in type T (e.g. 8 for char)
- * @example 8: 2^8 = 256 or 16
- */
- const static size_t TBits = sizeof(T) << 3;
-
- /**
- * @variable Number of nucleotides that can be stored in one type T (e.g. 4 for char)
- * TNucl MUST be a power of two
- * @example 4: 8/2 = 4 or 16/2 = 8
- */
- const static size_t TNucl = TBits >> 1;
-
- /**
- * @variable Number of bits in TNucl (e.g. 2 for char). Useful for shifts instead of divisions.
- */
- const static size_t TNuclBits = log_<TNucl, 2>::value;
-
- /**
- * @variable Number of Ts which required to store all sequence.
- */
- const static size_t DataSize = (size_ + TNucl - 1) >> TNuclBits;
-
- typedef T DataType;
-
- /**
- * @variable Number of meaningful bytes in whick seq is stored
- */
- const static size_t TotalBytes = sizeof(T) * DataSize;
-
-private:
- // number of nucleotides in the last data_ bucket
- const static size_t NuclsRemain = size_ & (TNucl - 1);
-
- // useful mask to fill the last element of the data_ array
- const static size_t MaskForLastBucket = (((T) 1) << (NuclsRemain << 1) ) - 1;
-
-
- /**
- * @variable Inner representation of sequence: array of Ts with length = DataSize.
- *
- * @invariant Invariant: all nucleotides >= size_ are 'A's (useful for comparison)
- */
- std::array<T, DataSize> data_;
-
-
-public:
-
- SimpleSeq() {
- //VERIFY((T)(-1) >= (T)0);//be sure to use unsigned types
- std::fill(data_.begin(), data_.end(), 0);
- }
-
- explicit SimpleSeq(T * data_array) {
- memcpy(data_.data(), data_array, TotalBytes);
- }
-
-
- char operator[](const size_t i) const {
- //VERIFY(i >= 0);
- //VERIFY(i < size_);
- return (data_[i >> TNuclBits] >> ((i & (TNucl - 1)) << 1)) & 3;
- }
-
- std::string str() const {
- std::string res(size_, '-');
- for (size_t i = 0; i < size_; ++i) {
- res[i] = nucl(operator[](i));
- }
- return res;
- }
-
- void copy_data(void * dst) const {
- memcpy(dst, (const void *) data_.data(), TotalBytes);
- }
-
- static size_t GetHash(const DataType *data, size_t sz, uint32_t seed = 0) {
- return CityHash64WithSeed((const char*)data, sz * sizeof(DataType), 0x9E3779B9 ^ seed);
- }
-
- size_t GetHash(uint32_t seed = 0) const {
- return GetHash(data_.data(), DataSize, seed);
- }
-
- struct hash {
- size_t operator()(const SimpleSeq<size_, T>& seq, uint32_t seed = 0) const {
- return seq.GetHash(seed);
- }
-
- size_t operator()(const DataType *data, size_t sz, unsigned seed = 0) {
- return GetHash(data, sz, seed);
- }
- };
-
- struct equal_to {
- bool operator()(const SimpleSeq<size_, T>& l, const SimpleSeq<size_, T>& r) const {
- for (size_t i = 0; i < DataSize; ++i)
- if (l.data_[i] != r.data_[i])
- return false;
- return true;
- }
- };
-
- struct less2 {
- int operator()(const SimpleSeq<size_, T> &l, const SimpleSeq<size_, T> &r) const {
- for (size_t i = 0; i < size_; ++i) {
- if (l[i] != r[i]) {
- return (l[i] < r[i]);
- }
- }
- return false;
- }
- };
-
-};
-
-template<size_t size_, typename T>
-std::ostream& operator<<(std::ostream& os, SimpleSeq<size_, T> seq) {
- os << seq.str();
- return os;
-}
-
-
-#endif /* SIMPLE_SEQ_HPP_ */
diff --git a/src/modules/dev_support/CMakeLists.txt b/src/modules/dev_support/CMakeLists.txt
deleted file mode 100644
index d719227..0000000
--- a/src/modules/dev_support/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(dev_support CXX)
-
-add_library(dev_support STATIC
- copy_file.cpp
- path_helper.cpp
- logger/logger_impl.cpp)
diff --git a/src/modules/dev_support/copy_file.cpp b/src/modules/dev_support/copy_file.cpp
deleted file mode 100644
index f68d9d2..0000000
--- a/src/modules/dev_support/copy_file.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "copy_file.hpp"
-
-#include "dev_support/path_helper.hpp"
-#include "dev_support/logger/logger.hpp"
-
-#include <boost/algorithm/string.hpp>
-
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include <unistd.h>
-#include <dirent.h>
-
-#include <sys/stat.h>
-#include <sys/types.h>
-
-namespace path {
-
-namespace details {
-
-using namespace path;
-
-void copy_file(std::string from_path, std::string to_path) {
- using namespace std;
-
- make_full_path(from_path);
- make_full_path(to_path );
-
- if (from_path == to_path)
- return;
-
- std::ifstream source(from_path, ios::binary);
- std::ofstream dest (to_path.c_str() , ios::binary);
-
- dest << source.rdbuf();
-}
-
-
-void hard_link(std::string from_path, std::string to_path) {
- make_full_path(from_path);
- make_full_path(to_path );
-
- if (from_path == to_path)
- return;
-
- if (link(from_path.c_str(), to_path.c_str()) == -1) {
- WARN("Failed to create link. Reason: " << strerror(errno) << ". Error code: " << errno << ". Copying instead");
- copy_file(from_path, to_path);
- }
-}
-
-files_t files_in_folder(std::string const& path) {
- DIR *dp;
- if ((dp = opendir(path.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + path);
-
- files_t files;
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL)
- if (dirp->d_type == DT_REG)
- files.push_back(append_path(path, dirp->d_name));
-
- closedir(dp);
- return files;
-}
-
-files_t folders_in_folder(std::string const& path) {
- DIR *dp;
- if ((dp = opendir(path.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + path);
-
- files_t folders;
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL)
- if (dirp->d_type == DT_DIR) {
- std::string folder = dirp->d_name;
-
- if (folder != "." && folder != "..")
- folders.push_back(append_path(path, folder));
- }
-
- closedir(dp);
- return folders;
-}
-
-} // details
-
-path::files_t files_by_prefix(std::string const& path) {
- using namespace details;
- files_t files;
-
- std::string folder(parent_path(path));
- std::string prefix = filename(path);
-
- files_t out_files;
- const files_t all_files = files_in_folder(folder);
-
- for (auto it = all_files.begin(); it != all_files.end(); ++it) // no std::copy_if before C++11
- if (boost::starts_with(filename(*it), prefix))
- out_files.push_back(*it);
-
- return out_files;
-}
-
-void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
- using namespace details;
-
- for (auto it = files.begin(); it != files.end(); ++it) {
- files_t files_to_copy = files_by_prefix(*it);
-
- for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
- copy_file(*it, append_path(to_folder, filename(*it)));
- }
-}
-
-void link_files_by_prefix(path::files_t const& files, std::string const& to_folder) {
- using namespace details;
-
- for (auto it = files.begin(); it != files.end(); ++it) {
- files_t files_to_copy = files_by_prefix(*it);
-
- for (auto it = files_to_copy.begin(); it != files_to_copy.end(); ++it)
- hard_link(*it, append_path(to_folder, filename(*it)));
- }
-}
-
-void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive) {
- using namespace details;
-
- files_t files = files_in_folder(from_folder);
-
- for (auto it = files.begin(); it != files.end(); ++it)
- if (boost::ends_with(*it, ext))
- copy_file(*it, append_path(to_folder, filename(*it)));
-
- if (recursive) {
- files_t folders = folders_in_folder(from_folder);
-
- for (auto it = folders.begin(); it != folders.end(); ++it) {
- std::string subdir = append_path(to_folder, filename(*it));
- path:: make_dir(subdir);
- copy_files_by_ext(*it, subdir, ext, recursive);
- }
- }
-}
-
-}
diff --git a/src/modules/dev_support/copy_file.hpp b/src/modules/dev_support/copy_file.hpp
deleted file mode 100644
index f402772..0000000
--- a/src/modules/dev_support/copy_file.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dev_support/path_helper.hpp"
-#include <string>
-
-namespace path {
-
-path::files_t files_by_prefix(std::string const& path);
-void copy_files_by_prefix(path::files_t const& files, std::string const& to_folder);
-void link_files_by_prefix(path::files_t const& files, std::string const& to_folder);
-void copy_files_by_ext(std::string const& from_folder, std::string const& to_folder, std::string const& ext, bool recursive);
-
-}
diff --git a/src/modules/dev_support/file_limit.hpp b/src/modules/dev_support/file_limit.hpp
deleted file mode 100644
index 6990b6f..0000000
--- a/src/modules/dev_support/file_limit.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "dev_support/verify.hpp"
-
-inline rlim_t limit_file(size_t limit) {
- struct rlimit rl;
-
- int res = getrlimit(RLIMIT_NOFILE, &rl);
- VERIFY_MSG(res == 0,
- "getrlimit(2) call failed, errno = " << errno);
-
- // We cannot go beyond hard limit and we might not have enough privileges to
- // increase the hard limit
- limit = std::max<size_t>(limit, rl.rlim_cur);
- rl.rlim_cur = std::min<size_t>(limit, rl.rlim_max);
- res = setrlimit(RLIMIT_NOFILE, &rl);
- VERIFY_MSG(res == 0,
- "setrlimit(2) call failed, errno = " << errno);
- INFO("Open file limit set to " << rl.rlim_cur);
-
- return rl.rlim_cur;
-}
diff --git a/src/modules/dev_support/func.hpp b/src/modules/dev_support/func.hpp
deleted file mode 100644
index 5a8343c..0000000
--- a/src/modules/dev_support/func.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <functional>
-
-namespace func {
-
-//to use with std::function-s
-template<class T>
-void Compose(T t, std::function<void(T)> f1,
- std::function<void(T)> f2) {
- if (f1)
- f1(t);
- if (f2)
- f2(t);
-}
-
-template<class T>
-std::function<void(T)> Composition(std::function<void(T)> f1,
- std::function<void(T)> f2) {
- return std::bind(func::Compose<T>, std::placeholders::_1, f1, f2);
-}
-
-template<class A, class B>
-class Func {
-public:
- typedef std::function<B(A)> function_t;
-
- virtual B Apply(A a) const = 0;
-
- virtual ~Func() {
- }
-};
-
-template<class T>
-class AndOperator;
-
-template<class T>
-class OrOperator;
-
-template<class T>
-class NotOperator;
-
-template<class T>
-class Predicate: public Func<T, bool> {
-public:
- typedef T checked_type;
-
- bool Apply(T t) const {
- return Check(t);
- }
-
- virtual bool Check(T t) const = 0;
-
- bool operator()(T t) const { return Check(t); }
-
-
- virtual ~Predicate() {
- }
-};
-
-
-}
diff --git a/src/modules/dev_support/logger/log_writers.hpp b/src/modules/dev_support/logger/log_writers.hpp
deleted file mode 100644
index 12330f3..0000000
--- a/src/modules/dev_support/logger/log_writers.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/path_helper.hpp"
-#include "logger.hpp"
-
-#include <iostream>
-
-#include "config.hpp"
-
-#include <iostream>
-
-namespace logging {
-
-struct console_writer : public writer {
-#ifdef SPADES_USE_JEMALLOC
-
- void write_msg(double time, size_t cmem, size_t max_rss, level l, const char *file, size_t line_num,
- const char *source, const char *msg) {
- std::cout << fmt::format("{:14s} {:>5s} / {:<5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
- human_readable_time(time), human_readable_memory(cmem),
- human_readable_memory(max_rss), logging::level_name(l),
- source, path::filename(file), int(line_num), msg)
- << std::endl;
- }
-
-#else
-void write_msg(double time, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) {
- std::cout << fmt::format("{:14s} {:^5s} {:6.6s} {:24.24s} ({:26.26s}:{:4d}) {:s}",
- human_readable_time(time), human_readable_memory(max_rss), logging::level_name(l),
- source, path::filename(file), int(line_num), msg)
- << std::endl;
-}
-#endif
-};
-
-} // logging
diff --git a/src/modules/dev_support/logger/logger.hpp b/src/modules/dev_support/logger/logger.hpp
deleted file mode 100644
index e72329a..0000000
--- a/src/modules/dev_support/logger/logger.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dev_support/perfcounter.hpp"
-
-#include <vector>
-#include <unordered_map>
-#include <string>
-#include <sstream>
-#include <memory>
-
-#include "config.hpp"
-
-namespace logging
-{
-
-/////////////////////////////////////////////////////
-enum level
-{
- L_TRACE,
- L_DEBUG,
- L_INFO,
- L_WARN,
- L_ERROR
-};
-
-inline std::string level_name(level l)
-{
- static std::string names [] =
- {
- "TRACE",
- "DEBUG",
- "INFO" ,
- "WARN" ,
- "ERROR"
- };
-
- return names[l];
-}
-
-
-/////////////////////////////////////////////////////
-struct writer
-{
-#ifdef SPADES_USE_JEMALLOC
- virtual void write_msg(double time_in_sec, size_t cmem, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
-#else
- virtual void write_msg(double time_in_sec, size_t max_rss, level l, const char* file, size_t line_num, const char* source, const char* msg) = 0;
-#endif
- virtual ~writer(){}
-};
-
-typedef std::shared_ptr<writer> writer_ptr;
-
-/////////////////////////////////////////////////////
-struct properties
-{
- /* Reading logger properties from file
- *
- * File should contains lines like below.
- * Use leading # for comment.
- * File could contain line with default behavior description. If no 'default' entry found, default is set to INFO
- * Valid levels: TRACE, DEBUG, INFO, WARN, ERROR
- *
- * default=INFO
- * AbraCaDabra=TRACE
- * #BubaZuba=WARN
- * HariKrishna=INFO
- *
- */
-
- properties(std::string filename = "", level default_level = L_INFO);
- properties(level default_level = L_INFO);
-
- std::unordered_map<std::string, level> levels;
- level def_level;
- bool all_default;
-};
-
-////////////////////////////////////////////////////
-struct logger
-{
- logger(properties const& props);
-
- //
- bool need_log(level desired_level, const char* source) const;
- void log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg);
-
- //
- void add_writer(writer_ptr ptr);
-
-private:
- properties props_ ;
- std::vector<writer_ptr> writers_;
- perf_counter timer_ ;
-};
-
-std::shared_ptr<logger>& __logger();
-logger* create_logger(std::string filename = "", level default_level = L_INFO);
-
-void attach_logger(logger *lg);
-void detach_logger();
-
-} // logging
-
-inline const char* __scope_source_name() {
- return " General ";
-}
-
-#define DECL_LOGGER(source) \
- static const char* __scope_source_name() { \
- return source; \
- }
-
-#define LOG_MSG(l, msg) \
- do { \
- std::shared_ptr<logging::logger> &__lg__ = logging::__logger(); \
- if (__lg__.get() == NULL) \
- break; \
- \
- if (__lg__->need_log((l), __scope_source_name())) { \
- std::stringstream __logger__str__; \
- __logger__str__ << msg; /* don't use brackets here! */ \
- __lg__->log((l), __FILE__, __LINE__, __scope_source_name(), __logger__str__.str().c_str()); \
- } \
- } while(0);
-
-#ifdef SPADES_DEBUG_LOGGING
-# define DEBUG(message) LOG_MSG(logging::L_DEBUG, message)
-# define TRACE(message) LOG_MSG(logging::L_TRACE, message)
-#else
-# define DEBUG(message) /* No trace */
-# define TRACE(message) /* No trace */
-#endif
-#define INFO(message) LOG_MSG(logging::L_INFO , message)
-#define VERBOSE_T(n, T, message) {size_t n_copy = (n); if (n_copy % (T) == 0 && n_copy > 0) INFO(n_copy << message)}
-#define VERBOSE(n, message) VERBOSE_T((n), 10000, message)
-#define VERBOSE_POWER_T(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(n_copy << message)}
-#define VERBOSE_POWER(n, message) VERBOSE_POWER_T((n), 10000, message)
-#define VERBOSE_POWER_T2(n, T, message) {size_t n_copy = (n); if ((n_copy & (n_copy - 1)) == 0 && (n_copy > T)) INFO(message)}
-#define VERBOSE_POWER2(n, message) VERBOSE_POWER_T2((n), 10000, message)
-#define WARN(message) LOG_MSG(logging::L_WARN, message)
-#define ERROR(message) LOG_MSG(logging::L_ERROR, message)
-#define FATAL_ERROR(message) {ERROR(message); exit(-1);}
diff --git a/src/modules/dev_support/logger/logger_impl.cpp b/src/modules/dev_support/logger/logger_impl.cpp
deleted file mode 100644
index c9d8570..0000000
--- a/src/modules/dev_support/logger/logger_impl.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <boost/algorithm/string.hpp>
-#include <cppformat/format.h>
-
-#include <string>
-#include <map>
-#include <fstream>
-#include <vector>
-
-#include "dev_support/logger/logger.hpp"
-
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-# include <jemalloc/jemalloc.h>
-#endif
-
-namespace logging {
-
-properties::properties(level default_level)
- : def_level(default_level), all_default(true) {}
-
-properties::properties(std::string filename, level default_level)
- : def_level(default_level), all_default(true) {
- if (filename.empty())
- return;
-
- std::ifstream in(filename.c_str());
-
- std::map<std::string, level> remap = {
- {"TRACE", L_TRACE},
- {"DEBUG", L_DEBUG},
- {"INFO" , L_INFO },
- {"WARN" , L_WARN },
- {"ERROR", L_ERROR}
- };
-
- while (!in.eof()) {
- using namespace boost;
-
- char buf [0x400] = {};
- in.getline(buf, sizeof buf);
-
- std::string str(buf);
- trim(str);
-
- if (str.empty() || boost::starts_with(str, "#"))
- continue;
-
- std::vector<std::string> entry;
- split(entry, str, is_any_of("="));
-
- if(entry.size() != 2)
- throw std::runtime_error("invalid log file property entry: " + str);
-
- trim (entry[0]);
- trim (entry[1]);
- to_upper(entry[1]);
-
- auto it = remap.find(entry[1]);
- if(it == remap.end())
- throw std::runtime_error("invalid log file level description: " + entry[1]);
-
- levels[entry[0]] = it->second;
- }
-
- auto def = levels.find("default");
- if (def != levels.end())
- def_level = def->second;
-
- for (auto I = levels.begin(), E = levels.end(); I != E; ++I) {
- if (I->second != def_level) {
- all_default = false;
- break;
- }
- }
-}
-
-
-logger::logger(properties const& props)
- : props_(props) { }
-
-bool logger::need_log(level desired_level, const char* source) const {
- level source_level = props_.def_level;
-
- if (!props_.all_default) {
- auto it = props_.levels.find(source);
- if (it != props_.levels.end())
- source_level = it->second;
- }
-
- return desired_level >= source_level;
-}
-
-#ifdef SPADES_USE_JEMALLOC
-
-void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
- double time = timer_.time();
- const size_t *cmem = 0, *cmem_max = 0;
- size_t clen = sizeof(cmem);
-
- je_mallctl("stats.cactive", &cmem, &clen, NULL, 0);
- je_mallctl("stats.cactive_max", &cmem_max, &clen, NULL, 0);
-
- for (auto it = writers_.begin(); it != writers_.end(); ++it)
- (*it)->write_msg(time, (*cmem) / 1024, (*cmem_max) / 1024, desired_level, file, line_num, source, msg);
-}
-#else
-void logger::log(level desired_level, const char* file, size_t line_num, const char* source, const char* msg) {
- double time = timer_.time();
- size_t max_rss = get_max_rss();
-
- for (auto it = writers_.begin(); it != writers_.end(); ++it)
- (*it)->write_msg(time, max_rss, desired_level, file, line_num, source, msg);
-}
-#endif
-
-//
-void logger::add_writer(writer_ptr ptr)
-{
- writers_.push_back(ptr);
-}
-
-////////////////////////////////////////////////////
-std::shared_ptr<logger> &__logger() {
- static std::shared_ptr<logger> l;
- return l;
-}
-
-logger *create_logger(std::string filename, level default_level) {
- return new logger(properties(filename, default_level));
-}
-
-void attach_logger(logger *lg) {
- __logger().reset(lg);
-}
-
-void detach_logger() {
- __logger().reset();
-}
-
-
-} // logging
diff --git a/src/modules/dev_support/path_helper.cpp b/src/modules/dev_support/path_helper.cpp
deleted file mode 100644
index 534d459..0000000
--- a/src/modules/dev_support/path_helper.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dev_support/path_helper.hpp"
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-#include <unistd.h>
-
-#include <boost/tokenizer.hpp>
-#include <boost/algorithm/string.hpp>
-
-#include <string>
-#include <vector>
-
-namespace path {
-
-bool make_dir(std::string const& folder) {
- return mkdir(folder.c_str(), 0755) == 0;
-}
-
-std::string make_temp_dir(std::string const& prefix,
- std::string const& suffix) {
- std::string name = append_path(prefix, suffix + "_XXXXXX");
- char* actual;
- if ((actual = ::mkdtemp(strcpy(new char[name.length() + 1], name.c_str())))
- == NULL)
- throw std::runtime_error("Cannot create temporary dir " + name);
-
- std::string result(actual);
- if (result == name)
- throw std::runtime_error("Cannot create temporary dir " + name);
-
- delete[] actual;
-
- return result;
-}
-
-void remove_dir(std::string const& folder) {
- DIR *dp;
- if ((dp = opendir(folder.c_str())) == NULL)
- throw std::runtime_error("can not open folder " + folder);
-
- struct dirent *dirp;
- while ((dirp = readdir(dp)) != NULL) {
- std::string full_path = folder + "/" + dirp->d_name;
-
- if (dirp->d_type == DT_DIR) {
- if (std::string(".") != dirp->d_name
- && std::string("..") != dirp->d_name) {
- remove_dir(full_path);
- }
- } else
- remove(full_path.c_str());
- }
-
- closedir(dp);
- remove(folder.c_str());
-}
-
-bool is_regular_file(std::string const& path) {
- struct stat st;
- return (stat(path.c_str(), &st) == 0) && (S_ISREG(st.st_mode));
-}
-
-std::string append_path(std::string const& prefix, std::string const& suffix) {
- std::string delimiter = "";
-
- if (!boost::ends_with(prefix, "/") && !boost::starts_with(suffix, "/")
- && !prefix.empty()) {
- delimiter = "/";
- }
-
- return prefix + delimiter + suffix;
-}
-
-std::string current_dir() {
- char* cwd = getcwd(NULL, 0);
- std::string result = cwd;
-
- free(cwd);
- return result;
-}
-
-void make_full_path(std::string& path) {
- if (!boost::starts_with(path, "/")) // relative path
- path = append_path(current_dir(), path);
-}
-
-std::string filename(std::string const& path) {
- size_t pos = path.find_last_of('/');
- return pos != std::string::npos ? path.substr(pos + 1) : path;
-}
-
-std::string basename(std::string const& path) {
- size_t slash = path.find_last_of('/');
- size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
-
- size_t dot = path.find_last_of('.');
- if (dot < after_slash)
- dot = std::string::npos;
-
- return path.substr(after_slash, dot - after_slash);
-}
-
-std::string extension(std::string const& path) {
- size_t slash = path.find_last_of('/');
- size_t after_slash = slash == std::string::npos ? 0 : slash + 1;
- size_t dot = path.find_last_of('.');
-
- if (dot < after_slash || dot == std::string::npos || dot + 1 == path.size())
- return std::string();
-
- return path.substr(dot);
-}
-
-std::string parent_path(std::string const& path) {
- std::string cpath(path);
-
- make_full_path(cpath);
- size_t slash_pos = cpath.find_last_of('/');
-
- return (slash_pos == 0 ? std::string("/") : cpath.substr(0, slash_pos));
-}
-
-bool check_existence(std::string const& path) {
- struct stat st_buf;
- return stat(path.c_str(), &st_buf) == 0
- && (S_ISREG(st_buf.st_mode) || S_ISDIR(st_buf.st_mode)); // exists and (file or dir)
-}
-
-void remove_if_exists(std::string const& path) {
- if (check_existence(path)) {
- if (is_regular_file(path)) // file
- remove(path.c_str());
- else // dir
- remove_dir(path);
- }
-}
-
-//TODO do we need to screen anything but whitespaces?
-std::string screen_whitespaces(std::string const &path) {
- std::string to_search = " ";
- std::string res = "";
- for (size_t i = 0; i < path.size(); i++) {
- if ((i == 0) || (path[i] != ' ') || (path[i - 1] == '\\')) {
- res += path[i];
- } else {
- res +='\\';
- res +=' ';
- }
- }
-// res += "'";
- return res;
-}
-
-//todo reduce code duplication!!!
-bool FileExists(std::string const &filename) {
- struct stat st_buf;
- return stat(filename.c_str(), &st_buf) == 0 && S_ISREG(st_buf.st_mode);
-}
-
-void CheckFileExistenceFATAL(std::string const &filename) {
- if (!FileExists(filename)) FATAL_ERROR("File " << filename << " doesn't exist or can't be read!");
-}
-
-void make_dirs(std::string const &path) {
- VERIFY(!path.empty());
-
- size_t slash_pos = 0;
- while ((slash_pos = path.find_first_of('/', slash_pos + 1)) != std::string::npos) {
- make_dir(path.substr(0, slash_pos));
- }
- if (path[path.size() - 1] != '/') {
- make_dir(path);
- }
-}
-
-// doesn't support symlinks
-std::string resolve(std::string const& path) {
- typedef boost::char_delimiters_separator<char> separator_t;
- typedef boost::tokenizer<separator_t> tokenizer_t;
-
- tokenizer_t tok(path, separator_t(false, "", "/"));
-
- std::string result = "/";
- for (auto it = tok.begin(); it != tok.end(); ++it) {
- if (*it == "..")
- result = parent_path(result);
-
- else if (*it == ".")
- ; // Ignore
-
- else
- // Just cat other path entries
- result = append_path(result, *it);
- }
-
- return result;
-}
-
-std::string make_relative_path(std::string p, std::string base) {
- p = resolve(p);
- base = resolve(base);
-
- std::string pp = parent_path(p);
-
- typedef boost::char_delimiters_separator<char> separator_t;
- typedef boost::tokenizer<separator_t> tokenizer_t;
-
- tokenizer_t pp_tok(pp, separator_t(false, "", "/"));
- tokenizer_t base_tok(base, separator_t(false, "", "/"));
-
- auto i = pp_tok.begin();
- auto j = base_tok.begin();
-
- while (i != pp_tok.end() && j != base_tok.end() && *i == *j) {
- ++i;
- ++j;
- }
-
- std::string result;
- for (; j != base_tok.end(); ++j)
- result = append_path("..", result);
-
- for (; i != pp_tok.end(); ++i)
- result = append_path(result, *i);
-
- return append_path(result, filename(p));
-}
-
-std::string MakeLaunchTimeDirName() {
- time_t rawtime;
- struct tm * timeinfo;
- char buffer[80];
-
- time(&rawtime);
- timeinfo = localtime(&rawtime);
-
- strftime(buffer, 80, "%m.%d_%H.%M.%S", timeinfo);
- return std::string(buffer);
-}
-
-}
diff --git a/src/modules/dev_support/path_helper.hpp b/src/modules/dev_support/path_helper.hpp
deleted file mode 100644
index 372c6f4..0000000
--- a/src/modules/dev_support/path_helper.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <dirent.h>
-#include <unistd.h>
-
-#include <string>
-#include <vector>
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/verify.hpp"
-
-namespace path {
-//todo review and make names consistent!
-
-typedef std::vector<std::string> files_t;
-
-bool make_dir(std::string const &folder);
-
-std::string make_temp_dir(std::string const &prefix, std::string const &suffix);
-
-void remove_dir(std::string const &folder);
-
-bool is_regular_file(std::string const &path);
-
-std::string append_path(std::string const &prefix, std::string const &suffix);
-
-std::string current_dir();
-
-//todo why non-cons argument?!
-void make_full_path(std::string &path);
-
-std::string filename(std::string const &path);
-
-std::string basename(std::string const &path);
-
-std::string extension(std::string const &path);
-
-std::string parent_path(std::string const &path);
-
-bool check_existence(std::string const &path);
-
-void remove_if_exists(std::string const &path);
-
-std::string screen_whitespaces(std::string const &path);
-
-/**
-* Checks if file exists.
-* Analogs: http://www.techbytes.ca/techbyte103.html , http://www.gamedev.net/topic/211918-determining-if-a-file-exists-c/
-*/
-bool FileExists(std::string const &filename);
-
-/**
-* Exit(1) if file doesn't exists, writes FATAL log message.
-*/
-void CheckFileExistenceFATAL(std::string const &filename);
-
-void make_dirs(std::string const &path);
-
-// doesn't support symlinks
-std::string resolve(std::string const &path);
-
-std::string make_relative_path(std::string p, std::string base = current_dir());
-
-std::string MakeLaunchTimeDirName();
-
-}
diff --git a/src/modules/dev_support/range.hpp b/src/modules/dev_support/range.hpp
deleted file mode 100644
index bf2595d..0000000
--- a/src/modules/dev_support/range.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include "dev_support/verify.hpp"
-
-namespace omnigraph {
-
-struct Range {
-private:
- bool inside(size_t left, size_t right, size_t point) const {
- return left <= point && point <= right;
- }
-
-public:
- //inclusive
- size_t start_pos;
- //exclusive
- size_t end_pos;
-
- size_t size() const {
- VERIFY(end_pos >= start_pos);
- return end_pos - start_pos;
- }
-
- void shift(int shift) {
- VERIFY(shift > 0 || size_t(-shift) <= start_pos);
- start_pos += shift;
- end_pos += shift;
- }
-
- Range(): start_pos(0), end_pos(0) {
- VERIFY(end_pos >= start_pos);
- }
-
- Range(size_t start_pos, size_t end_pos)
- : start_pos(start_pos),
- end_pos(end_pos) {
- VERIFY(end_pos >= start_pos);
- }
-
- bool operator<(const Range &other) const {
- if (start_pos != other.start_pos)
- return start_pos < other.start_pos;
- return end_pos < other.end_pos;
- }
-
- bool contains(const Range& that) const {
- return start_pos <= that.start_pos && end_pos >= that.end_pos;
- }
-
- Range Merge(const Range &other) const {
- return Range(this->start_pos, other.end_pos);
- }
-
- Range Invert(size_t base_length) const {
- VERIFY(base_length >= end_pos);
- return Range(base_length - end_pos, base_length - start_pos);
- }
-
- Range& operator=(const Range& other) {
- start_pos = other.start_pos;
- end_pos = other.end_pos;
- return *this;
- }
-
- bool empty() const {
- return start_pos == end_pos;
- }
-
- bool Intersect(const Range &other) const {
- return inside(start_pos, end_pos, other.start_pos) || inside(start_pos, end_pos, other.end_pos) ||
- inside(other.start_pos, other.end_pos, start_pos);
- }
-
- bool IntersectLeftOf(const Range &other) const {
- return inside(start_pos, end_pos, other.start_pos) && inside(other.start_pos, other.end_pos, end_pos);
- }
-
- bool operator==(const Range &that) const {
- return start_pos == that.start_pos && end_pos == that.end_pos;
- }
-
- bool operator!=(const Range &that) const {
- return !(*this == that);
- }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Range& range) {
- os << "[" << (range.start_pos + 1) << " - " << range.end_pos << "]";
- return os;
-}
-
-}
diff --git a/src/modules/dev_support/segfault_handler.hpp b/src/modules/dev_support/segfault_handler.hpp
deleted file mode 100644
index 836e2f2..0000000
--- a/src/modules/dev_support/segfault_handler.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#pragma once
-
-#include "dev_support/stacktrace.hpp"
-#include "boost/noncopyable.hpp"
-
-#include <signal.h>
-
-struct segfault_handler : boost::noncopyable {
- typedef std::function<void()> callback_t;
-
- typedef void (*seg_handler_t)(int);
-
- segfault_handler(callback_t const &cb = 0) {
- if (callback() != 0)
- throw std::runtime_error("failed to initialize segfault_handler, it has been already initialized");
-
- callback() = cb;
- old_func_ = signal(SIGSEGV, &segfault_handler::handler);
- }
-
- ~segfault_handler() {
- callback() = 0;
- signal(SIGSEGV, old_func_);
- }
-
-private:
- static callback_t &callback() {
- static callback_t cb = 0;
- return cb;
- }
-
- static void handler(int signum) {
- if (signum == SIGSEGV) {
- std::cerr << "The program was terminated by segmentation fault" << std::endl;
- print_stacktrace();
-
- if (callback())
- callback()();
- }
-
- //TEST!!
- exit(1);
-
- signal(signum, SIG_DFL);
- kill(getpid(), signum);
- }
-
-private:
- seg_handler_t old_func_;
-};
diff --git a/src/modules/dev_support/simple_tools.hpp b/src/modules/dev_support/simple_tools.hpp
deleted file mode 100644
index 00690a5..0000000
--- a/src/modules/dev_support/simple_tools.hpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * simple_tools.hpp
- *
- * Created on: 27.05.2011
- * Author: vyahhi
- */
-
-#ifndef SIMPLE_TOOLS_HPP_
-#define SIMPLE_TOOLS_HPP_
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "dev_support/verify.hpp"
-#include "io/reads_io/ireader.hpp"
-#include "dev_support/path_helper.hpp"
-#include <memory>
-#include <string>
-#include <set>
-#include <vector>
-
-/**
- * Converts anything to string (using ostringstream).
- */
-template <typename T>
-std::string ToString(const T& t) {
- std::ostringstream ss;
- ss << t;
- return ss.str();
-}
-
-template <typename T>
-std::string ToString(const T& t, size_t length) {
- std::ostringstream ss;
- ss << t;
- std::string result = ss.str();
- while(result.size() < length)
- result = "0" + result;
- return result;
-}
-
-template <typename T>
-std::string ToString(std::vector<T>& t) {
- std::ostringstream ss;
- ss << "Size "<<t.size()<<": [";
- for (auto it = t.begin(); it != t.end(); ++it)
- ss<<*it<<", ";
- ss<<"]";
- return ss.str();
-}
-
-template <typename T>
-std::string ToString(std::set<T>& t) {
- std::ostringstream ss;
- ss << "Size "<<t.size()<<": [";
- for (auto it = t.begin(); it != t.end(); ++it)
- ss<<*it<<", ";
- ss<<"]";
- return ss.str();
-}
-
-template<typename T>
-inline const std::pair<T, T> ReversePair(std::pair<T, T> ep) {
- return std::pair<T, T>(ep.second, ep.first);
-}
-
-template <class ContainerT1, class ContainerT2>
-void push_back_all(ContainerT1& target, const ContainerT2& to_insert) {
- target.insert(target.end(), to_insert.begin(), to_insert.end());
-}
-
-template <class ContainerT1, class ContainerT2>
-void insert_all(ContainerT1& target, const ContainerT2& to_insert) {
- target.insert(to_insert.begin(), to_insert.end());
-}
-
-template<class MapT>
-std::set<typename MapT::key_type> key_set(const MapT& m) {
- std::set<typename MapT::key_type> answer;
- for (auto it = m.begin(); it != m.end(); ++it) {
- answer.insert(it->first);
- }
- return answer;
-}
-
-template<class MapT>
-std::set<typename MapT::mapped_type> value_set(const MapT& m) {
- std::set<typename MapT::mapped_type> answer;
- for (auto it = m.begin(); it != m.end(); ++it) {
- answer.insert(it->second);
- }
- return answer;
-}
-
-template <class MapT>
-const typename MapT::mapped_type& get(const MapT& from, const typename MapT::key_type& key) {
- auto it = from.find(key);
- VERIFY(it != from.end());
- return it->second;
-}
-
-template <class MapT>
-typename MapT::mapped_type& get(MapT& from, const typename MapT::key_type& key) {
- auto it = from.find(key);
- VERIFY(it != from.end());
- return it->second;
-}
-
-template <class MMapT>
-const std::vector<typename MMapT::mapped_type> get_all(const MMapT& from, const typename MMapT::key_type& key) {
- std::vector<typename MMapT::mapped_type> answer;
- for (auto it = from.lower_bound(key); it != from.upper_bound(key); ++it) {
- answer.push_back(it->second);
- }
- return answer;
-}
-
-class TmpFolderFixture
-{
- std::string tmp_folder_;
-
-public:
- TmpFolderFixture(std::string tmp_folder = "tmp") :
- tmp_folder_(tmp_folder)
- {
- path::make_dir(tmp_folder_);
- }
-
- ~TmpFolderFixture()
- {
- path::remove_dir(tmp_folder_);
- }
-};
-
-namespace std
-{
-template<class T1, class T2>
-std::ostream& operator<< (std::ostream& os, std::pair<T1, T2> const& pair)
-{
- return os << "(" << pair.first << ", " << pair.second << ")";
-}
-//}
-
-//namespace omnigraph
-//{
-template<class T>
-std::ostream& operator<< (std::ostream& os, const std::vector<T>& v)
-{
- os << "[";
- std::string delim = "";
- for (auto it = v.begin(); it != v.end(); ++it) {
- os << delim << *it;
- delim = ", ";
- }
-// std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, ", "));
- os << "]";
- return os;
-}
-
-template<class T>
-std::ostream& operator<< (std::ostream& os, const std::set<T>& set)
-{
- os << "{";
- bool delim = false;
- for (const auto& i : set) {
- if (delim) os << ", ";
- os << i;
- delim = true;
- }
- os << "}";
- return os;
-}
-
-}
-
-#endif /* SIMPLE_TOOLS_HPP_ */
diff --git a/src/modules/dev_support/standard_base.hpp b/src/modules/dev_support/standard_base.hpp
deleted file mode 100644
index 9adc83b..0000000
--- a/src/modules/dev_support/standard_base.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * standart.hpp
- *
- * Created on: 1 Sep 2011
- * Author: valery
- */
-
-#pragma once
-
-//==crt and stl
-#include <memory>
-#include <cstdlib>
-#include <cstdio>
-#include <time.h>
-#include <signal.h>
-#include <execinfo.h>
-
-#include <iostream>
-#include <iterator>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <set>
-#include <string>
-#include <sstream>
-#include <utility>
-#include <array>
-#include <unordered_map>
-#include <unordered_set>
-#include <deque>
-#include <cmath>
-#include <limits>
-
-using std::cin;
-using std::cout;
-using std::cerr;
-using std::endl;
-using std::map;
-using std::multimap;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
-using std::array;
-using std::set;
-using std::string;
-using std::pair;
-using std::make_pair;
-using std::ifstream;
-using std::istream;
-using std::ofstream;
-using std::ostream;
-using std::min;
-using std::max;
-using std::abs;
-using std::stringstream;
-using std::numeric_limits;
-using std::ostream_iterator;
-using std::copy;
-
-using std::shared_ptr;
-using std::make_shared;
-
-//==boost
-
-#ifndef NDEBUG
-#define BOOST_ENABLE_ASSERT_HANDLER
-#endif
-
-#include <boost/optional.hpp>
-
-#include <boost/noncopyable.hpp>
-
-using boost::optional;
-using boost::make_optional;
-using boost::none;
-
-using boost::noncopyable;
-
-// err handling
-#include "dev_support/stacktrace.hpp"
-
-// path manipulation instead of boost filesystem
-#include "dev_support/path_helper.hpp"
-using path::make_dir;
-using path::remove_dir;
-
-#ifndef NDEBUG
-namespace boost {
-inline void assertion_failed(char const * expr, char const * function,
- char const * file, long line) {
- std::cerr << "Aborted by assert: " << std::endl;
- print_stacktrace();
-#if __DARWIN_UNIX03
- __assert_rtn (expr, file, (int)line, function);
-#elif __DARWIN
- __assert (expr, file, (int)line, function);
-#else
- __assert_fail (expr, file, (unsigned)line, function);
-#endif
-}
-
-inline void assertion_failed_msg(char const * expr, char const * msg,
- char const * function, char const * file,
- long line) {
- std::cerr << "Aborted by assert: " << msg << std::endl;
- print_stacktrace();
-#if __DARWIN_UNIX03
- __assert_rtn (expr, file, (int)line, function);
-#elif __DARWIN
- __assert (expr, file, (int)line, function);
-#else
- __assert_fail (expr, file, (unsigned)line, function);
-#endif
-}
-
-} // namespace boost
-
-#endif // NDEBUG
-
-//==sys
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/time.h>
-
-//our
-//math
-#include "math/xmath.h"
-#include "dev_support/func.hpp"
-#include "dev_support/verify.hpp"
-// log
-#include "dev_support/logger/logger.hpp"
-
-
diff --git a/src/modules/dev_support/verify.hpp b/src/modules/dev_support/verify.hpp
deleted file mode 100644
index 337828e..0000000
--- a/src/modules/dev_support/verify.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "dev_support/stacktrace.hpp"
-#include "boost/current_function.hpp"
-#include <sstream>
-#include <iostream>
-#include <cassert>
-
-#define VERIFY(expr) \
- do { \
- if(!(expr))\
- print_stacktrace();\
- assert(expr); \
- } while(0);
-
-#define VERIFY_MSG(expr, msg) \
- if (!(expr)) { \
- std::stringstream ss; \
- print_stacktrace();\
- ss << "Verification of expression '" << #expr << "' failed in function '" << BOOST_CURRENT_FUNCTION << \
- "'. In file '" << __FILE__ << "' on line " << __LINE__ << ". Message '" << msg << "'." ; \
- std::cout << ss.str() << std::endl; \
- std::cerr << ss.str() << std::endl; \
- fflush(stdout); \
- fflush(stderr); \
- assert(expr); \
- }
diff --git a/src/modules/io/CMakeLists.txt b/src/modules/io/CMakeLists.txt
deleted file mode 100644
index 5c0fd41..0000000
--- a/src/modules/io/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(input CXX)
-
-add_library(input STATIC
- reads_io/parser.cpp
- sam_io/read.cpp
- sam_io/sam_reader.cpp)
-
-target_link_libraries(input BamTools samtools)
-
diff --git a/src/modules/io/dataset_support/dataset_readers.hpp b/src/modules/io/dataset_support/dataset_readers.hpp
deleted file mode 100644
index 5d56151..0000000
--- a/src/modules/io/dataset_support/dataset_readers.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "io/reads_io/io_helper.hpp"
-#include "pipeline/library.hpp"
-
-#include "pipeline/config_struct.hpp"
-
-namespace debruijn_graph {
-
-inline
-io::PairedStreamPtr paired_easy_reader(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::PairedRead> streams;
- for (auto read_pair : lib.paired_reads()) {
- streams.push_back(io::PairedEasyStream(read_pair.first, read_pair.second, followed_by_rc, insert_size, change_read_order,
- use_orientation, lib.orientation(), offset_type));
- }
- return io::MultifileWrap<io::PairedRead>(streams);
-}
-
-inline
-io::ReadStreamList<io::SingleRead> single_easy_readers(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads,
- bool handle_Ns = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::SingleRead> streams;
- if (including_paired_reads) {
- for (const auto& read : lib.reads()) {
- //do we need input_file function here?
- streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
- }
- } else {
- for (const auto& read : lib.single_reads()) {
- streams.push_back(io::EasyStream(read, followed_by_rc, handle_Ns, offset_type));
- }
- }
- return streams;
-}
-
-inline
-io::SingleStreamPtr single_easy_reader(const io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads,
- bool handle_Ns = true,
- io::OffsetType offset_type = io::PhredOffset) {
- return io::MultifileWrap<io::SingleRead>(
- single_easy_readers(lib, followed_by_rc, including_paired_reads, handle_Ns, offset_type));
-}
-
-inline
-io::PairedStreamPtr paired_easy_reader_for_libs(std::vector<size_t> libs,
- bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::PairedRead> streams;
- for (size_t i = 0; i < libs.size(); ++i) {
- streams.push_back(paired_easy_reader(cfg::get().ds.reads[libs[i]],
- followed_by_rc, insert_size, change_read_order, use_orientation, offset_type));
- }
- return io::MultifileWrap<io::PairedRead>(streams);
-}
-
-
-inline
-io::PairedStreamPtr paired_easy_reader(bool followed_by_rc,
- size_t insert_size,
- bool change_read_order = false,
- bool use_orientation = true,
- io::OffsetType offset_type = io::PhredOffset) {
-
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- all_libs[i] = i;
-
- // FIXME: Should we use only first library?
- // No, this one is for all libs together
- return paired_easy_reader_for_libs(all_libs, followed_by_rc, insert_size, change_read_order, use_orientation, offset_type);
-}
-
-
-inline
-io::SingleStreamPtr single_easy_reader_for_libs(vector<size_t> libs,
- bool followed_by_rc,
- bool including_paired_reads,
- io::OffsetType offset_type = io::PhredOffset) {
- io::ReadStreamList<io::SingleRead> streams;
- for (size_t i = 0; i < libs.size(); ++i) {
- streams.push_back(single_easy_reader(cfg::get().ds.reads[libs[i]],
- followed_by_rc, including_paired_reads, offset_type));
- }
- return io::MultifileWrap<io::SingleRead>(streams);
-}
-
-inline
-io::SingleStreamPtr single_easy_reader(bool followed_by_rc,
- bool including_paired_reads,
- io::OffsetType offset_type = io::PhredOffset) {
-
- std::vector<size_t> all_libs(cfg::get().ds.reads.lib_count());
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i)
- all_libs[i] = i;
-
- return single_easy_reader_for_libs(all_libs, followed_by_rc, including_paired_reads, offset_type);
-}
-
-}
diff --git a/src/modules/io/dataset_support/read_converter.hpp b/src/modules/io/dataset_support/read_converter.hpp
deleted file mode 100644
index 1182e7e..0000000
--- a/src/modules/io/dataset_support/read_converter.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * read_converter.hpp
- *
- * Created on: Apr 13, 2012
- * Author: andrey
- */
-
-#pragma once
-
-#include "io/reads_io/binary_converter.hpp"
-#include "io/reads_io/io_helper.hpp"
-#include "dataset_readers.hpp"
-#include "dev_support/simple_tools.hpp"
-
-#include <fstream>
-
-namespace debruijn_graph {
-
-typedef io::SequencingLibrary<config::DataSetData> SequencingLibrary;
-
-class ReadConverter {
-
-private:
- const static size_t current_binary_format_version = 11;
-
- static bool LoadLibIfExists(SequencingLibrary& lib) {
- auto& data = lib.data();
-
- if (!path::FileExists(data.binary_reads_info.bin_reads_info_file))
- return false;
-
- std::ifstream info;
- info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::in);
- DEBUG("Reading binary information file " << data.binary_reads_info.bin_reads_info_file);
-
- size_t chunk_num = 0;
- size_t format = 0;
- size_t lib_index = 0;
-
- info >> format;
- if (!info.eof()) {
- info >> chunk_num;
- }
- if (!info.eof()) {
- info >> lib_index;
- }
-
- if (chunk_num != data.binary_reads_info.chunk_num ||
- format != current_binary_format_version ||
- lib_index != data.lib_index) {
- return false;
- }
-
- INFO("Binary reads detected");
- info >> data.read_length;
- info >> data.read_count;
- info >> data.total_nucls;
- data.binary_reads_info.binary_coverted = true;
-
- info.close();
- return true;
- }
-
- static void ConvertToBinary(SequencingLibrary& lib) {
- auto& data = lib.data();
- std::ofstream info;
- info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
- info << "0 0 0";
- info.close();
-
- INFO("Converting reads to binary format for library #" << data.lib_index << " (takes a while)");
- INFO("Converting paired reads");
- io::PairedStreamPtr paired_reader = paired_easy_reader(lib, false, 0, false, false);
- io::BinaryWriter paired_converter(data.binary_reads_info.paired_read_prefix,
- data.binary_reads_info.chunk_num,
- data.binary_reads_info.buffer_size);
-
- io::ReadStreamStat paired_stat = paired_converter.ToBinary(*paired_reader, lib.orientation());
- paired_stat.read_count_ *= 2;
-
- INFO("Converting single reads");
-
- io::SingleStreamPtr single_reader = single_easy_reader(lib, false, false);
- io::BinaryWriter single_converter(data.binary_reads_info.single_read_prefix,
- data.binary_reads_info.chunk_num,
- data.binary_reads_info.buffer_size);
- io::ReadStreamStat single_stat = single_converter.ToBinary(*single_reader);
-
- paired_stat.merge(single_stat);
- data.read_length = paired_stat.max_len_;
- data.read_count = paired_stat.read_count_;
- data.total_nucls = paired_stat.total_len_;
-
- info.open(data.binary_reads_info.bin_reads_info_file.c_str(), std::ios_base::out);
- info << current_binary_format_version << " " <<
- data.binary_reads_info.chunk_num << " " <<
- data.lib_index << " " <<
- data.read_length << " " <<
- data.read_count << " " <<
- data.total_nucls << "\n";
-
- info.close();
- data.binary_reads_info.binary_coverted = true;
- }
-
-public:
- static void ConvertToBinaryIfNeeded(SequencingLibrary& lib) {
- if (lib.data().binary_reads_info.binary_coverted)
- return;
-
- if (LoadLibIfExists(lib)) {
- return;
- }
-
- ConvertToBinary(lib);
- }
-};
-
-
-inline
-io::BinaryPairedStreams raw_paired_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- ReadConverter::ConvertToBinaryIfNeeded(lib);
- const auto& data = lib.data();
- VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
-
- io::ReadStreamList<io::PairedReadSeq> paired_streams;
- for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
- i, insert_size));
- }
- return io::apply_paired_wrappers(followed_by_rc, paired_streams);
-}
-
-inline
-io::BinarySingleStreams raw_single_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- const auto& data = lib.data();
- ReadConverter::ConvertToBinaryIfNeeded(lib);
- VERIFY_MSG(data.binary_reads_info.binary_coverted, "Lib was not converted to binary, cannot produce binary stream");
-
- io::BinarySingleStreams single_streams;
- for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
- single_streams.push_back(make_shared<io::BinaryFileSingleStream>(data.binary_reads_info.single_read_prefix, i));
- }
- if (including_paired_reads) {
- io::BinaryPairedStreams paired_streams;
- for (size_t i = 0; i < data.binary_reads_info.chunk_num; ++i) {
- paired_streams.push_back(make_shared<io::BinaryFilePairedStream>(data.binary_reads_info.paired_read_prefix,
- i, 0));
- }
-
- return io::apply_single_wrappers(followed_by_rc, single_streams, &paired_streams);
- }
- else {
- return io::apply_single_wrappers(followed_by_rc, single_streams);
- }
-}
-
-
-inline
-io::BinaryPairedStreams paired_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- size_t insert_size = 0) {
- return raw_paired_binary_readers(lib, followed_by_rc, insert_size);
-}
-
-
-inline
-io::BinarySingleStreams single_binary_readers(io::SequencingLibrary<config::DataSetData> &lib,
- bool followed_by_rc,
- bool including_paired_reads) {
- return raw_single_binary_readers(lib, followed_by_rc, including_paired_reads);
-}
-
-
-inline
-//todo simplify
-io::BinaryPairedStreams paired_binary_readers_for_libs(config::dataset& dataset_info,
- const std::vector<size_t>& libs,
- bool followed_by_rc,
- size_t insert_size = 0) {
-
- VERIFY(!libs.empty())
- size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
-
- std::vector<io::BinaryPairedStreams> streams(chunk_num);
- for (size_t i = 0; i < libs.size(); ++i) {
- VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
- "Cannot create stream for multiple libraries with different chunk_num")
- io::BinaryPairedStreams lib_streams = raw_paired_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, insert_size);
- for (size_t j = 0; j < chunk_num; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
- }
-
- io::BinaryPairedStreams joint_streams;
- for (size_t j = 0; j < chunk_num; ++j) {
- joint_streams.push_back(io::MultifileWrap<io::PairedReadSeq>(streams[j]));
- }
- return joint_streams;
-}
-
-inline
-io::BinarySingleStreams single_binary_readers_for_libs(config::dataset& dataset_info,
- const std::vector<size_t>& libs,
- bool followed_by_rc,
- bool including_paired_reads) {
- VERIFY(!libs.empty())
- size_t chunk_num = dataset_info.reads[libs.front()].data().binary_reads_info.chunk_num;
-
- std::vector<io::BinarySingleStreams> streams(chunk_num);
- for (size_t i = 0; i < libs.size(); ++i) {
- VERIFY_MSG(chunk_num == dataset_info.reads[libs[i]].data().binary_reads_info.chunk_num,
- "Cannot create stream for multiple libraries with different chunk_num")
- io::BinarySingleStreams lib_streams = raw_single_binary_readers(dataset_info.reads[libs[i]], followed_by_rc, including_paired_reads);
-
- for (size_t j = 0; j < chunk_num; ++j) {
- streams[j].push_back(lib_streams.ptr_at(j));
- }
- }
-
- io::BinarySingleStreams joint_streams;
- for (size_t j = 0; j < chunk_num; ++j) {
- joint_streams.push_back(io::MultifileWrap<io::SingleReadSeq>(streams[j]));
- }
- return joint_streams;
-}
-
-inline
-io::BinaryPairedStreams paired_binary_readers(config::dataset& dataset_info,
- bool followed_by_rc,
- size_t insert_size = 0) {
-
- std::vector<size_t> all_libs(dataset_info.reads.lib_count());
- for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
- all_libs[i] = i;
- }
- return paired_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, insert_size);
-}
-
-inline
-io::BinarySingleStreams single_binary_readers(config::dataset& dataset_info,
- bool followed_by_rc,
- bool including_paired_reads) {
- std::vector<size_t> all_libs(dataset_info.reads.lib_count());
- for (size_t i = 0; i < dataset_info.reads.lib_count(); ++i) {
- all_libs[i] = i;
- }
- return single_binary_readers_for_libs(dataset_info, all_libs, followed_by_rc, including_paired_reads);
-}
-
-inline
-io::BinarySingleStreamPtr single_binary_multireader(config::dataset& dataset_info, bool followed_by_rc, bool including_paired_reads) {
- return io::MultifileWrap<io::SingleReadSeq>(single_binary_readers(dataset_info, followed_by_rc, including_paired_reads));
-}
-
-inline
-io::BinaryPairedStreamPtr paired_binary_multireader(config::dataset& dataset_info, bool followed_by_rc, size_t insert_size = 0) {
- return io::MultifileWrap<io::PairedReadSeq>(paired_binary_readers(dataset_info, followed_by_rc, insert_size));
-}
-
-
-}
diff --git a/src/modules/io/graph_io/graph_print_utils.hpp b/src/modules/io/graph_io/graph_print_utils.hpp
deleted file mode 100755
index abed05f..0000000
--- a/src/modules/io/graph_io/graph_print_utils.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef GRAPH_PRINTER_HPP_
-#define GRAPH_PRINTER_HPP_
-
-#include "dev_support/standard_base.hpp"
-
-namespace gvis {
-
-template<class VertexId>
-struct BaseVertex {
- VertexId id_;
- string label_;
- string href_;
- string fill_color_;
- BaseVertex(VertexId id, string label, string reference, string fill_color) :id_(id), label_(label), href_(reference), fill_color_(fill_color) {
- }
-};
-
-template<class VertexId>
-struct BaseEdge {
- VertexId from;
- VertexId to;
- string label;
- string color;
- BaseEdge(VertexId _from, VertexId _to, string _label, string _color) {
- from = _from;
- to = _to;
- label = _label;
- color = _color;
- }
-};
-
-class StreamRecorder {
-private:
- ostream &os_;
-protected:
- virtual ostream &os() {
- return os_;
- }
-public:
- StreamRecorder(ostream &os) : os_(os) {
- }
-
- virtual ~StreamRecorder() {
- }
-};
-
-template<class Vertex, class Edge>
-class GraphRecorder {
-public:
- virtual void recordVertex(Vertex vertex) = 0;
-
- virtual void recordEdge(Edge edge) = 0;
-
- virtual inline void startGraphRecord(const string &name) = 0;
-
- virtual inline void endGraphRecord() = 0;
-
- virtual ~GraphRecorder(){
- }
-};
-
-template<class VertexId>
-class SingleGraphRecorder : public GraphRecorder<BaseVertex<VertexId>, BaseEdge<VertexId>> {
-protected:
- typedef BaseVertex<VertexId> Vertex;
- typedef BaseEdge<VertexId> Edge;
-};
-
-template<class VertexId>
-class PairedGraphRecorder : public GraphRecorder<pair<BaseVertex<VertexId>, BaseVertex<VertexId>>, BaseEdge<pair<VertexId, VertexId>>> {
-protected:
- typedef pair<BaseVertex<VertexId>, BaseVertex<VertexId>> Vertex;
- typedef BaseEdge<pair<VertexId, VertexId>> Edge;
-};
-
-template<class VertexId>
-class DotGraphRecorder : public StreamRecorder {
-public:
- DotGraphRecorder(ostream &os) : StreamRecorder(os) {
- }
-
-protected:
- template<class vid>
- void recordVertexId(vid id) {
- this->os() << "vertex_" << id;
- }
-
- string IdToStr(VertexId u) {
- stringstream ss;
- ss << u;
- return ss.str();
- }
-
- string constructNodeId(VertexId v) {
- return constructNodePairId(v, v);
- }
-
- inline void recordParameter(ostream &os, const string &name, const string &value) {
- os << name << "=" << "<" << value << "> ";
- }
-
- inline void recordParameter(const string &name, const string &value) {
- recordParameter(this->os(), name, value);
- }
-
- inline void recordParameterInQuotes(ostream &os, const string &name, const string &value) {
- os << name << "=" << "\"" << value << "\" ";
- }
-
- inline void recordParameterInQuotes(const string &name, const string &value) {
- recordParameterInQuotes(this->os(), name, value);
- }
-
- inline double getColorParameter(int l, int r, double perc) {
- return l * perc + r * (1 - perc);
- }
-
- inline string getColor(int currentLength, int approximateLength) {
- currentLength %= approximateLength;
- int points[8][3] = {{0, 0, 1}, {0, 1, 1}, {1, 1, 1}, {0, 1, 0}, {1, 1, 0}, {1, 0, 1}, {0, 0, 1}};
- stringstream ss;
- int bound = approximateLength / 6;
- int num = currentLength / bound;
- double perc = (currentLength % bound) * 1. / bound;
- for(int i = 0; i < 3; i++) {
- ss << getColorParameter(points[num][i], points[num + 1][i], perc);
- if(i != 2)
- ss << ",";
- }
- return ss.str();
- }
-
-};
-
-
-template<class SingleVertexId>
-class DotSingleGraphRecorder: public SingleGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
-private:
- typedef BaseVertex<SingleVertexId> Vertex;
- typedef BaseEdge<SingleVertexId> Edge;
-
-public:
- DotSingleGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
- }
-
- void recordVertex(Vertex vertex) {
- this->recordVertexId(vertex.id_);
- this->os() << "[";
- this->recordParameterInQuotes("label", vertex.label_);
- this->os() << ",";
- this->recordParameter("style", "filled");
- this->os() << ",";
- this->recordParameter("color", "black");
- this->os() << ",";
- if(vertex.href_ != "") {
- this->recordParameterInQuotes("href", vertex.href_);
- this->os() << ",";
- }
- this->recordParameter("fillcolor", vertex.fill_color_);
- this->os() << "]" << endl;
- }
-
- void recordEdge(Edge edge) {
- this->recordVertexId(edge.from);
- this->os() << "->";
- this->recordVertexId(edge.to);
- this->os() << "[";
- this->recordParameterInQuotes("label", edge.label);
- this->os() << ",";
- this->recordParameter("color", edge.color);
- this->os() << "]" << endl;
- }
-
- inline void startGraphRecord(const string &name) {
- this->os() << "digraph " << name << " {" << endl;
- this->os() << "node" << "[";
- this->recordParameter("fontname", "Courier");
- this->recordParameter("penwidth", "1.8");
- this->os() << "]" << endl;
- }
-
- inline void endGraphRecord() {
- this->os() << "}" << endl;
- }
-};
-
-template<class SingleVertexId>
-class DotPairedGraphRecorder: public PairedGraphRecorder<SingleVertexId>, public DotGraphRecorder<SingleVertexId> {
-private:
- typedef BaseVertex<SingleVertexId> SingleVertex;
- typedef BaseEdge<SingleVertexId> SingleEdge;
- typedef typename PairedGraphRecorder<SingleVertexId>::Vertex Vertex;
- typedef typename PairedGraphRecorder<SingleVertexId>::Edge Edge;
-
-
- string constructNodePairId(SingleVertexId u, SingleVertexId v) {
- stringstream ss;
- string u_str = this->IdToStr(u);
- string v_str = this->IdToStr(v);
- if (u == v)
- ss << u;
- else if (u_str > v_str)
- ss << v_str << "_" << u_str;
- else
- ss << u_str << "_" << v_str;
- return ss.str();
- }
-
- inline string constructPortCell(const string &port, string href, const string &color) {
- stringstream ss;
- ss << "<TD BORDER=\"0\" PORT = \"port_" << port << "\" ";
- this->recordParameterInQuotes(ss, "color", color);
- this->recordParameterInQuotes(ss, "bgcolor", color);
- if(href != "") {
- ss <<"href=\"" << href << "\"";
- }
- ss << "></TD>";
- return ss.str();
- }
-
- inline string constructLabelCell(const string &label, const string &href, const string &color) {
- stringstream ss;
- ss << "<TD BORDER=\"0\" ";
- this->recordParameterInQuotes(ss, "color", color);
- this->recordParameterInQuotes(ss, "bgcolor", color);
- if(href != "") {
- ss <<"href=\"" << href << "\"";
- }
- ss << ">"
- << label << "</TD>";
- return ss.str();
- }
-
- string constructComplexNodeId(string pairId, SingleVertexId v) {
- stringstream ss;
- ss << pairId << ":port_" << v;
- return ss.str();
- }
-
- string constructTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
- stringstream ss;
- ss << "<TR>";
- ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
- ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
- ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
- ss << "</TR>\n";
- return ss.str();
- }
-
- string constructReverceTableEntry(SingleVertex v/*, const string &label, const string &href*/) {
- stringstream ss;
- ss << "<TR>";
- ss << constructPortCell(ToString(v.id_) + "_out", v.href_, v.fill_color_);
- ss << constructLabelCell(v.label_, v.href_, v.fill_color_);
- ss << constructPortCell(ToString(v.id_) + "_in", v.href_, v.fill_color_);
- ss << "</TR>\n";
- return ss.str();
- }
-
- string constructComplexNodeLabel(Vertex v) {
- return "<TABLE BORDER=\"1\" CELLSPACING=\"0\" >\n" + constructTableEntry(v.first)
- + constructReverceTableEntry(v.second) + "</TABLE>";
- }
-
- string constructVertexInPairId(SingleVertexId v, SingleVertexId rc) {
- return constructComplexNodeId(constructNodePairId(v, rc), v);
- }
-
-
-public:
- DotPairedGraphRecorder(ostream &os) : DotGraphRecorder<SingleVertexId>(os) {
- }
-
- void recordPairedVertexId(SingleVertexId id1, SingleVertexId id2) {
- this->os() << "vertex_" << constructNodePairId(id1, id2);
- }
-
- void recordVertex(Vertex vertex) {
- string pairLabel = constructComplexNodeLabel(vertex);
- recordPairedVertexId(vertex.first.id_, vertex.second.id_);
- this->os() << "[";
- this->recordParameter("label", constructComplexNodeLabel(vertex));
- this->os() << ",";
- this->recordParameter("color", "black");
- this->os() << ",";
- this->recordParameter("URL", "/vertex/" + std::to_string(vertex.first.id_) + ".svg");
- this->os() << "]" << endl;
- }
-
- void recordEdge(Edge edge) {
- this->recordVertexId(constructVertexInPairId(edge.from.first, edge.from.second));
- this->os() << "_out";
- this->os() << "->";
- this->recordVertexId(constructVertexInPairId(edge.to.first, edge.to.second));
- this->os() << "_in";
- this->os() << "[";
- this->recordParameterInQuotes("label", edge.label);
- this->os() << ",";
- this->recordParameter("color", edge.color);
- this->os() << "]" << endl;
- }
-
- inline void startGraphRecord(const string &name) {
- this->os() << "digraph " << name << " {" << endl;
- this->os() << "node" << "[";
- this->recordParameter("fontname", "Courier");
- this->os() << ",";
- this->recordParameter("penwidth", "1.8");
- this->os() << ",";
- this->recordParameter("shape", "plaintext");
- this->os() << "]" << endl;
- }
-
- inline void endGraphRecord() {
- this->os() << "}" << endl;
- }
-};
-
-
-}
-#endif //GRAPH_PRINTER_HPP_//
diff --git a/src/modules/io/kmers_io/kmer_iterator.hpp b/src/modules/io/kmers_io/kmer_iterator.hpp
deleted file mode 100644
index 0e7a38e..0000000
--- a/src/modules/io/kmers_io/kmer_iterator.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __IO_KMER_ITERATOR_HPP__
-#define __IO_KMER_ITERATOR_HPP__
-
-#include "io/kmers_io/mmapped_reader.hpp"
-#include <string>
-
-namespace io {
-
-template<class Seq>
-using raw_kmer_iterator = MMappedFileRecordArrayIterator<typename Seq::DataType>;
-
-template<class Seq>
-raw_kmer_iterator<Seq> make_kmer_iterator(const std::string &FileName,
- unsigned K) {
- return raw_kmer_iterator<Seq>(FileName, Seq::GetDataSize(K));
-}
-
-template<class Seq>
-std::vector<raw_kmer_iterator<Seq>> make_kmer_iterator(const std::string &FileName,
- size_t K, size_t amount) {
- std::vector<raw_kmer_iterator<Seq>> res;
- if (amount == 1) {
- res.emplace_back(FileName, Seq::GetDataSize(K));
- return res;
- }
-
- // Determine the file size
- struct stat buf;
- VERIFY_MSG(stat(FileName.c_str(), &buf) != -1,
- "stat(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- size_t file_size = buf.st_size;
-
- // Now start creating the iterators keeping in mind, that offset should be
- // multiple of page size.
- size_t chunk = round_up(file_size / amount,
- getpagesize() * Seq::GetDataSize(K) * sizeof(typename Seq::DataType));
- size_t offset = 0;
- if (chunk > file_size)
- chunk = file_size;
-
- while (offset < file_size) {
- res.emplace_back(FileName, Seq::GetDataSize(K),
- offset,
- offset + chunk > file_size ? file_size - offset : chunk);
- offset += chunk;
- }
-
- return res;
-}
-
-
-};
-
-#endif
diff --git a/src/modules/io/kmers_io/mmapped_reader.hpp b/src/modules/io/kmers_io/mmapped_reader.hpp
deleted file mode 100644
index 0fbe335..0000000
--- a/src/modules/io/kmers_io/mmapped_reader.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_MMAPPED_READER_HPP
-#define HAMMER_MMAPPED_READER_HPP
-
-#include "utils/adt/pointer_iterator.hpp"
-#include "utils/adt/array_vector.hpp"
-
-#include "dev_support/verify.hpp"
-
-#include <boost/iterator/iterator_facade.hpp>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <cstring>
-#include <cerrno>
-
-#include <string>
-#include <algorithm>
-
-class MMappedReader {
- int StreamFile;
- bool Unlink;
- std::string FileName;
-
- void remap() {
- VERIFY(BlockSize != FileSize);
-
- if (MappedRegion)
- munmap(MappedRegion, BlockSize);
-
- BlockOffset += BlockSize;
-
- if (BlockOffset + BlockSize > FileSize)
- BlockSize = FileSize - BlockOffset;
-
- // We do not add PROT_WRITE here intentionaly - remapping and write access
- // is pretty error-prone.
- if (BlockSize)
- MappedRegion =
- (uint8_t *) mmap(NULL, BlockSize,
- PROT_READ, MAP_FILE | MAP_PRIVATE,
- StreamFile, InitialOffset + BlockOffset);
- else
- MappedRegion = NULL;
- VERIFY_MSG((intptr_t) MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
-
- void read_internal(void *buf, size_t amount) {
- memcpy(buf, MappedRegion + BytesRead - BlockOffset, amount);
- BytesRead += amount;
- }
-
-protected:
- uint8_t *MappedRegion;
- size_t FileSize, BlockOffset, BytesRead, BlockSize;
- off_t InitialOffset;
-
-public:
- MMappedReader()
- : StreamFile(-1), Unlink(false), FileName(""), MappedRegion(0), FileSize(0), BytesRead(0),
- InitialOffset(0) { }
-
- MMappedReader(const std::string &filename, bool unlink = false,
- size_t blocksize = 64 * 1024 * 1024, off_t off = 0, size_t sz = 0)
- : Unlink(unlink), FileName(filename), BlockSize(blocksize) {
- struct stat buf;
-
- InitialOffset = off;
- FileSize = (sz ? sz : (stat(FileName.c_str(), &buf) != 0 ? 0 : buf.st_size - InitialOffset));
-
- StreamFile = open(FileName.c_str(), O_RDONLY);
- VERIFY_MSG(StreamFile != -1,
- "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno << ". File: " <<
- FileName);
-
- if (BlockSize != -1ULL) {
- size_t PageSize = getpagesize();
- BlockSize = BlockSize / PageSize * PageSize;
- } else
- BlockSize = FileSize;
-
- if (BlockSize) {
- MappedRegion =
- (uint8_t *) mmap(NULL, BlockSize, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE,
- StreamFile, InitialOffset);
- VERIFY_MSG((intptr_t) MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- } else
- MappedRegion = NULL;
-
- BlockOffset = BytesRead = 0;
- }
-
- MMappedReader(MMappedReader &&other) {
- // First, copy out the stuff
- MappedRegion = other.MappedRegion;
- FileSize = other.FileSize;
- BlockOffset = other.BlockOffset;
- BytesRead = other.BytesRead;
- BlockSize = other.BlockSize;
- FileName = std::move(other.FileName);
- Unlink = other.Unlink;
- StreamFile = other.StreamFile;
- InitialOffset = other.InitialOffset;
-
- // Now, zero out inside other, so we won't do crazy thing in dtor
- other.StreamFile = -1;
- other.Unlink = false;
- other.MappedRegion = 0;
- }
-
- MMappedReader &operator=(MMappedReader &&other) {
- if (this != &other) {
- *this = std::move(other);
- }
- return *this;
- }
-
- virtual ~MMappedReader() {
- if (StreamFile != -1)
- close(StreamFile);
- if (MappedRegion)
- munmap(MappedRegion, BlockSize);
-
- if (Unlink) {
- int res = unlink(FileName.c_str());
- VERIFY_MSG(res == 0,
- "unlink(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
- }
-
- void read(void *buf, size_t amount) {
- if (BytesRead + amount < BlockOffset + BlockSize) {
- // Easy case, no remap is necessary
- read_internal(buf, amount);
- return;
- }
-
- // Hard case - remapping is necessary. First - finish the current block.
- size_t ToRead = BlockSize - (BytesRead - BlockOffset);
- uint8_t *cbuf = (uint8_t *) buf;
-
- read_internal(cbuf, ToRead);
- amount -= ToRead;
- cbuf += ToRead;
-
- // Next, read as much BlockSize blocks as possible.
- while (amount >= BlockSize) {
- remap();
- read_internal(cbuf, BlockSize);
- amount -= BlockSize;
- cbuf += BlockSize;
- }
-
- // Finally, remap and read remaining.
- remap();
- read_internal(cbuf, amount);
- }
-
- void *skip(size_t amount) {
- // Easy case, no remapping is needed
- if (BytesRead + amount <= BlockOffset + BlockSize) {
- void *out = MappedRegion + BytesRead - BlockOffset;
- BytesRead += amount;
-
- return out;
- }
-
- // Make sure data does not cross the block boundary
- VERIFY(BytesRead == BlockOffset + BlockSize);
-
- // Now, remap and read from the beginning of the block
- remap();
-
- return skip(amount);
- }
-
- bool good() const {
- return BytesRead < FileSize;
- }
-
- size_t size() const { return FileSize; }
-
- size_t data_size() const { return FileSize; }
-
- void *data() const { return MappedRegion; }
-};
-
-template<typename T>
-class MMappedRecordReader : public MMappedReader {
-public:
- typedef pointer_iterator<T> iterator;
- typedef const pointer_iterator<T> const_iterator;
-
- MMappedRecordReader(const std::string &FileName, bool unlink = true,
- size_t blocksize = 64 * 1024 * 1024 / (sizeof(T) * (unsigned) getpagesize()) *
- (sizeof(T) * (unsigned) getpagesize()),
- off_t off = 0, size_t sz = 0) :
- MMappedReader(FileName, unlink, blocksize, off, sz) {
- VERIFY(FileSize % sizeof(T) == 0);
- }
-
- void read(T *el, size_t amount) {
- MMappedReader::read(el, amount * sizeof(T));
- }
-
- size_t size() const { return FileSize / sizeof(T); }
-
- size_t data_size() const { return FileSize; }
-
- T *data() { return (T *) MappedRegion; }
-
- const T *data() const { return (const T *) MappedRegion; }
-
- T &operator[](size_t idx) { return data()[idx]; }
-
- const T &operator[](size_t idx) const { return data()[idx]; }
-
- iterator begin() { return iterator(data()); }
-
- const_iterator begin() const { return const_iterator(data()); }
-
- iterator end() { return iterator(data() + size()); }
-
- const_iterator end() const { return const_iterator(data() + size()); }
-};
-
-template<class T>
-class MMappedFileRecordIterator :
- public boost::iterator_facade<MMappedFileRecordIterator<T>,
- const T,
- std::input_iterator_tag> {
-public:
- // Default ctor, used to implement "end" iterator
- MMappedFileRecordIterator() : good_(false) { }
-
- MMappedFileRecordIterator(const std::string &FileName)
- : reader_(FileName, false), good_(true) {
- reader_.read(&value_, sizeof(value_));
- }
-
- MMappedFileRecordIterator(MMappedRecordReader<T> &&reader)
- : reader_(std::move(reader)), good_(true) {
- reader_.read(&value_, sizeof(value_));
- }
-
- bool good() const {
- return good_;
- }
-
-private:
- friend class boost::iterator_core_access;
-
- void increment() {
- good_ = reader_.good();
- if (good_)
- reader_.read(&value_, sizeof(value_));
- }
-
- bool equal(const MMappedFileRecordIterator &other) {
- // Iterators are equal iff:
- // 1) They both are not good (at the end of the stream),
- // or
- // 2) Has the same mapped region
- return ((!reader_.good() && !other.reader_.good()) ||
- reader_.data() == other.reader_.data());
- }
-
- const T dereference() const { return value_; }
-
- T value_;
- MMappedRecordReader<T> reader_;
- bool good_;
-};
-
-template<typename T>
-class MMappedRecordArrayReader : public MMappedReader {
- size_t elcnt_;
-
-public:
- typedef typename array_vector<T>::iterator iterator;
- typedef typename array_vector<T>::const_iterator const_iterator;
-
- MMappedRecordArrayReader(const std::string &FileName,
- size_t elcnt = 1,
- bool unlink = true,
- off_t off = 0, size_t sz = 0) :
- MMappedReader(FileName, unlink, -1ULL, off, sz), elcnt_(elcnt) {
- VERIFY(FileSize % (sizeof(T) * elcnt_) == 0);
- }
-
- void read(T *el, size_t amount) {
- MMappedReader::read(el, amount * sizeof(T) * elcnt_);
- }
-
- size_t size() const { return FileSize / sizeof(T) / elcnt_; }
-
- size_t data_size() const { return FileSize; }
-
- size_t elcnt() const { return elcnt_; }
-
- T *data() { return (T *) MappedRegion; }
-
- const T *data() const { return (const T *) MappedRegion; }
-
- T &operator[](size_t idx) { return data()[idx * elcnt_]; }
-
- const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
-
- iterator begin() { return iterator(data(), /* size */ elcnt_); }
-
- const_iterator begin() const { return const_iterator(data()), /* size */ elcnt_; }
-
- const_iterator cbegin() const { return const_iterator(data()), /* size */ elcnt_; }
-
- iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
-
- const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
-
- const_iterator cend() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
-};
-
-static inline size_t round_up(size_t value, size_t boundary) {
- return (value + boundary - 1) / boundary * boundary;
-}
-
-template<class T>
-class MMappedFileRecordArrayIterator :
- public boost::iterator_facade<MMappedFileRecordArrayIterator<T>,
- const T *,
- std::input_iterator_tag,
- const T *> {
-public:
- // Default ctor, used to implement "end" iterator
- MMappedFileRecordArrayIterator() : value_(NULL), array_size_(0), reader_(), good_(false) { }
-
- MMappedFileRecordArrayIterator(const std::string &FileName,
- size_t elcnt,
- off_t offset = 0, size_t filesize = 0)
- : value_(NULL),
- array_size_(sizeof(T) * elcnt),
- reader_(FileName, false,
- round_up(filesize > 0 ? std::min(size_t(64 * 1024 * 1024), filesize) : 64 * 1024 * 1024,
- array_size_ * (unsigned) getpagesize()),
- offset, filesize),
- good_(false) {
- increment();
- }
-
- MMappedFileRecordArrayIterator(MMappedRecordReader<T> &&reader, size_t elcnt)
- : value_(NULL), array_size_(sizeof(T) * elcnt), reader_(std::move(reader)), good_(false) {
- increment();
- }
-
- MMappedFileRecordArrayIterator(const MMappedFileRecordArrayIterator &) = delete;
-
- MMappedFileRecordArrayIterator(MMappedFileRecordArrayIterator &&other)
- : value_(other.value_), array_size_(other.array_size_),
- reader_(std::move(other.reader_)), good_(other.good_) { }
-
- bool good() const { return good_; }
-
- const MMappedRecordReader<T> &reader() const { return reader_; }
-
-private:
- friend class boost::iterator_core_access;
-
- void increment() {
- good_ = reader_.good();
- value_ = (good_ ? (T *) reader_.skip(array_size_) : NULL);
- }
-
- bool equal(const MMappedFileRecordArrayIterator &other) const {
- return value_ == other.value_;
- }
-
- const T *dereference() const { return value_; }
-
- T *value_;
- size_t array_size_;
- MMappedRecordReader<T> reader_;
- bool good_;
-};
-
-#endif // HAMMER_MMAPPED_READER_HPP
diff --git a/src/modules/io/kmers_io/mmapped_writer.hpp b/src/modules/io/kmers_io/mmapped_writer.hpp
deleted file mode 100644
index 1f90a42..0000000
--- a/src/modules/io/kmers_io/mmapped_writer.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef HAMMER_MMAPPED_WRITER_HPP
-#define HAMMER_MMAPPED_WRITER_HPP
-
-#include "utils/adt/pointer_iterator.hpp"
-#include "utils/adt/array_vector.hpp"
-
-#include <string>
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <strings.h>
-
-class MMappedWriter {
- int StreamFile;
-
- MMappedWriter(const MMappedWriter &) = delete;
-
-protected:
- uint8_t *MappedRegion;
- size_t BytesWritten, BytesReserved, FileOffset, BufOffset;
-public:
- MMappedWriter() = default;
-
- MMappedWriter(const std::string &FileName) {
- open(FileName);
- }
-
- void open(const std::string &FileName) {
- StreamFile = ::open(FileName.c_str(), O_RDWR | O_CREAT | O_TRUNC, (mode_t) 0660);
- VERIFY_MSG(StreamFile != -1,
- "open(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
-
- FileOffset = BytesWritten = 0;
- MappedRegion = NULL;
- }
-
- virtual ~MMappedWriter() {
- if (MappedRegion)
- munmap(MappedRegion, BytesReserved);
- close(StreamFile);
- }
-
- void write(void *buf, size_t amount) {
- memcpy(MappedRegion + BufOffset + BytesWritten, buf, amount);
- BytesWritten += amount;
- }
-
- bool good() const {
- return BytesWritten < BytesReserved;
- }
-
- void reserve(size_t amount) {
- if (MappedRegion) {
- munmap(MappedRegion, BytesReserved);
- FileOffset += BytesWritten;
- MappedRegion = NULL;
- }
-
- if (amount == 0)
- return;
-
- int res = (int) lseek(StreamFile, amount - 1, SEEK_CUR);
- VERIFY_MSG(res != -1,
- "lseek(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- res = (int) ::write(StreamFile, "", 1);
- VERIFY_MSG(res != -1,
- "write(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
-
- // FileOffset here should be aligned to page boundary. Tune the stuff due to this fact.
- int PageSize = getpagesize();
- size_t FileOffsetAligned = FileOffset / PageSize * PageSize;
- size_t Residual = FileOffset - FileOffsetAligned;
-
- BytesReserved = amount + Residual;
- BytesWritten = 0;
- BufOffset = Residual;
- MappedRegion =
- (uint8_t *) mmap(NULL, BytesReserved,
- PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED,
- StreamFile, FileOffsetAligned);
- VERIFY_MSG((intptr_t) MappedRegion != -1L,
- "mmap(2) failed. Reason: " << strerror(errno) << ". Error code: " << errno);
- }
-
- size_t size() const { return BytesReserved; }
-};
-
-template<typename T>
-class MMappedRecordWriter : public MMappedWriter {
-public:
- typedef pointer_iterator<T> iterator;
- typedef const pointer_iterator<T> const_iterator;
-
- MMappedRecordWriter() = default;
-
- MMappedRecordWriter(const std::string &FileName) :
- MMappedWriter(FileName) {
- }
-
- void write(const T *el, size_t amount) {
- MMappedWriter::write((void *) el, amount * sizeof(T));
- }
-
- void reserve(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T));
- }
-
- void resize(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T));
- }
-
- size_t size() const { return BytesReserved / sizeof(T); }
-
- T *data() { return (T *) MappedRegion; }
-
- const T *data() const { return (const T *) MappedRegion; }
-
- T &operator[](size_t idx) { return data()[idx]; }
-
- const T &operator[](size_t idx) const { return data()[idx]; }
-
- iterator begin() { return iterator(data()); }
-
- const_iterator begin() const { return const_iterator(data()); }
-
- iterator end() { return iterator(data() + size()); }
-
- const_iterator end() const { return const_iterator(data() + size()); }
-};
-
-template<typename T>
-class MMappedRecordArrayWriter : public MMappedWriter {
- size_t elcnt_;
-public:
- typedef typename array_vector<T>::iterator iterator;
- typedef typename array_vector<T>::const_iterator const_iterator;
-
- MMappedRecordArrayWriter() = default;
-
- MMappedRecordArrayWriter(const std::string &FileName,
- size_t elcnt = 1) :
- MMappedWriter(FileName), elcnt_(elcnt) { }
-
- void open(const std::string &FileName,
- size_t elcnt = 1) {
- elcnt_ = elcnt;
- MMappedWriter::open(FileName);
- }
-
- void write(const T *el, size_t amount) {
- MMappedWriter::write((void *) el, amount * sizeof(T) * elcnt_);
- }
-
- void reserve(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
- }
-
- void resize(size_t amount) {
- MMappedWriter::reserve(amount * sizeof(T) * elcnt_);
- }
-
- size_t size() const { return BytesReserved / sizeof(T) / elcnt_; }
-
- T *data() { return (T *) MappedRegion; }
-
- const T *data() const { return (const T *) MappedRegion; }
-
- T &operator[](size_t idx) { return data()[idx * elcnt_]; }
-
- const T &operator[](size_t idx) const { return data()[idx * elcnt_]; }
-
- iterator begin() { return iterator(data(), elcnt_); }
-
- const_iterator begin() const { return const_iterator(data(), elcnt_); }
-
- iterator end() { return iterator(data() + size() * elcnt_, elcnt_); }
-
- const_iterator end() const { return const_iterator(data() + size() * elcnt_, elcnt_); }
-};
-
-#endif // HAMMER_MMAPPED_WRITER_HPP
diff --git a/src/modules/io/reads/read.hpp b/src/modules/io/reads/read.hpp
deleted file mode 100644
index 02f4c74..0000000
--- a/src/modules/io/reads/read.hpp
+++ /dev/null
@@ -1,244 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * read.hpp
- *
- * Created on: 29.03.2011
- * Author: vyahhi
- */
-
-#ifndef READ_HPP_
-#define READ_HPP_
-
-#include <string>
-#include <iostream>
-#include <fstream>
-#include "dev_support/verify.hpp"
-#include "data_structures/sequence/quality.hpp"
-#include "data_structures/sequence/sequence.hpp"
-#include "data_structures/sequence/nucl.hpp"
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "dev_support/simple_tools.hpp"
-
-//fixme deprecated!!! used in hammer!
-class Read {
-public:
- static const int PHRED_OFFSET = 33;
-
- bool isValid() const {
- return valid_;
- }
-
- Sequence getSequence() const {
- VERIFY(valid_);
- return Sequence(seq_);
- }
-
- Sequence getSubSequence(size_t start, size_t length) const __attribute__ ((deprecated)) {
- VERIFY(length > 0 && start + length <= seq_.size());
- return Sequence(seq_.substr(start, length));
- }
-
- Quality getQuality() const {
- VERIFY(valid_);
- return Quality(qual_);
- }
-
- const std::string &getSequenceString() const {
- return seq_;
- }
-
- const std::string &getQualityString() const {
- return qual_;
- }
-
- std::string getPhredQualityString(int offset = PHRED_OFFSET) const {
- std::string res = qual_;
- for (size_t i = 0; i < res.size(); ++i) {
- res[i] = (char) (res[i] + offset);
- }
- return res;
- }
-
- const std::string &getName() const {
- return name_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- char operator[](size_t i) const {
- VERIFY(is_nucl(seq_[i]));
- return dignucl(seq_[i]);
- }
-
- /**
- * trim read
- * @param ltrim first good base
- * @param rtrim last good base
- * @return whether there is anything left
- */
- bool trimLeftRight(int ltrim, int rtrim) {
- if (ltrim >= (int) seq_.size() || rtrim < 0 || rtrim < ltrim) {
- seq_ = "";
- qual_ = "";
- valid_ = false;
- return 0;
- }
- bool donesomething = false;
- if (ltrim > 0) {
- ltrim_ += ltrim;
- seq_.erase(0, ltrim);
- qual_.erase(0, ltrim);
- donesomething = true;
- }
- if (rtrim - ltrim + 1 < (int) seq_.size() && rtrim < (int) seq_.size() - ltrim - 1) {
- rtrim_ -= ((int) seq_.size() - (rtrim - ltrim + 1));
- seq_.erase(rtrim - ltrim + 1, std::string::npos);
- qual_.erase(rtrim - ltrim + 1, std::string::npos);
- donesomething = true;
- }
- if (donesomething) valid_ = updateValid();
- return true;
- }
-
- size_t trimNsAndBadQuality(int threshold) {
- int start = 0;
- for (; start < (int) seq_.size(); ++start) {
- if (seq_[start] != 'N' && (int) qual_[start] > threshold) break;
- }
- int end = 0;
- for (end = (int) seq_.size() - 1; end > -1; --end) {
- if (seq_[end] != 'N' && (int) qual_[end] > threshold) break;
- }
- if (!trimLeftRight(start, end)) return 0;
- else return seq_.size();
- }
-
- /**
- * @param k k as in k-mer
- * @param start start point
- * @return the first starting point of a valid k-mer >=start; return -1 if no such place exists
- */
- size_t firstValidKmer(size_t start, size_t k) const __attribute__ ((deprecated)) {
- size_t curHypothesis = start;
- size_t i = start;
- for (; i < seq_.size(); ++i) {
- if (i >= k + curHypothesis)
- return curHypothesis;
- if (!is_nucl(seq_[i])) {
- curHypothesis = i + 1;
- }
- }
- if (i >= k + curHypothesis) {
- return curHypothesis;
- }
- return -1ULL;
- }
-
- void setSequence(const char *s, bool preserve_trimming = false) {
- seq_ = s;
- if (!preserve_trimming) {
- ltrim_ = 0;
- rtrim_ = initial_size_ = (int) seq_.size();
- }
- valid_ = updateValid();
- }
-
- void setQuality(const char *s, int offset = PHRED_OFFSET) {
- qual_ = s;
- for (size_t i = 0; i < qual_.size(); ++i) {
- qual_[i] = (char) (qual_[i] - offset);
- }
- }
-
- void setName(const char *s) {
- name_ = s;
- }
-
- Read()
- : valid_(false), ltrim_(0), rtrim_(0), initial_size_(0) {
- ;
- }
-
- Read(const std::string &name, const std::string &seq, const std::string &qual) :
- name_(name), seq_(seq), qual_(qual) { // for test only!
- ltrim_ = 0;
- initial_size_ = rtrim_ = (int) seq_.size();
- valid_ = updateValid();
- }
-
- int ltrim() const { return ltrim_; }
-
- void set_ltrim(unsigned val) { ltrim_ = val; };
-
- int rtrim() const { return rtrim_; }
-
- int initial_size() const { return initial_size_; }
-
-private:
- std::string name_;
- std::string seq_;
- std::string qual_;
- bool valid_;
- int ltrim_;
- int rtrim_;
- int initial_size_;
-
- friend class ireadstream;
-
- friend uint32_t TrimBadQuality(Read *, int);
-
- bool updateValid() const {
- if (seq_.size() == 0) {
- return false;
- }
- for (size_t i = 0; i < seq_.size(); ++i) {
- if (!is_nucl(seq_[i])) {
- return false;
- }
- }
- return true;
- }
-
-public:
- Read operator!() const {
- std::string newName;
- if (name_ == "" || name_[0] != '!') {
- newName = '!' + name_;
- } else {
- newName = name_.substr(1, name_.length());
- }
- return Read(newName, ReverseComplement(seq_), Reverse(qual_));
- }
-
- void print(std::ostream &outf, int offset) const {
- outf << "@" << name_.c_str() << "\n";
- for (int i = 0; i < ltrim_; ++i) outf << "N";
- outf << seq_.c_str();
- for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << "N";
- outf << "\n" << "+" << name_.c_str();
- if (ltrim_ > 0) outf << " ltrim=" << ltrim_;
- if (rtrim_ < initial_size_)
- outf << " rtrim=" << (initial_size_ - rtrim_);
- outf << "\n";
- char badq = (char) (offset + 2);
- for (int i = 0; i < ltrim_; ++i) outf << badq;
- outf << getPhredQualityString(offset).c_str();
- for (int i = 0; i < initial_size_ - rtrim_; ++i) outf << badq;
- outf << "\n";
- }
-};
-
-// todo: put this to *.cpp
-//ostream& operator<<(ostream& os, const Read& read) {
-// return os << read.getSequenceString();
-//}
-
-#endif /* READ_HPP_ */
diff --git a/src/modules/io/reads/single_read.hpp b/src/modules/io/reads/single_read.hpp
deleted file mode 100644
index c307eaa..0000000
--- a/src/modules/io/reads/single_read.hpp
+++ /dev/null
@@ -1,334 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/verify.hpp"
-#include "data_structures/sequence/quality.hpp"
-#include "data_structures/sequence/sequence.hpp"
-#include "data_structures/sequence/nucl.hpp"
-#include "data_structures/sequence/sequence_tools.hpp"
-#include "dev_support/simple_tools.hpp"
-
-#include <string>
-
-namespace io {
-
-/*
-* This enumerate contains offset type.
-* UnknownOffset is equal to "offset = 0".
-* PhredOffset is equal to "offset = 33".
-* SolexaOffset is equal to "offset = 64".
-*/
-enum OffsetType {
- UnknownOffset = 0,
- PhredOffset = 33,
- SolexaOffset = 64
-};
-
-//todo extract code about offset from here
-
-typedef uint16_t SequenceOffsetT;
-
-
-class SingleRead {
-public:
-
- static std::string EmptyQuality(const std::string &seq) {
- return std::string(seq.size(), (char) 33);
- }
-
- static const int BAD_QUALITY_THRESHOLD = 2;
-
- SingleRead() :
- name_(""), seq_(""), qual_(""), left_offset_(0), right_offset_(0), valid_(false) {
- DEBUG(name_ << " created");
- }
-
- SingleRead(const std::string &name, const std::string &seq,
- const std::string &qual, OffsetType offset,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
- Init();
- DEBUG(name_ << " created");
- for (size_t i = 0; i < qual_.size(); ++i) {
- qual_[i] = (char) (qual_[i] - offset);
- }
- }
-
- SingleRead(const std::string &name, const std::string &seq,
- const std::string &qual,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(qual), left_offset_(left_offset), right_offset_(right_offset) {
- DEBUG(name_ << " created");
- Init();
- }
-
- SingleRead(const std::string &name, const std::string &seq,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- name_(name), seq_(seq), qual_(EmptyQuality(seq_)), left_offset_(left_offset),
- right_offset_(right_offset) {
- DEBUG(name_ << " created");
- Init();
- }
-
- bool IsValid() const {
- return valid_;
- }
-
- Sequence sequence(bool rc = false) const {
- VERIFY(valid_);
- return Sequence(seq_, rc);
- }
-
- Quality quality() const {
- VERIFY(valid_);
- return Quality(qual_);
- }
-
- const std::string &name() const {
- return name_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- const std::string &GetSequenceString() const {
- return seq_;
- }
-
- const std::string &GetQualityString() const {
- return qual_;
- }
-
- std::string GetPhredQualityString() const {
- int offset = PhredOffset;
- std::string res = qual_;
- for (size_t i = 0; i < res.size(); ++i) {
- res[i] = (char) (res[i] + offset);
- }
- return res;
- }
-
- /*
- * Return ith nucleotide of SingleRead sequence in unreadable form
- * (0, 1, 2 or 3).
- *
- * @param i Nucleotide index.
- * @return Nucleotide on ith position of SingleRead sequence.
- */
- char operator[](size_t i) const {
- VERIFY(is_nucl(seq_[i]));
- return dignucl(seq_[i]);
- }
-
- SingleRead operator!() const {
- std::string new_name;
- if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
- new_name = name_.substr(0, name_.length() - 3);
- } else {
- new_name = name_ + "_RC";
- }
- // TODO make naming nicer
- // if (name_ == "" || name_[0] != '!') {
- // new_name = '!' + name_;
- // } else {
- // new_name = name_.substr(1, name_.length());
- // }
- return SingleRead(new_name, ReverseComplement(seq_), Reverse(qual_), right_offset_, left_offset_);
- }
-
- SingleRead SubstrStrict(size_t from, size_t to) const {
- size_t len = to - from;
- // return SingleRead(name_, seq_.substr(from, len), qual_.substr(from, len));
- // TODO remove naming?
- std::string new_name;
- if (name_.length() >= 3 && name_.substr(name_.length() - 3) == "_RC") {
- new_name = name_.substr(0, name_.length() - 3) + "_SUBSTR(" + ToString(size() - to) + "," +
- ToString(size() - from) + ")" + "_RC";
- } else {
- new_name = name_ + "_SUBSTR(" + ToString(from) + "," + ToString(to) + ")";
- }
- return SingleRead(new_name, seq_.substr(from, len), qual_.substr(from, len),
- SequenceOffsetT(from + (size_t) left_offset_),
- SequenceOffsetT(size() - to + (size_t) right_offset_));
- }
-
- SingleRead Substr(size_t from, size_t to) const {
- size_t len = to - from;
- if (len == size()) {
- return *this;
- }
- if (len == 0) {
- return SingleRead();
- }
- return SubstrStrict(from, to);
- }
-
- bool operator==(const SingleRead &singleread) const {
- return seq_ == singleread.seq_;
- }
-
- void ChangeName(const std::string &new_name) {
- name_ = new_name;
- }
-
- static bool IsValid(const std::string &seq) {
- for (size_t i = 0; i < seq.size(); ++i) {
- if (!is_nucl(seq[i])) {
- return false;
- }
- }
- return true;
- }
-
- SequenceOffsetT GetLeftOffset() const {
- return left_offset_;
- }
-
- SequenceOffsetT GetRightOffset() const {
- return right_offset_;
- }
-
- bool BinWrite(std::ostream &file, bool rc = false) const {
- sequence(rc).BinWrite(file);
- if (rc) {
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- } else {
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- }
- return !file.fail();
- }
-
-
- void print_size() const {
- std::cerr << size() << std::endl;
- }
-
-
-private:
- /*
- * @variable The name of SingleRead in input file.
- */
- std::string name_;
- /*
- * @variable The sequence of nucleotides.
- */
- std::string seq_;
- /*
- * @variable The quality of SingleRead.
- */
- std::string qual_;
- /*
- * @variable The flag of SingleRead correctness.
- */
-
- //Left and right offsets with respect to original sequence
- SequenceOffsetT left_offset_;
-
- SequenceOffsetT right_offset_;
-
- bool valid_;
-
- void Init() {
- VERIFY(seq_.size() == qual_.size());
- valid_ = SingleRead::IsValid(seq_);
- }
-
-};
-
-inline std::ostream &operator<<(std::ostream &os, const SingleRead &read) {
- os << "Single read name=" << read.name() << " sequence=" << read.GetSequenceString() << std::endl;
- return os;
-}
-
-class SingleReadSeq {
-
-public:
- SingleReadSeq(const Sequence &s,
- SequenceOffsetT left_offset = 0, SequenceOffsetT right_offset = 0) :
- seq_(s), left_offset_(left_offset), right_offset_(right_offset) {
- }
-
- SingleReadSeq() : seq_(), left_offset_(0), right_offset_(0) {
- }
-
- bool BinRead(std::istream &file) {
- seq_.BinRead(file);
- file.read((char *) &left_offset_, sizeof(left_offset_));
- file.read((char *) &right_offset_, sizeof(right_offset_));
- return !file.fail();
- }
-
- bool BinWrite(std::ostream &file, bool rc = false) const {
- if (rc)
- (!seq_).BinWrite(file);
- else
- seq_.BinWrite(file);
- if (rc) {
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- } else {
- file.write((const char *) &left_offset_, sizeof(left_offset_));
- file.write((const char *) &right_offset_, sizeof(right_offset_));
- }
- return !file.fail();
- }
-
- // SingleReadSeq(std::istream& file): seq_(file, true) {
- // }
-
- bool operator==(const SingleReadSeq &singleread) const {
- return seq_ == singleread.seq_;
- }
-
- const Sequence sequence() const {
- return seq_;
- }
-
- size_t size() const {
- return seq_.size();
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- SingleReadSeq operator!() const {
- return SingleReadSeq(!seq_);
- }
-
- SequenceOffsetT GetLeftOffset() const {
- return left_offset_;
- }
-
- SequenceOffsetT GetRightOffset() const {
- return right_offset_;
- }
-
-private:
- Sequence seq_;
-
- //Left and right offsets with respect to original sequence
- SequenceOffsetT left_offset_;
-
- SequenceOffsetT right_offset_;
-};
-
-inline std::ostream &operator<<(std::ostream &os, const SingleReadSeq &read) {
- os << "Single read sequence=" << read.sequence() << std::endl;
- return os;
-}
-
-}
diff --git a/src/modules/io/reads_io/binary_converter.hpp b/src/modules/io/reads_io/binary_converter.hpp
deleted file mode 100644
index 7da965f..0000000
--- a/src/modules/io/reads_io/binary_converter.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * binary_io.hpp
- *
- * Created on: Apr 12, 2012
- * Author: andrey
- */
-
-#ifndef BINARY_IO_HPP_
-#define BINARY_IO_HPP_
-
-#include <fstream>
-
-#include "dev_support/verify.hpp"
-#include "ireader.hpp"
-#include "io/reads/single_read.hpp"
-#include "io/reads/paired_read.hpp"
-#include "pipeline/library.hpp"
-
-namespace io {
-
-template<class Read>
-class ReadBinaryWriter {
-
-public:
-
- ReadBinaryWriter(LibraryOrientation /*orientation*/ = LibraryOrientation::Undefined) {
- }
-
- bool Write(std::ostream& file, const Read& r) const {
- return r.BinWrite(file);
- }
-};
-
-template<>
-class ReadBinaryWriter<PairedRead> {
-
-private:
-
- bool rc1_;
-
- bool rc2_;
-
-public:
-
- ReadBinaryWriter(LibraryOrientation orientation) {
- switch (orientation) {
- case LibraryOrientation::FF: {
- rc1_ = false;
- rc2_ = false;
- break;
- }
- case LibraryOrientation::RR: {
- rc1_ = true;
- rc2_ = true;
- break;
- }
- case LibraryOrientation::FR: {
- rc1_ = false;
- rc2_ = true;
- break;
- }
- case LibraryOrientation::RF: {
- rc1_ = true;
- rc2_ = false;
- break;
- }
- default: {
- rc1_ = false;
- rc2_ = false;
- break;
- }
- }
-
- }
-
- bool Write(std::ostream& file, const PairedRead& r) const {
- return r.BinWrite(file, rc1_, rc2_);
- }
-};
-
-
-class BinaryWriter {
-
-private:
- const std::string file_name_prefix_;
-
- size_t file_num_;
-
- std::vector<std::ofstream*> file_ds_;
-
- size_t buf_size_;
-
- template<class Read>
- void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file, size_t from, size_t to) {
- for (size_t i = from; i < to; ++i) {
- read_writer.Write(file, buffer[i]);
- }
- }
-
- template<class Read>
- void FlushBuffer(const std::vector<Read>& buffer, const ReadBinaryWriter<Read>& read_writer, std::ostream& file) {
- FlushBuffer(buffer, read_writer, file, 0, buffer.size());
- }
-
- template<class Read>
- ReadStreamStat ToBinary(io::ReadStream<Read>& stream, size_t buf_size,
- LibraryOrientation orientation) {
-
- ReadBinaryWriter<Read> read_writer(orientation);
- size_t buffer_reads = buf_size / (sizeof (Read) * 4);
- size_t reads_to_flush = buffer_reads * file_num_;
-
- std::vector< std::vector<Read> > buf(file_num_, std::vector<Read>(buffer_reads) );
- std::vector< ReadStreamStat > read_stats(file_num_);
- std::vector< size_t > current_buf_sizes(file_num_, 0);
- size_t read_count = 0;
-
- for (size_t i = 0; i < file_num_; ++i) {
- file_ds_[i]->seekp(0);
- read_stats[i].write(*file_ds_[i]);
- }
-
- size_t buf_index;
- while (!stream.eof()) {
- buf_index = read_count % file_num_;
-
- Read& r = buf[buf_index][current_buf_sizes[buf_index]];
- stream >> r;
- read_stats[buf_index].increase(r);
-
- ++current_buf_sizes[buf_index];
- VERBOSE_POWER(++read_count, " reads processed");
-
- if (read_count % reads_to_flush == 0) {
- for (size_t i = 0; i < file_num_; ++i) {
- FlushBuffer(buf[i], read_writer, *file_ds_[i]);
- current_buf_sizes[i] = 0;
- }
- }
- }
-
- ReadStreamStat result;
- for (size_t i = 0; i < file_num_; ++i) {
- buf[i].resize(current_buf_sizes[i]);
- FlushBuffer(buf[i], read_writer, *file_ds_[i]);
-
- file_ds_[i]->seekp(0);
- read_stats[i].write(*file_ds_[i]);
- result.merge(read_stats[i]);
- }
-
- INFO(read_count << " reads written");
- return result;
- }
-
-
- template<class Read>
- ReadStreamStat ToBinaryForThread(io::ReadStream<Read>& stream, size_t buf_size,
- size_t thread_num, LibraryOrientation orientation) {
-
- ReadBinaryWriter<Read> read_writer(orientation);
- size_t buffer_reads = buf_size / (sizeof (Read) * 4);
- std::vector<Read> buf(buffer_reads);
-
- ReadStreamStat stat;
- file_ds_[thread_num]->seekp(0);
- stat.write(*file_ds_[thread_num]);
-
- size_t current = 0;
-
- while (!stream.eof()) {
- Read& r = buf[current];
- stream >> r;
- stat.increase(r);
- ++current;
-
- if (stat.read_count_ % buffer_reads == 0) {
- FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
- current = 0;
- }
- }
-
- buf.resize(current);
- FlushBuffer(buf, read_writer, *file_ds_[thread_num]);
-
- file_ds_[thread_num]->seekp(0);
- stat.write(*file_ds_[thread_num]);
-
- return stat;
- }
-
-
-public:
-
- BinaryWriter(const std::string& file_name_prefix, size_t file_num,
- size_t buf_size):
- file_name_prefix_(file_name_prefix), file_num_(file_num),
- file_ds_(), buf_size_(buf_size) {
-
- std::string fname;
- for (size_t i = 0; i < file_num_; ++i) {
- fname = file_name_prefix_ + "_" + ToString(i) + ".seq";
- file_ds_.push_back(new std::ofstream(fname, std::ios_base::binary));
- }
- }
-
- ~BinaryWriter() {
- for (size_t i = 0; i < file_num_; ++i) {
- if (file_ds_[i]->is_open()) {
- file_ds_[i]->close();
- }
- delete file_ds_[i];
- }
- }
-
-
- ReadStreamStat ToBinary(io::ReadStream<io::SingleReadSeq>& stream) {
- return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::SingleRead>& stream) {
- return ToBinary(stream, buf_size_ / file_num_, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::PairedReadSeq>& stream) {
- return ToBinary(stream, buf_size_ / (2 * file_num_), LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinary(io::ReadStream<io::PairedRead>& stream, LibraryOrientation orientation) {
- return ToBinary(stream, buf_size_ / (2 * file_num_), orientation);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleReadSeq>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::SingleRead>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / file_num_, thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedReadSeq>& stream, size_t thread_num) {
- return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, LibraryOrientation::Undefined);
- }
-
- ReadStreamStat ToBinaryForThread(io::ReadStream<io::PairedRead>& stream, size_t thread_num, LibraryOrientation orientation) {
- return ToBinaryForThread(stream, buf_size_ / (2 * file_num_), thread_num, orientation);
- }
-
-// template<class Read>
-// void WriteReads(std::vector<Read>& data) {
-// size_t chunk_size = data.size() / file_num_;
-// size_t last_chunk_size = chunk_size + data.size() % file_num_;
-//
-// for (size_t i = 0; i < file_num_ - 1; ++i) {
-// file_ds_[i]->write((const char *) &chunk_size, sizeof(chunk_size));
-// }
-// file_ds_.back()->write((const char *) &last_chunk_size, sizeof(last_chunk_size));
-//
-// size_t start_pos = 0;
-// for (size_t i = 0; i < file_num_ - 1; ++i, start_pos += chunk_size) {
-// FlushBuffer(data, *file_ds_[i], start_pos, start_pos + chunk_size);
-// }
-// FlushBuffer(data, file_ds_.back(), start_pos, data.size());
-// }
-//
-// template<class Read>
-// void WriteSeparatedReads(std::vector< std::vector<Read> >& data) {
-// if (data.size() != file_num_) {
-// WARN("Cannot write reads, number of vectors is not equal to thread number");
-// return;
-// }
-//
-// for (size_t i = 0; i < file_num_; ++i) {
-// size_t size = data[i].size();
-// file_ds_[i]->write((const char *) &size, sizeof(size));
-// }
-//
-// for (size_t i = 0; i < file_num_; ++i) {
-// FlushBuffer(data[i], *file_ds_[i]);
-// }
-// }
-};
-
-
-}
-
-
-#endif /* BINARY_IO_HPP_ */
diff --git a/src/modules/io/reads_io/binary_streams.hpp b/src/modules/io/reads_io/binary_streams.hpp
deleted file mode 100644
index d7679f2..0000000
--- a/src/modules/io/reads_io/binary_streams.hpp
+++ /dev/null
@@ -1,357 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <fstream>
-
-#include "dev_support/verify.hpp"
-#include "ireader.hpp"
-#include "io/reads/single_read.hpp"
-#include "io/reads/paired_read.hpp"
-
-namespace io {
-
-// == Deprecated classes ==
-// Use FileReadStream and InsertSizeModyfing instead
-
-class BinaryFileSingleStream: public PredictableReadStream<SingleReadSeq> {
-private:
- std::ifstream stream_;
- ReadStreamStat read_stat_;
- size_t current_;
-
-public:
-
- BinaryFileSingleStream(const std::string& file_name_prefix, size_t file_num) {
- std::string fname;
- fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
- stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-
- reset();
- }
-
- virtual bool is_open() {
- return stream_.is_open();
- }
-
- virtual bool eof() {
- return current_ == read_stat_.read_count_;
- }
-
- virtual BinaryFileSingleStream& operator>>(SingleReadSeq& read) {
- read.BinRead(stream_);
- VERIFY(current_ < read_stat_.read_count_);
-
- ++current_;
- return *this;
- }
-
- virtual void close() {
- current_ = 0;
- stream_.close();
- }
-
- virtual void reset() {
- stream_.clear();
- stream_.seekg(0);
- VERIFY(stream_.good());
- read_stat_.read(stream_);
- current_ = 0;
- }
-
- virtual size_t size() const {
- return read_stat_.read_count_;
- }
-
- virtual ReadStreamStat get_stat() const {
- return read_stat_;
- }
-
-};
-
-class BinaryFilePairedStream: public PredictableReadStream<PairedReadSeq> {
-
-private:
- std::ifstream stream_;
-
- size_t insert_size_;
-
- ReadStreamStat read_stat_;
-
- size_t current_;
-
-
-public:
-
- BinaryFilePairedStream(const std::string& file_name_prefix, size_t file_num, size_t insert_szie): stream_(), insert_size_ (insert_szie) {
- std::string fname;
- fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
- stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-
- reset();
- }
-
- virtual bool is_open() {
- return stream_.is_open();
- }
-
- virtual bool eof() {
- return current_ >= read_stat_.read_count_;
- }
-
- virtual BinaryFilePairedStream& operator>>(PairedReadSeq& read) {
- read.BinRead(stream_, insert_size_);
- VERIFY(current_ < read_stat_.read_count_);
-
- ++current_;
- return *this;
- }
-
- virtual void close() {
- current_ = 0;
- stream_.close();
- }
-
-
- virtual void reset() {
- stream_.clear();
- stream_.seekg(0);
- VERIFY(stream_.good());
- read_stat_.read(stream_);
- current_ = 0;
- }
-
- virtual size_t size() const {
- return read_stat_.read_count_;
- }
-
- ReadStreamStat get_stat() const {
- ReadStreamStat stat = read_stat_;
- stat.read_count_ *= 2;
- return stat;
- }
-};
-
-
-//template <class Read>
-//class FileReadStream: public io::PredictableIReader<Read> {
-//
-//private:
-// std::ifstream stream_;
-//
-// ReadStat read_stat_;
-//
-// size_t current_;
-//
-//public:
-//
-// FileReadStream(const std::string& file_name_prefix, size_t file_num) {
-// std::string fname;
-// fname = file_name_prefix + "_" + ToString(file_num) + ".seq";
-// stream_.open(fname.c_str(), std::ios_base::binary | std::ios_base::in);
-//
-// reset();
-// }
-//
-// virtual ~FileReadStream() {
-// if (stream_.is_open()) {
-// stream_.close();
-// }
-// }
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return current_ == read_stat_.read_count_;
-// }
-//
-// virtual FileReadStream& operator>>(Read& read) {
-// read.BinRead(stream_);
-// VERIFY(current_ < read_stat_.read_count_);
-//
-// ++current_;
-// return *this;
-// }
-//
-// virtual void close() {
-// current_ = 0;
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.clear();
-// stream_.seekg(0);
-// VERIFY(stream_.good());
-// read_stat_.read(stream_);
-// current_ = 0;
-// }
-//
-// virtual size_t size() const {
-// return read_stat_.read_count_;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return read_stat_;
-// }
-//};
-
-//template <class Read>
-//class ReadBufferedStream: public io::PredictableIReader<Read> {
-//
-//private:
-// std::vector<Read> * data_;
-//
-// ReadStat read_stat_;
-//
-// size_t current_;
-//
-//public:
-//
-// ReadBufferedStream(io::PredictableIReader<Read>& stream) {
-// read_stat_ = stream.get_stat();
-// data_ = new std::vector<Read>(read_stat_.read_count_);
-//
-// size_t i = 0;
-// while (!stream.eof()) {
-// stream >> (*data_)[i++];
-// }
-//
-// reset();
-// }
-//
-// virtual ~ReadBufferedStream() {
-// delete data_;
-// }
-//
-// virtual bool is_open() {
-// return true;
-// }
-//
-// virtual bool eof() {
-// return current_ == read_stat_.read_count_;
-// }
-//
-// virtual ReadBufferedStream& operator>>(Read& read) {
-// read = (*data_)[current_];
-// VERIFY(current_ < read_stat_.read_count_);
-//
-// ++current_;
-// return *this;
-// }
-//
-// virtual void close() {
-// current_ = 0;
-// }
-//
-// virtual void reset() {
-// current_ = 0;
-// }
-//
-// virtual size_t size() const {
-// return read_stat_.read_count_;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return read_stat_;
-// }
-//};
-
-//class SeqSingleReadStreamWrapper: public Reader<SingleReadSeq> {
-//
-//private:
-// io::IReader<io::PairedReadSeq>& stream_;
-//
-// PairedReadSeq current_read_;
-//
-// bool is_read_;
-//
-//public:
-//
-// SeqSingleReadStreamWrapper(io::IReader<io::PairedReadSeq>& stream): stream_(stream), current_read_(), is_read_(false) {
-// }
-//
-// virtual ~SeqSingleReadStreamWrapper() {}
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return stream_.eof() && !is_read_;
-// }
-//
-// virtual SeqSingleReadStreamWrapper& operator>>(io::SingleReadSeq& read) {
-// if (!is_read_) {
-// stream_ >> current_read_;
-// read = current_read_.first();
-// } else {
-// read = current_read_.second();
-// }
-// is_read_ = !is_read_;
-// return *this;
-// }
-//
-// virtual void close() {
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.reset();
-// is_read_ = false;
-// }
-//
-// virtual ReadStat get_stat() const {
-// return stream_.get_stat();
-// }
-//};
-
-//class InsertSizeModifyingWrapper: public io::IReader<io::PairedReadSeq> {
-//
-//private:
-// io::IReader<io::PairedReadSeq>& stream_;
-//
-// size_t insert_size_;
-//
-//public:
-//
-// InsertSizeModifyingWrapper(io::IReader<io::PairedReadSeq>& stream, size_t insert_szie): stream_(stream), insert_size_ (insert_szie) {
-// }
-//
-// virtual ~InsertSizeModifyingWrapper() {
-// }
-//
-// virtual bool is_open() {
-// return stream_.is_open();
-// }
-//
-// virtual bool eof() {
-// return stream_.eof();
-// }
-//
-// virtual InsertSizeModifyingWrapper& operator>>(io::PairedReadSeq& read) {
-// stream_ >> read;
-// read.inc_insert_size(insert_size_);
-// return *this;
-// }
-//
-// virtual void close() {
-// stream_.close();
-// }
-//
-// virtual void reset() {
-// stream_.reset();
-// }
-//
-// virtual ReadStat get_stat() const {
-// return stream_.get_stat();
-// }
-//};
-
-}
diff --git a/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp b/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp
deleted file mode 100644
index 188ba6b..0000000
--- a/src/modules/io/reads_io/careful_filtering_reader_wrapper.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-//todo rename file
-#include "io/reads_io/delegating_reader_wrapper.hpp"
-#include "pipeline/library.hpp"
-
-namespace io {
-
-const size_t none = -1ul;
-
-inline std::pair<size_t, size_t> LongestValidCoords(const SingleRead& r) {
- size_t best_len = 0;
- size_t best_pos = none;
- size_t pos = none;
- std::string seq = r.GetSequenceString();
- for (size_t i = 0; i <= seq.size(); ++i) {
- if (i < seq.size() && is_nucl(seq[i])) {
- if (pos == none) {
- pos = i;
- }
- } else {
- if (pos != none) {
- size_t len = i - pos;
- if (len > best_len) {
- best_len = len;
- best_pos = pos;
- }
- }
- pos = none;
- }
- }
- if (best_len == 0) {
- return std::make_pair(0, 0);
- }
- return std::make_pair(best_pos, best_pos + best_len);
-}
-
-inline SingleRead LongestValid(const SingleRead& r,
- bool /*use_orientation*/ = false,
- LibraryOrientation /*orientation*/ = LibraryOrientation::FR) {
-
- std::pair<size_t, size_t> p = LongestValidCoords(r);
- return r.Substr(p.first, p.second);
-}
-
-inline PairedRead LongestValid(const PairedRead& r,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::FR) {
- std::pair<size_t, size_t> c1 = LongestValidCoords(r.first());
- std::pair<size_t, size_t> c2 = LongestValidCoords(r.second());
- size_t len1 = c1.second - c1.first;
- size_t len2 = c2.second - c2.first;
- if (len1 == 0 || len2 == 0) {
- return PairedRead();
- }
- if (len1 == r.first().size() && len2 == r.second().size()) {
- return r;
- }
-
- size_t is;
- if (!use_orientation) {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- }
- else {
- switch (orientation) {
- case LibraryOrientation::FF: {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- break;
- }
- case LibraryOrientation::RR: {
- is = r.insert_size() - r.first().size() + c1.second - c2.first;
- break;
- }
- case LibraryOrientation::FR: {
- is = r.insert_size() - c1.first - c2.first;
- break;
- }
- case LibraryOrientation::RF: {
- is = r.insert_size() - r.first().size() + c1.second - r.second().size() + c2.second;
- break;
- }
- default: {
- is = r.insert_size() - c1.first - r.second().size() + c2.second;
- break;
- }
- }
- }
-
- return PairedRead(r.first().Substr(c1.first, c1.second), r.second().Substr(c2.first, c2.second), is);
-}
-
-
-//todo rewrite without eof
-template<typename ReadType>
-class CarefulFilteringWrapper : public DelegatingWrapper<ReadType> {
- typedef DelegatingWrapper<ReadType> base;
-public:
- /*
- * Default constructor.
- *
- * @param reader Reference to any other reader (child of IReader).
- */
- CarefulFilteringWrapper(typename base::ReadStreamPtrT reader_ptr,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) :
- base(reader_ptr),
- eof_(false),
- use_orientation_(use_orientation),
- orientation_(orientation) {
- StepForward();
- }
-
- /* virtual */ bool eof() {
- return eof_;
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read * data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ CarefulFilteringWrapper& operator>>(ReadType& read) {
- read = next_read_;
- StepForward();
- return *this;
- }
-
- /* virtual */
- void reset() {
- base::reset();
- eof_ = false;
- StepForward();
- }
-
-private:
- bool eof_;
- bool use_orientation_;
- LibraryOrientation orientation_;
- ReadType next_read_;
-
- /*
- * Read next valid read in the stream.
- */
- void StepForward() {
- while (!base::eof()) {
- base::operator >>(next_read_);
- next_read_ = LongestValid(next_read_, use_orientation_, orientation_);
- if (next_read_.IsValid()) {
- return;
- }
- }
- eof_ = true;
- }
-};
-
-template<class ReadType>
-std::shared_ptr<ReadStream<ReadType>> CarefulFilteringWrap(std::shared_ptr<ReadStream<ReadType>> reader_ptr,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- //return reader_ptr = make_shared<CarefulFilteringWrapper<ReadType>>(reader_ptr, false, LibraryOrientation::Undefined);
- return std::shared_ptr<CarefulFilteringWrapper<ReadType> >(
- new CarefulFilteringWrapper<ReadType>(reader_ptr, use_orientation, orientation));
-}
-
-template<class ReadType>
-ReadStreamList<ReadType> CarefulFilteringWrap(const ReadStreamList<ReadType>& readers,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- ReadStreamList<ReadType> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(CarefulFilteringWrap<ReadType>(readers.ptr_at(i), use_orientation, orientation));
- }
- return answer;
-}
-
-}
diff --git a/src/modules/io/reads_io/cutting_reader_wrapper.hpp b/src/modules/io/reads_io/cutting_reader_wrapper.hpp
deleted file mode 100644
index 596329a..0000000
--- a/src/modules/io/reads_io/cutting_reader_wrapper.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////****************************************************************************
-// todo remove!!!
-///**
-// * @file cutting_reader_wrapper.hpp
-// * @author Mariya Fomkina
-// * @version 1.0
-// *
-// * @section LICENSE
-// *
-// * This program is free software; you can redistribute it and/or
-// * modify it under the terms of the GNU General Public License as
-// * published by the Free Software Foundation; either version 2 of
-// * the License, or (at your option) any later version.
-// *
-// * @section DESCRIPTION
-// *
-// * CuttingReaderWrapper is the class-wrapper that reads only set
-// * number of reads from another reader.
-// */
-//
-//#ifndef COMMON_IO_CUTTINGREADERWRAPPER_HPP_
-//#define COMMON_IO_CUTTINGREADERWRAPPER_HPP_
-//
-//#include "io/ireader.hpp"
-//
-//namespace io {
-//
-//template<typename ReadType>
-//class CuttingReaderWrapper : public IReader<ReadType> {
-// public:
-// /*
-// * Default constructor.
-// *
-// * @param reader Reference to any other reader (child of IReader).
-// * @param cut Number of reads to be read (-1 by default, i.e. all).
-// */
-// explicit CuttingReaderWrapper(IReader<ReadType>& reader,
-// size_t cut = -1)
-// : reader_(reader), cut_(cut), read_(0) {
-// }
-//
-// /*
-// * Default destructor.
-// */
-// /* virtual */ ~CuttingReaderWrapper() {
-// close();
-// }
-//
-// /*
-// * Check whether the stream is opened.
-// *
-// * @return true of the stream is opened and false otherwise.
-// */
-// /* virtual */ bool is_open() {
-// return reader_.is_open();
-// }
-//
-// /*
-// * Check whether we've reached the end of stream.
-// *
-// * @return true if the end of stream is reached and false
-// * otherwise.
-// */
-// /* virtual */ bool eof() {
-// return (read_ == cut_) || (reader_.eof());
-// }
-//
-// /*
-// * Read SingleRead or PairedRead from stream (according to ReadType).
-// *
-// * @param read The SingleRead or PairedRead that will store read
-// * data.
-// *
-// * @return Reference to this stream.
-// */
-// /* virtual */ CuttingReaderWrapper& operator>>(ReadType& read) {
-// if (read_ < cut_) {
-// reader_ >> read;
-// ++read_;
-// }
-// return (*this);
-// }
-//
-// /*
-// * Close the stream.
-// */
-// /* virtual */ void close() {
-// reader_.close();
-// }
-//
-// /*
-// * Close the stream and open it again.
-// */
-// /* virtual */ void reset() {
-// read_ = 0;
-// reader_.reset();
-// }
-//
-// ReadStat get_stat() const {
-// return reader_.get_stat();
-// }
-//
-// private:
-// /*
-// * @variable Internal stream readers.
-// */
-// IReader<ReadType>& reader_;
-// /*
-// * @variable Number of reads that are allowed to read (if it is less
-// * than 0, all the reads in stream are allowed to be read).
-// */
-// size_t cut_;
-// /*
-// * @variable Number of reads that are read till the moment.
-// */
-// size_t read_;
-//
-// /*
-// * Hidden copy constructor.
-// */
-// explicit CuttingReaderWrapper(const CuttingReaderWrapper<ReadType>&
-// reader);
-// /*
-// * Hidden assign operator.
-// */
-// void operator=(const CuttingReaderWrapper<ReadType>& reader);
-//};
-//
-//}
-//
-//#endif /* COMMON_IO_CUTTINGREADERWRAPPER_HPP_ */
diff --git a/src/modules/io/reads_io/easy_reader.hpp b/src/modules/io/reads_io/easy_reader.hpp
deleted file mode 100644
index 98df7fb..0000000
--- a/src/modules/io/reads_io/easy_reader.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////****************************************************************************
-//
-//#pragma once
-//
-//#include "ireader.hpp"
-//#include "paired_readers.hpp"
-//#include "delegating_reader_wrapper.hpp"
-//#include "splitting_wrapper.hpp"
-//#include "rc_reader_wrapper.hpp"
-//#include "filtering_reader_wrapper.hpp"
-//#include "careful_filtering_reader_wrapper.hpp"
-//#include "single_read.hpp"
-//#include "io_helper.hpp"
-//
-//#include <memory>
-//
-//namespace io {
-//
-//////todo refactor, and maybe merge them once again
-////class EasyReader: public DelegatingReaderWrapper<SingleRead> {
-//// explicit EasyReader(const EasyReader& reader);
-//// void operator=(const EasyReader& reader);
-////
-//// Reader raw_reader_;
-////// FilteringReaderWrapper<ReadType> filtered_reader_;
-//// CarefulFilteringReaderWrapper<SingleRead> filtered_reader_;
-//// RCReaderWrapper<SingleRead> rc_reader_;
-////
-////public:
-//// explicit EasyReader(const string& filename,
-//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
-//// raw_reader_(filename, offset_type), filtered_reader_(raw_reader_), rc_reader_(
-//// filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////
-//// /*
-//// * Default destructor.
-//// */
-//// /* virtual */
-//// ~EasyReader() {
-//// }
-////
-////};
-////
-//////todo refactor, and maybe merge them once again
-////class EasySplittingReader: public DelegatingReaderWrapper<io::SingleRead> {
-//// explicit EasySplittingReader(const EasySplittingReader& reader);
-//// void operator=(const EasySplittingReader& reader);
-////
-//// Reader raw_reader_;
-////// FilteringReaderWrapper<ReadType> filtered_reader_;
-//// SplittingWrapper splitting_reader_;
-//// RCReaderWrapper<io::SingleRead> rc_reader_;
-////
-////public:
-//// explicit EasySplittingReader(const io::SingleRead::FilenameType& filename,
-//// bool followed_by_rc, OffsetType offset_type = PhredOffset) :
-//// raw_reader_(filename, offset_type), splitting_reader_(raw_reader_), rc_reader_(
-//// splitting_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(splitting_reader_);
-//// }
-//// }
-////
-//// /*
-//// * Default destructor.
-//// */
-//// /* virtual */
-//// ~EasySplittingReader() {
-//// }
-////
-////};
-//
-////class PairedEasyReader: public DelegatingReaderWrapper<io::PairedRead> {
-//// std::unique_ptr<IReader<io::PairedRead>> raw_reader_;
-//// CarefulFilteringReaderWrapper<io::PairedRead> filtered_reader_;
-//// RCReaderWrapper<io::PairedRead> rc_reader_;
-////
-////public:
-//// PairedEasyReader(const io::PairedRead::FilenamesType& filenames,
-//// bool followed_by_rc, size_t insert_size, bool change_read_order =
-//// false, bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
-//// OffsetType offset_type = PhredOffset) :
-//// raw_reader_(
-//// new SeparateReader(filenames, insert_size,
-//// change_read_order, use_orientation, orientation, offset_type)), filtered_reader_(
-//// *raw_reader_), rc_reader_(filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////
-//// PairedEasyReader(const std::string& filename, bool followed_by_rc,
-//// size_t insert_size, bool change_read_order = false,
-//// bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
-//// OffsetType offset_type = PhredOffset) :
-//// raw_reader_(
-//// new MixedReader(filename, insert_size, change_read_order,
-//// use_orientation, orientation, offset_type)), filtered_reader_(
-//// *raw_reader_), rc_reader_(filtered_reader_) {
-//// if (followed_by_rc) {
-//// Init(rc_reader_);
-//// } else {
-//// Init(filtered_reader_);
-//// }
-//// }
-////};
-//
-//}
diff --git a/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp b/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp
deleted file mode 100644
index 7cb42c0..0000000
--- a/src/modules/io/reads_io/fasta_fastq_gz_parser.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file fastqgz_parser.hpp
- * @author Mariya Fomkina
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * FastaFastqGzParser is the parser stream that reads data from .fastq.gz
- * files.
- */
-
-#ifndef COMMON_IO_FASTAFASTQGZPARSER_HPP
-#define COMMON_IO_FASTAFASTQGZPARSER_HPP
-
-#include <zlib.h>
-#include <string>
-#include "kseq/kseq.h"
-#include "dev_support/verify.hpp"
-#include "io/reads/single_read.hpp"
-#include "io/reads_io/parser.hpp"
-#include "data_structures/sequence/quality.hpp"
-#include "data_structures/sequence/nucl.hpp"
-
-namespace io {
-
-namespace fastafastqgz {
-// STEP 1: declare the type of file handler and the read() function
-KSEQ_INIT(gzFile, gzread)
-}
-
-class FastaFastqGzParser: public Parser {
-public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
- */
- FastaFastqGzParser(const std::string& filename, OffsetType offset_type =
- PhredOffset) :
- Parser(filename, offset_type), fp_(), seq_(NULL) {
- open();
- }
-
- /*
- * Default destructor.
- */
- /* virtual */
- ~FastaFastqGzParser() {
- close();
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */
- FastaFastqGzParser& operator>>(SingleRead& read) {
- if (!is_open_ || eof_) {
- return *this;
- }
- //todo offset_type_ should be used in future
- if (seq_->qual.s) {
- read = SingleRead(seq_->name.s, seq_->seq.s, seq_->qual.s, offset_type_);
- } else {
- read = SingleRead(seq_->name.s, seq_->seq.s);
-// size_t len = strlen(seq_->seq.s);
-// char* qual = (char*) malloc(len + 1);
-// char q = '\2' + 64;
-// for (size_t i = 0; i < len; ++i) {
-// qual[i] = q;
-// }
-// qual[len] = '\0';
-// read.SetAll(seq_->name.s, seq_->seq.s, qual, SolexaOffset);
-// free(qual);
- }
- ReadAhead();
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */
- void close() {
- if (is_open_) {
- // STEP 5: destroy seq
- fastafastqgz::kseq_destroy(seq_);
- // STEP 6: close the file handler
- gzclose(fp_);
- is_open_ = false;
- eof_ = true;
- }
- }
-
-private:
- /*
- * @variable File that is associated with gzipped data file.
- */
- gzFile fp_;
- /*
- * @variable Data element that stores last SingleRead got from
- * stream.
- */
- fastafastqgz::kseq_t* seq_;
-
- /*
- * Open a stream.
- */
- /* virtual */
- void open() {
- // STEP 2: open the file handler
- fp_ = gzopen(filename_.c_str(), "r");
- if (!fp_) {
- is_open_ = false;
- return;
- }
- // STEP 3: initialize seq
- seq_ = fastafastqgz::kseq_init(fp_);
- eof_ = false;
- is_open_ = true;
- ReadAhead();
- }
-
- /*
- * Read next SingleRead from file.
- */
- void ReadAhead() {
- VERIFY(is_open_);
- VERIFY(!eof_);
- if (fastafastqgz::kseq_read(seq_) < 0) {
- eof_ = true;
- }
- }
-
- /*
- * Hidden copy constructor.
- */
- FastaFastqGzParser(const FastaFastqGzParser& parser);
- /*
- * Hidden assign operator.
- */
- void operator=(const FastaFastqGzParser& parser);
-};
-
-}
-
-#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/modules/io/reads_io/file_reader.hpp b/src/modules/io/reads_io/file_reader.hpp
deleted file mode 100644
index c9152d0..0000000
--- a/src/modules/io/reads_io/file_reader.hpp
+++ /dev/null
@@ -1,129 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
-
-* Reader<SingleRead> is the very base class that reads from one file
-* through Parser object.
-* Reader<PairedRead> is the class that reads data from two input
-* files and gets PairedReads using this data and distance information.
-*/
-
-#pragma once
-
-#include "ireader.hpp"
-#include "io/reads/single_read.hpp"
-#include "parser.hpp"
-#include "dev_support/path_helper.hpp"
-
-namespace io {
-
-class FileReadStream : public ReadStream<SingleRead> {
-public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param distance Doesn't have any sense here, but necessary for
- * wrappers.
- * @param offset The offset of the read quality.
- */
- explicit FileReadStream(const std::string &filename,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), offset_type_(offset_type), parser_(NULL) {
- path::CheckFileExistenceFATAL(filename_);
- parser_ = SelectParser(filename_, offset_type_);
- }
-
- /*
- * Default destructor.
- */
- /* virtual */ ~FileReadStream() {
- close();
- delete parser_;
- }
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- if (parser_ != NULL) {
- return parser_->is_open();
- } else {
- return false;
- }
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- if (parser_ != NULL) {
- return parser_->eof();
- } else {
- return true;
- }
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param singleread The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ FileReadStream &operator>>(SingleRead &singleread) {
- if (parser_ != NULL) {
- (*parser_) >> singleread;
- }
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- if (parser_ != NULL) {
- parser_->close();
- }
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- if (parser_ != NULL) {
- parser_->reset();
- }
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
-private:
- /*
- * @variable The name of the file which stream reads from.
- */
- std::string filename_;
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
- /*
- * @variable Internal stream that reads from file.
- */
- Parser *parser_;
-
-};
-
-}
diff --git a/src/modules/io/reads_io/io_helper.hpp b/src/modules/io/reads_io/io_helper.hpp
deleted file mode 100644
index 2f42348..0000000
--- a/src/modules/io/reads_io/io_helper.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "read_stream_vector.hpp"
-#include "io/reads/single_read.hpp"
-#include "io/reads/paired_read.hpp"
-#include "file_reader.hpp"
-#include "paired_readers.hpp"
-#include "binary_streams.hpp"
-#include "multifile_reader.hpp"
-#include "converting_reader_wrapper.hpp"
-#include "careful_filtering_reader_wrapper.hpp"
-#include "rc_reader_wrapper.hpp"
-
-namespace io {
- typedef ReadStream<SingleRead> SingleStream;
- typedef std::shared_ptr<SingleStream> SingleStreamPtr;
- typedef ReadStreamList<SingleRead> SingleStreams;
-
- typedef ReadStream<PairedRead> PairedStream;
- typedef std::shared_ptr<PairedStream> PairedStreamPtr;
- typedef ReadStreamList<PairedRead> PairedStreams;
-
- typedef ReadStream<SingleReadSeq> BinarySingleStream;
- typedef std::shared_ptr<BinarySingleStream> BinarySingleStreamPtr;
- typedef ReadStreamList<SingleReadSeq> BinarySingleStreams;
-
- typedef ReadStream<PairedReadSeq> BinaryPairedStream;
- typedef std::shared_ptr<BinaryPairedStream> BinaryPairedStreamPtr;
- typedef ReadStreamList<PairedReadSeq> BinaryPairedStreams;
-
- //old
-// typedef io::IReader<io::SingleReadSeq> SequenceSingleReadStream;
-// typedef io::IReader<io::PairedReadSeq> SequencePairedReadStream;
-// typedef io::MultifileReader<io::PairedRead> MultiPairedStream;
-// typedef io::MultifileReader<io::SingleRead> MultiSingleStream;
-
- inline BinarySingleStreams apply_single_wrappers(bool followed_by_rc,
- BinarySingleStreams& single_readers,
- BinaryPairedStreams* paired_readers = 0) {
- VERIFY(single_readers.size() != 0);
- BinarySingleStreams readers = single_readers;
-
- if (paired_readers != 0) {
- VERIFY(single_readers.size() == paired_readers->size());
- BinarySingleStreams squashed_paired = SquashingWrap<PairedReadSeq>(*paired_readers);
- readers = WrapPairsInMultifiles<SingleReadSeq>(squashed_paired, readers);
- }
-
- if (followed_by_rc) {
- readers = RCWrap<SingleReadSeq>(readers);
- }
- return readers;
- }
-
- //todo make deprecated
- inline BinaryPairedStreams apply_paired_wrappers(bool followed_by_rc,
- BinaryPairedStreams& readers) {
- VERIFY(readers.size() != 0);
- if (followed_by_rc) {
- return RCWrap<PairedReadSeq>(readers);
- } else {
- return readers;
- }
- }
-
- inline SingleStreamPtr EasyStream(const std::string& filename, bool followed_by_rc,
- bool handle_Ns = true, OffsetType offset_type = PhredOffset) {
- SingleStreamPtr reader = make_shared<FileReadStream>(filename, offset_type);
- if (handle_Ns) {
- reader = CarefulFilteringWrap<SingleRead>(reader);
- }
- if (followed_by_rc) {
- reader = RCWrap<SingleRead>(reader);
- }
- return reader;
- }
-
- inline PairedStreamPtr WrapPairedStream(PairedStreamPtr reader,
- bool followed_by_rc,
- bool use_orientation = false,
- LibraryOrientation orientation = LibraryOrientation::Undefined) {
- PairedStreamPtr answer = reader;
- answer = CarefulFilteringWrap<PairedRead>(answer, use_orientation, orientation);
- if (followed_by_rc) {
- answer = RCWrap<PairedRead>(answer);
- }
- return answer;
-
- }
-
- inline PairedStreamPtr PairedEasyStream(const std::string& filename1, const std::string& filename2,
- bool followed_by_rc, size_t insert_size, bool change_read_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset) {
- PairedStreamPtr reader = make_shared<SeparatePairedReadStream>(filename1, filename2, insert_size,
- change_read_order, use_orientation,
- orientation, offset_type);
- //Use orientation for IS calculation if it's not done by changer
- return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
- }
-
- inline PairedStreamPtr PairedEasyStream(const std::string& filename, bool followed_by_rc,
- size_t insert_size, bool change_read_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset) {
- PairedStreamPtr reader = make_shared<InterleavingPairedReadStream>(filename, insert_size, change_read_order,
- use_orientation, orientation, offset_type);
- //Use orientation for IS calculation if it's not done by changer
- return WrapPairedStream(reader, followed_by_rc, !use_orientation, orientation);
- }
-}
diff --git a/src/modules/io/reads_io/ireader.hpp b/src/modules/io/reads_io/ireader.hpp
deleted file mode 100644
index e3e286d..0000000
--- a/src/modules/io/reads_io/ireader.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-//todo rename to reader
-#pragma once
-
-#include <boost/noncopyable.hpp>
-#include "dev_support/standard_base.hpp"
-
-namespace io {
-
-struct ReadStreamStat {
- size_t read_count_;
- size_t max_len_;
- uint64_t total_len_;
-
-
- ReadStreamStat(): read_count_(0), max_len_(0), total_len_(0) { }
-
- void write(std::ostream& stream) const {
- stream.write((const char *) &read_count_, sizeof(read_count_));
- stream.write((const char *) &max_len_, sizeof(max_len_));
- stream.write((const char *) &total_len_, sizeof(total_len_));
- }
-
- void read(std::istream& stream) {
- stream.read((char *) &read_count_, sizeof(read_count_));
- stream.read((char *) &max_len_, sizeof(max_len_));
- stream.read((char *) &total_len_, sizeof(total_len_));
- }
-
- template<class Read>
- void increase(const Read& read) {
- size_t len = read.size();
-
- ++read_count_;
- if (max_len_ < len) {
- max_len_ = len;
- }
- total_len_ += read.nucl_count();
- }
-
- void merge(const ReadStreamStat& stat) {
- read_count_ += stat.read_count_;
- if (max_len_ < stat.max_len_) {
- max_len_ = stat.max_len_;
- }
- total_len_ += stat.total_len_;
- }
-
- bool valid() const {
- return read_count_ != 0;
- }
-
-};
-
-/**
- * Reader is the interface for all other readers and reader wrappers.
- */
-template<typename ReadType>
-class ReadStream: boost::noncopyable {
- public:
- typedef ReadType ReadT;
-
- /*
- * Default destructor.
- */
- virtual ~ReadStream() {}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true if the stream is opened and false otherwise.
- */
- virtual bool is_open() = 0;
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of the stream is reached and false
- * otherwise.
- */
- virtual bool eof() = 0;
-
- /*
- * Read SingleRead or PairedRead from stream (according to ReadType).
- *
- * @param read The SingleRead or PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- virtual ReadStream& operator>>(ReadType& read) = 0;
-
- /*
- * Close the stream.
- */
- virtual void close() = 0;
-
- /*
- * Close the stream and open it again.
- */
- virtual void reset() = 0;
-
- virtual ReadStreamStat get_stat() const = 0;
-
-};
-
-template<class Read>
-class PredictableReadStream: public ReadStream<Read> {
-public:
- virtual size_t size() const = 0;
-};
-
-}
diff --git a/src/modules/io/reads_io/ireadstream.hpp b/src/modules/io/reads_io/ireadstream.hpp
deleted file mode 100644
index 3cc34d0..0000000
--- a/src/modules/io/reads_io/ireadstream.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
-* ifastqstream.hpp
-*
-* Created on: 03.03.2011
-* Author: vyahhi
-*/
-
-#ifndef IREADSTREAM_HPP_
-#define IREADSTREAM_HPP_
-
-#include "kseq/kseq.h"
-#include <zlib.h>
-#include "dev_support/verify.hpp"
-#include "io/reads/read.hpp"
-#include "data_structures/sequence/nucl.hpp"
-
-// STEP 1: declare the type of file handler and the read() function
-KSEQ_INIT(gzFile, gzread)
-
-/*
-* Read name, seq and qual strings from FASTQ data (one by one)
-*/
-//fixme deprecated!!! remove usages!
-class ireadstream {
-
-public:
-typedef Read ReadT;
-
-ireadstream(const std::string &filename) : offset_(Read::PHRED_OFFSET) {
- filename_ = filename;
- is_open_ = open(filename);
-}
-
-ireadstream(const std::string &filename, int offset) : offset_(offset) {
- filename_ = filename;
- is_open_ = open(filename);
-}
-
-virtual ~ireadstream() {
- close();
-}
-
-bool is_open() const {
- return is_open_;
-}
-
-bool eof() const {
- return eof_;
-}
-
-static std::vector <Read> *readAll(std::string filename, int cnt = -1) {
- ireadstream irs(filename);
- VERIFY(irs.is_open());
- std::vector <Read> *res = new std::vector<Read>();
- Read r;
- while (cnt-- && irs.is_open() && !irs.eof()) {
- irs >> r;
- if (!r.isValid()) {
- cnt++;
- continue;
- }
- res->push_back(r);
- }
- irs.close();
- return res;
-}
-
-static void readAllNoValidation(std::vector <Read> *res, std::string filename, uint64_t *totalsize,
- int qvoffset = Read::PHRED_OFFSET, int trim_quality = -1, int cnt = -1) {
- ireadstream irs(filename, qvoffset);
- VERIFY(irs.is_open());
- *totalsize = 0;
- Read r;
- while (cnt-- && irs.is_open() && !irs.eof()) {
- irs >> r;
- size_t read_size = r.trimNsAndBadQuality(trim_quality);
- res->push_back(r);
- *totalsize += read_size;
- }
- irs.close();
-}
-
-ireadstream &operator>>(Read &r) {
- VERIFY(is_open());
- VERIFY(!eof());
- if (!is_open() || eof()) {
- return *this;
- }
- r.setName(seq_->name.s);
- if (seq_->qual.s) {
- r.setQuality(seq_->qual.s, offset_);
- }
- r.setSequence(seq_->seq.s);
- read_ahead(); // make actual read for the next result
- return *this;
-}
-
-void close() {
- if (is_open()) {
- kseq_destroy(seq_); // STEP 5: destroy seq
- gzclose(fp_); // STEP 6: close the file handler
- is_open_ = false;
- }
-}
-
-void reset() {
- close();
- open(filename_);
-}
-
-private:
-std::string filename_;
-gzFile fp_;
-kseq_t *seq_;
-bool is_open_;
-bool eof_;
-int offset_;
-
-/*
- * open i's file with FASTQ reads,
- * return true if it opened file, false otherwise
- */
-bool open(std::string filename) {
- fp_ = gzopen(filename.c_str(), "r"); // STEP 2: open the file handler
- if (!fp_) {
- return false;
- }
- is_open_ = true;
- seq_ = kseq_init(fp_); // STEP 3: initialize seq
- eof_ = false;
- read_ahead();
- return true;
-}
-
-void read_ahead() {
- VERIFY(is_open());
- VERIFY(!eof());
- if (kseq_read(seq_) < 0) {
- eof_ = true;
- }
-}
-};
-
-//return -1 if failed to determine offset
-inline int determine_offset(const std::string &filename) {
-ireadstream stream(filename, 0);
-size_t count = 0;
-Read r;
-while (!stream.eof() && count++ < 10000) {
- stream >> r;
- std::string q_str = r.getQualityString();
- for (size_t i = 0; i < q_str.size(); ++i) {
- int q_val = q_str[i];
- if (q_val < 59)
- return 33;
- if (q_val > 74)
- return 64;
- }
-}
-return -1;
-}
-
-#endif /* IREADSTREAM_HPP_ */
diff --git a/src/modules/io/reads_io/is_corrupting_wrapper.hpp b/src/modules/io/reads_io/is_corrupting_wrapper.hpp
deleted file mode 100644
index f2993f3..0000000
--- a/src/modules/io/reads_io/is_corrupting_wrapper.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-////***************************************************************************
-////* Copyright (c) 2011-2014 Saint-Petersburg Academic University
-////* All Rights Reserved
-////* See file LICENSE for details.
-////****************************************************************************
-// todo remove!!!
-//#ifndef IS_CORRUPTING_WRAPPER_HPP_
-//#define IS_CORRUPTING_WRAPPER_HPP_
-//
-//namespace io {
-//
-//class ISCorruptingWrapper: public DelegatingReaderWrapper<PairedRead> {
-//private:
-// const size_t is_;
-//public:
-// typedef PairedRead ReadType;
-//
-// explicit ISCorruptingWrapper(IReader<ReadType>& reader, size_t is) :
-// DelegatingReaderWrapper<PairedRead>(reader), is_(is) {
-// }
-//
-// /* virtual */
-// ISCorruptingWrapper& operator>>(ReadType& read) {
-// (this->reader()) >> read;
-// read = PairedRead(read.first(), read.second(), is_);
-// return *this;
-// }
-//
-//};
-//
-//}
-//
-//#endif /* IS_CORRUPTING_WRAPPER_HPP_ */
diff --git a/src/modules/io/reads_io/modifying_reader_wrapper.hpp b/src/modules/io/reads_io/modifying_reader_wrapper.hpp
deleted file mode 100644
index 5575e92..0000000
--- a/src/modules/io/reads_io/modifying_reader_wrapper.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/verify.hpp"
-#include "io/reads_io/delegating_reader_wrapper.hpp"
-#include "paired_readers.hpp"
-
-#include <memory>
-#include <io/reads/single_read.hpp>
-
-namespace io {
-
-class SequenceModifier {
-public:
- virtual ~SequenceModifier() {}
-
- SingleRead Modify(const SingleRead& read) {
- return SingleRead(read.name(), Modify(read.sequence()).str());
- }
-
- SingleReadSeq Modify(const SingleReadSeq& read) {
- return SingleReadSeq(Modify(read.sequence()));
- }
-
- virtual Sequence Modify(const Sequence& s) = 0;
-};
-
-class TrivialModifier : public SequenceModifier {
-public:
-
- virtual Sequence Modify(const Sequence& s) {
- return s;
- }
-};
-
-/**
- * Attention!!! this class clears quality!!!
- */
-template<class ReadType>
-class ModifyingWrapper;
-
-template<>
-class ModifyingWrapper<SingleRead>: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read = modifier_->Modify(read);
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<PairedRead>: public DelegatingWrapper<PairedRead> {
- typedef DelegatingWrapper<PairedRead> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(PairedRead& read) {
- this->reader() >> read;
- read = PairedRead(modifier_->Modify(read.first()),
- modifier_->Modify(read.second()),
- read.insert_size());
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<SingleReadSeq>: public DelegatingWrapper<SingleReadSeq> {
- typedef DelegatingWrapper<SingleReadSeq> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(SingleReadSeq& read) {
- this->reader() >> read;
- read = modifier_->Modify(read.sequence());
- return *this;
- }
-};
-
-template<>
-class ModifyingWrapper<PairedReadSeq>: public DelegatingWrapper<PairedReadSeq> {
- typedef DelegatingWrapper<PairedReadSeq> base;
- std::shared_ptr<SequenceModifier> modifier_;
-
-public:
- ModifyingWrapper(base::ReadStreamPtrT reader, std::shared_ptr<SequenceModifier> modifier) :
- base(reader), modifier_(modifier) {}
-
- ModifyingWrapper& operator>>(PairedReadSeq& read) {
- this->reader() >> read;
- read = PairedReadSeq(modifier_->Modify(read.first().sequence())
- , SingleReadSeq(modifier_->Modify(read.second())), read.insert_size());
- return *this;
- }
-};
-
-}
diff --git a/src/modules/io/reads_io/osequencestream.hpp b/src/modules/io/reads_io/osequencestream.hpp
deleted file mode 100644
index 6124aef..0000000
--- a/src/modules/io/reads_io/osequencestream.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * oreadstream.hpp
- *
- * Created on: 23.06.2011
- * Author: vyahhi
- */
-
-#pragma once
-
-#include <fstream>
-#include <string>
-#include <vector>
-#include "io/reads/single_read.hpp"
-#include "io/reads/paired_read.hpp"
-
-namespace io {
-
-inline std::string MakeContigId(int number, size_t length, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length);
-}
-
-inline std::string MakeContigId(int number, size_t length, double coverage, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage);
-}
-
-inline std::string MakeContigId(int number, size_t length, double coverage, size_t id, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_ID_" + ToString(id);
-}
-inline std::string MakeContigComponentId(int number, size_t length, double coverage, size_t id, size_t component_id, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_ID_" + ToString(id) + "_component_" + ToString(component_id);
-}
-inline std::string MakeContigComponentId(int number, size_t length, double coverage, size_t component_id, const std::string& prefix = "NODE") {
- return prefix + "_" + ToString(number) + "_length_" + ToString(length) + "_cov_" + ToString(coverage) + "_component_" + ToString(component_id);
-}
-
-
-class osequencestream {
-protected:
- std::ofstream ofstream_;
-
- int id_;
-
- void write_str(const std::string& s) {
- size_t cur = 0;
- while (cur < s.size()) {
- ofstream_ << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- }
-
- virtual void write_header(const std::string& s) {
- // Velvet format: NODE_1_length_24705_cov_358.255249
- ofstream_ << ">" << MakeContigId(id_++, s.size()) << std::endl;
- }
-
-public:
- osequencestream(const std::string& filename): id_(1) {
- ofstream_.open(filename.c_str());
- }
-
- virtual ~osequencestream() {
- ofstream_.close();
- }
-
- virtual osequencestream& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- virtual osequencestream& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
- /**
- * Has different way of making headers
- * Doesn't increase counters, don't mix with other methods!
- */
- virtual osequencestream& operator<<(const SingleRead& read) {
- ofstream_ << ">" << read.name() << std::endl;
- size_t cur = 0;
- std::string s = read.GetSequenceString();
- while (cur < s.size()) {
- ofstream_ << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- return *this;
- }
-};
-
-class PairedOutputSequenceStream {
-protected:
- std::ofstream ofstreaml_;
- std::ofstream ofstreamr_;
-
- static void write(const SingleRead& read, std::ofstream& stream) {
- stream << ">" << read.name() << std::endl;
- size_t cur = 0;
- std::string s = read.GetSequenceString();
- while (cur < s.size()) {
- stream << s.substr(cur, 60) << std::endl;
- cur += 60;
- }
- }
-
-public:
- PairedOutputSequenceStream(const std::string& filename1, const std::string &filename2) {
- ofstreaml_.open(filename1);
- ofstreamr_.open(filename2);
- }
-
- virtual ~PairedOutputSequenceStream() {
- ofstreaml_.close();
- ofstreamr_.close();
- }
-
- PairedOutputSequenceStream& operator<<(const PairedRead& read) {
- write(read.first(), ofstreaml_);
- write(read.second(), ofstreamr_);
- return *this;
- }
-};
-
-
-class osequencestream_cov: public osequencestream {
-protected:
- double coverage_;
-
- virtual void write_header(const std::string& s) {
- // Velvet format: NODE_1_length_24705_cov_358.255249
- ofstream_ << ">" << MakeContigId(id_++, s.size(), coverage_) << std::endl;
- }
-
-
-public:
- osequencestream_cov(const std::string& filename)
- : osequencestream(filename), coverage_(0.) { }
-
- virtual ~osequencestream_cov() {
- ofstream_.close();
- }
-
- osequencestream_cov& operator<<(double coverage) {
- coverage_ = coverage;
- return *this;
- }
-
- osequencestream_cov& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_cov& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-
-class osequencestream_simple: public osequencestream {
-protected:
- std::string header_;
-
- double cov_;
-
- virtual void write_header(const std::string& /*s*/) {
- ofstream_ << ">" << header_ << std::endl;
- }
-
-public:
- osequencestream_simple(const std::string& filename)
- : osequencestream(filename), header_("") { }
-
- virtual ~osequencestream_simple() {
- ofstream_.close();
- }
-
- void set_header(const std::string &header) {
- header_ = header;
- }
-
- osequencestream_simple& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_simple& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-class osequencestream_with_id: public osequencestream {
-protected:
- size_t uid_;
-
- double cov_;
-
- virtual void write_header(const std::string& s) {
- ofstream_ << ">" << GetId(s) << std::endl;
- id_++;
- }
-
-public:
- osequencestream_with_id(const std::string& filename)
- : osequencestream(filename), uid_(0), cov_(0.0) { }
-
- virtual ~osequencestream_with_id() {
- ofstream_.close();
- }
-
- std::string GetId(const std::string& s) const {
- return MakeContigId(id_, s.size(), cov_, uid_);
- }
-
- void setCoverage(double c) {
- cov_ = c;
- }
-
- void setID(size_t uid) {
- uid_ = uid;
- }
-
- osequencestream_with_id& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_id& operator<<(double coverage) {
- cov_ = coverage;
- return *this;
- }
-
- osequencestream_with_id& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-class osequencestream_with_manual_node_id: public osequencestream_with_id {
- bool is_id_set_;
- virtual void write_header(const std::string& s) {
- //for manual NODE ID setting osequencestream need to chech that node ID is really manually set
- if (!is_id_set_) {
- WARN ("NODE ID is not set manually, setting to 0");
- id_ = 0;
- }
- ofstream_ << ">" << MakeContigId(id_, s.size(), cov_, uid_) << std::endl;
- is_id_set_ = false;
- }
-
-public:
-//unfortunately constructor inheritance is supported only since g++4.8
- osequencestream_with_manual_node_id(const std::string& filename): osequencestream_with_id(filename) {
- is_id_set_ = false;
- }
-
- void setNodeID(int id) {
- id_ = id;
- is_id_set_ = true;
- }
-
- osequencestream_with_manual_node_id& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_manual_node_id& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-
-};
-
-
-class osequencestream_with_data_for_scaffold: public osequencestream_with_id {
-protected:
- std::ofstream scstream_;
-
- virtual void write_header(const std::string& s) {
- scstream_ << id_ << "\tNODE_" << id_ << "\t" << s.size() << "\t" << (int) round(cov_) << std::endl;
- ofstream_ << ">" << MakeContigId(id_++, s.size(), cov_, uid_) << std::endl;
- }
-
-public:
- osequencestream_with_data_for_scaffold(const std::string& filename): osequencestream_with_id(filename) {
- id_ = 1;
- std::string sc_filename = filename + ".info";
- scstream_.open(sc_filename.c_str());
- }
-
- virtual ~osequencestream_with_data_for_scaffold() {
- ofstream_.close();
- scstream_.close();
- }
-
- osequencestream_with_data_for_scaffold& operator<<(const std::string& s) {
- write_header(s);
- write_str(s);
- return *this;
- }
-
- osequencestream_with_data_for_scaffold& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-};
-
-class osequencestream_for_fastg: public osequencestream_with_id {
-protected:
- std::string header_;
-
- virtual void write_header(const std::string& s) {
- ofstream_ << ">" << s;
- }
-
-public:
- osequencestream_for_fastg(const std::string& filename):
- osequencestream_with_id(filename) {
- id_ = 1;
- }
-
- virtual ~osequencestream_for_fastg() {
- ofstream_.close();
- }
-
- void set_header(const std::string& h) {
- header_= h;
- }
-
- osequencestream_for_fastg& operator<<(const std::set<std::string>& s) {
- write_header(header_);
- if (s.size() > 0) {
- auto iter = s.begin();
- ofstream_ << ":" << *iter;
- ++iter;
- while (iter != s.end()) {
- ofstream_ << "," << *iter;
- ++iter;
- }
- }
- ofstream_ << ";" << std::endl;
- return *this;
- }
-
- osequencestream_for_fastg& operator<<(const std::string& s) {
- write_str(s);
- return *this;
- }
-
- osequencestream_for_fastg& operator<<(const Sequence& seq) {
- std::string s = seq.str();
- return operator <<(s);
- }
-
-};
-
-}
diff --git a/src/modules/io/reads_io/paired_readers.hpp b/src/modules/io/reads_io/paired_readers.hpp
deleted file mode 100644
index 14e84a7..0000000
--- a/src/modules/io/reads_io/paired_readers.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <string>
-#include "ireader.hpp"
-#include "io/reads/paired_read.hpp"
-#include "file_reader.hpp"
-#include "orientation.hpp"
-
-namespace io {
-
-class SeparatePairedReadStream : public ReadStream<PairedRead> {
- public:
- /*
- * Default constructor.
- *
- * @param filename The pair that contains the names of two files to
- * be opened.
- * @param distance Distance between parts of PairedReads.
- * @param offset The offset of the read quality.
- */
- explicit SeparatePairedReadStream(const std::string& filename1, const std::string& filename2,
- size_t insert_size, bool change_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset)
- : insert_size_(insert_size),
- change_order_(change_order),
- use_orientation_(use_orientation),
- changer_(GetOrientationChanger<PairedRead>(orientation)),
- offset_type_(offset_type),
- first_(new FileReadStream(filename1, offset_type_)),
- second_(new FileReadStream(filename2, offset_type_)),
- filename1_(filename1),
- filename2_(filename2){}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return first_->is_open() && second_->is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
-
- if (first_->eof() != second_->eof()) {
- if (first_->eof()) {
- ERROR("The number of right read-pairs is larger than the number of left read-pairs");
- } else {
- ERROR("The number of left read-pairs is larger than the number of right read-pairs");
- }
- FATAL_ERROR("Unequal number of read-pairs detected in the following files: " << filename1_ << " " << filename2_ << "");
- }
- return first_->eof();
- }
-
- /*
- * Read PairedRead from stream.
- *
- * @param pairedread The PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ SeparatePairedReadStream& operator>>(PairedRead& pairedread) {
- SingleRead sr1, sr2;
- (*first_) >> sr1;
- (*second_) >> sr2;
-
- if (use_orientation_) {
- pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
- }
- else {
- pairedread = PairedRead(sr1, sr2, insert_size_);
- }
-
- if (change_order_) {
- pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
- }
-
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- first_->close();
- second_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- first_->reset();
- second_->reset();
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- private:
-
- size_t insert_size_;
-
- bool change_order_;
-
- bool use_orientation_;
-
- std::unique_ptr<OrientationChanger<PairedRead>> changer_;
-
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
-
- /*
- * @variable The first stream (reads from first file).
- */
- std::unique_ptr<ReadStream<SingleRead>> first_;
- /*
- * @variable The second stream (reads from second file).
- */
- std::unique_ptr<ReadStream<SingleRead>> second_;
-
- //Only for providing information about error for users
- std::string filename1_;
- std::string filename2_;
-};
-
-class InterleavingPairedReadStream : public ReadStream<PairedRead> {
- public:
- /*
- * Default constructor.
- *
- * @param filename Single file
- * @param distance Distance between parts of PairedReads.
- * @param offset The offset of the read quality.
- */
- explicit InterleavingPairedReadStream(const std::string& filename, size_t insert_size, bool change_order = false,
- bool use_orientation = true, LibraryOrientation orientation = LibraryOrientation::FR,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), insert_size_(insert_size),
- change_order_(change_order),
- use_orientation_(use_orientation),
- changer_(GetOrientationChanger<PairedRead>(orientation)),
- offset_type_(offset_type),
- single_(new FileReadStream(filename_, offset_type_)) {}
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- /* virtual */ bool is_open() {
- return single_->is_open();
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- /* virtual */ bool eof() {
- return single_->eof();
- }
-
- /*
- * Read PairedRead from stream.
- *
- * @param pairedread The PairedRead that will store read data.
- *
- * @return Reference to this stream.
- */
- /* virtual */ InterleavingPairedReadStream& operator>>(PairedRead& pairedread) {
- SingleRead sr1, sr2;
- (*single_) >> sr1;
- (*single_) >> sr2;
-
- if (use_orientation_) {
- pairedread = changer_->Perform(PairedRead(sr1, sr2, insert_size_));
- }
- else {
- pairedread = PairedRead(sr1, sr2, insert_size_);
- }
-
- if (change_order_) {
- pairedread = PairedRead(pairedread.second(), pairedread.first(), insert_size_);
- }
-
- return *this;
- }
-
- /*
- * Close the stream.
- */
- /* virtual */ void close() {
- single_->close();
- }
-
- /*
- * Close the stream and open it again.
- */
- /* virtual */ void reset() {
- single_->reset();
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- private:
- /*
- * @variable The names of the file which stream reads from.
- */
- std::string filename_;
-
- size_t insert_size_;
-
- bool change_order_;
-
- bool use_orientation_;
-
- std::unique_ptr<OrientationChanger<PairedRead>> changer_;
-
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
-
- /*
- * @variable The single read stream.
- */
- std::unique_ptr<ReadStream<SingleRead>> single_;
-
-};
-}
diff --git a/src/modules/io/reads_io/parser.cpp b/src/modules/io/reads_io/parser.cpp
deleted file mode 100644
index f750810..0000000
--- a/src/modules/io/reads_io/parser.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
- * @file parser.cpp
- * @author Mariya Fomkina
- * @version 1.0
- *
- * @section LICENSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * @section DESCRIPTION
- *
- * Parser is the parent class for all streams that read data from
- * different file types (fastq, fasta, sam etc).
- * This file contains functions that are used to select exact parser
- * according to extension.
- */
-
-#include <io/reads/single_read.hpp>
-#include "io/reads_io/fasta_fastq_gz_parser.hpp"
-#include "io/reads_io/parser.hpp"
-#include "io/sam_io/bam_parser.hpp"
-#include "dev_support/standard_base.hpp"
-
-
-namespace io {
-
-/*
- * Get extension from filename.
- *
- * @param filename The name of the file to read from.
- *
- * @return File extension (e.g. "fastq", "fastq.gz").
- */
-std::string GetExtension(const std::string& filename) {
- std::string name = filename;
- size_t pos = name.find_last_of(".");
- std::string ext = "";
- if (pos != std::string::npos) {
- ext = name.substr(name.find_last_of(".") + 1);
- if (ext == "gz") {
- ext = name.substr(name.find_last_of
- (".", name.find_last_of(".") - 1) + 1);
- }
- }
- return ext;
-}
-
-/*
- * Select parser type according to file extension.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
-
- * @return Pointer to the new parser object with these filename and
- * offset.
- */
-Parser* SelectParser(const std::string& filename,
- OffsetType offset_type /*= PhredOffset*/) {
- std::string ext = GetExtension(filename);
- if (ext == "bam")
- return new BAMParser(filename, offset_type);
-
- return new FastaFastqGzParser(filename, offset_type);
- /*
- if ((ext == "fastq") || (ext == "fastq.gz") ||
- (ext == "fasta") || (ext == "fasta.gz") ||
- (ext == "fa") || (ext == "fq.gz") ||
- (ext == "fq") || (ext == "fa.gz") ||
- (ext == "seq") || (ext == "seq.gz")) {
- return new FastaFastqGzParser(filename, offset_type);
- }
-
- ERROR("Unknown file extention in input!");
- return NULL; */
-}
-
-void first_fun(int) {
-}
-
-}
diff --git a/src/modules/io/reads_io/parser.hpp b/src/modules/io/reads_io/parser.hpp
deleted file mode 100644
index f384446..0000000
--- a/src/modules/io/reads_io/parser.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/**
-* @file parser.hpp
-* @author Mariya Fomkina
-* @version 1.0
-*
-* @section LICENSE
-*
-* This program is free software; you can redistribute it and/or
-* modify it under the terms of the GNU General Public License as
-* published by the Free Software Foundation; either version 2 of
-* the License, or (at your option) any later version.
-*
-* @section DESCRIPTION
-*
-* Parser is the parent class for all streams that read data from
-* different file types (fastq, fasta, sam etc).
-*/
-
-#ifndef COMMON_IO_PARSER_HPP
-#define COMMON_IO_PARSER_HPP
-
-#include <string>
-#include "io/reads/single_read.hpp"
-
-namespace io {
-
-class Parser {
-public:
- /*
- * Default constructor.
- *
- * @param filename The name of the file to be opened.
- * @param offset The offset of the read quality.
- */
- Parser(const std::string &filename,
- OffsetType offset_type = PhredOffset)
- : filename_(filename), offset_type_(offset_type),
- is_open_(false), eof_(true) { }
-
- /*
- * Default destructor.
- */
- virtual ~Parser() { }
-
- /*
- * Check whether the stream is opened.
- *
- * @return true of the stream is opened and false otherwise.
- */
- virtual bool is_open() const {
- return is_open_;
- }
-
- /*
- * Check whether we've reached the end of stream.
- *
- * @return true if the end of stream is reached and false
- * otherwise.
- */
- virtual bool eof() const {
- return eof_;
- }
-
- /*
- * Read SingleRead from stream.
- *
- * @param read The SingleRead that will store read data.
- *
- * @return Reference to this stream.
- */
- virtual Parser &operator>>(SingleRead &read) = 0;
-
- /*
- * Close the stream.
- */
- virtual void close() = 0;
-
- /*
- * Close the stream and open it again.
- */
- void reset() {
- close();
- open();
- }
-
-protected:
- /*
- * @variable The name the file which stream reads from.
- */
- std::string filename_;
- /*
- * @variable Quality offset type.
- */
- OffsetType offset_type_;
- /*
- * @variable Flag that shows whether the stream is opened.
- */
- bool is_open_;
- /*
- * @variable Flag that shows whether the end of the stream is
- * reached.
- */
- bool eof_;
-
-private:
- /*
- * Open a stream.
- */
- virtual void open() = 0;
-};
-
-/*
-* Get extension from filename.
-*
-* @param filename The name of the file to read from.
-*
-* @return File extension (e.g. "fastq", "fastq.gz").
-*/
-std::string GetExtension(const std::string &filename);
-
-/*
-* Select parser type according to file extension.
-*
-* @param filename The name of the file to be opened.
-* @param offset The offset of the read quality.
-
-* @return Pointer to the new parser object with these filename and
-* offset.
-*/
-Parser *SelectParser(const std::string &filename,
- OffsetType offset_type = PhredOffset);
-
-//todo delete???
-void first_fun(int);
-
-}
-
-#endif /* COMMON_IO_PARSER_HPP */
diff --git a/src/modules/io/reads_io/read_processor.hpp b/src/modules/io/reads_io/read_processor.hpp
deleted file mode 100644
index 1da18de..0000000
--- a/src/modules/io/reads_io/read_processor.hpp
+++ /dev/null
@@ -1,209 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __HAMMER_READ_PROCESSOR_HPP__
-#define __HAMMER_READ_PROCESSOR_HPP__
-
-#include "io/reads_io/mpmc_bounded.hpp"
-
-#include "dev_support/openmp_wrapper.h"
-
-#pragma GCC diagnostic push
-#ifdef __clang__
-#pragma clang diagnostic ignored "-Wunused-private-field"
-#endif
-namespace hammer {
-class ReadProcessor {
- static size_t constexpr cacheline_size = 64;
- typedef char cacheline_pad_t[cacheline_size];
-
- unsigned nthreads_;
- cacheline_pad_t pad0;
- size_t read_;
- cacheline_pad_t pad1;
- size_t processed_;
- cacheline_pad_t pad2;
-
-private:
- template<class Reader, class Op>
- bool RunSingle(Reader &irs, Op &op) {
- using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
-
- while (!irs.eof()) {
- ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
- irs >> *r;
- read_ += 1;
-
- processed_ += 1;
- if (op(std::move(r))) // Pass ownership of read down to processor
- return true;
- }
-
- return false;
- }
-
- template<class Reader, class Op, class Writer>
- void RunSingle(Reader &irs, Op &op, Writer &writer) {
- using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
-
- while (!irs.eof()) {
- ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
- irs >> *r;
- read_ += 1;
-
- auto res = op(std::move(r)); // Pass ownership of read down to processor
- processed_ += 1;
-
- if (res)
- writer << *res;
- }
- }
-
-public:
- ReadProcessor(unsigned nthreads)
- : nthreads_(nthreads), read_(0), processed_(0) { }
-
- size_t read() const { return read_; }
-
- size_t processed() const { return processed_; }
-
- template<class Reader, class Op>
- bool Run(Reader &irs, Op &op) {
- using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
-
- if (nthreads_ < 2)
- return RunSingle(irs, op);
-
- // Round nthreads to next power of two
- unsigned bufsize = nthreads_ - 1;
- bufsize = (bufsize >> 1) | bufsize;
- bufsize = (bufsize >> 2) | bufsize;
- bufsize = (bufsize >> 4) | bufsize;
- bufsize = (bufsize >> 8) | bufsize;
- bufsize = (bufsize >> 16) | bufsize;
- bufsize += 1;
-
- mpmc_bounded_queue<ReadPtr> in_queue(2 * bufsize);
-
- bool stop = false;
-# pragma omp parallel shared(in_queue, irs, op, stop) num_threads(nthreads_)
- {
-# pragma omp master
- {
- while (!irs.eof()) {
- ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
- irs >> *r;
-# pragma omp atomic
- read_ += 1;
-
- while (!in_queue.enqueue(std::move(r)))
- sched_yield();
-
-# pragma omp flush (stop)
- if (stop)
- break;
- }
-
- in_queue.close();
- }
-
- while (1) {
- ReadPtr r;
-
- if (!in_queue.wait_dequeue(r))
- break;
-
-# pragma omp atomic
- processed_ += 1;
-
- bool res = op(std::move(r));
- if (res) {
-# pragma omp atomic
- stop |= res;
- }
- }
- }
-
-# pragma omp flush(stop)
- return stop;
- }
-
- template<class Reader, class Op, class Writer>
- void Run(Reader &irs, Op &op, Writer &writer) {
- using ReadPtr = std::unique_ptr<typename Reader::ReadT>;
-
- if (nthreads_ < 2) {
- RunSingle(irs, op, writer);
- return;
- }
-
- // Round nthreads to next power of two
- unsigned bufsize = nthreads_ - 1;
- bufsize = (bufsize >> 1) | bufsize;
- bufsize = (bufsize >> 2) | bufsize;
- bufsize = (bufsize >> 4) | bufsize;
- bufsize = (bufsize >> 8) | bufsize;
- bufsize = (bufsize >> 16) | bufsize;
- bufsize += 1;
-
- mpmc_bounded_queue<ReadPtr> in_queue(bufsize), out_queue(2 * bufsize);
-# pragma omp parallel shared(in_queue, out_queue, irs, op, writer) num_threads(nthreads_)
- {
-# pragma omp master
- {
- while (!irs.eof()) {
- ReadPtr r = ReadPtr(new typename Reader::ReadT) ;
- irs >> *r;
-
- // First, try to provide read to the queue. If it's full, never mind.
- bool status = in_queue.enqueue(std::move(r));
-
- // Flush down the output queue
- ReadPtr outr;
- while (out_queue.dequeue(outr))
- writer << *outr;
-
- // If the input queue was originally full, wait until we can insert
- // the read once again.
- if (!status)
- while (!in_queue.enqueue(std::move(r)))
- sched_yield();
- }
-
- in_queue.close();
-
- // Flush down the output queue while in master threads.
- ReadPtr outr;
- while (out_queue.dequeue(outr))
- writer << *outr;
- }
-
- while (1) {
- ReadPtr r;
-
- if (!in_queue.wait_dequeue(r))
- break;
-
- auto res = op(std::move(r));
- if (res)
- while (!out_queue.enqueue(std::move(res)))
- sched_yield();
- }
- }
-
- // Flush down the output queue
- ReadPtr outr;
- while (out_queue.dequeue(outr))
- writer << *outr;
- }
-};
-
-#pragma GCC diagnostic pop
-
-}
-
-#endif // __HAMMER_READ_PROCESSOR_HPP__
diff --git a/src/modules/io/reads_io/read_stream_vector.hpp b/src/modules/io/reads_io/read_stream_vector.hpp
deleted file mode 100644
index 632e8db..0000000
--- a/src/modules/io/reads_io/read_stream_vector.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "ireader.hpp"
-#include <vector>
-
-namespace io {
-//todo rename file
-
-//todo check destroy_readers logic and usages
-template<class ReadType>
-class ReadStreamList {
-public:
- typedef ReadType ReadT;
- typedef ReadStream<ReadType> ReaderT;
- typedef std::shared_ptr<ReaderT> ReaderPtrT;
-
-private:
- std::vector<ReaderPtrT> readers_;
-
-public:
-
- explicit ReadStreamList(const std::vector<ReaderPtrT> &readers) : readers_(readers) {
- }
-
- ReadStreamList() {
- }
-
- explicit ReadStreamList(ReaderT *reader_ptr) : readers_(1, ReaderPtrT(reader_ptr)) {
- }
-
- explicit ReadStreamList(ReaderPtrT reader_ptr) : readers_(1, reader_ptr) {
- }
-
- explicit ReadStreamList(size_t size) : readers_(size) {
- }
-
-// std::vector<Reader*>& get() {
-// destroy_readers_ = false;
-// return streams_;
-// }
-
- //todo use boost iterator facade
- class iterator : public std::iterator<std::input_iterator_tag, ReaderT> {
- typedef typename std::vector<ReaderPtrT>::iterator vec_it;
- vec_it it_;
- public:
-
- iterator(vec_it it) : it_(it) {
- }
-
- void operator++() {
- ++it_;
- }
-
- bool operator==(const iterator &that) {
- return it_ == that.it_;
- }
-
- bool operator!=(const iterator &that) {
- return it_ != that.it_;
- }
-
- ReaderT &operator*() {
- return *(*it_);
- }
- };
-
-// class const_iterator: public std::iterator<std::input_iterator_tag, Reader> {
-// typedef typename std::vector<Reader*>::iterator vec_it;
-// vec_it it_;
-// public:
-//
-// const_iterator(vec_it it) : it_(it) {
-// }
-//
-// void operator++ () {
-// ++it_;
-// }
-//
-// bool operator== (const const_iterator& that) {
-// return it_ == that.it_;
-// }
-//
-// bool operator!= (const const_iterator& that) {
-// return it_ != that.it_;
-// }
-//
-// ReaderT& operator*() {
-// return *(*it_);
-// }
-// };
-
- ReaderT &operator[](size_t i) {
- return *readers_.at(i);
- }
-
- ReaderPtrT &ptr_at(size_t i) {
- return readers_.at(i);
- }
-
- ReaderT &back() {
- return *readers_.back();
- }
-
- size_t size() const {
- return readers_.size();
- }
-
- bool eof() const {
- for (size_t i = 0; i < readers_.size(); ++i) {
- if (!readers_[i]->eof()) {
- return false;
- }
- }
- return true;
- }
-
- iterator begin() {
- return iterator(readers_.begin());
- }
-
- iterator end() {
- return iterator(readers_.end());
- }
-
-// const_iterator begin() const {
-// return iterator(streams_.begin());
-// }
-//
-// const_iterator end() const {
-// return iterator(streams_.end());
-// }
-
- void push_back(ReaderT *reader_ptr) {
- readers_.push_back(ReaderPtrT(reader_ptr));
- }
-
- void push_back(ReaderPtrT reader_ptr) {
- readers_.push_back(reader_ptr);
- }
-
- void reset() {
- for (size_t i = 0; i < readers_.size(); ++i) {
- readers_[i]->reset();
- }
- }
-
- void close() {
- for (size_t i = 0; i < readers_.size(); ++i) {
- readers_[i]->close();
- }
- }
-
- void clear() {
- readers_.clear();
- }
-
- ReadStreamStat get_stat() const {
- ReadStreamStat stat;
- for (size_t i = 0; i < readers_.size(); ++i) {
- stat.merge(readers_[i]->get_stat());
- }
- return stat;
- }
-
-// void release() {
-// destroy_readers_ = false;
-// }
-
-// const std::vector< Reader * >& get() const {
-// return streams_;
-// }
-
-};
-
-}
diff --git a/src/modules/io/reads_io/sequence_reader.hpp b/src/modules/io/reads_io/sequence_reader.hpp
deleted file mode 100644
index 515cc9e..0000000
--- a/src/modules/io/reads_io/sequence_reader.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "io/reads_io/ireader.hpp"
-#include "io/reads/single_read.hpp"
-
-namespace io {
-
-//todo merge with VectorReader
-template<class ReadType>
-class SequenceReadStream : public ReadStream<ReadType> {
-public:
- explicit SequenceReadStream(const Sequence &sequence, const std::string &name = "")
- : sequence_(sequence),
- name_(name),
- opened_(true),
- eof_(false) {
- }
-
- virtual ~SequenceReadStream() {
- }
-
- virtual bool is_open() {
- return opened_;
- }
-
- virtual bool eof() {
- return eof_;
- }
-
- virtual void close() {
- opened_ = false;
- }
-
- void reset() {
- eof_ = false;
- opened_ = true;
- }
-
- ReadStreamStat get_stat() const {
- return ReadStreamStat();
- }
-
- SequenceReadStream &operator>>(ReadType &read);
-
-private:
- Sequence sequence_;
- std::string name_;
- bool opened_;
- bool eof_;
-};
-
-template<>
-SequenceReadStream<SingleRead> &SequenceReadStream<SingleRead>::operator>>(SingleRead &read) {
- if (!eof_) {
- read = SingleRead(name_, sequence_.str());
- eof_ = true;
- }
- return *this;
-}
-
-template<>
-SequenceReadStream<SingleReadSeq> &SequenceReadStream<SingleReadSeq>::operator>>(SingleReadSeq &read) {
- if (!eof_) {
- read = SingleReadSeq(sequence_);
- eof_ = true;
- }
- return *this;
-}
-
-}
diff --git a/src/modules/io/reads_io/splitting_wrapper.hpp b/src/modules/io/reads_io/splitting_wrapper.hpp
deleted file mode 100644
index 95a4f23..0000000
--- a/src/modules/io/reads_io/splitting_wrapper.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "io/reads/single_read.hpp"
-#include "read_stream_vector.hpp"
-#include "delegating_reader_wrapper.hpp"
-
-namespace io {
-
-class SplittingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-private:
- std::vector<SingleRead> buffer_;
- size_t buffer_position_;
-
- void FillBuffer(SingleRead& tmp_read) {
- buffer_.clear();
- for(size_t i = 0; i < tmp_read.size(); i++) {
- size_t j = i;
- while(j < tmp_read.size() && is_nucl(tmp_read.GetSequenceString()[j])) {
- j++;
- }
- if(j > i) {
- buffer_.push_back(tmp_read.Substr(i, j));
- i = j - 1;
- }
- }
- buffer_position_ = 0;
- }
-
- bool Skip() {
- while(!this->reader().eof() && buffer_position_ == buffer_.size()) {
- SingleRead tmp_read;
- this->reader() >> tmp_read;
- FillBuffer(tmp_read);
- }
- return buffer_position_ != buffer_.size();
- }
-
-public:
-
- explicit SplittingWrapper(base::ReadStreamPtrT reader) :
- base(reader), buffer_position_(0) {
- }
-
- /* virtual */
- SplittingWrapper& operator>>(SingleRead& read) {
- Skip();
- read = buffer_[buffer_position_];
- buffer_position_++;
- return *this;
- }
-
- //todo fix needed!!! seems that eof can't be called multiple times in a row!!!
- /* virtual */ bool eof() {
- return !Skip();
- }
-};
-
-inline std::shared_ptr<ReadStream<SingleRead>> SplittingWrap(std::shared_ptr<ReadStream<SingleRead>> reader_ptr) {
- return std::make_shared<SplittingWrapper>(reader_ptr);
-}
-
-inline ReadStreamList<SingleRead> SplittingWrap(ReadStreamList<SingleRead>& readers) {
- ReadStreamList<SingleRead> answer;
- for (size_t i = 0; i < readers.size(); ++i) {
- answer.push_back(SplittingWrap(readers.ptr_at(i)));
- }
- return answer;
-}
-}
diff --git a/src/modules/io/reads_io/vector_reader.hpp b/src/modules/io/reads_io/vector_reader.hpp
deleted file mode 100644
index 9059c6e..0000000
--- a/src/modules/io/reads_io/vector_reader.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "io/reads_io/ireadstream.hpp"
-namespace io {
-
-/**
- * Use vector<T> as input-stream with operator>>(T& t)
- */
-template <typename T>
-class VectorReadStream : public ReadStream<T> {
- std::vector<T> data_;
- size_t pos_;
- bool closed_;
-public:
- VectorReadStream(const std::vector<T>& data) : data_(data), pos_(0), closed_(false) {
-
- }
-
- VectorReadStream(const T& item) : data_({item}), pos_(0), closed_(false) {
-
- }
-
- virtual bool eof() /*const */{
- return pos_ == data_.size();
- }
-
- VectorReadStream<T>& operator>>(T& t) {
- VERIFY(!eof());
- t = data_[pos_++];
- return *this;
- }
-
- void close() {
- closed_ = true;
- }
-
- virtual bool is_open() /*const */{
- return !closed_;
- }
-
- void reset() {
- pos_ = 0;
- }
-
- ReadStreamStat get_stat() const {
- //todo
- ReadStreamStat stat;
- stat.read_count_ = data_.size();
-
- return stat;
- }
-
-};
-
-}
diff --git a/src/modules/io/reads_io/wrapper_collection.hpp b/src/modules/io/reads_io/wrapper_collection.hpp
deleted file mode 100644
index 3b243bb..0000000
--- a/src/modules/io/reads_io/wrapper_collection.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "io/reads/single_read.hpp"
-#include "delegating_reader_wrapper.hpp"
-
-namespace io {
-
-//todo refactor!!!
-class IdSettingReaderWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- size_t next_id_;
-public:
- IdSettingReaderWrapper(base::ReadStreamPtrT reader, size_t start_id = 0) :
- base(reader), next_id_(start_id) {
- }
-
- /* virtual */
- IdSettingReaderWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read.ChangeName(ToString(next_id_++));
- return *this;
- }
-};
-
-class PrefixAddingReaderWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
- std::string prefix_;
-public:
- PrefixAddingReaderWrapper(base::ReadStreamPtrT reader,
- const std::string& prefix) :
- base(reader), prefix_(prefix) {
- }
-
- /* virtual */
- PrefixAddingReaderWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- read.ChangeName(prefix_ + read.name());
- return *this;
- }
-};
-
-//fixme currently leads to long stretches of ACGTACGT...
-class FixingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-
- io::SingleRead MakeValid(const io::SingleRead& read) const {
- std::string str = read.GetSequenceString();
- for (size_t i = 0; i < str.length(); ++i) {
- if (!is_nucl(str[i]))
- str[i] = nucl(char(i % 4));
- }
- return io::SingleRead(read.name(), str);
- }
-
-public:
- FixingWrapper(base::ReadStreamPtrT reader) :
- base(reader) {
- }
-
- /* virtual */
- FixingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- if (!read.IsValid()) {
- TRACE("Read " << read.name() << " was invalid. Fixing");
- read = MakeValid(read);
- VERIFY(read.IsValid());
- }
- return *this;
- }
-
-private:
- DECL_LOGGER("FixingWrapper");
-};
-
-class NonNuclCollapsingWrapper: public DelegatingWrapper<SingleRead> {
- typedef DelegatingWrapper<SingleRead> base;
-
- io::SingleRead MakeValid(const io::SingleRead& read) const {
- std::string str = read.GetSequenceString();
- std::stringstream ss;
- for (size_t i = 0; i < read.size(); ++i) {
- if (is_nucl(str[i]))
- ss << str[i];
- }
- return io::SingleRead(read.name(), ss.str());
- }
-
-public:
- NonNuclCollapsingWrapper(base::ReadStreamPtrT reader) :
- base(reader) {
- }
-
- /* virtual */
- NonNuclCollapsingWrapper& operator>>(SingleRead& read) {
- this->reader() >> read;
- if (!read.IsValid()) {
- TRACE("Read " << read.name() << " was invalid. Collapsing non-nucls");
- read = MakeValid(read);
- VERIFY(read.IsValid());
- }
- return *this;
- }
-
-private:
- DECL_LOGGER("NonNuclCollapsingWrapper");
-};
-
-}
diff --git a/src/modules/io/sam_io/bam_parser.hpp b/src/modules/io/sam_io/bam_parser.hpp
deleted file mode 100644
index 3a22c0d..0000000
--- a/src/modules/io/sam_io/bam_parser.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef COMMON_IO_BAMPARSER_HPP
-#define COMMON_IO_BAMPARSER_HPP
-
-#include "io/reads/single_read.hpp"
-#include "io/reads_io/parser.hpp"
-#include "data_structures/sequence/quality.hpp"
-#include "data_structures/sequence/nucl.hpp"
-#include "dev_support/verify.hpp"
-
-#include "bamtools/api/BamReader.h"
-
-#include <string>
-
-namespace io {
-
-class BAMParser: public Parser {
-public:
- BAMParser(const std::string& filename, OffsetType offset_type = PhredOffset)
- : Parser(filename, offset_type) {
- open();
- }
-
- ~BAMParser() {
- close();
- }
-
- BAMParser& operator>>(SingleRead& read) {
- if (!is_open_ || eof_)
- return *this;
-
- read = SingleRead(seq_.Name, seq_.QueryBases, seq_.Qualities, offset_type_);
- eof_ = (false == reader_.GetNextAlignment(seq_));
-
- return *this;
- }
-
- void close() {
- reader_.Close();
- is_open_ = false;
- eof_ = true;
- }
-
-private:
- BamTools::BamReader reader_;
- BamTools::BamAlignment seq_;
-
- void open() {
- reader_.Open(filename_);
- is_open_ = true;
-
- eof_ = (false == reader_.GetNextAlignment(seq_));
- }
-
- BAMParser(const BAMParser& parser);
- void operator=(const BAMParser& parser);
-};
-
-}
-
-#endif /* COMMON_IO_FASTAFASTQGZPARSER_HPP */
diff --git a/src/modules/io/sam_io/bam_reader.hpp b/src/modules/io/sam_io/bam_reader.hpp
deleted file mode 100644
index 57c2c64..0000000
--- a/src/modules/io/sam_io/bam_reader.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-//todo rename to reader
-#pragma once
-
-#include "io/reads_io/ireader.hpp"
-#include "io/reads/single_read.hpp"
-
-#include <bamtools/api/BamReader.h>
-
-namespace io {
-class BamRead : public BamTools::BamAlignment {
-public:
- BamRead() { }
-
- BamRead(const BamTools::BamAlignment &other)
- : BamTools::BamAlignment(other) { }
-
- const std::string &name() const {
- return Name;
- }
-
- size_t size() const {
- return Length;
- }
-
- size_t nucl_count() const {
- return size();
- }
-
- const std::string &GetSequenceString() const {
- return QueryBases;
- }
-
- std::string GetPhredQualityString() const {
- return Qualities;
- }
-
- operator io::SingleRead() {
- // not including quality is intentional:
- // during read correction bases might be inserted/deleted,
- // and base qualities for them are not calculated
- return io::SingleRead(name(), GetSequenceString());
- }
-
- char operator[](size_t i) const {
- VERIFY(is_nucl(QueryBases[i]));
- return dignucl(QueryBases[i]);
- }
-};
-
-class UnmappedBamStream : public ReadStream<BamRead> {
-public:
- UnmappedBamStream(const std::string &filename)
- : filename_(filename) {
- open();
- }
-
- virtual ~UnmappedBamStream() { }
-
- bool is_open() { return is_open_; }
-
- bool eof() { return eof_; }
-
- UnmappedBamStream &operator>>(BamRead &read) {
- if (!is_open_ || eof_)
- return *this;
-
- read = seq_;
- eof_ = (false == reader_.GetNextAlignment(seq_));
-
- return *this;
- }
-
- void close() {
- reader_.Close();
- is_open_ = false;
- eof_ = true;
- }
-
- void reset() {
- close();
- open();
- }
-
- ReadStreamStat get_stat() const { return ReadStreamStat(); }
-
-private:
- BamTools::BamReader reader_;
- BamTools::BamAlignment seq_;
- std::string filename_;
- bool is_open_;
- bool eof_;
-
- void open() {
- reader_.Open(filename_);
- is_open_ = true;
-
- eof_ = (false == reader_.GetNextAlignment(seq_));
- }
-
-};
-}
diff --git a/src/modules/io/sam_io/read.cpp b/src/modules/io/sam_io/read.cpp
deleted file mode 100644
index dc9a0e0..0000000
--- a/src/modules/io/sam_io/read.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <io/sam_io/read.hpp>
-
-using namespace std;
-
-namespace sam_reader {
-
-string SingleSamRead::cigar() const {
- uint32_t *cigar = bam1_cigar(data_);
- string res;
- res.reserve(data_->core.n_cigar);
- for (size_t k = 0; k < data_->core.n_cigar; ++k) {
- res += std::to_string(bam_cigar_oplen(cigar[k]));
- res += bam_cigar_opchr(cigar[k]);
-
- }
- return res;
-}
-
-string SingleSamRead::name() const {
- string res(bam1_qname(data_));
- return res;
-}
-
-string SingleSamRead::seq() const {
- string res = "";
- auto b = bam1_seq(data_);
- for (int k = 0; k < data_->core.l_qseq; ++k) {
- res += bam_nt16_rev_table[bam1_seqi(b, k)];
- }
- return res;
-}
-
-
-}
-;
diff --git a/src/modules/io/sam_io/sam_reader.cpp b/src/modules/io/sam_io/sam_reader.cpp
deleted file mode 100644
index 5d338fa..0000000
--- a/src/modules/io/sam_io/sam_reader.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include <io/sam_io/read.hpp>
-#include <io/sam_io/sam_reader.hpp>
-
-using namespace std;
-
-namespace sam_reader {
-
-bool MappedSamStream::eof() const {
- return eof_;
-}
-
-bool MappedSamStream::is_open() const {
- return is_open_;
-}
-
-MappedSamStream& MappedSamStream::operator>>(SingleSamRead& read) {
- if (!is_open_ || eof_)
- return *this;
- read.set_data(seq_);
- int tmp = samread(reader_, seq_);
- eof_ = (0 >= tmp);
- return *this;
-}
-
-MappedSamStream& MappedSamStream::operator >>(PairedSamRead& read) {
- TRACE("starting process paired read");
- SingleSamRead r1;
- MappedSamStream::operator >>(r1);
- SingleSamRead r2;
- MappedSamStream::operator >>(r2);
-
- read = PairedSamRead(r1, r2);
- TRACE(r1.seq());
- TRACE(r2.seq());
- TRACE(r1.name());
- return *this;
-}
-
-const char* MappedSamStream::get_contig_name(int i) const {
- VERIFY(i < reader_->header->n_targets);
- return (reader_->header->target_name[i]);
-}
-
-void MappedSamStream::close() {
- samclose(reader_);
- is_open_ = false;
- eof_ = true;
- bam_destroy1(seq_);
-}
-
-void MappedSamStream::reset() {
- close();
- open();
-}
-
-void MappedSamStream::open() {
- if ((reader_ = samopen(filename_.c_str(), "r", NULL)) == NULL) {
- WARN("Fail to open SAM file " << filename_);
- is_open_ = false;
- eof_ = true;
- } else {
- is_open_ = true;
- int tmp = samread(reader_, seq_);
- eof_ = (0 >= tmp);
- }
-}
-
-}
diff --git a/src/modules/io/sam_io/sam_reader.hpp b/src/modules/io/sam_io/sam_reader.hpp
deleted file mode 100644
index 55dc297..0000000
--- a/src/modules/io/sam_io/sam_reader.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-
-#include "read.hpp"
-
-#include "dev_support/logger/log_writers.hpp"
-
-#include <samtools/sam.h>
-#include <samtools/bam.h>
-
-#include <string>
-
-namespace sam_reader {
-
-class MappedSamStream {
-public:
- MappedSamStream(const std::string &filename)
- : filename_(filename) {
- open();
- }
-
- virtual ~MappedSamStream() {
- }
-
- bool is_open() const;
- bool eof() const;
- MappedSamStream& operator >>(SingleSamRead& read);
- MappedSamStream& operator >>(PairedSamRead& read);
- const char* get_contig_name(int i) const;
- void close();
- void reset();
-
-private:
- samfile_t *reader_;
- bam1_t *seq_ = bam_init1();
- std::string filename_;
- bool is_open_;
- bool eof_;
-
- void open();
-};
-
-}
-;
diff --git a/src/modules/math/CMakeLists.txt b/src/modules/math/CMakeLists.txt
deleted file mode 100644
index 28cb6c6..0000000
--- a/src/modules/math/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(math_module CXX)
-
-add_library(math_module STATIC
- kmer_coverage_model.cpp)
-
-target_link_libraries(math_module nlopt)
-
diff --git a/src/modules/math/kmer_coverage_model.cpp b/src/modules/math/kmer_coverage_model.cpp
deleted file mode 100644
index db886d7..0000000
--- a/src/modules/math/kmer_coverage_model.cpp
+++ /dev/null
@@ -1,394 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "kmer_coverage_model.hpp"
-
-#include "math/xmath.h"
-#include "dev_support/logger/logger.hpp"
-#include "math/smooth.hpp"
-#include "dev_support/verify.hpp"
-
-#include <boost/math/special_functions/zeta.hpp>
-#include <boost/math/distributions/normal.hpp>
-#include <boost/math/distributions/skew_normal.hpp>
-#include <boost/math/distributions/geometric.hpp>
-#include <boost/math/distributions/pareto.hpp>
-
-#include <nlopt/nlopt.hpp>
-
-#include <vector>
-
-#include <cstring>
-#include <cstdint>
-#include <cstddef>
-#include <cmath>
-
-namespace cov_model {
-using std::isfinite;
-
-static const size_t MaxCopy = 10;
-
-static double dzeta(double x, double p) {
- return pow(x, -p - 1) / boost::math::zeta(p + 1);
-}
-
-static double perr(size_t i, double scale, double shape) {
- return pow((1 + shape * ((double) (i - 1)) / scale), -1.0 / shape) -
- pow((1 + shape * ((double) i) / scale), -1.0 / shape);
-}
-
-static double pgood(size_t i, double zp, double u, double sd, double shape,
- double *mixprobs = NULL) {
- double res = 0;
-
- for (unsigned copy = 0; copy < MaxCopy; ++copy) {
- boost::math::skew_normal snormal((copy + 1) * u, sd * sqrt(copy + 1), shape);
- // res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * (boost::math::cdf(snormal, i + 1) - boost::math::cdf(snormal, i));
- res += (mixprobs ? mixprobs[copy] : dzeta(copy + 1, zp)) * boost::math::pdf(snormal, i);
- }
-
- return res;
-}
-
-class CovModelLogLike {
- const std::vector <size_t> &cov;
-
-public:
- CovModelLogLike(const std::vector <size_t> &cov)
- : cov(cov) { }
-
- int getN() const { return 7; };
-
-private:
-
- double eval_(const double *x) const {
- double zp = x[0], p = x[1], shape = x[2], u = x[3], sd = x[4], scale = x[5], shape2 = x[6];
-
- if (zp <= 1 || shape <= 0 || sd <= 0 || p < 1e-9 || p > 1 - 1e-9 || u <= 0 || scale <= 0 ||
- !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(p) || !isfinite(u) ||
- !isfinite(scale) || !isfinite(shape2))
- return +std::numeric_limits<double>::infinity();
-
- std::vector <double> kmer_probs(cov.size());
-
- // Error
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- kmer_probs[i] += p * perr(i + 1, scale, shape);
-
- // Good
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- kmer_probs[i] += (1 - p) * pgood(i + 1, zp, u, sd, shape2);
-
- double res = 0;
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- res += (double) (cov[i]) * log(kmer_probs[i]);
-
- return -res;
- }
-};
-
-struct CovModelLogLikeEMData {
- const std::vector <size_t> &cov;
- const std::vector <double> &z;
-};
-
-static double CovModelLogLikeEM(unsigned, const double *x, double *, void *data) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
-
- // INFO("Entry: " << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4]);
-
- if (zp <= 1 || shape <= 0 || sd <= 0 || u <= 0 || scale <= 0 ||
- !isfinite(zp) || !isfinite(shape) || !isfinite(sd) || !isfinite(u) ||
- !isfinite(scale) || !isfinite(shape2))
- return -std::numeric_limits<double>::infinity();
-
- const std::vector <size_t> &cov = static_cast<CovModelLogLikeEMData *>(data)->cov;
- const std::vector <double> &z = static_cast<CovModelLogLikeEMData *>(data)->z;
-
- std::vector <double> kmer_probs(cov.size(), 0);
-
- // Error
- for (size_t i = 0; i < kmer_probs.size(); ++i) {
- if (cov[i] == 0)
- continue;
-
- kmer_probs[i] += z[i] * log(perr(i + 1, scale, shape));
- }
-
- // Good
- // Pre-compute mixing probabilities
- std::vector <double> mixprobs(MaxCopy, 0);
- for (unsigned copy = 0; copy < MaxCopy; ++copy)
- mixprobs[copy] = dzeta(copy + 1, zp);
-
- // Compute the density
- for (size_t i = 0; i < kmer_probs.size(); ++i) {
- if (cov[i] == 0)
- continue;
-
- double val = log(pgood(i + 1, zp, u, sd, shape2, &mixprobs[0]));
- if (!isfinite(val))
- val = -1000.0;
- kmer_probs[i] += (1 - z[i]) * val;
- }
-
- double res = 0;
- for (size_t i = 0; i < kmer_probs.size(); ++i)
- res += (double) (cov[i]) * kmer_probs[i];
-
- // INFO("f: " << res);
- return res;
-}
-
-
-static std::vector <double> EStep(const std::vector <double> &x,
- double p, size_t N) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
-
- std::vector <double> res(N);
- for (size_t i = 0; i < N; ++i) {
- double pe = p * perr(i + 1, scale, shape);
- res[i] = pe / (pe + (1 - p) * pgood(i + 1, zp, u, sd, shape2));
- if (!isfinite(res[i]))
- res[i] = 1.0;
- }
-
- return res;
-}
-
-// Estimate the coverage mean by finding the max past the
-// first valley.
-size_t KMerCoverageModel::EstimateValley() const {
- // Smooth the histogram
- std::vector <size_t> scov;
- math::Smooth3RS3R(scov, cov_);
-
- size_t Valley = scov[0];
-
- // Start finding the valley
- size_t Idx = 1;
- while (scov[Idx] < Valley && Idx < scov.size()) {
- Valley = scov[Idx];
- Idx += 1;
- }
- Idx -= 1;
-
- INFO("Kmer coverage valley at: " << Idx);
-
- return Idx;
-}
-
-void KMerCoverageModel::Fit() {
- VERIFY_MSG(cov_.size() > 10, "Invalid kmer coverage histogram, make sure that the coverage is indeed uniform");
-
- // Find the minimal coverage point using smoothed histogram.
- Valley_ = EstimateValley();
-
- // First estimate of coverage is the first maximum after the valley.
- MaxCov_ = Valley_ + 1;
- size_t MaxHist = cov_[MaxCov_];
- for (size_t i = Valley_ + 1; i < cov_.size(); ++i) {
- if (cov_[i] > MaxHist) {
- MaxHist = cov_[i];
- MaxCov_ = i;
- }
- }
- INFO("K-mer histogram maximum: " << MaxCov_);
-
- // Refine the estimate via median
- size_t AfterValley = 0, SecondValley = std::min(2 * MaxCov_ - Valley_, cov_.size());
- for (size_t i = Valley_ + 1; i < SecondValley; ++i)
- AfterValley += cov_[i];
-
- size_t ccov = 0;
- for (size_t i = Valley_ + 1; i < SecondValley; ++i) {
- if (ccov > AfterValley / 2) {
- MaxCov_ = std::max(i, MaxCov_);
- break;
- }
- ccov += cov_[i];
- }
-
- if (MaxCov_ - Valley_ < 3)
- WARN("Too many erroneous kmers, the estimates might be unreliable");
-
- std::vector <size_t> mvals(1 + MaxCov_ - Valley_);
- mvals[0] = cov_[MaxCov_];
- size_t tmadcov = mvals[0];
- for (size_t i = 1; i < std::min(MaxCov_ - Valley_, cov_.size() - MaxCov_); ++i) {
- mvals[i] = cov_[MaxCov_ + i] + cov_[MaxCov_ - i];
- tmadcov += mvals[i];
- }
- size_t madcov = 0;
- double CovSd = sqrt(5.0 * (double) MaxCov_);
- for (size_t i = 0; i < MaxCov_ - Valley_; ++i) {
- if (madcov > tmadcov / 2) {
- CovSd = i;
- break;
- }
- madcov += mvals[i];
- }
- CovSd *= 1.4826;
- INFO("Estimated median coverage: " << MaxCov_ << ". Coverage mad: " << CovSd);
-
- // Estimate error probability as ratio of kmers before the valley.
- size_t BeforeValley = 0, Total = 0;
- double ErrorProb = 0;
- for (size_t i = 0; i < cov_.size(); ++i) {
- if (i <= Valley_)
- BeforeValley += cov_[i];
- Total += cov_[i];
- }
- ErrorProb = (double) BeforeValley / (double) Total;
- // Allow some erroneous / good kmers.
- ErrorProb = std::min(1 - 1e-3, ErrorProb);
- ErrorProb = std::max(1e-3, ErrorProb);
-
- TRACE("Total: " << Total << ". Before: " << BeforeValley);
- TRACE("p: " << ErrorProb);
-
- std::vector <double> x(6), lb(6), ub(6);
-
- x[0] = 3;
- lb[0] = 0;
- ub[0] = 2000;
- x[1] = 3;
- lb[1] = 0;
- ub[1] = 2000;
- x[2] = MaxCov_;
- lb[2] = 0;
- ub[2] = 2 * MaxCov_;
- x[3] = CovSd;
- lb[3] = MaxCov_ - Valley_;
- ub[3] = SecondValley;
- x[4] = 1;
- lb[4] = 0;
- ub[4] = 2000;
- x[5] = 0;
- lb[5] = -6;
- ub[5] = 6;
-
- INFO("Fitting coverage model");
- // Ensure that there will be at least 2 iterations.
- double PrevErrProb = 2;
- const double ErrProbThr = 1e-8;
- auto GoodCov = cov_;
- GoodCov.resize(std::min(cov_.size(), 5 * MaxCopy * MaxCov_ / 4));
- converged_ = true;
- unsigned it = 1;
- while (fabs(PrevErrProb - ErrorProb) > ErrProbThr) {
- // Recalculate the vector of posterior error probabilities
- std::vector <double> z = EStep(x, ErrorProb, GoodCov.size());
-
- // Recalculate the probability of error
- PrevErrProb = ErrorProb;
- ErrorProb = 0;
- for (size_t i = 0; i < GoodCov.size(); ++i)
- ErrorProb += z[i] * (double) GoodCov[i];
- ErrorProb /= (double) Total;
-
- bool LastIter = fabs(PrevErrProb - ErrorProb) <= ErrProbThr;
-
- nlopt::opt opt(nlopt::LN_NELDERMEAD, 6);
- CovModelLogLikeEMData data = {GoodCov, z};
- opt.set_max_objective(CovModelLogLikeEM, &data);
- if (!LastIter)
- opt.set_maxeval(5 * 6 * it);
- opt.set_xtol_rel(1e-8);
- opt.set_ftol_rel(1e-8);
-
- double fMin;
- nlopt::result Results = nlopt::FAILURE;
- try {
- Results = opt.optimize(x, fMin);
- } catch (nlopt::roundoff_limited &) {
- }
-
- VERBOSE_POWER_T2(it, 1, "... iteration " << it);
- TRACE("Results: ");
- TRACE("Converged: " << Results << " " << "F: " << fMin);
-
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
- TRACE("zp: " << zp << " p: " << ErrorProb << " shape: " << shape << " u: " << u << " sd: " << sd <<
- " scale: " << scale << " shape2: " << shape2);
-
- it += 1;
- }
-
- double delta = x[5] / sqrt(1 + x[5] * x[5]);
- mean_coverage_ = x[2] + x[3] * delta * sqrt(2 / M_PI);
- sd_coverage_ = x[3] * sqrt(1 - 2 * delta * delta / M_PI);
- INFO("Fitted mean coverage: " << mean_coverage_ << ". Fitted coverage std. dev: " << sd_coverage_);
-
- // Now let us check whether we have sane results
- for (size_t i = 0; i < x.size(); ++i)
- if (!isfinite(x[i])) {
- converged_ = false;
- break;
- }
-
- if (!isfinite(ErrorProb))
- converged_ = false;
-
- // See, if we can deduce proper threshold
-
- // First, check whether initial estimate of Valley was sane.
- ErrorThreshold_ = 0;
- if (converged_ && Valley_ > x[2] && x[2] > 2) {
- Valley_ = (size_t) math::round(x[2] / 2.0);
- WARN("Valley value was estimated improperly, reset to " << Valley_);
- }
-
- // If the model converged, then use it to estimate the thresholds.
- if (converged_) {
- std::vector <double> z = EStep(x, ErrorProb, GoodCov.size());
-
- INFO("Probability of erroneous kmer at valley: " << z[Valley_]);
- converged_ = false;
- for (size_t i = 0; i < z.size(); ++i)
- if (z[i] > strong_probability_threshold_) //0.999
- LowThreshold_ = std::min(i + 1, Valley_);
- else if (z[i] < probability_threshold_) {//0.05?
- ErrorThreshold_ = std::max(i + 1, Valley_);
- converged_ = true;
- break;
- }
-
-#if 0
-for (size_t i = 0; i < z.size(); ++i) {
- double zp = x[0], shape = x[1], u = x[2], sd = x[3], scale = x[4], shape2 = x[5];
- double pe = ErrorProb * perr(i + 1, scale, shape);
- double pg = (1 - ErrorProb) * pgood(i + 1, zp, u, sd, shape2);
-
- fprintf(stderr, "%e %e %e %e\n", pe, pg, z[i], perr(i + 1, scale, shape));
-}
-#endif
- }
-
- // See, if we have sane ErrorThreshold_ and go down to something convervative, if not.
- if (converged_) {
- INFO("Preliminary threshold calculated as: " << ErrorThreshold_);
- ErrorThreshold_ = (Valley_ < mean_coverage_ ?
- std::min(Valley_ + (size_t) (mean_coverage_ - Valley_) / 2, ErrorThreshold_) :
- Valley_);
- INFO("Threshold adjusted to: " << ErrorThreshold_);
- } else {
- ErrorThreshold_ = Valley_;
- LowThreshold_ = 1;
- WARN("Failed to determine erroneous kmer threshold. Threshold set to: " << ErrorThreshold_);
- }
-
- // Now the bonus: estimate the genome size!
- GenomeSize_ = 0;
- for (size_t i = ErrorThreshold_ - 1; i < GoodCov.size(); ++i)
- GenomeSize_ += GoodCov[i];
- GenomeSize_ /= 2;
-
- INFO("Estimated genome size (ignoring repeats): " << GenomeSize_);
-}
-
-};
diff --git a/src/modules/math/kmer_coverage_model.hpp b/src/modules/math/kmer_coverage_model.hpp
deleted file mode 100644
index 1e7ec38..0000000
--- a/src/modules/math/kmer_coverage_model.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __KMER_COVERAGE_MODEL_HPP__
-#define __KMER_COVERAGE_MODEL_HPP__
-
-#include <vector>
-#include <cstddef>
-
-namespace cov_model {
-
-class KMerCoverageModel {
- const std::vector <size_t> &cov_;
- size_t MaxCov_, Valley_, ErrorThreshold_, LowThreshold_, GenomeSize_;
- double probability_threshold_, strong_probability_threshold_, mean_coverage_, sd_coverage_;
- bool converged_;
-
-public:
- KMerCoverageModel(const std::vector <size_t> &cov, double probability_threshold,
- double strong_probability_threshold)
- : cov_(cov), LowThreshold_(0), probability_threshold_(probability_threshold),
- strong_probability_threshold_(strong_probability_threshold),
- mean_coverage_(0.0), sd_coverage_(0.0), converged_(false) { }
-
- void Fit();
-
- size_t GetErrorThreshold() const { return ErrorThreshold_; }
-
- size_t GetLowThreshold() const { return LowThreshold_; }
-
- size_t GetGenomeSize() const { return GenomeSize_; }
-
- double GetMeanCoverage() const { return mean_coverage_; }
-
- double GetSdCoverage() const { return sd_coverage_; }
-
- bool converged() const { return converged_; }
-
-private:
- size_t EstimateValley() const;
-};
-
-};
-
-
-#endif
diff --git a/src/modules/math/pred.hpp b/src/modules/math/pred.hpp
deleted file mode 100644
index 493626b..0000000
--- a/src/modules/math/pred.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef __ADT_PRED_HPP__
-#define __ADT_PRED_HPP__
-
-#pragma once
-
-#include "utils/adt/function_traits.hpp"
-
-#include <memory>
-#include <functional>
-
-namespace pred {
-
-template<typename T>
-class TypedPredicate {
-public:
- typedef T checked_type;
-
- template<typename P>
- TypedPredicate(P p)
- : self_(std::make_shared<TypedPredicateModel < P> > (std::move(p))) { }
-
- bool operator()(T x) const {
- return self_->operator()(x);
- }
-
-private:
- struct TypedPredicateConcept {
- virtual ~TypedPredicateConcept() { };
-
- virtual bool operator()(T x) const = 0;
- };
-
- template<class P>
- struct TypedPredicateModel : TypedPredicateConcept {
- TypedPredicateModel(P p)
- : data_(std::move(p)) { }
-
- virtual bool operator()(T x) const override {
- return data_(x);
- }
-
- P data_;
- };
-
- std::shared_ptr<const TypedPredicateConcept> self_;
-};
-
-template<typename T>
-class AlwaysTrueOperator {
-public:
- typedef T checked_type;
-
- bool operator()(T) const {
- return true;
- }
-};
-
-template<typename T>
-class AlwaysFalseOperator {
- typedef T checked_type;
-
-public:
- bool operator()(T) const {
- return false;
- }
-};
-
-template<typename T>
-class AndOperator {
-public:
- typedef T checked_type;
-
- AndOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
- : lhs_(std::move(lhs)),
- rhs_(std::move(rhs)) { }
-
- bool operator()(T x) const {
- return lhs_(x) && rhs_(x);
- }
-
-private:
- const TypedPredicate<T> lhs_, rhs_;
-};
-
-template<typename T>
-class OrOperator {
-public:
- typedef T checked_type;
-
- OrOperator(TypedPredicate<T> lhs, TypedPredicate<T> rhs)
- : lhs_(std::move(lhs)), rhs_(std::move(rhs)) { }
-
- bool operator()(T x) const {
- return lhs_(x) || rhs_(x);
- }
-
-private:
- const TypedPredicate<T> lhs_, rhs_;
-};
-
-template<typename T>
-class NotOperator {
-public:
- typedef T checked_type;
-
- NotOperator(const TypedPredicate<T> p)
- : p_(std::move(p)) { }
-
- bool operator()(T x) const {
- return !p_(x);
- }
-
-private:
- const TypedPredicate<T> p_;
-};
-
-template<class P,
- bool = adt::function_traits<P>::arity == 1 &&
- std::is_same<typename adt::function_traits<P>::return_type, bool>::value>
-struct is_predicate : public std::true_type {
-};
-
-template<class P>
-struct is_predicate<P, false> : public std::false_type {
-};
-
-template<class TP1, class TP2,
- typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
- typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
- typename =
- typename std::enable_if<std::is_same<_T1, _T2>::value &&
- is_predicate<TP1>::value && is_predicate<TP2>::value
- >::type>
-TypedPredicate<_T1> And(TP1 lhs, TP2 rhs) {
- return AndOperator<_T1>(lhs, rhs);
-}
-
-template<class TP1, class TP2,
- typename _T1 = typename adt::function_traits<TP1>::template arg<0>::type,
- typename _T2 = typename adt::function_traits<TP2>::template arg<0>::type,
- typename =
- typename std::enable_if<std::is_same<_T1, _T2>::value &&
- is_predicate<TP1>::value && is_predicate<TP2>::value
- >::type>
-TypedPredicate<_T1> Or(TP1 lhs, TP2 rhs) {
- return OrOperator<_T1>(lhs, rhs);
-}
-
-template<class TP,
- typename _T = typename adt::function_traits<TP>::template arg<0>::type,
- typename =
- typename std::enable_if<is_predicate<TP>::value>::type>
-TypedPredicate<_T> Not(TP p) {
- return NotOperator<_T>(p);
-}
-
-template<class T>
-TypedPredicate<T> AlwaysTrue() {
- return AlwaysTrueOperator<T>();
-}
-
-template<class T>
-TypedPredicate<T> AlwaysFalse() {
- return AlwaysFalseOperator<T>();
-}
-
-} // namespace pred
-
-#endif // __ADT_PRED_HPP__
diff --git a/src/modules/math/smooth.hpp b/src/modules/math/smooth.hpp
deleted file mode 100644
index eb53dc9..0000000
--- a/src/modules/math/smooth.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __SMOTH_HPP__
-#define __SMOTH_HPP__
-
-#include <cmath>
-
-namespace math {
-
-template<typename T>
-static T MedianOf3(T u, T v, T w) {
- /* Median(u,v,w): */
- if ((u <= v && v <= w) ||
- (u >= v && v >= w))
- return v;
- if ((u <= w && w <= v) ||
- (u >= w && w >= v))
- return w;
-
- /* else */ return u;
-}
-
-/* Return (Index-1) of median(u,v,w) , i.e.,
--1 : u
-0 : v
-1 : w
-*/
-template<typename T>
-static int IndexOfMedianOf3(T u, T v, T w) {
- if ((u <= v && v <= w) ||
- (u >= v && v >= w))
- return 0;
- if ((u <= w && w <= v) ||
- (u >= w && w >= v))
- return 1;
-
- /* else */ return -1;
-}
-
-enum {
- SmoothNoEndRule,
- SmoothCopyEndRule,
- SmoothTukeyEndRule
-};
-
-template<typename T>
-static bool SmoothEndStep(const T *x, T *y, size_t n, unsigned end_rule) {
- switch (end_rule) {
- default:
- case SmoothNoEndRule:
- return false;
- case SmoothCopyEndRule:
- y[0] = x[0];
- y[n - 1] = x[n - 1];
- return false;
- case SmoothTukeyEndRule: {
- bool chg = false;
- y[0] = MedianOf3(3 * y[1] - 2 * y[2], x[0], y[1]);
- chg = chg || (y[0] != x[0]);
- y[n - 1] = MedianOf3(y[n - 2], x[n - 1], 3 * y[n - 2] - 2 * y[n - 3]);
- chg = chg || (y[n - 1] != x[n - 1]);
- return chg;
- }
- }
-
- return false;
-}
-
-template<typename T>
-static bool Smooth3(const T *x, T *y, size_t n, unsigned end_rule) {
- // y[] := Running Median of three (x) = "3 (x[])" with "copy ends"
- // --- return chg := ( y != x )
- bool chg = false;
-
- for (size_t i = 1; i < n - 1; i++) {
- int j = IndexOfMedianOf3(x[i - 1], x[i], x[i + 1]);
- y[i] = x[(int) i + j];
- chg = chg || j;
- }
-
- chg |= SmoothEndStep(x, y, n, end_rule);
-
- return chg;
-}
-
-template<typename T>
-static size_t Smooth3R(const T *x, T *y, T *z, size_t n, unsigned end_rule) {
- // y[] := "3R"(x) ; 3R = Median of three, repeated until convergence
- size_t iter;
- bool chg;
-
- iter = chg = Smooth3(x, y, n, SmoothCopyEndRule);
-
- while (chg) {
- if ((chg = Smooth3(y, z, n, SmoothNoEndRule))) {
- iter += 1;
- for (size_t i = 1; i < n - 1; i++)
- y[i] = z[i];
- }
- }
-
- chg |= SmoothEndStep(x, y, n, end_rule);
-
- return (iter ? iter : chg);
- /* = 0 <==> only one "3" w/o any change
- = 1 <==> either ["3" w/o change + endchange]
- or [two "3"s, 2nd w/o change ] */
-}
-
-
-template<typename T>
-static bool SplitTest(const T *x, size_t i) {
- // Split test:
- // Are we at a /-\ or \_/ location => split should be made ?
-
- if (x[i] != x[i + 1])
- return false;
-
- if ((x[i - 1] <= x[i] && x[i + 1] <= x[i + 2]) ||
- (x[i - 1] >= x[i] && x[i + 1] >= x[i + 2]))
- return false;
-
- /* else */ return true;
-}
-
-template<typename T>
-static bool SmoothSplit3(const T *x, T *y, size_t n, bool do_ends) {
- // y[] := S(x[]) where S() = "sm_split3"
- bool chg = false;
-
- for (size_t i = 0; i < n; i++)
- y[i] = x[i];
-
- if (do_ends && SplitTest(x, 1)) {
- chg = true;
- y[1] = x[0];
- y[2] = MedianOf3(x[2], x[3], 3 * x[3] - 2 * x[4]);
- }
-
- for (size_t i = 2; i < n - 3; i++) {
- if (SplitTest(x, i)) {
- int j;
- // plateau at x[i] == x[i+1]
-
- // at left:
- if (-1 < (j = IndexOfMedianOf3(x[i], x[i - 1], 3 * x[i - 1] - 2 * x[i - 2]))) {
- y[i] = (j == 0 ? x[i - 1] : 3 * x[i - 1] - 2 * x[i - 2]);
- chg = (y[i] != x[i]);
- }
-
- // at right:
- if (-1 < (j = IndexOfMedianOf3(x[i + 1], x[i + 2], 3 * x[i + 2] - 2 * x[i + 3]))) {
- y[i + 1] = (j == 0 ? x[i + 2] : 3 * x[i + 2] - 2 * x[i + 3]);
- chg = (y[i + 1] != x[i + 1]);
- }
- }
- }
-
- if (do_ends && SplitTest(x, n - 3)) {
- chg = true;
- y[n - 2] = x[n - 1];
- y[n - 3] = MedianOf3(x[n - 3], x[n - 4], 3 * x[n - 4] - 2 * x[n - 5]);
- }
-
- return chg;
-}
-
-template<typename T>
-size_t Smooth3RS3R(std::vector <T> &y, const std::vector <T> &x,
- unsigned end_rule = SmoothTukeyEndRule, bool split_ends = false) {
- // y[1:n] := "3R S 3R"(x[1:n]); z = "work";
- size_t iter;
- bool chg;
- size_t n = x.size();
-
- y.resize(n);
- std::vector <T> z(n), w(n);
-
- iter = Smooth3R(&x[0], &y[0], &z[0], n, end_rule);
- chg = SmoothSplit3(&y[0], &z[0], n, split_ends);
- if (chg)
- iter += Smooth3R(&z[0], &y[0], &w[0], n, end_rule);
-
- /* else y == z already */
- return (iter + chg);
-}
-
-};
-
-#endif
diff --git a/src/modules/paired_info/CMakeLists.txt b/src/modules/paired_info/CMakeLists.txt
deleted file mode 100644
index 35d1605..0000000
--- a/src/modules/paired_info/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-############################################################################
-# Copyright (c) 2015 Saint Petersburg State University
-# Copyright (c) 2011-2014 Saint Petersburg Academic University
-# All Rights Reserved
-# See file LICENSE for details.
-############################################################################
-
-project(paired_info CXX)
-
-add_library(paired_info STATIC
- bwa_pair_info_filler.cpp)
-
-target_link_libraries(paired_info input)
-
diff --git a/src/modules/paired_info/bwa_pair_info_filler.cpp b/src/modules/paired_info/bwa_pair_info_filler.cpp
deleted file mode 100644
index 6855138..0000000
--- a/src/modules/paired_info/bwa_pair_info_filler.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "bwa_pair_info_filler.hpp"
-
-
-namespace bwa_pair_info {
-
-void MapperReadT::ParseCigar(const string& cigar) {
- string num = "";
- bool left_side = true;
- for (size_t i = 0; i < cigar.length(); ++i) {
- if (isdigit(cigar[i])) {
- num += cigar[i];
- }
- else {
- if (cigar[i] == 'H') {
- if (left_side)
- left_hard_clip_ = (uint16_t) std::stoi(num);
- else
- right_hard_clip_ = (uint16_t) std::stoi(num);
- num = "";
- }
- else if (cigar[i] == 'S') {
- if (left_side)
- left_soft_clip_ = (uint16_t) std::stoi(num);
- else
- right_soft_clip_ = (uint16_t) std::stoi(num);
- num = "";
- }
- else {
- left_side = false;
- num = "";
- }
- }
- }
-}
-
-//Correct read algnment according to orientation and clippings
-void BWACorrectingProcessor::ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) {
- using io::LibraryOrientation;
-
- if (!l.IsValid() || !r.IsValid()) {
- return;
- }
- ++count_;
-
- MappedPositionT left_pos(edge_id_map_.at(stoi(l.get_contig_id())), l.pos());
- MappedPositionT right_pos(edge_id_map_.at(stoi(r.get_contig_id())), r.pos());
-
- //This function if overloaded in BWAISCounter and BWAIndexFiller
- if (!CheckAlignments(left_pos, right_pos)) {
- return;
- }
-
- int r_from_pos_to_right_end = r.len() + r.right_hard_clip() - r.left_soft_clip();
- int l_from_pos_to_left_end = l.left_soft_clip() + l.left_hard_clip();
-
- if ((!l.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::FR)) ||
- (l.is_forward() && (lib_.orientation() == LibraryOrientation::RF || lib_.orientation() == LibraryOrientation::RR))) {
- left_pos.e = g_.conjugate(left_pos.e);
- left_pos.pos = (int) g_.length(left_pos.e) - left_pos.pos - (l.len() - l.left_soft_clip() - l.right_soft_clip()) + (int) g_.k();
- l_from_pos_to_left_end = l.right_soft_clip() + l.right_hard_clip();
- }
- if ((!r.is_forward() && (lib_.orientation() == LibraryOrientation::FF || lib_.orientation() == LibraryOrientation::RF)) ||
- (r.is_forward() && (lib_.orientation() == LibraryOrientation::FR || lib_.orientation() == LibraryOrientation::RR))) {
- right_pos.e = g_.conjugate(right_pos.e);
- right_pos.pos = (int) g_.length(right_pos.e) - right_pos.pos - (r.len() - r.left_soft_clip() - r.right_soft_clip()) + (int) g_.k();
- r_from_pos_to_right_end = r.len() + r.left_hard_clip() - r.right_soft_clip();
- }
-
- right_pos.pos = right_pos.pos + r_from_pos_to_right_end;
- left_pos.pos = left_pos.pos - l_from_pos_to_left_end;
-
- //This function if overloaded in BWAISCounter and BWAIndexFiller
- ProcessAlignments(left_pos, right_pos);
-}
-
-// ==== insert size counter overloads ====
-bool BWAISCounter::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- return l.e == r.e && g_.length(l.e) >= min_contig_len_;
-}
-
-void BWAISCounter::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- ++mapped_count_;
-
- int is = r.pos - l.pos;
- if (is > 0 || !ignore_negative_) {
- hist_[is] += 1;
- } else {
- ++negative_count_;
- }
-}
-
-bool BWAISCounter::RefineInsertSize(SequencingLibraryT& reads) const {
- using namespace omnigraph;
- size_t correctly_mapped = mapped_count_ - negative_count_;
- INFO(correctly_mapped << " paired reads (" << ((double) correctly_mapped * 100.0 / (double) count_) << "% of all) aligned to long edges");
-
- if (negative_count_ > 3 * correctly_mapped)
- WARN("Too much reads aligned with negative insert size. Is the library orientation set properly?");
- if (mapped_count_ == 0)
- return false;
-
- std::map<size_t, size_t> percentiles;
- find_mean(hist_, reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
- find_median(hist_, reads.data().median_insert_size, reads.data().insert_size_mad, reads.data().insert_size_distribution);
- if (reads.data().median_insert_size < reads.data().read_length) {
- return false;
- }
-
- std::tie(reads.data().insert_size_left_quantile, reads.data().insert_size_right_quantile) =
- GetISInterval(0.8, reads.data().insert_size_distribution);
-
- return !reads.data().insert_size_distribution.empty();
-}
-
-// ==== pair info index filler overloads ====
-EdgePair BWAIndexFiller::ConjugatePair(EdgePair ep) const {
- return make_pair(g_.conjugate(ep.second), g_.conjugate(ep.first));
-}
-
-void BWAIndexFiller::ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- EdgePair ep{l.e, r.e};
- TRACE("Lpos " << l.pos << ", Rpos " << r.pos);
- int edge_distance = (int) lib_.data().mean_insert_size - r.pos + l.pos;
- TRACE("Distance " << edge_distance);
-
- paired_index_.Add(ep.first, ep.second, omnigraph::de::RawPoint(edge_distance, 1.0));
-}
-
-bool BWAIndexFiller::CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) {
- return g_.length(l.e) >= min_contig_len_ && g_.length(r.e) >= min_contig_len_;
-}
-
-
-//Main class realization
-void BWAPairInfoFiller::OutputEdges(const string &filename) const {
- io::osequencestream_simple oss(filename);
- for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- debruijn_graph::EdgeId e = *it;
- oss.set_header(ToString(g_.int_id(e)));
- oss << g_.EdgeNucls(e);
- }
-}
-void BWAPairInfoFiller::FillEdgeIdMap() {
- for (auto it = g_.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
- debruijn_graph::EdgeId e = *it;
- edge_id_map_.insert(make_pair(g_.int_id(e), e));
- }
-}
-
-bool BWAPairInfoFiller::CreateIndex(const string& contigs) {
- int run_res = 0;
- string err_log = path::append_path(work_dir_, "index.err");
- string index_line = bwa_path_ + string(" index ") + "-a is " + contigs + " 2>" + err_log;
- index_line = path::screen_whitespaces(index_line);
- INFO("Running bwa index ... ");
- INFO("Command line: " << index_line);
- run_res = system(index_line.c_str());
- if (run_res != 0) {
- ERROR("bwa index failed, cannot align reads");
- return false;
- }
- return true;
-}
-
-
-bool BWAPairInfoFiller::RunBWA(const string& reads_file, const string& out_sam_file) const {
- string run_command = bwa_path_ + " mem -t " + ToString(nthreads_) + " " + index_base_ + " " + reads_file + " > " + out_sam_file + " 2>"
- + out_sam_file + ".txt";
- run_command = path::screen_whitespaces(run_command);
- INFO("Running bwa mem ...");
- INFO("Command line: " << run_command);
-
- int run_res = system(run_command.c_str());
- if (run_res != 0) {
- ERROR("bwa mem failed, cannot align reads");
- return false;
- }
- return true;
-}
-
-bool BWAPairInfoFiller::AlignLib(const SequencingLibraryT& lib,
- const string& sam_file_base,
- vector<pair<string, string>>& resulting_sam_files) {
-
- VERIFY_MSG(Init(), "BWA index was not constructed properly");
- resulting_sam_files.clear();
- size_t file_index = 0;
- bool any_aligned = false;
-
- for (auto iter = lib.paired_begin(); iter != lib.paired_end(); iter++) {
- string left_reads = iter->first;
- string left_sam = sam_file_base + "_1_" + ToString(file_index) + ".sam";
- bool res = RunBWA(left_reads, left_sam);
- if (!res) {
- WARN("Failed to align left reads " << left_reads);
- continue;
- }
- string right_reads = iter->second;
- string right_sam = sam_file_base + "_2_" + ToString(file_index) + ".sam";
- res = RunBWA(right_reads, right_sam);
- if (!res) {
- WARN("Failed to align right reads " << right_reads);
- continue;
- }
-
- resulting_sam_files.push_back(make_pair(left_sam, right_sam));
- any_aligned = true;
- }
- return any_aligned;
-}
-
-
-void BWAPairInfoFiller::ProcessSAMFiles(const string &left_sam, const string &right_sam,
- BWAPairedReadProcessor& processor) {
-
- //Left and right reads are stored in maps until pair is detected
- unordered_map<string, MapperReadT> left_reads;
- unordered_map<string, MapperReadT> right_reads;
- size_t counter = 0;
- //Check for duplicating read IDs
- bool left_duplicated = false;
- bool right_duplicated = false;
-
- INFO("Reading SAM files " << left_sam << " and " << right_sam);
- MappedSamStream lf(left_sam);
- MappedSamStream rf(right_sam);
- while (!lf.eof() || !rf.eof()) {
- SingleSamRead left_read;
- MapperReadT left_data;
- string l_name = "";
-
- SingleSamRead right_read;
- MapperReadT right_data;
- string r_name = "";
-
- if (!lf.eof()) {
- lf >> left_read;
- l_name = left_read.name();
- if (left_read.is_properly_aligned()) {
- TRACE("Left read " << l_name);
- left_data = MapperReadT(string(lf.get_contig_name(left_read.contig_id())),
- left_read.pos(),
- left_read.data_len(),
- left_read.strand(),
- left_read.cigar());
- }
- else if (!left_read.is_main_alignment()) {
- //If not primary alignment ignore mapping
- TRACE("Ignoring left read");
- l_name = "";
- }
- }
- if (!rf.eof()) {
- rf >> right_read;
- r_name = right_read.name();
- if (right_read.is_properly_aligned()) {
- TRACE("Right read " << r_name);
- right_data = MapperReadT(string(rf.get_contig_name(right_read.contig_id())),
- right_read.pos(),
- right_read.data_len(),
- right_read.strand(),
- right_read.cigar());
- }
- else if (!right_read.is_main_alignment()) {
- //If not primary alignment ignore mapping
- TRACE("Ignoring right read");
- r_name = "";
- }
- }
-
- //Think about custom read names
- if (l_name == r_name) {
- TRACE("Equal processing");
- //Process immideately if ids are equal in both SAM entries
- processor.ProcessPairedRead(left_data, right_data);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- continue;
- }
-
- if (r_name != "") {
- auto it = left_reads.find(r_name);
- if (it != left_reads.end()) {
- //Right read's mate found in map
- TRACE("Right read's mate found, processing");
- processor.ProcessPairedRead(it->second, right_data);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- //Remove mate as used
- left_reads.erase(it);
- }
- else {
- TRACE("Right read's mate not found, adding to map");
- if (right_reads.count(r_name) == 0) {
- //Insert read without mate for further analysis
- //TODO inspect map size and performance
- right_reads.emplace(r_name, right_data);
- } else {
- DEBUG("Right read " << r_name << " is duplicated!");
- //Report duplication
- right_duplicated = true;
- }
- }
- }
-
- if (l_name != "") {
- auto it = right_reads.find(l_name);
- if (it != right_reads.end()) {
- //Left read's mate found in map
- TRACE("Left read's mate found, processing");
- processor.ProcessPairedRead(left_data, it->second);
- VERBOSE_POWER2(++counter, "Processed " << counter << " paired reads");
- //Remove mate as used
- right_reads.erase(it);
- }
- else {
- TRACE("Left read's mate not found, adding to map");
- if (left_reads.count(l_name) == 0) {
- //Insert read without mate for further analysis
- //TODO inspect map size and performance
- left_reads.emplace(l_name, left_data);
- } else {
- DEBUG("Left read " << r_name << " is duplicated!");
- //Report duplication
- left_duplicated = true;
- }
-
- }
- }
- }
-
- if (left_duplicated)
- WARN("SAM file " << left_sam << " contains duplicated read ids");
- if (right_duplicated)
- WARN("SAM file " << right_sam << " contains duplicated read ids");
-}
-
-bool BWAPairInfoFiller::Init() {
- if (!index_constructed_) {
- INFO("Initializing bwa pair info counter, working dir " << work_dir_);
- path::make_dir(base_dir_);
- work_dir_ = path::make_temp_dir(base_dir_, "");
- index_base_= path::append_path(work_dir_, "long_edges.fasta");
- INFO("Saving edges to " << index_base_);
- OutputEdges(index_base_);
- FillEdgeIdMap();
- index_constructed_ = CreateIndex(index_base_);
- }
- return index_constructed_;
-}
-
-bool BWAPairInfoFiller::ProcessLib(size_t lib_index,
- SequencingLibraryT& lib,
- PairedInfoIndexT& paired_index,
- size_t counter_edge_len,
- size_t index_filler_edge_len) {
- //Initialize if needed
- Init();
- string lib_dir = path::append_path(work_dir_, ToString(lib_index));
- path::make_dir(lib_dir);
- vector<pair<string, string>> sam_files;
- bool result = false;
-
- INFO("Mapping lib #" << lib_index << " using BWA");
- if (!AlignLib(lib, path::append_path(lib_dir, "single"), sam_files)) {
- WARN("Failed to align lib #" << lib_index);
- return false;
- }
-
- INFO("Estimating insert size for library #" << lib_index);
- BWAISCounter counter(lib, edge_id_map_, g_, counter_edge_len);
- for (const auto& sam_pair : sam_files) {
- ProcessSAMFiles(sam_pair.first, sam_pair.second, counter);
- }
-
- if (!counter.RefineInsertSize(lib)) {
- lib.data().mean_insert_size = 0.0;
- WARN("Unable to estimate insert size paired library #" << lib_index);
- }
- else {
- INFO(" Estimated insert size for paired library #" << lib_index);
- INFO(" Insert size = " << lib.data().mean_insert_size <<
- ", deviation = " << lib.data().insert_size_deviation <<
- ", left quantile = " << lib.data().insert_size_left_quantile <<
- ", right quantile = " << lib.data().insert_size_right_quantile <<
- ", read length = " << lib.data().read_length);
-
- INFO("Collecting paired information for library #" << lib_index);
- paired_index.Init();
-
- BWAIndexFiller filler(lib, edge_id_map_, g_, paired_index, index_filler_edge_len);
- for (const auto& sam_pair : sam_files) {
- ProcessSAMFiles(sam_pair.first, sam_pair.second, filler);
- }
- result = true;
- }
- if (remove_tmp_files_)
- path::remove_dir(lib_dir);
- return result;
-}
-
-
-}
diff --git a/src/modules/paired_info/bwa_pair_info_filler.hpp b/src/modules/paired_info/bwa_pair_info_filler.hpp
deleted file mode 100644
index 438fafe..0000000
--- a/src/modules/paired_info/bwa_pair_info_filler.hpp
+++ /dev/null
@@ -1,253 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "assembly_graph/graph_core/graph.hpp"
-#include "pipeline/config_struct.hpp"
-
-#include <io/sam_io/sam_reader.hpp>
-#include <io/sam_io/read.hpp>
-
-#include <io/reads_io/osequencestream.hpp>
-#include <paired_info/paired_info.hpp>
-#include <paired_info/insert_size_refiner.hpp>
-
-#ifndef PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
-#define PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
-
-namespace bwa_pair_info {
-
-using namespace sam_reader;
-using debruijn_graph::EdgeId;
-
-typedef omnigraph::de::UnclusteredPairedInfoIndexT<debruijn_graph::Graph> PairedInfoIndexT;
-typedef io::SequencingLibrary<debruijn_graph::config::DataSetData> SequencingLibraryT;
-typedef std::pair<debruijn_graph::EdgeId, debruijn_graph::EdgeId> EdgePair;
-typedef unordered_map<size_t, debruijn_graph::EdgeId> EdgeIdMap;
-
-//More compact representation of aligned read for storing in map
-class MapperReadT {
-public:
- MapperReadT(): contig_id_(""), pos_(-1), len_(-1), is_forward_(true),
- left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0){}
-
- MapperReadT(const string& ctg_id, int32_t pos, int32_t len, bool is_forward, const string& cigar):
- contig_id_(ctg_id), pos_(pos), len_(len), is_forward_(is_forward),
- left_hard_clip_(0), right_hard_clip_(0), left_soft_clip_(0), right_soft_clip_(0) {
-
- ParseCigar(cigar);
- }
-
- bool IsValid() const {
- return contig_id_ != "";
- }
-
-private:
-
- void ParseCigar(const string& cigar);
-
-public:
- const string &get_contig_id() const {
- return contig_id_;
- }
- int32_t pos() const {
- return pos_;
- }
- int32_t len() const {
- return len_;
- }
- bool is_forward() const {
- return is_forward_;
- }
- uint32_t left_soft_clip() const {
- return left_soft_clip_;
- }
- uint32_t right_soft_clip() const {
- return right_soft_clip_;
- }
- uint32_t left_hard_clip() const {
- return left_hard_clip_;
- }
- uint32_t right_hard_clip() const {
- return right_hard_clip_;
- }
-
-private:
- string contig_id_;
- int32_t pos_;
- int32_t len_;
- bool is_forward_;
- uint32_t left_hard_clip_:16, right_hard_clip_:16;
- uint32_t left_soft_clip_:16, right_soft_clip_:16;
-};
-
-//Base class for aligned read processor (simple analog of SequenceMapperListener)
-class BWAPairedReadProcessor {
-public:
- virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r) = 0;
-
- virtual ~BWAPairedReadProcessor() {
-
- }
-};
-
-//Class that corrects mapping positions according to lib orientation and clippings
-class BWACorrectingProcessor: public BWAPairedReadProcessor {
-protected:
- const SequencingLibraryT& lib_;
-
- const EdgeIdMap& edge_id_map_;
-
- const debruijn_graph::Graph& g_;
-
- size_t count_;
-
-public:
-
- struct MappedPositionT {
- EdgeId e;
- int pos;
-
- MappedPositionT(EdgeId e_, int pos_): e(e_), pos(pos_) {
-
- }
- };
-
- BWACorrectingProcessor(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g):
- lib_(lib), edge_id_map_(edge_id_map), g_(g), count_(0) {
- }
-
- virtual bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
-
- virtual void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) = 0;
-//Correct read algnment according to orientation and clippings
- virtual void ProcessPairedRead(const MapperReadT& l, const MapperReadT& r);
-};
-
-//Insert size counter
-class BWAISCounter: public BWACorrectingProcessor {
-private:
- HistType hist_;
- size_t min_contig_len_;
- bool ignore_negative_;
- size_t mapped_count_;
- size_t negative_count_;
-
-public:
- BWAISCounter(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
- size_t min_contig_len, bool ignore_negative = false):
- BWACorrectingProcessor(lib, edge_id_map, g), hist_(), min_contig_len_(min_contig_len),
- ignore_negative_(ignore_negative), mapped_count_(0), negative_count_(0) {
- }
-
- bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- bool RefineInsertSize(SequencingLibraryT& reads) const ;
-
-};
-
-//Pair info filler
-class BWAIndexFiller: public BWACorrectingProcessor {
-
-private:
- PairedInfoIndexT& paired_index_;
-
- size_t min_contig_len_;
-
- EdgePair ConjugatePair(EdgePair ep) const;
-
-public:
- BWAIndexFiller(const SequencingLibraryT& lib, const EdgeIdMap& edge_id_map, const debruijn_graph::Graph& g,
- PairedInfoIndexT& paired_index, size_t min_contig_len = 0):
- BWACorrectingProcessor(lib, edge_id_map, g), paired_index_(paired_index), min_contig_len_(min_contig_len) {
- }
-
- bool CheckAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-
- void ProcessAlignments(const MappedPositionT& l, const MappedPositionT& r) override;
-};
-
-//Class for running BWA, managing and parsing SAM files
-class BWAPairInfoFiller {
-public:
- DECL_LOGGER("BWAPairInfo");
-
-private:
- const debruijn_graph::Graph& g_;
-
- string bwa_path_;
-
- string base_dir_;
-
- string work_dir_;
-
- size_t nthreads_;
-
- string index_base_;
-
- bool index_constructed_;
-
- bool remove_tmp_files_;
-
- unordered_map<size_t, debruijn_graph::EdgeId> edge_id_map_;
-
-private:
-
- //Save graph in fasta format
- void OutputEdges(const string& filename) const;
-
- //Construct int_id -> EdgeId map
- void FillEdgeIdMap();
-
- //Run bwa index
- bool CreateIndex(const string& contigs);
-
- //Initialize for read aligment (includes all above)
- bool Init();
-
- //Run bwa mem on single file
- bool RunBWA(const string& reads_file, const string& out_sam_file) const;
-
- //Process single read library
- bool AlignLib(const SequencingLibraryT& lib,
- const string& sam_file_base,
- vector<pair<string, string>>& resulting_sam_files);
-
- //Parse a pair of same files and analyze alignments with processor
- void ProcessSAMFiles(const string &left_sam, const string &right_sam,
- BWAPairedReadProcessor& processor);
-
-public:
-
- BWAPairInfoFiller(const debruijn_graph::Graph& g,
- const string& bwa_path,
- const string& work_dir,
- size_t nthreads = 1,
- bool remove_tmp = true):
- g_(g), bwa_path_(bwa_path), base_dir_(work_dir), work_dir_(""),
- nthreads_(nthreads), index_base_(""), index_constructed_(false),
- remove_tmp_files_(remove_tmp),
- edge_id_map_() {
- }
-
- ~BWAPairInfoFiller() {
- if (remove_tmp_files_)
- path::remove_if_exists(work_dir_);
- }
-
- //Count IS and fill pair info index for the given lib
- bool ProcessLib(size_t lib_index,
- SequencingLibraryT& lib,
- PairedInfoIndexT& paired_index,
- size_t counter_edge_len,
- size_t index_filler_edge_len);
-};
-
-}
-
-#endif //PROJECT_BWA_PAIR_INFO_FILLER_HPP_H
diff --git a/src/modules/paired_info/data_divider.hpp b/src/modules/paired_info/data_divider.hpp
deleted file mode 100644
index 7bd2c7b..0000000
--- a/src/modules/paired_info/data_divider.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
-* data_divider.hpp
-*
-* Created on: Aug 16, 2011
-* Author: alexeyka
-*/
-
-
-#ifndef DATA_DIVIDER_HPP_
-#define DATA_DIVIDER_HPP_
-
-#include <iostream>
-#include <math.h>
-#include "dev_support/verify.hpp"
-#include <vector>
-#include <utility>
-#include <cstdlib>
-#include <cstdio>
-#include "index_point.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class EdgeId>
-class DataDivider {
- typedef pair<size_t, size_t> Interval;
- typedef vector<PairInfo<EdgeId> > PairInfos;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<Point> PointArray;
- typedef std::function<double(int)> WeightFunction;
-
- // double LeftDerivative(int index, vector<int> x, vector<int> y) {
- // return outf[dist - min_value_ + 1][0] - outf[dist - min][0];
- // }
- //
- // double RightDerivative(index, std::vector<int> x, std::vector<int> y) {
- // return outf[dist - min_value_][0] - outf[dist - min - 1][0];
- // }
- //
- // double MiddleDerivative(int index, std::vector<int> x, std::vector<int> y) {
- // return 0.5f * (outf[dist - min_value_ + 1][0] - outf[dist - min - 1][0]);
- // }
-
-public:
- DataDivider(size_t threshold, const PointArray &points) :
- threshold_(threshold), points_(points) {
- }
-
- vector<Interval> DivideData() {
- VERIFY(points_.size() > 0);
- vector<Interval> answer;
- min_value_ = rounded_d(points_.front());
- max_value_ = rounded_d(points_.back());
- size_t begin = 0;
- for (size_t i = 0; i < points_.size() - 1; ++i) {
- if (IsANewCluster(i, points_)) {
- answer.push_back(make_pair(begin, i + 1));
- begin = i + 1;
- }
- }
- answer.push_back(make_pair(begin, points_.size()));
-
- return answer;
- }
-
- vector<Interval> DivideAndSmoothData(const EdgePair &ep,
- PairInfos &new_data,
- WeightFunction weight_f) {
- VERIFY(points_.size() > 0);
- vector<Interval> answer;
-
- TRACE("Data");
- //Print();
- const Point &point = points_.front();
- min_value_ = rounded_d(point);
- max_value_ = rounded_d(points_.back());
- size_t begin = 0;
- for (size_t i = 0; i < points_.size(); ++i) {
- if (i == points_.size() - 1 || IsANewCluster(i)) {
- int low_val = rounded_d(points_[begin]);
- int high_val = rounded_d(points_[i]);
- size_t new_begin = new_data.size();
- VERIFY(low_val <= high_val);
- for (int j = low_val; j <= high_val; ++j) {
- double val = 0.;
- for (size_t k = begin; k <= i; ++k) {
- val += points_[k].weight * weight_f(j - rounded_d(points_[k]));
- }
- new_data.push_back(PairInfo<EdgeId>(ep.first, ep.second, j, val, 0.));
- }
- size_t new_end = new_data.size();
- answer.push_back(make_pair(new_begin, new_end));
-
- begin = i + 1;
- }
- }
- //answer.push_back(make_pair(beginc, new_data.size()));
- TRACE("New_data ");
- Print();
-
- return answer;
- }
-
-private:
- int min_value_;
- int max_value_;
- size_t threshold_;
- PointArray points_;
-
- void Print() const {
- for (size_t i = 0; i < points_.size(); ++i) {
- TRACE(points_[i].d << " " << points_[i].weight);
- }
- }
-
- bool IsANewCluster(size_t index) {
- VERIFY(index < points_.size() - 1);
- return (math::gr(abs(points_[index + 1].d - points_[index].d), (DEDistance) threshold_));
- }
-
- DECL_LOGGER("DataDivider");
-};
-
-}
-
-
-}
-
-#endif /* DATA_DIVIDER_HPP_ */
diff --git a/src/modules/paired_info/distance_estimation.hpp b/src/modules/paired_info/distance_estimation.hpp
deleted file mode 100644
index 7143ef3..0000000
--- a/src/modules/paired_info/distance_estimation.hpp
+++ /dev/null
@@ -1,309 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef DISTANCE_ESTIMATION_HPP_
-#define DISTANCE_ESTIMATION_HPP_
-
-#include "math/xmath.h"
-#include "dev_support/openmp_wrapper.h"
-
-#include "paired_info.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "paired_info/pair_info_bounds.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-//todo move to some more common place
-template<class Graph>
-class GraphDistanceFinder {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- typedef std::vector<EdgeId> Path;
- typedef std::vector<size_t> GraphLengths;
- typedef std::map<EdgeId, GraphLengths> LengthMap;
-
-public:
- GraphDistanceFinder(const Graph &graph, size_t insert_size, size_t read_length, size_t delta) :
- graph_(graph), insert_size_(insert_size), gap_((int) (insert_size - 2 * read_length)),
- delta_((double) delta) { }
-
- std::vector<size_t> GetGraphDistancesLengths(EdgeId e1, EdgeId e2) const {
- LengthMap m;
- m.insert({e2, {}});
-
- FillGraphDistancesLengths(e1, m);
-
- return m[e2];
- }
-
- // finds all distances from a current edge to a set of edges
- void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
- vector<VertexId> end_points;
- vector<size_t> path_lower_bounds;
- for (const auto &entry : second_edges) {
- EdgeId second_edge = entry.first;
- end_points.push_back(graph_.EdgeStart(second_edge));
- path_lower_bounds.push_back(PairInfoPathLengthLowerBound(graph_.k(), graph_.length(e1),
- graph_.length(second_edge), gap_, delta_));
- TRACE("Bounds for paths are " << path_lower_bounds.back());
- }
-
- size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), insert_size_, delta_);
-
- DistancesLengthsCallback<Graph> callback(graph_);
-
- PathProcessor<Graph> paths_proc(graph_, graph_.EdgeEnd(e1), path_upper_bound);
-
- for (size_t i = 0; i < end_points.size(); ++i) {
- //FIXME should max dist also depend on the point?
- paths_proc.Process(end_points[i], path_lower_bounds[i], path_upper_bound, callback);
- }
-
- vector<GraphLengths> result;
-
- size_t i = 0;
- for (auto &entry : second_edges) {
- GraphLengths lengths = callback.distances(i++);
- for (size_t j = 0; j < lengths.size(); ++j) {
- lengths[j] += graph_.length(e1);
- TRACE("Resulting distance set # " << i <<
- " edge " << graph_.int_id(entry.first) << " #" << j << " length " << lengths[j]);
- }
-
- if (e1 == entry.first)
- lengths.push_back(0);
-
- std::sort(lengths.begin(), lengths.end());
- entry.second = lengths;
- }
- }
-
-private:
- DECL_LOGGER("GraphDistanceFinder");
-
- const Graph &graph_;
- const size_t insert_size_;
- const int gap_;
- const double delta_;
-};
-
-template<class Graph>
-class AbstractDistanceEstimator {
-protected:
- typedef UnclusteredPairedInfoIndexT<Graph> InPairedIndex;
- typedef PairedInfoIndexT<Graph> OutPairedIndex;
- typedef typename InPairedIndex::HistProxy InHistogram;
- typedef typename OutPairedIndex::Histogram OutHistogram;
-
-public:
- AbstractDistanceEstimator(const Graph &graph,
- const InPairedIndex &index,
- const GraphDistanceFinder<Graph> &distance_finder,
- size_t linkage_distance = 0)
- : graph_(graph), index_(index),
- distance_finder_(distance_finder), linkage_distance_(linkage_distance) { }
-
- virtual void Estimate(PairedInfoIndexT<Graph> &result, size_t nthreads) const = 0;
-
- virtual ~AbstractDistanceEstimator() { }
-
-protected:
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<pair<int, double> > EstimHist;
- typedef vector<size_t> GraphLengths;
- typedef std::map<EdgeId, GraphLengths> LengthMap;
-
- const Graph &graph() const { return graph_; }
-
- const InPairedIndex &index() const { return index_; }
-
- void FillGraphDistancesLengths(EdgeId e1, LengthMap &second_edges) const {
- distance_finder_.FillGraphDistancesLengths(e1, second_edges);
- }
-
- OutHistogram ClusterResult(EdgePair /*ep*/, const EstimHist &estimated) const {
- OutHistogram result;
- for (size_t i = 0; i < estimated.size(); ++i) {
- size_t left = i;
- double weight = estimated[i].second;
- while (i + 1 < estimated.size() &&
- (estimated[i + 1].first - estimated[i].first) <= (int) linkage_distance_) {
- ++i;
- weight += estimated[i].second;
- }
- double center = (estimated[left].first + estimated[i].first) * 0.5;
- double var = (estimated[i].first - estimated[left].first) * 0.5;
- result.insert(Point(center, weight, var));
- }
- return result;
- }
-
- void AddToResult(const OutHistogram &clustered, EdgePair ep, PairedInfoBuffer<Graph> &result) const {
- result.AddMany(ep.first, ep.second, clustered);
- }
-
-private:
- const Graph &graph_;
- const InPairedIndex &index_;
- const GraphDistanceFinder<Graph> &distance_finder_;
- const size_t linkage_distance_;
-
- virtual const string Name() const = 0;
-};
-
-template<class Graph>
-class DistanceEstimator : public AbstractDistanceEstimator<Graph> {
- typedef AbstractDistanceEstimator<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef vector<size_t> GraphLengths;
- typedef vector<pair<int, double> > EstimHist;
- typedef pair<EdgeId, EdgeId> EdgePair;
-
-protected:
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
-
-public:
- DistanceEstimator(const Graph &graph,
- const InPairedIndex &index,
- const GraphDistanceFinder<Graph> &distance_finder,
- size_t linkage_distance, size_t max_distance)
- : base(graph, index, distance_finder, linkage_distance), max_distance_(max_distance) { }
-
- virtual ~DistanceEstimator() { }
-
- void Init() const {
- INFO("Using " << this->Name() << " distance estimator");
- }
-
- virtual void Estimate(OutPairedIndex &result, size_t nthreads) const {
- this->Init();
- const auto &index = this->index();
-
- DEBUG("Collecting edge infos");
- std::vector<EdgeId> edges;
- for (auto it = this->graph().ConstEdgeBegin(); !it.IsEnd(); ++it)
- edges.push_back(*it);
-
- DEBUG("Processing");
- PairedInfoBuffersT<Graph> buffer(this->graph(), nthreads);
-# pragma omp parallel for num_threads(nthreads) schedule(guided, 10)
- for (size_t i = 0; i < edges.size(); ++i) {
- EdgeId edge = edges[i];
- ProcessEdge(edge, index, buffer[omp_get_thread_num()]);
- }
-
- for (size_t i = 0; i < nthreads; ++i) {
- result.Merge(buffer[i]);
- buffer[i].Clear();
- }
- }
-
-protected:
- const DEDistance max_distance_;
-
- virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
- const InHistogram &histogram,
- const GraphLengths &raw_forward) const {
- using std::abs;
- using namespace math;
- EdgeId e1 = ep.first, e2 = ep.second;
- size_t first_len = this->graph().length(e1), second_len = this->graph().length(e2);
- int minD = rounded_d(histogram.min()), maxD = rounded_d(histogram.max());
-
- TRACE("Bounds are " << minD << " " << maxD);
- EstimHist result;
- vector<int> forward;
- forward.reserve(raw_forward.size());
- for (auto raw_length : raw_forward) {
- int length = int(raw_length);
- if (minD - int(max_distance_) <= length && length <= maxD + int(max_distance_))
- forward.push_back(length);
- }
- if (forward.size() == 0)
- return result;
-
- size_t cur_dist = 0;
- vector<DEWeight> weights(forward.size(), 0);
- for (auto point : histogram) {
- if (ls(2 * point.d + second_len, DEDistance(first_len)))
- continue;
- while (cur_dist + 1 < forward.size() && forward[cur_dist + 1] < point.d)
- ++cur_dist;
-
- if (cur_dist + 1 < forward.size() &&
- ls(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
- ++cur_dist;
-
- if (le(abs(forward[cur_dist] - point.d), max_distance_))
- weights[cur_dist] += point.weight;
- } else if (cur_dist + 1 < forward.size() &&
- eq(forward[cur_dist + 1] - point.d, point.d - forward[cur_dist])) {
- if (le(abs(forward[cur_dist] - point.d), max_distance_))
- weights[cur_dist] += point.weight * 0.5;
- ++cur_dist;
- if (le(abs(forward[cur_dist] - point.d), max_distance_))
- weights[cur_dist] += point.weight * 0.5;
- } else {
- if (le(abs(forward[cur_dist] - point.d), max_distance_))
- weights[cur_dist] += point.weight;
- }
- }
-
- for (size_t i = 0; i < forward.size(); ++i)
- if (ge(weights[i], DEWeight(0)))
- result.push_back(make_pair(forward[i], weights[i]));
-
- VERIFY(result.size() == forward.size());
- return result;
- }
-
-private:
- virtual void ProcessEdge(EdgeId e1,
- const InPairedIndex &pi,
- PairedInfoBuffer<Graph> &result) const {
- typename base::LengthMap second_edges;
- auto inner_map = pi.GetHalf(e1);
- for (auto i : inner_map)
- second_edges[i.first];
-
- this->FillGraphDistancesLengths(e1, second_edges);
-
- for (const auto &entry: second_edges) {
- EdgeId e2 = entry.first;
- EdgePair ep(e1, e2);
-
- VERIFY(ep <= pi.ConjugatePair(ep));
-
- const GraphLengths &forward = entry.second;
- TRACE("Edge pair is " << this->graph().int_id(ep.first)
- << " " << this->graph().int_id(ep.second));
- auto hist = pi.Get(e1, e2);
- const EstimHist &estimated = this->EstimateEdgePairDistances(ep, hist, forward);
- OutHistogram res = this->ClusterResult(ep, estimated);
- this->AddToResult(res, ep, result);
- }
- }
-
- virtual const string Name() const {
- static const string my_name = "SIMPLE";
- return my_name;
- }
-
- DECL_LOGGER("DistanceEstimator");
-};
-
-}
-
-}
-
-#endif /* DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/modules/paired_info/histogram.hpp b/src/modules/paired_info/histogram.hpp
deleted file mode 100644
index c326f6e..0000000
--- a/src/modules/paired_info/histogram.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015-2016 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <btree/btree_set.h>
-#include "utils/adt/flat_set.hpp"
-#include "utils/adt/small_pod_vector.hpp"
-#include "index_point.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Point>
-class Histogram {
- typedef Histogram<Point> self_type;
- typedef typename std::less<Point> key_compare;
- typedef typename std::allocator<Point> allocator_type;
- typedef typename adt::flat_set<Point, key_compare, adt::SmallPODVector> Tree;
-
-public:
- typedef typename Tree::key_type key_type;
- typedef typename Tree::value_type value_type;
- typedef typename Tree::pointer pointer;
- typedef typename Tree::const_pointer const_pointer;
- typedef typename Tree::reference reference;
- typedef typename Tree::const_reference const_reference;
- typedef typename Tree::size_type size_type;
- typedef typename Tree::difference_type difference_type;
- typedef typename Tree::iterator iterator;
- typedef typename Tree::const_iterator const_iterator;
- typedef typename Tree::reverse_iterator reverse_iterator;
- typedef typename Tree::const_reverse_iterator const_reverse_iterator;
-
- enum {
- kValueSize = sizeof(Point)
- };
-
-public:
- // Default constructor.
- Histogram() = default;
-
- // Copy constructor.
- Histogram(const self_type &x)
- : tree_(x.tree_) {}
-
- template <class InputIterator>
- Histogram(InputIterator b, InputIterator e) {
- insert(b, e);
- }
-
- // Iterator routines.
- iterator begin() { return tree_.begin(); }
- const_iterator begin() const { return tree_.begin(); }
- iterator end() { return tree_.end(); }
- const_iterator end() const { return tree_.end(); }
- reverse_iterator rbegin() { return tree_.rbegin(); }
- const_reverse_iterator rbegin() const { return tree_.rbegin(); }
- reverse_iterator rend() { return tree_.rend(); }
- const_reverse_iterator rend() const { return tree_.rend(); }
-
- // Lookup routines.
- iterator lower_bound(const key_type &key) { return tree_.lower_bound(key); }
- const_iterator lower_bound(const key_type &key) const { return tree_.lower_bound(key); }
- iterator upper_bound(const key_type &key) { return tree_.upper_bound(key); }
- const_iterator upper_bound(const key_type &key) const { return tree_.upper_bound(key); }
- std::pair<iterator,iterator> equal_range(const key_type &key) { return tree_.equal_range(key); }
- std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { return tree_.equal_range(key); }
-
- // Utility routines.
- void clear() { tree_.clear(); }
- void swap(self_type &x) { tree_.swap(x.tree_); }
-
- // Size routines.
- size_type size() const { return tree_.size(); }
- size_type max_size() const { return tree_.max_size(); }
- bool empty() const { return tree_.empty(); }
- size_type bytes_used() const { return tree_.bytes_used(); }
-
- // Lookup routines.
- iterator find(const key_type &key) { return tree_.find(key); }
- const_iterator find(const key_type &key) const { return tree_.find(key); }
- size_type count(const key_type &key) const { return tree_.count(key); }
-
- // Insertion routines.
- std::pair<iterator,bool> insert(const value_type &x) { return tree_.insert(x); }
- iterator insert(iterator position, const value_type &x) { return tree_.insert(position, x); }
- template <typename InputIterator>
- void insert(InputIterator b, InputIterator e) { tree_.insert(b, e); }
-
- // Deletion routines.
- size_type erase(const key_type &key) { return tree_.erase(key); }
- // Erase the specified iterator from the btree. The iterator must be valid
- // (i.e. not equal to end()). Return an iterator pointing to the node after
- // the one that was erased (or end() if none exists).
- iterator erase(const iterator &iter) { return tree_.erase(iter); }
- void erase(const iterator &first, const iterator &last) { tree_.erase(first, last); }
-
- bool operator==(const self_type& x) const {
- if (size() != x.size())
- return false;
-
- for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi)
- if (*i != *xi)
- return false;
-
- return true;
- }
-
- bool operator!=(const self_type& other) const {
- return !operator==(other);
- }
-
-protected:
- Tree tree_;
-
-private:
- // This is template voodoo which creates function overload depending on
- // whether Point has const operator+= or not.
- template<class>
- struct true_helper : std::true_type {};
- template<class T = Point>
- static auto test_can_merge(int) -> true_helper<decltype(std::declval<const T>().operator+=(std::declval<const T>()))>;
- template<class>
- static auto test_can_merge(long) -> std::false_type;
- template<class T = Point>
- struct can_merge : decltype(test_can_merge<T>(0)) {};
-
-public:
- // This function overload is enabled only when Point has const operator+= (e.g. RawPoint)
- // and therefore we can update it inplace.
- template<class U = Point>
- typename std::enable_if<can_merge<U>::value, size_t>::type
- merge_point(const U &new_point) {
- // First, try to insert a point
- const auto &result = insert(new_point);
- if (result.second)
- return 1;
- // We already having something there. Try to merge stuff in.
- *result.first += new_point;
- return 0;
- }
-
- // Otherwise this overload is used, which removes the point from set,
- // updates it and re-inserts back.
- template<class U = Point>
- typename std::enable_if<!can_merge<U>::value, size_t>::type
- merge_point(const U &new_point) {
- auto result = insert(new_point);
- if (result.second)
- return 1;
- Point updated = *result.first + new_point;
- auto after_removed = erase(result.first);
- insert(after_removed, updated);
- return 0;
- }
-
- template<class OtherHist>
- size_t merge(const OtherHist &other) {
- size_t added = 0;
- for (const auto &new_point : other) {
- added += merge_point(new_point);
- }
- return added;
- }
-};
-
-template<typename T>
-inline std::ostream &operator<<(std::ostream &os, const Histogram<T> &b) {
- os << "{";
- for (const auto& e : b)
- os << e << "; ";
- os << "}";
- return os;
-}
-
-typedef Histogram<RawGapPoint> RawGapHistogram;
-typedef Histogram<GapPoint> GapHistogram;
-
-typedef Histogram<RawPoint> RawHistogram;
-typedef Histogram<Point> HistogramWithWeight;
-
-}
-
-}
diff --git a/src/modules/paired_info/insert_size_refiner.hpp b/src/modules/paired_info/insert_size_refiner.hpp
deleted file mode 100644
index cbaf257..0000000
--- a/src/modules/paired_info/insert_size_refiner.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-#include "dev_support/cpp_utils.hpp"
-#include "assembly_graph/stats/picture_dump.hpp"
-//#include "sequence_mapper.hpp"
-
-namespace omnigraph {
-
-typedef std::map<int, size_t> HistType;
-
-inline double get_median(const HistType &hist) {
- double S = 0;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter)
- S += (double) iter->second;
-
- double sum = S;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- sum -= (double) iter->second;
- if (sum <= S / 2) {
- return iter->first;
- }
- }
- assert(false);
- return -1;
-}
-
-inline double get_mad(const HistType &hist, double median) { // median absolute deviation
- std::map<int, size_t> hist2;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- int x = abs(iter->first - math::round_to_zero(median));
- hist2[x] = iter->second;
- }
- return get_median(hist2);
-}
-
-inline void hist_crop(const HistType &hist, double low, double high, HistType &res) {
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first >= low && iter->first <= high) {
- DEBUG("Cropped histogram " << iter->first << " " << iter->second);
- res.insert(*iter);
- }
- }
-}
-
-inline
-std::pair<double, double> GetISInterval(double quantile,
- const HistType &is_hist) {
- // First, obtain the sum of the values
- double S = 0;
- for (auto iter : is_hist)
- S += (double) iter.second;
-
- double lval = S * (1 - quantile) / 2, rval = S * (1 + quantile) / 2;
- double is_min, is_max;
-
- // Now, find the quantiles
- double cS = 0;
- is_min = is_hist.begin()->first;
- is_max = is_hist.rbegin()->first;
- for (auto iter : is_hist) {
- if (cS <= lval)
- is_min = iter.first;
- else if (cS <= rval)
- is_max = iter.first;
- cS += (double) iter.second;
- }
-
- return std::make_pair(is_min, is_max);
-}
-
-inline void find_median(const HistType &hist, double &median, double &mad, HistType &cropped_hist) {
- DEBUG("Counting median and MAD");
- median = get_median(hist);
- mad = get_mad(hist, median);
- double low = median - 5. * 1.4826 * mad;
- double high = median + 5. * 1.4826 * mad;
- omnigraph::hist_crop(hist, low, high, cropped_hist);
- median = get_median(cropped_hist);
- mad = get_mad(cropped_hist, median);
-}
-
-//Moved from insert size counter.
-//TODO: Please explain constants like 1.4826.
-inline void find_mean(const HistType &hist, double &mean, double &delta, std::map<size_t, size_t> &percentiles) {
- double median = get_median(hist);
- double mad = get_mad(hist, median);
- double low = median - 5. * 1.4826 * mad;
- double high = median + 5. * 1.4826 * mad;
-
- DEBUG("Median IS: " << median);
- DEBUG("MAD: " << mad);
- DEBUG("Thresholds set to: [" << low << ", " << high << "]");
-
- size_t n = 0;
- double sum = 0.;
- double sum2 = 0.;
- DEBUG("Counting average");
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- n += iter->second;
- sum += (double) iter->second * 1. * (double) iter->first;
- sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
- }
- mean = sum / (double) n;
- delta = sqrt(sum2 / (double) n - mean * mean);
-
- low = mean - 5 * delta;
- high = mean + 5 * delta;
-
- DEBUG("Mean IS: " << mean);
- DEBUG("sd: " << delta);
- DEBUG("Thresholds set to: [" << low << ", " << high << "]");
-
- n = 0;
- sum = 0.;
- sum2 = 0.;
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- n += iter->second;
- sum += (double) iter->second * 1. * (double) iter->first;
- sum2 += (double) iter->second * 1. * (double) iter->first * (double) iter->first;
- }
- mean = sum / (double) n;
- delta = sqrt(sum2 / (double) n - mean * mean);
-
- DEBUG("Mean IS: " << mean);
- DEBUG("sd: " << delta);
-
- size_t m = 0;
-
- DEBUG("Counting percentiles");
- //todo optimize
- size_t q[19];
- for (size_t i = 1; i < 20; ++i) {
- q[i - 1] = 5 * i;
- }
- for (auto iter = hist.begin(); iter != hist.end(); ++iter) {
- if (iter->first < low || iter->first > high) {
- continue;
- }
- size_t mm = m + iter->second;
- for (size_t i = 0; i < utils::array_size(q); i++) {
- size_t scaled_q_i((size_t) ((double) q[i] / 100. * (double) n));
- if (m < scaled_q_i && mm >= scaled_q_i) {
- percentiles[q[i]] = (size_t) iter->first;
- }
- }
- m = mm;
- }
-}
-
-
-}
diff --git a/src/modules/paired_info/is_counter.hpp b/src/modules/paired_info/is_counter.hpp
deleted file mode 100644
index 678387c..0000000
--- a/src/modules/paired_info/is_counter.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * is_counter.hpp
- *
- * Created on: May 25, 2014
- * Author: andrey
- */
-
-#ifndef IS_COUNTER_HPP_
-#define IS_COUNTER_HPP_
-
-
-#include "paired_info/insert_size_refiner.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-using namespace omnigraph;
-
-class InsertSizeCounter: public SequenceMapperListener {
-
-public:
-
- InsertSizeCounter(const conj_graph_pack& gp,
- size_t edge_length_threshold,
- bool ignore_negative = false)
- : gp_(gp),
- edge_length_threshold_(edge_length_threshold),
- ignore_negative_(ignore_negative) {
- }
-
- HistType hist() { return hist_; }
- size_t total() const { return total_.total_; }
- size_t mapped() const { return counted_.total_; }
- size_t negative() const { return negative_.total_; }
-
-
- virtual void StartProcessLibrary(size_t threads_count) {
- hist_.clear();
- tmp_hists_ = vector<HistType>(threads_count);
-
- total_ = count_data(threads_count);
- counted_ = count_data(threads_count);
- negative_ = count_data(threads_count);
- }
-
- virtual void StopProcessLibrary() {
- for (size_t i = 0; i < tmp_hists_.size(); ++i) {
- MergeBuffer(i);
- }
- tmp_hists_.clear();
- total_.merge();
- counted_.merge();
- negative_.merge();
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
- (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(thread_index, read1, read2, (int) r.second().size(),
- (int) r.first().GetLeftOffset() + (int) r.second().GetRightOffset());
- }
-
- virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleRead&, const MappingPath<EdgeId>& /*read*/) {
- }
-
- virtual void ProcessSingleRead(size_t /*thread_index*/, const io::SingleReadSeq&, const MappingPath<EdgeId>& /*read*/) {
- }
-
- virtual void MergeBuffer(size_t thread_index) {
- for (const auto& kv: tmp_hists_[thread_index]) {
- hist_[kv.first] += kv.second;
- }
- tmp_hists_[thread_index].clear();
- }
-
- void FindMean(double& mean, double& delta, std::map<size_t, size_t>& percentiles) const {
- find_mean(hist_, mean, delta, percentiles);
- }
-
- void FindMedian(double& median, double& mad, HistType& histogram) const {
- find_median(hist_, median, mad, histogram);
- }
-
-private:
- virtual void ProcessPairedRead(size_t thread_index,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2,
- int read2_size,
- int is_delta) {
-
- ++total_.arr_[thread_index];
-
- if (read1.size() == 1 && read2.size() == 1 &&
- read2.simple_path().front() == read1.simple_path().front() &&
- gp_.g.length(read1.simple_path().front()) >= edge_length_threshold_) {
-
- auto mapping_edge_1 = read1.front().second;
- auto mapping_edge_2 = read2.front().second;
-
- int read1_start = (int) mapping_edge_1.mapped_range.start_pos - (int) mapping_edge_1.initial_range.start_pos ;
- TRACE("Read 1: " << (int) mapping_edge_1.mapped_range.start_pos << " - " << (int) mapping_edge_1.initial_range.start_pos << " = " << read1_start);
- int read2_start = (int) mapping_edge_2.mapped_range.start_pos - (int) mapping_edge_2.initial_range.start_pos;
- TRACE("Read 2: " << (int) mapping_edge_2.mapped_range.start_pos << " - " << (int) mapping_edge_2.initial_range.start_pos << " = " << read2_start);
- int is = read2_start - read1_start + read2_size + is_delta;
- TRACE("IS: " << read2_start << " - " << read1_start << " + " << (int) is_delta << " = " << is);
-
- if (is > 0 || !ignore_negative_) {
- tmp_hists_[thread_index][is] += 1;
- ++counted_.arr_[thread_index];
- } else {
- ++negative_.arr_[thread_index];
- }
-
- }
-
- }
- struct count_data {
- size_t total_;
- vector<size_t> arr_;
- count_data(): total_(0) {
- }
- count_data(size_t nthreads): total_(0), arr_(nthreads, 0) {
- }
- void inc(size_t i) {
- ++arr_[i];
- }
- void merge() {
- for (size_t i = 0; i < arr_.size(); ++i) {
- total_ += arr_[i];
- }
- }
- };
-
-private:
- const conj_graph_pack& gp_;
-
- HistType hist_;
- vector<HistType> tmp_hists_;
-
- count_data total_;
- count_data counted_;
- count_data negative_;
-
- size_t edge_length_threshold_;
- bool ignore_negative_;
-};
-
-}
-
-
-#endif /* IS_COUNTER_HPP_ */
diff --git a/src/modules/paired_info/pair_info_bounds.hpp b/src/modules/paired_info/pair_info_bounds.hpp
deleted file mode 100644
index ae0c041..0000000
--- a/src/modules/paired_info/pair_info_bounds.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef OMNI_UTILS_HPP_
-#define OMNI_UTILS_HPP_
-
-#include "dev_support/standard_base.hpp"
-
-namespace omnigraph {
-
-
-inline size_t PairInfoPathLengthUpperBound(size_t k, size_t insert_size,
- double delta) {
- double answer = 0. + (double) insert_size + delta - (double) k - 2.;
- VERIFY(math::gr(answer, 0.));
- return (size_t)std::floor(answer);
-}
-
-inline size_t PairInfoPathLengthLowerBound(size_t k, size_t l1, size_t l2,
- int gap, double delta) {
- double answer = 0. + (double) gap + (double) k + 2. - (double) l1 - (double) l2 - delta;
- return math::gr(answer, 0.) ? (size_t)std::floor(answer) : 0;
-}
-
-}
-#endif /* OMNI_UTILS_HPP_ */
diff --git a/src/modules/paired_info/pair_info_filler.hpp b/src/modules/paired_info/pair_info_filler.hpp
deleted file mode 100644
index 3d2ef1b..0000000
--- a/src/modules/paired_info/pair_info_filler.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * pair_info_filler.hpp
- *
- * Created on: Oct 3, 2013
- * Author: andrey
- */
-
-#ifndef PAIR_INFO_FILLER_HPP_
-#define PAIR_INFO_FILLER_HPP_
-
-#include "assembly_graph/graph_alignment/sequence_mapper_notifier.hpp"
-
-namespace debruijn_graph {
-
-/**
- * As for now it ignores sophisticated case of repeated consecutive
- * occurrence of edge in path due to gaps in mapping
- *
- * todo talk with Anton about simplification and speed-up of procedure with little quality loss
- */
-class LatePairedIndexFiller : public SequenceMapperListener {
- typedef std::function<double(MappingRange, MappingRange)> WeightF;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
-public:
- LatePairedIndexFiller(const Graph &graph, WeightF weight_f, omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index)
- : graph_(graph),
- weight_f_(weight_f),
- paired_index_(paired_index) {
- }
-
- virtual void StartProcessLibrary(size_t threads_count) {
- paired_index_.Init();
- buffer_pi_ = {graph_, threads_count};
- }
-
- virtual void StopProcessLibrary() {
- for (size_t i = 0; i < buffer_pi_.size(); ++i)
- MergeBuffer(i);
-
- buffer_pi_.Clear();
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedRead& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
- }
-
- virtual void ProcessPairedRead(size_t thread_index,
- const io::PairedReadSeq& r,
- const MappingPath<EdgeId>& read1,
- const MappingPath<EdgeId>& read2) {
- ProcessPairedRead(buffer_pi_[thread_index], read1, read2, r.distance());
- }
-
- virtual void ProcessSingleRead(size_t,
- const io::SingleReadSeq&,
- const MappingPath<EdgeId>&) {}
-
- virtual void ProcessSingleRead(size_t,
- const io::SingleRead&,
- const MappingPath<EdgeId>&) {}
-
- virtual void MergeBuffer(size_t thread_index) {
- paired_index_.Merge(buffer_pi_[thread_index]);
- buffer_pi_[thread_index].Clear();
- }
-
- virtual ~LatePairedIndexFiller() {}
-
-private:
- void ProcessPairedRead(omnigraph::de::PairedInfoBuffer<Graph>& paired_index,
- const MappingPath<EdgeId>& path1,
- const MappingPath<EdgeId>& path2, size_t read_distance) const {
- for (size_t i = 0; i < path1.size(); ++i) {
- std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
- for (size_t j = 0; j < path2.size(); ++j) {
- std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
-
- EdgePair ep{mapping_edge_1.first, mapping_edge_2.first};
-
-
- omnigraph::de::DEWeight weight =
- weight_f_(mapping_edge_1.second, mapping_edge_2.second);
- size_t kmer_distance = read_distance
- + mapping_edge_2.second.initial_range.end_pos
- - mapping_edge_1.second.initial_range.start_pos;
- int edge_distance = (int) kmer_distance
- + (int) mapping_edge_1.second.mapped_range.start_pos
- - (int) mapping_edge_2.second.mapped_range.end_pos;
-
- paired_index.Add(mapping_edge_1.first, mapping_edge_2.first,
- omnigraph::de::RawPoint(edge_distance, weight));
- }
- }
- }
-
-private:
- const Graph& graph_;
- WeightF weight_f_;
- omnigraph::de::UnclusteredPairedInfoIndexT<Graph>& paired_index_;
- omnigraph::de::PairedInfoBuffersT<Graph> buffer_pi_;
-
- DECL_LOGGER("LatePairedIndexFiller");
-};
-
-
-}
-
-
-#endif /* PAIR_INFO_FILLER_HPP_ */
diff --git a/src/modules/paired_info/pair_info_improver.hpp b/src/modules/paired_info/pair_info_improver.hpp
deleted file mode 100644
index ac6475c..0000000
--- a/src/modules/paired_info/pair_info_improver.hpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "pipeline/graph_pack.hpp"
-#include "split_path_constructor.hpp"
-#include "paired_info/paired_info_helpers.hpp"
-#include "assembly_graph/paths/path_utils.hpp"
-#include <math.h>
-#include <io/reads_io/read_processor.hpp>
-
-namespace debruijn_graph {
-
-inline bool ClustersIntersect(omnigraph::de::Point p1, omnigraph::de::Point p2) {
- return math::le(p1.d, p2.d + p1.var + p2.var) &&
- math::le(p2.d, p1.d + p1.var + p2.var);
-}
-
-
-//todo move out
-template<class Graph>
-class ParallelEdgeProcessor {
- class ConstEdgeIteratorWrapper {
- public:
- typedef typename Graph::EdgeId ReadT;
-
- ConstEdgeIteratorWrapper(const Graph &g)
- : it_(g) {}
-
- bool eof() const { return it_.IsEnd(); }
-
- ConstEdgeIteratorWrapper& operator>>(typename Graph::EdgeId &val) {
- val = *it_;
- ++it_;
- return *this;
- }
-
- private:
- ConstEdgeIterator<Graph> it_;
- };
-
-public:
- ParallelEdgeProcessor(const Graph &g, unsigned nthreads)
- : rp_(nthreads), it_(g) {}
-
- template <class Processor>
- bool Run(Processor &op) { return rp_.Run(it_, op); }
-
- bool IsEnd() const { return it_.eof(); }
- size_t processed() const { return rp_.processed(); }
-
-private:
- hammer::ReadProcessor rp_;
- ConstEdgeIteratorWrapper it_;
-};
-
-template<class Graph>
-static
-bool TryToAddPairInfo(omnigraph::de::PairedInfoIndexT<Graph>& clustered_index,
- typename Graph::EdgeId e1, typename Graph::EdgeId e2,
- const omnigraph::de::Point& point_to_add) {
- auto histogram = clustered_index.Get(e1, e2);
- for (auto i : histogram) {
- if (ClustersIntersect(i, point_to_add))
- return false;
- }
-
- clustered_index.Add(e1, e2, point_to_add);
- return true;
-}
-
-template<class Graph>
-class PairInfoImprover {
- typedef typename Graph::EdgeId EdgeId;
- typedef std::vector<omnigraph::de::PairInfo<EdgeId> > PairInfos;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
- typedef omnigraph::de::PairedInfoIndexT<Graph> Index;
-
- public:
- PairInfoImprover(const Graph& g,
- Index& clustered_index,
- const io::SequencingLibrary<config::DataSetData> &lib, size_t max_repeat_length)
- : graph_(g), index_(clustered_index), lib_(lib), max_repeat_length_(max_repeat_length) { }
-
- void ImprovePairedInfo(unsigned num_threads = 1) {
- CorrectPairedInfo(num_threads);
- CorrectPairedInfo(num_threads);
- }
-
- private:
- void CorrectPairedInfo(unsigned nthreads) {
- size_t missing_paired_info_count = 0;
- size_t extra_paired_info_count = 0;
- extra_paired_info_count = RemoveContradictional(nthreads);
- missing_paired_info_count = FillMissing(nthreads);
-
- INFO("Paired info stats: missing = " << missing_paired_info_count
- << "; contradictional = " << extra_paired_info_count);
- }
-
- class ContradictionalRemover {
- public:
- ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
- const Graph &g,
- omnigraph::de::PairedInfoIndexT<Graph>& index, size_t max_repeat_length)
- : to_remove_(to_remove), graph_(g), index_(index), max_repeat_length_(max_repeat_length) {}
-
- bool operator()(std::unique_ptr<EdgeId> e) {
- omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
-
- if (graph_.length(*e)>= max_repeat_length_ && index_.contains(*e))
- FindInconsistent(*e, to_remove);
-
- return false;
- }
-
- private:
- bool IsConsistent(EdgeId /*e*/, EdgeId e1, EdgeId e2,
- const omnigraph::de::Point& p1, const omnigraph::de::Point& p2) const {
- if (math::le(p1.d, 0.f) || math::le(p2.d, 0.f) || math::gr(p1.d, p2.d))
- return true;
-
- double pi_dist = p2.d - p1.d;
- int first_length = (int) graph_.length(e1);
- double var = p1.var + p2.var;
-
- TRACE(" PI " << p1 << " tr " << omp_get_thread_num());
- TRACE("vs PI " << p2 << " tr " << omp_get_thread_num());
-
- if (math::le(pi_dist, first_length + var) &&
- math::le((double)first_length, pi_dist + var)) {
- if (graph_.EdgeEnd(e1) == graph_.EdgeStart(e2))
- return true;
-
- auto paths = GetAllPathsBetweenEdges(graph_, e1, e2, 0, (size_t) ceil(pi_dist - first_length + var));
- return (paths.size() > 0);
- } else {
- if (math::gr(p2.d, p1.d + first_length)) {
- auto paths = GetAllPathsBetweenEdges(graph_, e1, e2,
- (size_t) floor(pi_dist - first_length - var),
- (size_t) ceil(pi_dist - first_length + var));
- return (paths.size() > 0);
- }
- return false;
- }
- }
-
- // Checking the consistency of two edge pairs (e, e_1) and (e, e_2) for all pairs (base_edge, <some_edge>)
- void FindInconsistent(EdgeId base_edge,
- Index& pi) const {
- for (auto i1 : index_.Get(base_edge)) {
- auto e1 = i1.first;
- for (auto i2 : index_.Get(base_edge)) {
- auto e2 = i2.first;
- if (e1 == e2)
- continue;
- for (auto p1 : i1.second) {
- for (auto p2 : i2.second) {
- if (!IsConsistent(base_edge, e1, e2, p1, p2)) {
- if (p1.lt(p2))
- pi.Add(base_edge, e1, p1);
- else
- pi.Add(base_edge, e2, p2);
- }
- }
- }
- }
- }
- }
-
- omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
- const Graph &graph_;
- Index& index_;
- size_t max_repeat_length_;
- };
-
- size_t RemoveContradictional(unsigned nthreads) {
- size_t cnt = 0;
-
- omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
-
- // FIXME: Replace with lambda
- ContradictionalRemover remover(to_remove, graph_, index_, max_repeat_length_);
- ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
-
- DEBUG("ParallelRemoveContraditional: Threads finished");
-
- DEBUG("Merging maps");
- for (size_t i = 1; i < nthreads; ++i) {
- to_remove[0].Merge(to_remove[i]);
- to_remove[i].Clear();
- }
- DEBUG("Resulting size " << to_remove[0].size());
-
- DEBUG("Deleting paired infos, liable to removing");
- for (auto I = omnigraph::de::half_pair_begin(to_remove[0]);
- I != omnigraph::de::half_pair_end(to_remove[0]); ++I) {
- cnt += DeleteIfExist(I.first(), I.second(), *I);
- }
- to_remove[0].Clear();
-
- DEBUG("Size of index " << index_.size());
- DEBUG("ParallelRemoveContraditional: Clean finished");
- return cnt;
-
- }
-
- size_t FillMissing(unsigned nthreads) {
- DEBUG("Fill missing: Creating indexes");
- const size_t NUM_CHUNKS = nthreads * 16;
- omnigraph::de::PairedInfoIndicesT<Graph> to_add(graph_, NUM_CHUNKS);
-
- SplitPathConstructor<Graph> spc(graph_);
- IterationHelper<Graph, EdgeId> edges(graph_);
- auto iters = edges.Chunks(NUM_CHUNKS);
-
- DEBUG("Fill missing: Start threads");
- #pragma omp parallel for schedule(guided)
- for (size_t i = 0; i < iters.size() - 1; ++i) {
- TRACE("Processing chunk #" << i);
- for (auto e = iters[i]; e != iters[i + 1]; ++e) {
- TRACE("Checking for edge " << *e);
- auto paths = spc.ConvertPIToSplitPaths(*e, index_,
- lib_.data().mean_insert_size,
- lib_.data().insert_size_deviation);
- for (const auto &path : paths) {
- TRACE("Path " << path.PrintPath(graph_));
- for (const auto &pi : path)
- TryToAddPairInfo(to_add[i], pi.first, pi.second, pi.point);
- }
- }
- }
- //ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(filler);
- DEBUG("Fill missing: Threads finished");
-
- size_t cnt = 0;
- for (size_t i = 0; i < iters.size() - 1; ++i) {
- DEBUG("Adding map #" << i);
- for (auto I = omnigraph::de::half_pair_begin(to_add[i]);
- I != omnigraph::de::half_pair_end(to_add[i]);
- ++I) {
- EdgeId e1 = I.first();
- EdgeId e2 = I.second();
- for (auto p : *I)
- cnt += TryToAddPairInfo(index_, e1, e2, p);
- }
- to_add[i].Clear();
- }
-
- DEBUG("Size of paired index " << index_.size());
-
- DEBUG("Fill missing: Clean finished");
- DEBUG("Added " << cnt);
- return cnt;
- }
-
- private:
- size_t DeleteIfExist(EdgeId e1, EdgeId e2, const typename Index::HistProxy& infos) {
- size_t cnt = 0;
- for (auto point : infos) {
- cnt += index_.Remove(e1, e2, point);
- TRACE("cnt += " << cnt);
- }
-
- return cnt;
- }
-
- const Graph& graph_;
- Index& index_;
- const io::SequencingLibrary<config::DataSetData>& lib_;
- size_t max_repeat_length_;
- DECL_LOGGER("PairInfoImprover")
-};
-
-}
diff --git a/src/modules/paired_info/paired_info.hpp b/src/modules/paired_info/paired_info.hpp
deleted file mode 100644
index 952617b..0000000
--- a/src/modules/paired_info/paired_info.hpp
+++ /dev/null
@@ -1,712 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "utils/adt/iterator_range.hpp"
-#include <boost/iterator/iterator_facade.hpp>
-#include <btree/safe_btree_map.h>
-#include <sparsehash/sparse_hash_map>
-
-#include <type_traits>
-
-#include "histogram.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-/**
- * @brief Index of paired reads information. For each pair of edges, we store so-called histogram which is a set
- * of points with distance between those edges. Index is internally arranged as a map of map of histograms:
- * edge1 -> (edge2 -> histogram)
- * When we add a point (a,b)->p into the index, we automatically insert a conjugate point (b',a')->p',
- * (self-conjugate edge pairs are the sole exception), so the index is always conjugate-symmetrical.
- * Index provides access for a lot of different information:
- * - if you need to have a histogram between two edges, use Get(edge1, edge2);
- * - if you need to get a neighbourhood of some edge (second edges with corresponding histograms), use Get(edge1);
- * - if you need to skip a symmetrical half of that neighbourhood, use GetHalf(edge1);
- * Backward information (e.g., (b,a)->-p) is currently inaccessible.
- * @param G graph type
- * @param Traits Policy-like structure with associated types of inner and resulting points, and how to convert between them
- * @param C map-like container type (parameterized by key and value type)
- */
-template<typename G, typename Traits, template<typename, typename> class Container>
-class PairedIndex {
-
-private:
- typedef typename Traits::Gapped InnerPoint;
- typedef omnigraph::de::Histogram<InnerPoint> InnerHistogram;
-
-public:
- typedef G Graph;
-
- typedef typename Traits::Expanded Point;
- typedef omnigraph::de::Histogram<Point> Histogram;
- typedef typename Graph::EdgeId EdgeId;
- typedef std::pair<EdgeId, EdgeId> EdgePair;
-
- typedef Container<EdgeId, InnerHistogram> InnerMap;
- typedef Container<EdgeId, InnerMap> StorageMap;
-
- typedef PairedIndex<G, Traits, Container> Self;
-
- //--Data access types--
-
- typedef typename StorageMap::const_iterator ImplIterator;
-
-public:
- /**
- * @brief Smart proxy set representing a composite histogram of points between two edges.
- * @detail You can work with the proxy just like any constant set.
- * The only major difference is that it returns all consisting points by value,
- * because some of them don't exist in the underlying sets and are
- * restored from the conjugate info on-the-fly.
- */
- class HistProxy {
-
- public:
- /**
- * @brief Iterator over a proxy set of points.
- */
- class Iterator: public boost::iterator_facade<Iterator, Point, boost::bidirectional_traversal_tag, Point> {
-
- typedef typename InnerHistogram::const_iterator InnerIterator;
-
- public:
- Iterator(InnerIterator iter, DEDistance offset, bool back = false)
- : iter_(iter), offset_(offset), back_(back)
- {}
-
- private:
- friend class boost::iterator_core_access;
-
- Point dereference() const {
- auto i = iter_;
- if (back_) --i;
- Point result = Traits::Expand(*i, offset_);
- if (back_)
- result.d = -result.d;
- return result;
- }
-
- void increment() {
- back_ ? --iter_ : ++iter_;
- }
-
- void decrement() {
- back_ ? ++iter_ : --iter_;
- }
-
- inline bool equal(const Iterator &other) const {
- return iter_ == other.iter_ && back_ == other.back_;
- }
-
- InnerIterator iter_; //current position
- DEDistance offset_; //edge length
- bool back_;
- };
-
- /**
- * @brief Returns a wrapper for a histogram.
- */
- HistProxy(const InnerHistogram& hist, DEDistance offset = 0, bool back = false)
- : hist_(hist), offset_(offset), back_(back)
- {}
-
- /**
- * @brief Returns an empty proxy (effectively a Null object pattern).
- */
- static const InnerHistogram& empty_hist() {
- static InnerHistogram res;
- return res;
- }
-
- /**
- * @brief Adds a point to the histogram.
- */
- //void insert(Point p) {
- // hist_.insert(Traits::Shrink(p, offset_));
- //}
-
- Iterator begin() const {
- return Iterator(back_ ? hist_.end() : hist_.begin(), offset_, back_);
- }
-
- Iterator end() const {
- return Iterator(back_ ? hist_.begin() : hist_.end(), offset_, back_);
- }
-
- /**
- * @brief Finds the point with the minimal distance.
- */
- Point min() const {
- VERIFY(!empty());
- return *begin();
- }
-
- /**
- * @brief Finds the point with the maximal distance.
- */
- Point max() const {
- VERIFY(!empty());
- return *--end();
- }
-
- /**
- * @brief Returns the copy of all points in a simple flat histogram.
- */
- Histogram Unwrap() const {
- return Histogram(begin(), end());
- }
-
- size_t size() const {
- return hist_.size();
- }
-
- bool empty() const {
- return hist_.empty();
- }
-
- private:
- const InnerHistogram& hist_;
- DEDistance offset_;
- bool back_;
- };
-
- typedef typename HistProxy::Iterator HistIterator;
-
- //---- Traversing edge neighbours ----
-
- using EdgeHist = std::pair<EdgeId, HistProxy>;
-
- /**
- * @brief A proxy map representing neighbourhood of an edge,
- * where `Key` is the graph edge ID and `Value` is the proxy histogram.
- * @detail You can work with the proxy just like with any constant map.
- * The only major difference is that it returns all consisting pairs by value,
- * because proxies are constructed on-the-fly.
- */
- class EdgeProxy {
- public:
-
- /**
- * @brief Iterator over a proxy map.
- * @detail For a full proxy, traverses both straight and conjugate pairs.
- * For a half proxy, traverses only lesser pairs (i.e., (a,b) where (a,b)<=(b',a')) of edges.
- */
- class Iterator: public boost::iterator_facade<Iterator, EdgeHist, boost::forward_traversal_tag, EdgeHist> {
-
- typedef typename InnerMap::const_iterator InnerIterator;
-
- void Skip() { //For a half iterator, skip conjugate pairs
- while (half_ && iter_ != stop_ && index_.GreaterPair(edge_, iter_->first))
- ++iter_;
- }
-
- public:
- Iterator(const PairedIndex &index, InnerIterator iter, InnerIterator stop, EdgeId edge, bool half)
- : index_ (index)
- , iter_(iter)
- , stop_(stop)
- , edge_(edge)
- , half_(half)
- {
- Skip();
- }
-
- void increment() {
- ++iter_;
- Skip();
- }
-
- void operator=(const Iterator &other) {
- //TODO: is this risky without an assertion?
- //VERIFY(index_ == other.index_);
- //We shouldn't reassign iterators from one index onto another
- iter_ = other.iter_;
- stop_ = other.stop_;
- edge_ = other.edge_;
- half_ = other.half_;
- }
-
- private:
- friend class boost::iterator_core_access;
-
- bool equal(const Iterator &other) const {
- return iter_ == other.iter_;
- }
-
- EdgeHist dereference() const {
- const auto& hist = iter_->second;
- return std::make_pair(iter_->first, HistProxy(hist, index_.CalcOffset(edge_)));
- }
-
- private:
- const PairedIndex &index_; //TODO: get rid of this somehow
- InnerIterator iter_, stop_;
- EdgeId edge_;
- bool half_;
- };
-
- EdgeProxy(const PairedIndex &index, const InnerMap& map, EdgeId edge, bool half = false)
- : index_(index), map_(map), edge_(edge), half_(half)
- {}
-
- Iterator begin() const {
- return Iterator(index_, map_.begin(), map_.end(), edge_, half_);
- }
-
- Iterator end() const {
- return Iterator(index_, map_.end(), map_.end(), edge_, half_);
- }
-
- HistProxy operator[](EdgeId e2) const {
- if (half_ && index_.GreaterPair(edge_, e2))
- return HistProxy::empty_hist();
- return index_.Get(edge_, e2);
- }
-
- //Currently unused
- /*HistProxy<true> GetBack(EdgeId e2) const {
- return index_.GetBack(edge_, e2);
- }*/
-
- bool empty() const {
- return map_.empty();
- }
-
- private:
- const PairedIndex& index_;
- const InnerMap& map_;
- EdgeId edge_;
- //When false, represents all neighbours (consisting both of directly added data and "restored" conjugates).
- //When true, proxifies only half of the added edges.
- bool half_;
- };
-
- typedef typename EdgeProxy::Iterator EdgeIterator;
-
- //---------------- Constructor ----------------
-
- PairedIndex(const Graph &graph)
- : size_(0), graph_(graph)
- {}
-
-public:
- /**
- * @brief Returns a conjugate pair for two edges.
- */
- EdgePair ConjugatePair(EdgeId e1, EdgeId e2) const {
- return std::make_pair(graph_.conjugate(e2), graph_.conjugate(e1));
- }
- /**
- * @brief Returns a conjugate pair for a pair of edges.
- */
- EdgePair ConjugatePair(EdgePair ep) const {
- return ConjugatePair(ep.first, ep.second);
- }
-
-private:
- bool GreaterPair(EdgeId e1, EdgeId e2) const {
- auto ep = std::make_pair(e1, e2);
- return ep > ConjugatePair(ep);
- }
-
- void SwapConj(EdgeId &e1, EdgeId &e2) const {
- auto tmp = e1;
- e1 = graph_.conjugate(e2);
- e2 = graph_.conjugate(tmp);
- }
-
- size_t CalcOffset(EdgeId e) const {
- return this->graph().length(e);
- }
-
-public:
- //---------------- Data inserting methods ----------------
- /**
- * @brief Adds a point between two edges to the index,
- * merging weights if there's already one with the same distance.
- */
- void Add(EdgeId e1, EdgeId e2, Point p) {
- InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
- InsertWithConj(e1, e2, sp);
- }
-
- /**
- * @brief Adds a whole set of points between two edges to the index.
- */
- template<typename TH>
- void AddMany(EdgeId e1, EdgeId e2, const TH& hist) {
- for (auto p : hist) {
- InnerPoint sp = Traits::Shrink(p, CalcOffset(e1));
- InsertWithConj(e1, e2, sp);
- }
- }
-
-private:
-
- void InsertWithConj(EdgeId e1, EdgeId e2, InnerPoint p) {
- size_ += storage_[e1][e2].merge_point(p);
- //TODO: deal with loops and self-conj
- SwapConj(e1, e2);
- size_ += storage_[e1][e2].merge_point(p);
- }
-
- bool IsSelfConj(EdgeId e1, EdgeId e2) {
- return e1 == graph_.conjugate(e2);
- }
-
-public:
- /**
- * @brief Adds a lot of info from another index, using fast merging strategy.
- * Should be used instead of point-by-point index merge.
- */
- template<class Index>
- void Merge(const Index& index_to_add) {
- auto& base_index = storage_;
- for (auto AddI = index_to_add.data_begin(); AddI != index_to_add.data_end(); ++AddI) {
- EdgeId e1_to_add = AddI->first;
- const auto& map_to_add = AddI->second;
- InnerMap& map_already_exists = base_index[e1_to_add];
- MergeInnerMaps(map_to_add, map_already_exists);
- }
- VERIFY(size() >= index_to_add.size());
- }
-
-private:
- template<class OtherMap>
- void MergeInnerMaps(const OtherMap& map_to_add,
- InnerMap& map) {
- for (const auto& to_add : map_to_add) {
- InnerHistogram& hist_exists = map[to_add.first];
- size_ += hist_exists.merge(to_add.second);
- }
- }
-
-public:
- //---------------- Data deleting methods ----------------
-
- /**
- * @brief Removes the specific entry from the index, and its conjugate.
- * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
- * @return The number of deleted entries (0 if there wasn't such entry)
- */
- size_t Remove(EdgeId e1, EdgeId e2, Point p) {
- InnerPoint point = Traits::Shrink(p, graph_.length(e1));
- auto res = RemoveSingle(e1, e2, point);
- //TODO: deal with loops and self-conj
- SwapConj(e1, e2);
- res += RemoveSingle(e1, e2, point);
- return res;
- }
-
- /**
- * @brief Removes the whole histogram from the index, and its conjugate.
- * @warning Don't use it on unclustered index, because hashmaps require set_deleted_item
- * @return The number of deleted entries
- */
- size_t Remove(EdgeId e1, EdgeId e2) {
- auto res = RemoveAll(e1, e2);
- if (!IsSelfConj(e1, e2)) { //TODO: loops?
- SwapConj(e1, e2);
- res += RemoveAll(e1, e2);
- }
- return res;
- }
-
-private:
-
- //TODO: remove duplicode
- size_t RemoveSingle(EdgeId e1, EdgeId e2, InnerPoint point) {
- auto i1 = storage_.find(e1);
- if (i1 == storage_.end())
- return 0;
- auto& map = i1->second;
- auto i2 = map.find(e2);
- if (i2 == map.end())
- return 0;
- InnerHistogram& hist = i2->second;
- if (!hist.erase(point))
- return 0;
- --size_;
- if (hist.empty()) { //Prune empty maps
- map.erase(e2);
- if (map.empty())
- storage_.erase(e1);
- }
- return 1;
- }
-
- size_t RemoveAll(EdgeId e1, EdgeId e2) {
- auto i1 = storage_.find(e1);
- if (i1 == storage_.end())
- return 0;
- auto& map = i1->second;
- auto i2 = map.find(e2);
- if (i2 == map.end())
- return 0;
- InnerHistogram& hist = i2->second;
- size_t size_decrease = hist.size();
- map.erase(i2);
- size_ -= size_decrease;
- if (map.empty()) //Prune empty maps
- storage_.erase(i1);
- return size_decrease;
- }
-
-public:
-
- /**
- * @brief Removes all neighbourhood of an edge (all edges referring to it, and their histograms)
- * @warning To keep the symmetricity, it also deletes all conjugates, so the actual complexity is O(size).
- * @return The number of deleted entries
- */
- size_t Remove(EdgeId edge) {
- InnerMap &inner_map = storage_[edge];
- std::vector<EdgeId> to_remove;
- to_remove.reserve(inner_map.size());
- size_t old_size = this->size();
- for (const auto& ep : inner_map)
- to_remove.push_back(ep.first);
- for (auto e2 : to_remove)
- this->Remove(edge, e2);
- return old_size - this->size();
- }
-
- //---------------- Data accessing methods ----------------
-
- /**
- * @brief Underlying raw implementation data (for custom iterator helpers).
- */
- ImplIterator data_begin() const {
- return storage_.begin();
- }
-
- /**
- * @brief Underlying raw implementation data (for custom iterator helpers).
- */
- ImplIterator data_end() const {
- return storage_.end();
- }
-
- adt::iterator_range<ImplIterator> data() const {
- return adt::make_range(data_begin(), data_end());
- }
-
-private:
- //When there is no such edge, returns a fake empty map for safety
- const InnerMap& GetImpl(EdgeId e) const {
- auto i = storage_.find(e);
- if (i != storage_.end())
- return i->second;
- return empty_map_;
- }
-
- //When there is no such histogram, returns a fake empty histogram for safety
- const InnerHistogram& GetImpl(EdgeId e1, EdgeId e2) const {
- auto i = storage_.find(e1);
- if (i != storage_.end()) {
- auto j = i->second.find(e2);
- if (j != i->second.end())
- return j->second;
- }
- return HistProxy::empty_hist();
- }
-
-public:
-
- /**
- * @brief Returns a whole proxy map to the neighbourhood of some edge.
- * @param e ID of starting edge
- */
- EdgeProxy Get(EdgeId e) const {
- return EdgeProxy(*this, GetImpl(e), e);
- }
-
- /**
- * @brief Returns a half proxy map to the neighbourhood of some edge.
- * @param e ID of starting edge
- */
- EdgeProxy GetHalf(EdgeId e) const {
- return EdgeProxy(*this, GetImpl(e), e, true);
- }
-
- /**
- * @brief Operator alias of Get(id).
- */
- EdgeProxy operator[](EdgeId e) const {
- return Get(e);
- }
-
- /**
- * @brief Returns a histogram proxy for all points between two edges.
- */
- HistProxy Get(EdgeId e1, EdgeId e2) const {
- return HistProxy(GetImpl(e1, e2), CalcOffset(e1));
- }
-
- /**
- * @brief Operator alias of Get(e1, e2).
- */
- HistProxy operator[](EdgePair p) const {
- return Get(p.first, p.second);
- }
-
- //Currently unused
- /**
- * @brief Returns a backwards histogram proxy for all points between two edges.
- */
- /*HistProxy<true> GetBack(EdgeId e1, EdgeId e2) const {
- return HistProxy<true>(GetImpl(e2, e1), CalcOffset(e2));
- }*/
-
- /**
- * @brief Checks if an edge (or its conjugated twin) is consisted in the index.
- */
- bool contains(EdgeId edge) const {
- return storage_.count(edge) + storage_.count(graph_.conjugate(edge)) > 0;
- }
-
- /**
- * @brief Checks if there is a histogram for two points (or their conjugated pair).
- */
- bool contains(EdgeId e1, EdgeId e2) const {
- auto i1 = storage_.find(e1);
- if (i1 != storage_.end() && i1->second.count(e2))
- return true;
- return false;
- }
-
- //---------------- Miscellaneous ----------------
-
- /**
- * Returns the graph the index is based on. Needed for custom iterators.
- */
- const Graph &graph() const { return graph_; }
-
- /**
- * @brief Inits the index with graph data. For each edge, adds a loop with zero weight.
- * @warning Do not call this on non-empty indexes.
- */
- void Init() {
- //VERIFY(size() == 0);
- for (auto it = graph_.ConstEdgeBegin(); !it.IsEnd(); ++it)
- Add(*it, *it, Point());
- }
-
- /**
- * @brief Clears the whole index. Used in merging.
- */
- void Clear() {
- storage_.clear();
- size_ = 0;
- }
-
- /**
- * @brief Returns the physical index size (total count of all histograms).
- */
- size_t size() const { return size_; }
-
-private:
- PairedIndex(size_t size, const Graph& graph, const StorageMap& storage)
- : size_(size), graph_(graph), storage_(storage) {}
-
-public:
- /**
- * @brief Returns a copy of sub-index.
- * @deprecated Needed only in smoothing distance estimator.
- */
- Self SubIndex(EdgeId e1, EdgeId e2) const {
- InnerMap tmp;
- const auto& h1 = GetImpl(e1, e2);
- size_t size = h1.size();
- tmp[e1][e2] = h1;
- SwapConj(e1, e2);
- const auto& h2 = GetImpl(e1, e2);
- size += h2.size();
- tmp[e1][e2] = h2;
- return Self(size, graph_, tmp);
- };
-
-private:
- size_t size_;
- const Graph& graph_;
- StorageMap storage_;
- InnerMap empty_map_; //null object
-};
-
-//Aliases for common graphs
-template<typename K, typename V>
-using safe_btree_map = btree::safe_btree_map<K, V>; //Two-parameters wrapper
-template<typename Graph>
-using PairedInfoIndexT = PairedIndex<Graph, PointTraits, safe_btree_map>;
-
-template<typename K, typename V>
-using sparse_hash_map = google::sparse_hash_map<K, V>; //Two-parameters wrapper
-template<typename Graph>
-using UnclusteredPairedInfoIndexT = PairedIndex<Graph, RawPointTraits, sparse_hash_map>;
-
-/**
- * @brief A collection of paired indexes which can be manipulated as one.
- * Used as a convenient wrapper in parallel index processing.
- */
-template<class Index>
-class PairedIndices {
- typedef std::vector<Index> Storage;
- Storage data_;
-
-public:
- PairedIndices() {}
-
- PairedIndices(const typename Index::Graph& graph, size_t lib_num) {
- data_.reserve(lib_num);
- for (size_t i = 0; i < lib_num; ++i)
- data_.emplace_back(graph);
- }
-
- /**
- * @brief Initializes all indexes with zero points.
- */
- void Init() { for (auto& it : data_) it.Init(); }
-
- /**
- * @brief Clears all indexes.
- */
- void Clear() { for (auto& it : data_) it.Clear(); }
-
- Index& operator[](size_t i) { return data_[i]; }
-
- const Index& operator[](size_t i) const { return data_[i]; }
-
- size_t size() const { return data_.size(); }
-
- typename Storage::iterator begin() { return data_.begin(); }
- typename Storage::iterator end() { return data_.end(); }
-
- typename Storage::const_iterator begin() const { return data_.begin(); }
- typename Storage::const_iterator end() const { return data_.end(); }
-};
-
-template<class Graph>
-using PairedInfoIndicesT = PairedIndices<PairedInfoIndexT<Graph>>;
-
-template<class Graph>
-using UnclusteredPairedInfoIndicesT = PairedIndices<UnclusteredPairedInfoIndexT<Graph>>;
-
-template<typename K, typename V>
-using unordered_map = std::unordered_map<K, V>; //Two-parameters wrapper
-template<class Graph>
-using PairedInfoBuffer = PairedIndex<Graph, RawPointTraits, unordered_map>;
-
-template<class Graph>
-using PairedInfoBuffersT = PairedIndices<PairedInfoBuffer<Graph>>;
-
-}
-
-}
diff --git a/src/modules/paired_info/peak_finder.hpp b/src/modules/paired_info/peak_finder.hpp
deleted file mode 100644
index c127108..0000000
--- a/src/modules/paired_info/peak_finder.hpp
+++ /dev/null
@@ -1,385 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * PeakFinder.hpp
- *
- * Created on: Aug 15, 2011
- * Author: alexeyka
- */
-
-#ifndef PEAKFINDER_HPP_
-#define PEAKFINDER_HPP_
-
-#include "dev_support/verify.hpp"
-#include "data_divider.hpp"
-#include "paired_info.hpp"
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <complex>
-#include <cmath>
-
-namespace omnigraph{
-
-namespace de {
-
-template <class EdgeId>
-class PeakFinder {
-
- typedef std::complex<double> complex_t;
-
- public:
- PeakFinder(const vector<PairInfo<EdgeId> >& data,
- size_t begin,
- size_t end,
- size_t /*range*/,
- size_t delta,
- double percentage,
- double der_thr) :
- delta_(delta),
- percentage_(percentage),
- der_thr_(der_thr)
- {
- for (size_t i = begin; i < end; ++i) {
- x_.push_back(rounded_d(data[i]));
- y_.push_back(data[i].weight());
- }
- Init();
- }
-
- double weight() const {
- return weight_;
- }
-
- double GetNormalizedWeight() const {
- return weight_;
- }
-
- void PrintStats(string host) const {
- for (size_t i = 0; i < data_len_; ++i)
- DEBUG(host << (x_left_ + (int) i) << " " << hist_[i]);
- }
-
- void FFTSmoothing(double cutoff) {
- VERIFY(data_len_ > 0);
- if (data_len_ == 1) {
- hist_[0] = y_[0];
- return;
- }
- InitBaseline();
- SubtractBaseline();
- FFTForward(hist_);
- size_t Ncrit = (size_t) (cutoff);
-
- // cutting off - standard parabolic filter
- for (size_t i = 0; i < data_len_ && i < Ncrit; ++i)
- hist_[i] *= 1. - ((double) i * (double) i * 1.) / (double) (Ncrit * Ncrit);
-
- for (size_t i = Ncrit; i < hist_.size(); ++i)
- hist_[i] = 0.;
-
- FFTBackward(hist_);
- AddBaseline();
- }
-
- bool IsPeak(int dist, size_t range) const {
- return IsLocalMaximum(dist, range);
- }
-
- bool IsPeak(int dist) const {
- return IsLocalMaximum(dist, 10);
- }
-
- // looking for one maximum in the picture
- vector<pair<int, double> > ListPeaks(/*int delta = 3*/) const {
- TRACE("Smoothed data");
- //size_t index_max = 0;
- //for (size_t i = 0; i < data_len_; ++i) {
- //TRACE(x_left_ + (int) i << " " << hist_[i]);
- //if (hist_[i].real() > hist_[index_max].real())
- //index_max = i;
- //}
- //vector<pair<int, double> > result;
- //result.push_back(make_pair(x_left_ + index_max, hist_[index_max].real()));
- //return result;
- DEBUG("Listing peaks");
- map<int, double> peaks_;
- //another data_len_
- size_t data_len_ = (size_t) (x_right_ - x_left_);
- vector<bool> was;
- srand((unsigned) time(NULL));
- for (size_t i = 0; i < data_len_; ++i)
- was.push_back(false);
-
- size_t iteration = 0;
- for (size_t l = 0; l < data_len_; ++l) {
- //for (size_t k = 0; k < 4; ++k) {
- //size_t v = std::rand() % data_len_;
- size_t v = l;
- if (was[v])
- continue;
-
- was[v] = true;
- int index = (int) v + x_left_;
- while (index < (x_right_ - 1) && index > x_left_ && iteration < 5) {
- // if @index is local maximum, then leave it
- double right_derivative = RightDerivative(index);
- double left_derivative = LeftDerivative(index);
-
- if (math::gr(right_derivative, 0.) && math::gr(right_derivative, -left_derivative)) {
- index++;
- if ((iteration & 1) == 0)
- ++iteration;
- }
- else if (math::le(left_derivative, 0.)) {
- index--;
- if ((iteration & 1) == 1)
- ++iteration;
- }
- else
- break;
- }
-
- TRACE("FOUND " << index);
-
- //double right_derivative = RightDerivative(index);
- //double left_derivative = LeftDerivative(index);
-
- if (index < 0)
- continue;
-
- //if (index >= x_right_ - delta || index < x_left_ + delta)
- //continue;
-
- TRACE("Is in range");
-
- if (IsPeak(index, 5)) {
- TRACE("Is local maximum " << index);
- double weight_ = 0.;
- int left_bound = (x_left_ > (index - 20) ? x_left_ : (index - 20));
- int right_bound = (x_right_ < (index + 1 + 20) ? x_right_ : (index + 1 + 20));
- for (int i = left_bound; i < right_bound; ++i)
- weight_ += hist_[i - x_left_].real();
- TRACE("WEIGHT counted");
- pair<int, double> tmp_pair = make_pair(index, 100. * weight_);
- if (!peaks_.count(index)) {
- TRACE("Peaks size " << peaks_.size() << ", inserting " << tmp_pair);
- peaks_.insert(tmp_pair);
- } else {
- TRACE("NON UNIQUE");
- }
- }
- }
- TRACE("FINISHED " << peaks_.size());
- vector<pair<int, double> > peaks;
- for (auto iter = peaks_.begin(); iter != peaks_.end(); ++iter) {
- const pair<int, double>& tmp_pair = *iter;
- TRACE("next peak " << tmp_pair);
- peaks.push_back(tmp_pair);
- //for (int i = -10; i <= 10; ++i) {
- //peaks.push_back(make_pair(tmp_pair.first + i, tmp_pair.second / 21.));
- //}
- }
- return peaks;
- }
-
- vector<complex_t> getIn() const {
- return hist_;
- }
-
- vector<complex_t> getOut() const {
- return hist_;
- }
-
-private:
- double x1, x2, y1, y2;
- size_t delta_;
- double percentage_;
- double der_thr_;
- double weight_;
- vector<int> x_;
- vector<double> y_;
- size_t data_size_, data_len_;
- int x_left_, x_right_;
- vector<complex_t> hist_;
-
- size_t Rev(size_t num, size_t lg_n) {
- size_t res = 0;
- for (size_t i = 0; i < lg_n; ++i)
- if (num & (1 << i))
- res |= 1 << (lg_n - 1 - i);
- return res;
- }
-
- void FFT(vector<complex_t>& vect, bool invert) {
- size_t n = vect.size();
- size_t lg_n = 0;
- while ( (1u << lg_n) < n)
- ++lg_n;
-
- while (n < (1u << lg_n)) {
- vect.push_back(0.);
- ++n;
- }
-
- for (size_t i = 0; i < n; ++i)
- if (i < Rev(i, lg_n))
- swap(vect[i], vect[Rev(i, lg_n)]);
-
- for (size_t len = 2; len < 1 + n; len <<= 1) {
- double ang = 2 * M_PI / (double) len * (invert ? -1 : 1);
- complex_t wlen(cos(ang), sin(ang));
- for (size_t i = 0; i < n; i += len) {
- complex_t w(1.);
- for (size_t j = 0; j < (len >> 1); ++j) {
- complex_t u = vect[i + j];
- complex_t v = vect[i + j + (len >> 1)] * w;
- vect[i + j] = u + v;
- vect[i + j + (len >> 1)] = u - v;
- w *= wlen;
- }
- }
- }
-
- if (invert)
- for (size_t i = 0; i < n; ++i)
- vect[i] /= (double) n;
- }
-
-
- void FFTForward(vector<complex_t>& vect) {
- FFT(vect, false);
- }
-
- void FFTBackward(vector<complex_t>& vect) {
- FFT(vect, true);
- }
-
- void ExtendLinear(vector<complex_t>& hist) {
- size_t ind = 0;
- weight_ = 0.;
- for (size_t i = 0; i < data_len_; ++i) {
- if (ind == data_size_ - 1)
- hist.push_back((double) x_right_);
- else {
- VERIFY(x_[ind + 1] > x_[ind]);
- hist.push_back(((double) (i + x_left_ - x_[ind]) *
- y_[ind + 1] + y_[ind] *
- (double) (x_[ind + 1] - i - x_left_)) /
- (double) (1 * (x_[ind + 1] - x_[ind])));
- }
- weight_ += hist[i].real(); // filling the array on the fly
-
- if (ind < data_size_ && ((int) i == x_[ind + 1] - x_left_))
- ++ind;
- }
-
- }
-
-
- void InitBaseline() {
- size_t Np = (size_t) ((double) data_len_ * percentage_);
- if (Np == 0) Np++; // Np <> 0 !!!!
-
- double mean_beg = 0.;
- double mean_end = 0.;
- for (size_t i = 0; i < Np; ++i) {
- mean_beg += hist_[i].real();
- mean_end += hist_[data_len_ - i - 1].real();
- }
- mean_beg /= 1. * (double) Np;
- mean_end /= 1. * (double) Np;
-
- // two points defining the line
- x1 = (double) Np / 2.;
- x2 = (double) data_len_ - (double) Np / 2.;
- y1 = mean_beg;
- y2 = mean_end;
- }
-
- void SubtractBaseline() {
- // subtracting a baseline
- // it's being constructed like this: the first point is (Np/2; mean of the first percentage of data),
- // the second point is (data_len_ - Np/2; mean of the last $percentage of data)
- for (size_t i = 0; i < data_len_; ++i) {
- hist_[i] -= (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
- }
- }
-
- void AddBaseline() {
- for (size_t i = 0; i < data_len_; ++i) {
- hist_[i] += (y1 + (y2 - y1) * ((double) i - x1) / (x2 - x1));
- }
- }
-
- void Init() {
- data_size_ = x_.size();
- x_left_ = x_[0];
- x_right_ = x_[data_size_ - 1] + 1;
- data_len_ = x_right_ - x_left_;
- ExtendLinear(hist_);
- }
-
- bool IsInRange(int peak) const {
- return peak < x_right_ && peak >= x_left_;
- }
-
- double LeftDerivative(int dist) const {
- VERIFY(dist > x_left_);
- return hist_[dist - x_left_].real() - hist_[dist - x_left_ - 1].real();
- }
-
- double RightDerivative(int dist) const {
- VERIFY(dist < x_right_ - 1);
- return hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_].real();
- }
-
- double MiddleDerivative(int dist) const {
- VERIFY(dist > x_left_ && dist < x_right_ - 1);
- return .5 * (hist_[dist - x_left_ + 1].real() - hist_[dist - x_left_ - 1].real());
- }
-
- double Derivative(int dist) const {
- if (dist == x_right_ - 1)
- return LeftDerivative(dist);
- else if (dist == x_left_)
- return RightDerivative(dist);
- else
- return MiddleDerivative(dist);
- }
-
- bool IsLocalMaximum(int peak, size_t range, int left_bound, int right_bound, size_t delta) const {
-
- DEBUG("Is local maximum : peak " << peak << " range " << range
- << " bounds " << left_bound << " " << right_bound << " delta " << delta);
- int index_max = peak;
- TRACE("Looking for the maximum");
- for (int j = left_bound; j < right_bound; ++j)
- if (math::ls(hist_[index_max - x_left_].real(), hist_[j - x_left_].real())) {
- index_max = j;
- }// else if (j < i && hist_[index_max - x_left_][0] == hist_[j - x_left][0] ) index_max = j;
- TRACE("Maximum is " << index_max);
-
- if ((size_t)abs(index_max - peak) <= delta)
- return true;
-
- return false;
- }
-
- bool IsLocalMaximum(int peak, size_t range) const {
- return IsLocalMaximum(peak, range, x_left_, x_right_, delta_);
- }
-
- DECL_LOGGER("PeakFinder");
-};
-
-}
-
-}
-
-#endif /* PEAKFINDER_HPP_ */
diff --git a/src/modules/paired_info/smoothing_distance_estimation.hpp b/src/modules/paired_info/smoothing_distance_estimation.hpp
deleted file mode 100644
index 04f9410..0000000
--- a/src/modules/paired_info/smoothing_distance_estimation.hpp
+++ /dev/null
@@ -1,283 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef SMOOTHING_DISTANCE_ESTIMATION_HPP_
-#define SMOOTHING_DISTANCE_ESTIMATION_HPP_
-
-#include "paired_info.hpp"
-#include "data_divider.hpp"
-#include "peak_finder.hpp"
-#include "weighted_distance_estimation.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class SmoothingDistanceEstimator : public WeightedDistanceEstimator<Graph> {
- //FIXME configure
- static const size_t OVERLAP_TOLERANCE = 1000;
-protected:
- typedef WeightedDistanceEstimator<Graph> base;
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
- typedef typename InPairedIndex::Histogram TempHistogram;
-
-public:
- SmoothingDistanceEstimator(const Graph &graph,
- const InPairedIndex &histogram,
- const GraphDistanceFinder<Graph> &dist_finder,
- std::function<double(int)> weight_f,
- size_t linkage_distance, size_t max_distance, size_t threshold,
- double range_coeff, double delta_coeff,
- size_t cutoff,
- size_t min_peak_points,
- double inv_density,
- double percentage,
- double derivative_threshold,
- bool only_scaffolding = false) :
- base(graph, histogram, dist_finder, weight_f, linkage_distance, max_distance),
- threshold_(threshold),
- range_coeff_(range_coeff),
- delta_coeff_(delta_coeff),
- cutoff_((int) cutoff),
- min_peak_points_(min_peak_points),
- inv_density_(inv_density),
- percentage_(percentage),
- deriv_thr(derivative_threshold),
- only_scaffolding_(only_scaffolding),
- gap_distances(0) { }
-
- virtual ~SmoothingDistanceEstimator() { }
-
-protected:
- typedef typename Graph::EdgeId EdgeId;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<pair<int, double> > EstimHist;
- typedef vector<PairInfo<EdgeId> > PairInfos;
- typedef vector<size_t> GraphLengths;
-
- EstimHist EstimateEdgePairDistances(EdgePair /*ep*/,
- const InHistogram & /*raw_data*/,
- const vector<size_t> & /*forward*/) const override {
- VERIFY_MSG(false, "Sorry, the SMOOOOTHING estimator is not available anymore." <<
- "SPAdes is going to terminate");
-
- return EstimHist();
- }
-
-private:
- typedef pair<size_t, size_t> Interval;
-
- size_t threshold_;
- double range_coeff_;
- double delta_coeff_;
- int cutoff_;
- size_t min_peak_points_;
- double inv_density_;
- double percentage_;
- double deriv_thr;
- bool only_scaffolding_;
- mutable size_t gap_distances;
-
- EstimHist FindEdgePairDistances(EdgePair ep,
- const TempHistogram &raw_hist) const {
- size_t first_len = this->graph().length(ep.first);
- size_t second_len = this->graph().length(ep.second);
- TRACE("Lengths are " << first_len << " " << second_len);
- TempHistogram data;
- for (auto I = raw_hist.begin(), E = raw_hist.end(); I != E; ++I) {
- Point p = *I;
- if (math::ge(2 * (long) rounded_d(p) + (long) second_len, (long) first_len)) if (
- (long) rounded_d(p) + (long) OVERLAP_TOLERANCE >= (long) first_len)
- data.insert(p);
- }
- EstimHist result;
- double picture_weight = 0.;
- for (auto I = data.begin(), E = data.end(); I != E; ++I)
- picture_weight += I->weight;
- if (math::ls(picture_weight, 3.))
- return result;
-
- DataDivider<EdgeId> data_divider(threshold_,
- vector<Point>(data.begin(), data.end()));
-
- PairInfos infos;
- infos.reserve(data.size());
- const vector<Interval> &clusters =
- data_divider.DivideAndSmoothData(ep, infos, this->weight_f_);
- DEBUG("Seeking for distances");
- TRACE("size " << infos.size());
-
- for (size_t i = 0; i < clusters.size(); ++i) {
- size_t begin = clusters[i].first;
- size_t end = clusters[i].second;
- TRACE("begin " << begin << " at " << rounded_d(infos[begin])
- << ", " << " end " << end << " at " << rounded_d(infos[end - 1]));
- size_t data_length = rounded_d(infos[end - 1]) - rounded_d(infos[begin]) + 1;
- TRACE("data length " << data_length);
- if (end - begin > min_peak_points_) {
- size_t range = (size_t) math::round((double) data_length * range_coeff_);
- size_t delta = (size_t) math::round((double) data_length * delta_coeff_);
- PeakFinder<EdgeId> peakfinder(infos, begin, end, range, delta, percentage_, deriv_thr);
- DEBUG("Processing window : " << rounded_d(infos[begin])
- << " " << rounded_d(infos[end - 1]));
- peakfinder.FFTSmoothing(cutoff_);
- TRACE("Listing peaks");
- const EstimHist &peaks = peakfinder.ListPeaks();
- //for (auto iter = peaks.begin(); iter != peaks.end(); ++iter) {
- //TRACE("PEAKS " << iter->first << " " << iter->second);
- //}
- if (peaks.size() == 0)
- continue;
- size_t index_of_max_weight = 0;
- for (size_t i = 0; i < peaks.size(); ++i)
- if (math::ls(peaks[index_of_max_weight].second, peaks[i].second))
- index_of_max_weight = i;
- result.push_back(peaks[index_of_max_weight]);
- }
- }
-
- if (result.size() == 0)
- return result;
- size_t index_of_max_weight = 0;
- for (size_t i = 0; i < result.size(); ++i)
- if (math::ls(result[index_of_max_weight].second, result[i].second))
- index_of_max_weight = i;
-
- EstimHist new_result;
- for (size_t i = 0; i < result.size(); ++i)
- if (result[i].second > .5 * result[index_of_max_weight].second)
- new_result.push_back(result[i]);
- return new_result;
- }
-
- void ProcessEdge(EdgeId e1,
- const InPairedIndex &pi,
- PairedInfoBuffer<Graph> &result) const override {
- typename base::LengthMap second_edges;
- auto inner_map = pi.GetHalf(e1);
- for (auto I : inner_map)
- second_edges[I.first];
-
- this->FillGraphDistancesLengths(e1, second_edges);
-
- for (const auto &entry: second_edges) {
- EdgeId e2 = entry.first;
- EdgePair ep(e1, e2);
-
- VERIFY(ep <= pi.ConjugatePair(ep));
-
- TRACE("Processing edge pair " << this->graph().int_id(e1)
- << " " << this->graph().int_id(e2));
- const GraphLengths &forward = entry.second;
-
- auto hist = pi.Get(e1, e2).Unwrap();
- EstimHist estimated;
- //DEBUG("Extending paired information");
- //DEBUG("Extend left");
- //this->base::ExtendInfoLeft(e1, e2, hist, 1000);
- DEBUG("Extend right");
- this->ExtendInfoRight(e1, e2, hist, 1000);
- if (forward.size() == 0) {
- estimated = FindEdgePairDistances(ep, hist);
- ++gap_distances;
- } else if (forward.size() > 0 && (!only_scaffolding_)) {
- //TODO: remove THIS
- InPairedIndex temp_index(this->graph());
- temp_index.AddMany(e1, e2, hist);
- auto hist = temp_index.Get(e1, e2);
- estimated = this->base::EstimateEdgePairDistances(ep, hist, forward);
- }
- DEBUG(gap_distances << " distances between gap edge pairs have been found");
- OutHistogram res = this->ClusterResult(ep, estimated);
- this->AddToResult(res, ep, result);
- }
- }
-
- bool IsTipTip(EdgeId e1, EdgeId e2) const {
- return (this->graph().OutgoingEdgeCount(this->graph().EdgeEnd(e1)) == 0 &&
- this->graph().IncomingEdgeCount(this->graph().EdgeEnd(e1)) == 1 &&
- this->graph().IncomingEdgeCount(this->graph().EdgeStart(e2)) == 0 &&
- this->graph().OutgoingEdgeCount(this->graph().EdgeStart(e2)) == 1);
- }
-
- void ExtendInfoRight(EdgeId e1, EdgeId e2, TempHistogram &data, size_t max_shift) const {
- ExtendRightDFS(e1, e2, data, 0, max_shift);
- }
-
- void MergeInto(const InHistogram &what, TempHistogram &where, int shift) const {
- // assuming they are sorted already
- if (what.size() == 0)
- return;
-
- if (where.size() == 0) {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- where.insert(to_be_added);
- }
-
- return;
- }
-
- // Check, whether two histograms intersect. If not, we can just merge them
- // straightforwardly.
- if (math::ls(where.rbegin()->d, what.min().d + shift) ||
- math::gr(where.begin()->d, what.max().d + shift)) {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- where.insert(to_be_added);
- }
- } else {
- for (auto to_be_added : what) {
- to_be_added.d += shift;
- auto low_bound = std::lower_bound(where.begin(), where.end(), to_be_added);
- if (to_be_added == *low_bound) {
- to_be_added.weight += low_bound->weight;
- where.erase(to_be_added);
- where.insert(to_be_added);
- } else
- where.insert(low_bound, to_be_added);
- }
- }
- }
-
- void ExtendRightDFS(const EdgeId &first, EdgeId current, TempHistogram &data, int shift,
- size_t max_shift) const {
- auto end = this->graph().EdgeEnd(current);
- if (current == first)
- return;
- if (this->graph().IncomingEdgeCount(end) > 1)
- return;
-
- for (EdgeId next : this->graph().OutgoingEdges(end)) {
- auto hist = this->index().Get(first, next);
- if (-shift < (int) max_shift)
- ExtendRightDFS(first, next, data, shift - (int) this->graph().length(current), max_shift);
-
- //auto filtered_infos = FilterPositive(hist, this->graph().length(first), this->graph().length(next));
- //if (filtered_infos.size() > 0)
- // MergeInto(filtered_infos, data, shift - (int) this->graph().length(current));
- MergeInto(hist, data, shift - (int) this->graph().length(current));
- }
- }
-
- const string Name() const override {
- static const string my_name = "SMOOTHING";
- return my_name;
- }
-
- DECL_LOGGER("SmoothingDistanceEstimator")
-};
-
-}
-
-}
-
-#endif /* SMOOTHING_DISTANCE_ESTIMATION_HPP_ */
diff --git a/src/modules/paired_info/split_path_constructor.hpp b/src/modules/paired_info/split_path_constructor.hpp
deleted file mode 100644
index 9cf0c2f..0000000
--- a/src/modules/paired_info/split_path_constructor.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
-* split_path_constructor.hpp
-*
-* Created on: Jun 14, 2012
-* Author: avsirotkin
-*/
-
-#pragma once
-
-#include "dev_support/logger/logger.hpp"
-#include "paired_info/paired_info.hpp"
-#include "assembly_graph/paths/path_processor.hpp"
-#include "paired_info/pair_info_bounds.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PathInfoClass {
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
-
- EdgeId base_edge;
- vector<PairInfo> path;
-
- PathInfoClass() : base_edge(NULL) { };
-
- PathInfoClass(const EdgeId Edge) : base_edge(Edge) { };
-
- std::pair<EdgeId, double> operator[](const size_t i) const {
- if (i == 0)
- return std::make_pair(base_edge, 0.0);
-
- VERIFY(i < path.size() + 1);
- return std::make_pair(path[i - 1].second, path[i - 1].d());
- }
-
- size_t size() const { return path.size() + 1; }
-
- void push_back(const PairInfo &pi) { path.push_back(pi); }
-
- typename std::vector<PairInfo>::const_iterator begin() const { return path.begin(); }
-
- typename std::vector<PairInfo>::const_iterator end() const { return path.end(); }
-
- std::string PrintPath(const Graph &graph) const {
- std::ostringstream ss;
- ss << " " << graph.int_id(base_edge) << ": ";
- for (size_t j = 0; j < path.size(); j++) {
- ss << "(" << graph.int_id(path[j].second) << ", " << path[j].d() << "), ";
- }
- return ss.str();
- }
-};
-
-template<class Graph>
-class SplitPathConstructor {
- typedef typename Graph::EdgeId EdgeId;
- typedef PathInfoClass<Graph> PathInfo;
- typedef omnigraph::de::PairInfo<EdgeId> PairInfo;
-
-public:
- SplitPathConstructor(const Graph &graph) : graph_(graph) { }
-
- vector<PathInfo> ConvertPIToSplitPaths(EdgeId cur_edge, const omnigraph::de::PairedInfoIndexT<Graph> &pi,
- double is, double is_var) const {
- vector<PairInfo> pair_infos; //TODO: this is an adaptor for the old implementation
- for (auto i : pi.Get(cur_edge))
- for (auto j : i.second)
- pair_infos.emplace_back(cur_edge, i.first, j);
-
- vector<PathInfo> result;
- if (pair_infos.empty())
- return result;
-
- vector<bool> pair_info_used(pair_infos.size());
- TRACE("Preparing path_processor for this base edge");
- size_t path_upper_bound = PairInfoPathLengthUpperBound(graph_.k(), (size_t) is, is_var);
-
- //FIXME is path_upper_bound enough?
- PathProcessor<Graph> path_processor(graph_,
- graph_.EdgeEnd(cur_edge),
- path_upper_bound);
-
- TRACE("Path_processor is done");
-
- for (size_t i = pair_infos.size(); i > 0; --i) {
- const PairInfo &cur_info = pair_infos[i - 1];
- if (math::le(cur_info.d(), 0.))
- continue;
- if (pair_info_used[i - 1])
- continue;
- DEBUG("SPC: pi " << cur_info);
- vector<EdgeId> common_part = GetCommonPathsEnd(graph_, cur_edge, cur_info.second,
- (size_t) (cur_info.d() - cur_info.var()),
- (size_t) (cur_info.d() + cur_info.var()),
- path_processor);
- DEBUG("Found common part of size " << common_part.size());
- PathInfoClass<Graph> sub_res(cur_edge);
- if (common_part.size() > 0) {
- size_t total_length = 0;
- for (size_t j = 0; j < common_part.size(); ++j)
- total_length += graph_.length(common_part[j]);
-
- DEBUG("Common part " << ToString(common_part));
- for (size_t j = 0; j < common_part.size(); ++j) {
- PairInfo cur_pi(cur_edge, common_part[j],
- cur_info.d() - (double) total_length,
- cur_info.weight(),
- cur_info.var());
-
- sub_res.push_back(cur_pi);
- total_length -= graph_.length(common_part[j]);
- for (size_t ind = 0; ind + 1 < i; ++ind) {
- if (cur_pi == pair_infos[ind])
- pair_info_used[ind] = true;
- }
- }
- }
-
- sub_res.push_back(cur_info);
- result.push_back(sub_res);
- DEBUG(sub_res.PrintPath(graph_));
- }
- return result;
- }
-
-private:
- const Graph &graph_;
-};
-
-
-}
diff --git a/src/modules/paired_info/weighted_distance_estimation.hpp b/src/modules/paired_info/weighted_distance_estimation.hpp
deleted file mode 100644
index 9928ef9..0000000
--- a/src/modules/paired_info/weighted_distance_estimation.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef WEIGHTED_DISTANCE_ESTIMATION_HPP_
-#define WEIGHTED_DISTANCE_ESTIMATION_HPP_
-
-#include "math/xmath.h"
-#include "paired_info.hpp"
-#include "distance_estimation.hpp"
-
-namespace omnigraph {
-
-namespace de {
-
-template<class Graph>
-class WeightedDistanceEstimator : public DistanceEstimator<Graph> {
-protected:
- typedef DistanceEstimator<Graph> base;
- typedef typename base::InPairedIndex InPairedIndex;
- typedef typename base::OutPairedIndex OutPairedIndex;
- typedef typename base::InHistogram InHistogram;
- typedef typename base::OutHistogram OutHistogram;
-
-public:
- WeightedDistanceEstimator(const Graph &graph,
- const InPairedIndex &histogram,
- const GraphDistanceFinder<Graph> &distance_finder,
- std::function<double(int)> weight_f,
- size_t linkage_distance, size_t max_distance) :
- base(graph, histogram, distance_finder, linkage_distance, max_distance), weight_f_(weight_f) { }
-
- virtual ~WeightedDistanceEstimator() { }
-
-protected:
- typedef typename Graph::EdgeId EdgeId;
-
- typedef vector<pair<int, double> > EstimHist;
- typedef pair<EdgeId, EdgeId> EdgePair;
- typedef vector<size_t> GraphLengths;
-
- std::function<double(int)> weight_f_;
-
- virtual EstimHist EstimateEdgePairDistances(EdgePair ep,
- const InHistogram &histogram,
- const GraphLengths &raw_forward) const override {
- using std::abs;
- using namespace math;
- TRACE("Estimating with weight function");
- size_t first_len = this->graph().length(ep.first);
- size_t second_len = this->graph().length(ep.second);
-
- EstimHist result;
- int maxD = rounded_d(histogram.max()), minD = rounded_d(histogram.min());
- vector<int> forward;
- for (auto length : raw_forward) {
- if (minD - (int) this->max_distance_ <= length && length <= maxD + (int) this->max_distance_) {
- forward.push_back(length);
- }
- }
- if (forward.size() == 0)
- return result;
-
- DEDistance max_dist = this->max_distance_;
- size_t cur_dist = 0;
- vector<double> weights(forward.size());
- for (auto point : histogram) {
- if (ls(2. * point.d + (double) second_len, (double) first_len))
- continue;
- while (cur_dist + 1 < forward.size() && (double) forward[cur_dist + 1] < point.d) {
- ++cur_dist;
- }
- if (cur_dist + 1 < forward.size() && ls((double) forward[cur_dist + 1] - point.d,
- point.d - (double) forward[cur_dist])) {
- ++cur_dist;
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
- }
- else if (cur_dist + 1 < forward.size() && eq(forward[cur_dist + 1] - point.d,
- point.d - forward[cur_dist])) {
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
-
- ++cur_dist;
-
- if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * 0.5 * weight_f_(forward[cur_dist] - rounded_d(point));
- } else if (le(abs(forward[cur_dist] - point.d), max_dist))
- weights[cur_dist] += point.weight * weight_f_(forward[cur_dist] - rounded_d(point));
- }
-
- for (size_t i = 0; i < forward.size(); ++i)
- if (gr(weights[i], 0.))
- result.push_back(make_pair(forward[i], weights[i]));
-
- return result;
- }
-
- const string Name() const override {
- static const string my_name = "WEIGHTED";
- return my_name;
- }
-
-};
-
-}
-
-}
-#endif
diff --git a/src/modules/paired_info/weights.hpp b/src/modules/paired_info/weights.hpp
deleted file mode 100644
index 8812d68..0000000
--- a/src/modules/paired_info/weights.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-using omnigraph::Path;
-using omnigraph::MappingPath;
-using omnigraph::Range;
-using omnigraph::MappingRange;
-
-namespace debruijn_graph {
-inline double PairedReadCountWeight(const MappingRange&, const MappingRange&) {
- return 1.;
-}
-
-inline double KmerCountProductWeight(const MappingRange& mr1,
- const MappingRange& mr2) {
- return (double)(mr1.initial_range.size() * mr2.initial_range.size());
-}
-
-class WeightDEWrapper {
-private:
-
- vector<double> new_hist;
- int left_x;
- int insert_size;
-
- void ExtendLinear(const std::map<int, size_t> & hist) {
- size_t sum_weight = 0;
-
- for (auto iter = hist.begin(); iter != hist.end(); ++iter)
- sum_weight += iter->second;
- DEBUG(sum_weight);
-
- VERIFY(hist.size() > 0);
- auto iter = hist.begin();
-
- left_x = iter->first;
-
- int prev = iter->first;
- size_t prev_val = iter->second;
-
- new_hist.push_back((double)prev_val / (double)sum_weight);
- ++iter;
-
- for (; iter != hist.end(); ++iter) {
- int x = iter->first;
- size_t y = iter->second;
- double tan = ((double)y - (double)prev_val) / (x - prev);
-
- VERIFY(prev < x);
- for (int i = prev + 1; i <= x; ++i) {
- new_hist.push_back(((double)prev_val + tan * (i - prev)) / (double)sum_weight);
- }
- prev = x;
- prev_val = y;
- DEBUG("hist " << x << " " << y);
- }
- }
-
-public:
- WeightDEWrapper(const map<int, size_t>& hist, double IS) {
- DEBUG("WeightDEWrapper " << IS);
- insert_size = (int) IS;
- DEBUG("Extending linear");
- ExtendLinear(hist);
- }
-
- ~WeightDEWrapper() {
- }
-
-
- double CountWeight(int x) const {
- int xx = insert_size - left_x + x - 1;
-
- if (!(xx >= 0 && xx < (int) new_hist.size())) return 0.;
- VERIFY(math::le(new_hist[xx], 1.));
- return 1000. * new_hist[xx];
- }
-};
-
-inline double UnityFunction(int /*x*/) {
- return 1.;
-}
-}
\ No newline at end of file
diff --git a/src/modules/pipeline/config_common.hpp b/src/modules/pipeline/config_common.hpp
deleted file mode 100755
index e540017..0000000
--- a/src/modules/pipeline/config_common.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/simple_tools.hpp"
-#include "dev_support/path_helper.hpp"
-#include "dev_support/verify.hpp"
-
-// todo: undo dirty fix
-
-#include <boost/property_tree/ptree.hpp>
-#include <boost/property_tree/info_parser.hpp>
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-
-namespace config_common {
-// for enable_if/disable_if
-namespace details {
-template<class T, class S>
-struct is_equal_type {
- static const bool value = false;
-};
-
-template<class T>
-struct is_equal_type<T, T> {
- static const bool value = true;
-};
-}
-
-template<class T>
-typename boost::enable_if_c<details::is_equal_type<T, std::string>::value ||
- boost::is_arithmetic<T>::value>::type
-load(T &value,
- boost::property_tree::ptree const &pt, std::string const &key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found())
- value = pt.get<T>(key);
-}
-
-template<class T>
-typename boost::disable_if_c<details::is_equal_type<T,
- std::string>::value ||
- boost::is_arithmetic<T>::value>::type
-load(T &value,
- boost::property_tree::ptree const &pt, std::string const &key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found())
- load(value, pt.get_child(key), complete);
-}
-
-template<class T>
-void load_items(std::vector <T> &vec, boost::property_tree::ptree const &pt,
- std::string const &key, bool complete) {
- std::string vector_key = key + std::string(".count");
- if (complete || pt.find(vector_key) != pt.not_found()) {
- size_t count = pt.get<size_t>(vector_key);
-
- for (size_t i = 0; i != count; ++i) {
- T t;
- load(t, pt.get_child(fmt::format("{:s}.item_{:d}", key, i)),
- complete);
- vec.push_back(t);
- }
- }
-}
-
-template<class T>
-void load(std::vector <T> &vec, boost::property_tree::ptree const &pt, std::string const &key,
- bool /*complete*/) {
- boost::optional<T> value = pt.get_optional<T>(key);
- if (value) {
- vec.push_back(*value);
- return;
- }
- for (size_t i = 1; ; i++) {
- value = pt.get_optional<std::string>(key + "#" + ToString(i));
- if (value) {
- vec.push_back(*value);
- continue;
- }
- value = pt.get_optional<std::string>(key + "." + ToString(i));
- if (value) {
- vec.push_back(*value);
- continue;
- }
- if (i > 0) {
- return;
- }
- }
-}
-
-template<class T>
-void load(T &value, boost::property_tree::ptree const &pt, std::string const &key) {
- load(value, pt, key, true);
-}
-
-template<class T>
-void load(T &value, boost::property_tree::ptree const &pt, const char *key) {
- load(value, pt, std::string(key), true);
-}
-
-template<class T>
-void load(T &value, boost::property_tree::ptree const &pt) {
- load(value, pt, true);
-}
-
-template<class T>
-void load_param(const std::string &filename, const std::string &key,
- boost::optional<T> &value) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(filename, pt);
- value = pt.get_optional<T>(key);
-}
-
-template<class T>
-void write_param(const std::string &filename, const std::string &key,
- const boost::optional<T> &value) {
- if (value) {
- std::ofstream params_stream(filename.c_str(), std::ios_base::app);
- params_stream << key << "\t" << value << std::endl;
- }
-}
-
-template<class T>
-void write_param(const std::string &filename, const std::string &key,
- const T &value) {
- std::ofstream params_stream(filename.c_str(), std::ios_base::app);
- params_stream << key << "\t" << value << std::endl;
-}
-
-}
diff --git a/src/modules/pipeline/config_singl.hpp b/src/modules/pipeline/config_singl.hpp
deleted file mode 100644
index 9bf726e..0000000
--- a/src/modules/pipeline/config_singl.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __CONFIG_SINGL_HPP__
-#define __CONFIG_SINGL_HPP__
-
-
-#include "dev_support/verify.hpp"
-
-#include <string>
-
-namespace config_common {
-
-// config singleton-wrap
-template<class Config>
-struct config {
- static std::string dirnameOf(const std::string &fname) {
- size_t pos = fname.find_last_of("\\/");
- return (std::string::npos == pos) ? "" : fname.substr(0, pos);
- }
-
- template<class Source>
- static void create_instance(Source const &source) {
- load(inner_cfg(), source);
- is_initialized() = true;
- }
-
- static Config const &get() {
- VERIFY_MSG(is_initialized(), "Config not initialized");
- return inner_cfg();
- }
-
- static Config &get_writable() {
- VERIFY_MSG(is_initialized(), "Config not initialized");
- return inner_cfg();
- }
-
-private:
- static Config &inner_cfg() {
- static Config config;
- return config;
- }
-
- static bool &is_initialized() {
- static bool is_initialized = false;
- return is_initialized;
- }
-};
-
-}
-
-
-#endif // __CONFIG_SINGLE_HPP__
diff --git a/src/modules/pipeline/config_struct.cpp b/src/modules/pipeline/config_struct.cpp
deleted file mode 100644
index fecc73b..0000000
--- a/src/modules/pipeline/config_struct.cpp
+++ /dev/null
@@ -1,819 +0,0 @@
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "pipeline/config_struct.hpp"
-
-#include "pipeline/config_common.hpp"
-#include "dev_support/openmp_wrapper.h"
-
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/verify.hpp"
-
-#include "io/reads_io/file_reader.hpp"
-
-#include <string>
-#include <vector>
-
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-
-using namespace llvm;
-
-namespace io {
-template<>
-void SequencingLibrary<debruijn_graph::config::DataSetData>::yamlize(llvm::yaml::IO &io) {
- // First, load the "common stuff"
- SequencingLibraryBase::yamlize(io);
-
- // Now load the remaining stuff
- io.mapOptional("read length" , data_.read_length);
- io.mapOptional("average read length" , data_.avg_read_length);
- io.mapOptional("insert size mean" , data_.mean_insert_size);
- io.mapOptional("insert size deviation" , data_.insert_size_deviation);
- io.mapOptional("insert size left quantile" , data_.insert_size_left_quantile);
- io.mapOptional("insert size right quantile" , data_.insert_size_right_quantile);
- io.mapOptional("insert size median" , data_.median_insert_size);
- io.mapOptional("insert size mad" , data_.insert_size_mad);
- io.mapOptional("insert size distribution" , data_.insert_size_distribution);
- io.mapOptional("average coverage" , data_.average_coverage);
- io.mapOptional("pi threshold" , data_.pi_threshold);
- io.mapOptional("binary converted" , data_.binary_reads_info.binary_coverted);
- io.mapOptional("single reads mapped" , data_.single_reads_mapped);
-}
-
-template<>
-void SequencingLibrary<debruijn_graph::config::DataSetData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
- // Simply ask base class to validate for us
- SequencingLibraryBase::validate(io, res);
-}
-}
-
-#include "pipeline/library.inl"
-
-template class io::DataSet<debruijn_graph::config::DataSetData>;
-
-namespace debruijn_graph {
-namespace config {
-
-template<typename mode_t>
-vector<string> CheckedNames(const vector<pair<string, mode_t>>& mapping, mode_t total) {
- VERIFY_MSG(size_t(total) == mapping.size(), "Names for some modes missing")
- vector<string> answer;
- for (size_t i = 0; i < mapping.size(); ++i) {
- VERIFY_MSG(size_t(mapping[i].second) == i, "Id/name mapping error");
- answer.push_back(mapping[i].first);
- }
- return answer;
-}
-
-vector<string> InfoPrinterPosNames() {
- return CheckedNames<info_printer_pos>({
- {"default", info_printer_pos::default_pos},
- {"before_first_gap_closer", info_printer_pos::before_first_gap_closer},
- {"before_simplification", info_printer_pos::before_simplification},
- {"before_post_simplification", info_printer_pos::before_post_simplification},
- {"final_simplified", info_printer_pos::final_simplified},
- {"final_gap_closed", info_printer_pos::final_gap_closed},
- {"before_repeat_resolution", info_printer_pos::before_repeat_resolution}}, info_printer_pos::total);
-}
-
-vector<string> PipelineTypeNames() {
- return CheckedNames<pipeline_type>({
- {"base", pipeline_type::base},
- {"isolate", pipeline_type::isolate},
- {"mda", pipeline_type::mda},
- {"meta", pipeline_type::meta},
- {"moleculo", pipeline_type::moleculo},
- {"diploid", pipeline_type::diploid},
- {"rna", pipeline_type::rna},
- {"plasmid", pipeline_type::plasmid}}, pipeline_type::total);
-}
-
-vector<string> ConstructionModeNames() {
- return CheckedNames<construction_mode>({
- {"old", construction_mode::old},
- {"extension", construction_mode::extention}}, construction_mode::total);
-}
-
-vector<string> EstimationModeNames() {
- return CheckedNames<estimation_mode>({
- {"simple", estimation_mode::simple},
- {"weighted", estimation_mode::weighted},
- {"smoothing", estimation_mode::smoothing}}, estimation_mode::total);
-}
-
-
-vector<string> ResolveModeNames() {
- return CheckedNames<resolving_mode>({
- {"none", resolving_mode::none},
- {"path_extend", resolving_mode::path_extend}}, resolving_mode::total);
-}
-
-vector<string> SingleReadResolveModeNames() {
- return CheckedNames<single_read_resolving_mode>({
- {"none", single_read_resolving_mode::none},
- {"only_single_libs", single_read_resolving_mode::only_single_libs},
- {"all", single_read_resolving_mode::all}}, single_read_resolving_mode::total);
-}
-
-void load_lib_data(const std::string& prefix) {
- // First, load the data into separate libs
- cfg::get_writable().ds.reads.load(prefix + ".lib_data");
-
- // Now, infer the common parameters
- size_t max_rl = 0;
- double avg_cov = 0.0;
- double avg_rl = 0.0;
- for (const auto& lib : cfg::get().ds.reads.libraries()) {
- auto const& data = lib.data();
- if (lib.is_graph_contructable())
- max_rl = std::max(max_rl, data.read_length);
- if (data.average_coverage > 0)
- avg_cov = data.average_coverage;
- if (data.avg_read_length > 0)
- avg_rl = data.avg_read_length;
- }
-
- cfg::get_writable().ds.set_RL(max_rl);
- cfg::get_writable().ds.set_aRL(avg_rl);
- cfg::get_writable().ds.set_avg_coverage(avg_cov);
-}
-
-void write_lib_data(const std::string& prefix) {
- cfg::get_writable().ds.reads.save(prefix + ".lib_data");
-}
-
-void load(debruijn_config::simplification::tip_clipper &tc,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
- load(tc.condition, pt, "condition");
-}
-
-void load(debruijn_config::simplification::dead_end_clipper& dead_end,
- boost::property_tree::ptree const &pt,
- bool /* complete */) {
- using config_common::load;
- load(dead_end.condition, pt, "condition");
- load(dead_end.enabled, pt, "enabled");
-}
-
-void load(resolving_mode &rm, boost::property_tree::ptree const &pt,
- std::string const &key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- rm = ModeByName<resolving_mode>(pt.get<std::string>(key), ResolveModeNames());
- }
-}
-
-void load(single_read_resolving_mode &rm, boost::property_tree::ptree const &pt,
- std::string const &key, bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- std::string ep = pt.get<std::string>(key);
- rm = ModeByName<single_read_resolving_mode>(ep, SingleReadResolveModeNames());
- }
-}
-
-void load(construction_mode& con_mode,
- boost::property_tree::ptree const& pt, std::string const& key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- con_mode = ModeByName<construction_mode>(pt.get<std::string>(key), ConstructionModeNames());
- }
-}
-
-void load(debruijn_config::construction::early_tip_clipper& etc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(etc.enable, pt, "enable");
- etc.length_bound = pt.get_optional<size_t>("length_bound");
-}
-
-void load(debruijn_config::construction& con,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(con.con_mode, pt, "mode", complete);
- load(con.keep_perfect_loops, pt, "keep_perfect_loops", complete);
- load(con.read_buffer_size, pt, "read_buffer_size", complete);
- con.read_buffer_size *= 1024 * 1024;
- load(con.early_tc, pt, "early_tip_clipper", complete);
-}
-
-void load(debruijn_config::sensitive_mapper& sensitive_map,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(sensitive_map.k, pt, "k", complete);
-}
-
-void load(estimation_mode &est_mode,
- boost::property_tree::ptree const &pt, std::string const &key,
- bool complete) {
- if (complete || pt.find(key) != pt.not_found()) {
- est_mode = ModeByName<estimation_mode>(pt.get<std::string>(key), EstimationModeNames());
- }
-}
-
-void load(debruijn_config::simplification::bulge_remover& br,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(br.enabled , pt, "enabled" , complete);
- load(br.main_iteration_only , pt, "main_iteration_only" , complete);
- load(br.max_bulge_length_coefficient , pt, "max_bulge_length_coefficient", complete);
- load(br.max_additive_length_coefficient , pt,
- "max_additive_length_coefficient", complete);
- load(br.max_coverage, pt, "max_coverage", complete);
- load(br.max_relative_coverage, pt, "max_relative_coverage", complete);
- load(br.max_delta, pt, "max_delta", complete);
- load(br.max_relative_delta, pt, "max_relative_delta", complete);
- load(br.max_number_edges, pt, "max_number_edges", complete);
- load(br.parallel, pt, "parallel", complete);
- load(br.buff_size, pt, "buff_size", complete);
- load(br.buff_cov_diff, pt, "buff_cov_diff", complete);
- load(br.buff_cov_rel_diff, pt, "buff_cov_rel_diff", complete);
-}
-
-void load(debruijn_config::simplification::topology_tip_clipper& ttc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(ttc.length_coeff, pt, "length_coeff");
- load(ttc.plausibility_length, pt, "plausibility_length");
- load(ttc.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::complex_tip_clipper &ctc,
- boost::property_tree::ptree const &pt, bool complete) {
- using config_common::load;
- load(ctc.enabled, pt, "enabled", complete);
- load(ctc.max_relative_coverage, pt, "max_relative_coverage", complete);
- load(ctc.max_edge_len, pt, "max_edge_len", complete);
- load(ctc.condition, pt, "condition", complete);
-}
-
-void load(debruijn_config::simplification::relative_coverage_edge_disconnector& relative_ed,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(relative_ed.enabled, pt, "enabled", complete);
- load(relative_ed.diff_mult, pt, "diff_mult", complete);
-}
-
-void load(debruijn_config::simplification::relative_coverage_comp_remover& rcc,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(rcc.enabled, pt, "enabled", complete);
- load(rcc.coverage_gap, pt, "coverage_gap", complete);
- load(rcc.length_coeff, pt, "max_length_coeff", complete);
- load(rcc.tip_allowing_length_coeff, pt, "max_length_with_tips_coeff", complete);
- load(rcc.vertex_count_limit, pt, "max_vertex_cnt", complete);
- load(rcc.max_ec_length_coefficient, pt, "max_ec_length_coefficient", complete);
- load(rcc.max_coverage_coeff, pt, "max_coverage_coeff", complete);
-}
-
-void load(debruijn_config::simplification::isolated_edges_remover& ier,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(ier.enabled, pt, "enabled", complete);
- load(ier.max_length, pt, "max_length", complete);
- load(ier.max_coverage, pt, "max_coverage", complete);
- load(ier.max_length_any_cov, pt, "max_length_any_cov", complete);
-}
-
-void load(debruijn_config::simplification::init_cleaning& init_clean,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(init_clean.self_conj_condition, pt, "self_conj_condition", complete);
- load(init_clean.early_it_only, pt, "early_it_only", complete);
- load(init_clean.activation_cov, pt, "activation_cov", complete);
- load(init_clean.ier, pt, "ier", complete);
- load(init_clean.tip_condition, pt, "tip_condition", complete);
- load(init_clean.ec_condition, pt, "ec_condition", complete);
- load(init_clean.disconnect_flank_cov, pt, "disconnect_flank_cov", complete);
-}
-
-void load(debruijn_config::simplification::complex_bulge_remover& cbr,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(cbr.enabled, pt, "enabled");
- load(cbr.max_relative_length, pt, "max_relative_length", complete);
- load(cbr.max_length_difference, pt, "max_length_difference", complete);
-}
-
-void load(debruijn_config::simplification::erroneous_connections_remover& ec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(ec.condition, pt, "condition");
-}
-
-void load(debruijn_config::simplification::relative_coverage_ec_remover& rcec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(rcec.enabled, pt, "enabled");
- load(rcec.max_ec_length, pt, "rcec_lb");
- load(rcec.rcec_ratio, pt, "rcec_cb");
-}
-
-void load(debruijn_config::simplification::topology_based_ec_remover& tec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(tec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(tec.plausibility_length, pt, "plausibility_length");
- load(tec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::interstrand_ec_remover &isec,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
- load(isec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(isec.uniqueness_length, pt, "uniqueness_length");
- load(isec.span_distance, pt, "span_distance");
-}
-
-void load(debruijn_config::simplification::tr_based_ec_remover &trec,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
- load(trec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(trec.unreliable_coverage, pt, "unreliable_coverage");
- load(trec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::max_flow_ec_remover& mfec,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(mfec.enabled, pt, "enabled");
- load(mfec.max_ec_length_coefficient, pt, "max_ec_length_coefficient");
- load(mfec.plausibility_length, pt, "plausibility_length");
- load(mfec.uniqueness_length, pt, "uniqueness_length");
-}
-
-void load(debruijn_config::simplification::hidden_ec_remover& her,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(her.enabled, pt, "enabled");
- load(her.uniqueness_length, pt, "uniqueness_length");
- load(her.unreliability_threshold, pt, "unreliability_threshold");
- load(her.relative_threshold, pt, "relative_threshold");
-}
-
-void load(debruijn_config::distance_estimator& de,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(de.linkage_distance_coeff, pt, "linkage_distance_coeff");
- load(de.max_distance_coeff, pt, "max_distance_coeff");
- load(de.max_distance_coeff_scaff, pt, "max_distance_coeff_scaff");
- load(de.filter_threshold, pt, "filter_threshold");
-}
-
-void load(debruijn_config::smoothing_distance_estimator& ade,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
-
- load(ade.threshold, pt, "threshold");
- load(ade.range_coeff, pt, "range_coeff");
- load(ade.delta_coeff, pt, "delta_coeff");
- load(ade.percentage, pt, "percentage");
- load(ade.cutoff, pt, "cutoff");
- load(ade.min_peak_points, pt, "min_peak_points");
- load(ade.inv_density, pt, "inv_density");
- load(ade.derivative_threshold, pt, "derivative_threshold");
-}
-
-//FIXME make amb_de optional field
-inline void load(debruijn_config::ambiguous_distance_estimator &amde,
- boost::property_tree::ptree const &pt, bool complete) {
- using config_common::load;
-
- load(amde.enabled, pt, "enabled", complete);
- load(amde.haplom_threshold, pt, "haplom_threshold", complete);
- load(amde.relative_length_threshold, pt, "relative_length_threshold", complete);
- load(amde.relative_seq_threshold, pt, "relative_seq_threshold", complete);
-}
-
-void load(debruijn_config::scaffold_correction& sc_corr,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(sc_corr.scaffolds_file, pt, "scaffolds_file");
- load(sc_corr.output_unfilled, pt, "output_unfilled");
- load(sc_corr.max_insert, pt, "max_insert");
- load(sc_corr.max_cut_length, pt, "max_cut_length");
-}
-
-void load(debruijn_config::truseq_analysis& tsa,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(tsa.scaffolds_file, pt, "scaffolds_file");
- load(tsa.genome_file, pt, "genome_file");
-}
-
-void load(debruijn_config::bwa_aligner& bwa,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(bwa.bwa_enable, pt, "bwa_enable");
- load(bwa.debug, pt, "debug");
- load(bwa.path_to_bwa, pt, "path_to_bwa");
- load(bwa.min_contig_len, pt, "min_contig_len");
-}
-
-void load(debruijn_config::pacbio_processor& pb,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(pb.pacbio_k, pt, "pacbio_k");
- load(pb.additional_debug_info, pt, "additional_debug_info");
- load(pb.compression_cutoff, pt, "compression_cutoff");
- load(pb.domination_cutoff, pt, "domination_cutoff");
- load(pb.path_limit_stretching, pt, "path_limit_stretching");
- load(pb.path_limit_pressing, pt, "path_limit_pressing");
- load(pb.ignore_middle_alignment, pt, "ignore_middle_alignment");
- load(pb.long_seq_limit, pt, "long_seq_limit");
- load(pb.pacbio_min_gap_quantity, pt, "pacbio_min_gap_quantity");
- load(pb.contigs_min_gap_quantity, pt, "contigs_min_gap_quantity");
- load(pb.max_contigs_gap_length, pt, "max_contigs_gap_length");
-
-}
-
-
-void load(debruijn_config::position_handler& pos,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(pos.max_mapping_gap, pt, "max_mapping_gap");
- load(pos.max_gap_diff, pt, "max_gap_diff");
- load(pos.contigs_for_threading, pt, "contigs_for_threading");
- load(pos.contigs_to_analyze, pt, "contigs_to_analyze");
- load(pos.late_threading, pt, "late_threading");
- load(pos.careful_labeling, pt, "careful_labeling");
-}
-void load(debruijn_config::plasmid& pd,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(pd.long_edge_length, pt, "long_edge_length");
- load(pd.edge_length_for_median, pt, "edge_length_for_median");
-
- load(pd.relative_coverage, pt, "relative_coverage");
- load(pd.small_component_size, pt, "small_component_size");
- load(pd.small_component_relative_coverage, pt, "small_component_relative_coverage");
- load(pd.min_component_length, pt, "min_component_length");
- load(pd.min_isolated_length, pt, "min_isolated_length");
-
-}
-
-
-void load(debruijn_config::gap_closer& gc,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(gc.minimal_intersection, pt, "minimal_intersection");
- load(gc.before_simplify, pt, "before_simplify");
- load(gc.in_simplify, pt, "in_simplify");
- load(gc.after_simplify, pt, "after_simplify");
- load(gc.weight_threshold, pt, "weight_threshold");
-}
-
-void load(debruijn_config::graph_read_corr_cfg& graph_read_corr,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(graph_read_corr.enable, pt, "enable");
- load(graph_read_corr.output_dir, pt, "output_dir");
- load(graph_read_corr.binary, pt, "binary");
-}
-
-void load(debruijn_config::kmer_coverage_model& kcm,
- boost::property_tree::ptree const& pt, bool /*complete*/) {
- using config_common::load;
- load(kcm.probability_threshold, pt, "probability_threshold");
- load(kcm.strong_probability_threshold, pt, "strong_probability_threshold");
- load(kcm.coverage_threshold, pt, "coverage_threshold");
- load(kcm.use_coverage_threshold, pt, "use_coverage_threshold");
-}
-
-void load(dataset &ds,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
-
- load(ds.reads_filename, pt, "reads");
-
- ds.reference_genome_filename = "";
- boost::optional<std::string> refgen =
- pt.get_optional<std::string>("reference_genome");
- if (refgen && *refgen != "N/A") {
- ds.reference_genome_filename = *refgen;
- }
-}
-
-void load_reads(dataset &ds,
- std::string input_dir) {
- if (ds.reads_filename[0] != '/')
- ds.reads_filename = input_dir + ds.reads_filename;
- path::CheckFileExistenceFATAL(ds.reads_filename);
- ds.reads.load(ds.reads_filename);
-}
-
-void load_reference_genome(dataset &ds,
- std::string input_dir) {
- if (ds.reference_genome_filename == "") {
- ds.reference_genome = "";
- return;
- }
- if (ds.reference_genome_filename[0] != '/')
- ds.reference_genome_filename = input_dir + ds.reference_genome_filename;
- path::CheckFileExistenceFATAL(ds.reference_genome_filename);
- io::FileReadStream genome_stream(ds.reference_genome_filename);
- io::SingleRead genome;
- genome_stream >> genome;
- ds.reference_genome = genome.GetSequenceString();
-}
-
-void load(debruijn_config::simplification& simp,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
-
- load(simp.cycle_iter_count, pt, "cycle_iter_count", complete);
- load(simp.post_simplif_enabled, pt, "post_simplif_enabled", complete);
- load(simp.topology_simplif_enabled, pt, "topology_simplif_enabled", complete);
- load(simp.tc, pt, "tc", complete); // tip clipper:
-
- load(simp.dead_end, pt, "dead_end", complete); // dead end:
- load(simp.ttc, pt, "ttc", complete); // topology tip clipper:
- load(simp.complex_tc, pt, "complex_tc", complete); // complex tip clipper:
- load(simp.br, pt, "br", complete); // bulge remover:
- load(simp.ec, pt, "ec", complete); // erroneous connections remover:
- load(simp.rcec, pt, "rcec", complete); // relative coverage erroneous connections remover
- load(simp.rcc, pt, "rcc", complete); // relative coverage component remover:
- load(simp.relative_ed, pt, "relative_ed", complete); // relative edge disconnector:
- load(simp.tec, pt, "tec", complete); // topology aware erroneous connections remover:
- load(simp.trec, pt, "trec", complete); // topology and reliability based erroneous connections remover:
- load(simp.isec, pt, "isec", complete); // interstrand erroneous connections remover (thorn remover):
- load(simp.mfec, pt, "mfec", complete); // max flow erroneous connections remover:
- load(simp.ier, pt, "ier", complete); // isolated edges remover
- load(simp.cbr, pt, "cbr", complete); // complex bulge remover
- load(simp.her, pt, "her", complete); // hidden ec remover
- load(simp.init_clean, pt, "init_clean", complete); // presimplification
- load(simp.final_tc, pt, "final_tc", complete);
- load(simp.final_br, pt, "final_br", complete);
- simp.second_final_br = simp.final_br;
- load(simp.second_final_br, pt, "second_final_br", false);
-}
-
-void load(debruijn_config::info_printer& printer,
- boost::property_tree::ptree const& pt, bool complete) {
- using config_common::load;
- load(printer.basic_stats, pt, "basic_stats", complete);
- load(printer.lib_info, pt, "lib_info", complete);
- load(printer.extended_stats, pt, "extended_stats", complete);
- load(printer.write_components, pt, "write_components", complete);
- load(printer.components_for_kmer, pt, "components_for_kmer", complete);
- load(printer.components_for_genome_pos, pt, "components_for_genome_pos",
- complete);
- load(printer.write_components_along_genome, pt,
- "write_components_along_genome", complete);
- load(printer.write_components_along_contigs, pt,
- "write_components_along_contigs", complete);
- load(printer.save_full_graph, pt, "save_full_graph", complete);
- load(printer.save_all, pt, "save_all", complete);
- load(printer.save_graph_pack, pt, "save_graph_pack", complete);
- load(printer.write_full_graph, pt, "write_full_graph", complete);
- load(printer.write_full_nc_graph, pt, "write_full_nc_graph", complete);
- load(printer.write_error_loc, pt, "write_error_loc", complete);
-}
-
-//void clear(debruijn_config::info_printer& printer) {
-// printer.print_stats = false;
-// printer.write_components = false;
-// printer.components_for_kmer = "";
-// printer.components_for_genome_pos = "";
-// printer.write_components_along_genome = false;
-// printer.save_full_graph = false;
-// printer.write_full_graph = false;
-// printer.write_full_nc_graph = false;
-// printer.write_error_loc = false;
-//}
-
-void load(debruijn_config::info_printers_t &printers,
- boost::property_tree::ptree const &pt, bool /*complete*/) {
- using config_common::load;
-
- debruijn_config::info_printer def;
- load(def, pt, ModeName(info_printer_pos::default_pos, InfoPrinterPosNames()), true);
-
- for (size_t pos = size_t(info_printer_pos::default_pos) + 1; pos != size_t(info_printer_pos::total); ++pos) {
- debruijn_config::info_printer printer(def);
- load(printer, pt, ModeName(pos, InfoPrinterPosNames()), false);
-
- printers[info_printer_pos(pos)] = printer;
- }
-}
-
-void load_launch_info(debruijn_config &cfg, boost::property_tree::ptree const &pt) {
- using config_common::load;
- load(cfg.K, pt, "K");
- // input options:
- load(cfg.dataset_file, pt, "dataset");
- // input dir is based on dataset file location (all paths in datasets are relative to its location)
- cfg.input_dir = path::parent_path(cfg.dataset_file);
- if (cfg.input_dir[cfg.input_dir.length() - 1] != '/')
- cfg.input_dir += '/';
-
- load(cfg.output_base, pt, "output_base");
- if (cfg.output_base[cfg.output_base.length() - 1] != '/')
- cfg.output_base += '/';
-
- load(cfg.log_filename, pt, "log_filename");
-
- load(cfg.developer_mode, pt, "developer_mode");
- if (cfg.developer_mode) {
- load(cfg.output_pictures, pt, "output_pictures");
- load(cfg.output_nonfinal_contigs, pt, "output_nonfinal_contigs");
- load(cfg.compute_paths_number, pt, "compute_paths_number");
- } else {
- cfg.output_pictures = false;
- cfg.output_nonfinal_contigs = false;
- cfg.compute_paths_number = false;
- }
-
- load(cfg.load_from, pt, "load_from");
- if (cfg.load_from[0] != '/') { // relative path
- cfg.load_from = cfg.output_dir + cfg.load_from;
- }
-
- load(cfg.tmp_dir, pt, "tmp_dir");
- load(cfg.main_iteration, pt, "main_iteration");
-
- load(cfg.entry_point, pt, "entry_point");
-
- load(cfg.use_additional_contigs, pt, "use_additional_contigs");
- load(cfg.additional_contigs, pt, "additional_contigs");
- INFO("Additional contigs is " << cfg.additional_contigs);
-
- load(cfg.rr_enable, pt, "rr_enable");
-
- load(cfg.buffer_size, pt, "buffer_size");
- cfg.buffer_size <<= 20; //turn MB to bytes
-
- load(cfg.temp_bin_reads_dir, pt, "temp_bin_reads_dir");
- if (cfg.temp_bin_reads_dir[cfg.temp_bin_reads_dir.length() - 1] != '/')
- cfg.temp_bin_reads_dir += '/';
-
- load(cfg.max_threads, pt, "max_threads");
- // Fix number of threads according to OMP capabilities.
- cfg.max_threads = std::min(cfg.max_threads, (size_t) omp_get_max_threads());
- // Inform OpenMP runtime about this :)
- omp_set_num_threads((int) cfg.max_threads);
-
- load(cfg.max_memory, pt, "max_memory");
-
- path::CheckFileExistenceFATAL(cfg.dataset_file);
- boost::property_tree::ptree ds_pt;
- boost::property_tree::read_info(cfg.dataset_file, ds_pt);
- load(cfg.ds, ds_pt, true);
- load_reads(cfg.ds, cfg.input_dir);
- load_reference_genome(cfg.ds, cfg.input_dir);
-}
-
-// main debruijn config load function
-void load_cfg(debruijn_config &cfg, boost::property_tree::ptree const &pt,
- bool complete) {
- using config_common::load;
-
- string mode_str = pt.get("mode", "");
- if (!mode_str.empty()) {
- cfg.mode = ModeByName<pipeline_type>(mode_str, PipelineTypeNames());
- }
-
- //FIXME
- load(cfg.tsa, pt, "tsa", complete);
-
- load(cfg.use_unipaths, pt, "use_unipaths", complete);
-
- load(cfg.pb, pt, "pacbio_processor", complete);
-
- load(cfg.two_step_rr, pt, "two_step_rr", complete);
- load(cfg.use_intermediate_contigs, pt, "use_intermediate_contigs", complete);
- load(cfg.single_reads_rr, pt, "single_reads_rr", complete);
-
- load(cfg.preserve_raw_paired_index, pt, "preserve_raw_paired_index", complete);
-
- load(cfg.correct_mismatches, pt, "correct_mismatches", complete);
- load(cfg.paired_info_statistics, pt, "paired_info_statistics", complete);
- load(cfg.paired_info_scaffolder, pt, "paired_info_scaffolder", complete);
- load(cfg.gap_closer_enable, pt, "gap_closer_enable", complete);
-
- load(cfg.max_repeat_length, pt, "max_repeat_length", complete);
-
- load(cfg.est_mode, pt, "estimation_mode", complete);
- load(cfg.de, pt, "de", complete);
- load(cfg.ade, pt, "ade", complete); // advanced distance estimator:
- load(cfg.amb_de, pt, "amb_de", complete);
-
- load(cfg.con, pt, "construction", complete);
- load(cfg.gc, pt, "gap_closer", complete);
- load(cfg.simp, pt, "simp", complete);
- load(cfg.flanking_range, pt, "flanking_range", complete);
- load(cfg.graph_read_corr, pt, "graph_read_corr", complete);
- load(cfg.kcm, pt, "kmer_coverage_model", complete);
- load(cfg.pos, pt, "pos", complete); // position handler:
-
- load(cfg.rm, pt, "resolving_mode", complete);
- load(cfg.pe_params, pt, "pe", complete);
-
- load(cfg.use_scaffolder, pt, "use_scaffolder", complete);
- load(cfg.avoid_rc_connections, pt, "avoid_rc_connections", complete);
-
- load(cfg.sensitive_map, pt, "sensitive_mapper", complete);
-
- load(cfg.info_printers, pt, "info_printers", complete);
-
- load(cfg.bwa, pt, "bwa_aligner", complete);
-
- if (pt.count("plasmid")) {
- VERIFY_MSG(!cfg.pd, "Option can be loaded only once");
- cfg.pd.reset(debruijn_config::plasmid());
- load(*cfg.pd, pt, "plasmid");
- }
-
- if (pt.count("sc_cor")) {
- VERIFY_MSG(!cfg.sc_cor, "Option sc_cor can be loaded only once");
- cfg.sc_cor.reset(debruijn_config::scaffold_correction());
- load(*cfg.sc_cor, pt, "sc_cor");
- }
-
- if (pt.count("preliminary_simp")) {
- VERIFY_MSG(!cfg.preliminary_simp, "Option preliminary can be loaded only once");
- cfg.preliminary_simp.reset(cfg.simp);
- load(*cfg.preliminary_simp, pt, "preliminary_simp", false);
- }
- if (pt.count("prelim_pe")) {
- VERIFY_MSG(!cfg.prelim_pe_params, "Option prelim_pe can be loaded only once");
- cfg.prelim_pe_params.reset(cfg.pe_params);
- load(*cfg.prelim_pe_params, pt, "prelim_pe", false);
- }
-}
-
-void load(debruijn_config &cfg, const std::string &cfg_fns) {
- load(cfg, std::vector<std::string>({ cfg_fns }));
-}
-
-void load(debruijn_config &cfg, const std::vector<std::string> &cfg_fns) {
- VERIFY_MSG(cfg_fns.size() > 0, "Should provide at least one config file");
- boost::property_tree::ptree base_pt;
- boost::property_tree::read_info(cfg_fns[0], base_pt);
-
- load_launch_info(cfg, base_pt);
- load_cfg(cfg, base_pt, true);
-
- for (size_t i = 1 ; i < cfg_fns.size(); ++i) {
- boost::property_tree::ptree pt;
- boost::property_tree::read_info(cfg_fns[i], pt);
-
- //FIXME add logging of loading configs
- load_cfg(cfg, pt, false);
- }
-
- //some post-loading processing
- using config::pipeline_type;
- cfg.uneven_depth = set<pipeline_type>{pipeline_type::mda, pipeline_type::rna, pipeline_type::meta}.count(cfg.mode);
- if (!cfg.developer_mode) {
- cfg.pe_params.debug_output = false;
- cfg.pe_params.viz.DisableAll();
- cfg.pe_params.output.DisableAll();
- }
-
- if (!cfg.use_scaffolder) {
- cfg.pe_params.param_set.scaffolder_options.enabled = false;
- }
- cfg.need_mapping = cfg.developer_mode || cfg.correct_mismatches
- || cfg.gap_closer_enable || cfg.rr_enable;
-
- cfg.output_dir = cfg.output_base + "/K" + std::to_string(cfg.K) + "/";
-
- cfg.output_saves = cfg.output_dir + "saves/";
-
- if (cfg.tmp_dir[0] != '/') { // relative path
- cfg.tmp_dir = cfg.output_dir + cfg.tmp_dir;
- }
-
- cfg.temp_bin_reads_path =
- cfg.project_name.empty() ?
- (cfg.output_base + "/" + cfg.temp_bin_reads_dir) :
- (cfg.output_base + cfg.project_name + "/"
- + cfg.temp_bin_reads_dir);
- //cfg.temp_bin_reads_info = cfg.temp_bin_reads_path + "INFO";
-
- for (size_t i = 0; i < cfg.ds.reads.lib_count(); ++i) {
- auto& lib = cfg.ds.reads[i];
- lib.data().lib_index = i;
- lib.data().binary_reads_info.chunk_num = cfg.max_threads;
- lib.data().binary_reads_info.bin_reads_info_file = cfg.temp_bin_reads_path + "INFO_" + std::to_string(i);
- lib.data().binary_reads_info.buffer_size = cfg.buffer_size;
- lib.data().binary_reads_info.paired_read_prefix = cfg.temp_bin_reads_path + "paired_" + std::to_string(i);
- lib.data().binary_reads_info.single_read_prefix = cfg.temp_bin_reads_path + "single_" + std::to_string(i);
- }
-}
-
-}
-}
diff --git a/src/modules/pipeline/config_struct.hpp b/src/modules/pipeline/config_struct.hpp
deleted file mode 100644
index 70e4e3b..0000000
--- a/src/modules/pipeline/config_struct.hpp
+++ /dev/null
@@ -1,583 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-#pragma once
-
-#include "pipeline/config_singl.hpp"
-#include "algorithms/path_extend/pe_config_struct.hpp"
-#include "pipeline/library.hpp"
-
-#include <boost/optional.hpp>
-#include "math/xmath.h"
-
-namespace debruijn_graph {
-namespace config {
-
-enum class info_printer_pos : char {
- default_pos = 0,
- before_first_gap_closer,
- before_simplification,
- before_post_simplification,
- final_simplified,
- final_gap_closed,
- before_repeat_resolution,
-
- total
-};
-
-std::vector<std::string> InfoPrinterPosNames();
-
-enum class pipeline_type : char {
- base = 0,
- isolate,
- mda,
- meta,
- moleculo,
- diploid,
- rna,
- plasmid,
-
- total
-};
-
-std::vector<std::string> PipelineTypeNames();
-
-enum class construction_mode : char {
- old = 0,
- extention,
-
- total
-};
-
-std::vector<std::string> ConstructionModeNames();
-
-enum class estimation_mode : char {
- simple = 0,
- weighted,
- smoothing,
-
- total
-};
-
-std::vector<std::string> EstimationModeNames();
-
-enum class resolving_mode : char {
- none = 0,
- path_extend,
-
- total
-};
-
-std::vector<std::string> ResolveModeNames();
-
-enum class single_read_resolving_mode : char {
- none = 0,
- only_single_libs,
- all,
-
- total
-};
-
-std::vector<std::string> SingleReadResolveModeNames();
-
-template<typename mode_t>
-mode_t ModeByName(const std::string& name, const std::vector<std::string>& names) {
- auto it = std::find(names.begin(), names.end(), name);
- VERIFY_MSG(it != names.end(), "Unrecognized mode name");
- return mode_t(it - names.begin());
-}
-
-template<typename mode_t>
-std::string ModeName(const mode_t& mode, const std::vector<std::string>& names) {
- VERIFY_MSG(size_t(mode) < names.size(), "Unrecognized mode id");
- return names[size_t(mode)];
-}
-
-struct DataSetData {
- size_t read_length;
- double avg_read_length;
- double mean_insert_size;
- double insert_size_deviation;
- double insert_size_left_quantile;
- double insert_size_right_quantile;
- double median_insert_size;
- double insert_size_mad;
- std::map<int, size_t> insert_size_distribution;
-
- size_t lib_index;
- bool single_reads_mapped;
- uint64_t total_nucls;
- size_t read_count;
-
- double average_coverage;
- double pi_threshold;
-
- struct BinaryReadsInfo {
- BinaryReadsInfo(): binary_coverted(false), chunk_num(0), buffer_size(0) {}
-
- bool binary_coverted;
- std::string bin_reads_info_file;
- std::string paired_read_prefix;
- std::string single_read_prefix;
- size_t chunk_num;
- size_t buffer_size;
- } binary_reads_info;
-
-
- DataSetData(): read_length(0), avg_read_length(0.0),
- mean_insert_size(0.0),
- insert_size_deviation(0.0),
- insert_size_left_quantile(0.0),
- insert_size_right_quantile(0.0),
- median_insert_size(0.0),
- insert_size_mad(0.0),
- lib_index(0),
- single_reads_mapped(false),
- total_nucls(0),
- read_count(0),
- average_coverage(0.0),
- pi_threshold(0.0),
- binary_reads_info() {}
-};
-
-struct dataset {
- typedef io::DataSet<DataSetData>::Library Library;
-
- io::DataSet<DataSetData> reads;
-
- size_t max_read_length;
- double average_coverage;
- double average_read_length;
-
- size_t RL() const { return max_read_length; }
- void set_RL(size_t RL) {
- max_read_length = RL;
- }
-
- double aRL() const { return average_read_length; }
- void set_aRL(double aRL) {
- average_read_length = aRL;
- for (size_t i = 0; i < reads.lib_count(); ++i) {
- reads[i].data().avg_read_length = aRL;
- }
- }
-
- double avg_coverage() const { return average_coverage; }
- void set_avg_coverage(double avg_coverage) {
- average_coverage = avg_coverage;
- for (size_t i = 0; i < reads.lib_count(); ++i) {
- reads[i].data().average_coverage = avg_coverage;
- }
- }
-
- std::string reference_genome_filename;
- std::string reads_filename;
-
- std::string reference_genome;
-
- dataset(): max_read_length(0), average_coverage(0.0) {
- }
-};
-
-// struct for debruijn project's configuration file
-struct debruijn_config {
-
- pipeline_type mode;
- bool uneven_depth;
-
- bool developer_mode;
-
- bool preserve_raw_paired_index;
-
- struct simplification {
- struct tip_clipper {
- std::string condition;
- tip_clipper() {}
- tip_clipper(std::string condition_) : condition(condition_) {}
- };
-
- struct dead_end_clipper {
- std::string condition;
- bool enabled;
- };
-
- struct topology_tip_clipper {
- double length_coeff;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct complex_tip_clipper {
- bool enabled;
- double max_relative_coverage;
- size_t max_edge_len;
- std::string condition;
- };
-
- struct bulge_remover {
- bool enabled;
- bool main_iteration_only;
- double max_bulge_length_coefficient;
- size_t max_additive_length_coefficient;
- double max_coverage;
- double max_relative_coverage;
- size_t max_delta;
- double max_relative_delta;
- size_t max_number_edges;
- bool parallel;
- size_t buff_size;
- double buff_cov_diff;
- double buff_cov_rel_diff;
- };
-
- struct erroneous_connections_remover {
- std::string condition;
- erroneous_connections_remover() {}
- erroneous_connections_remover(std::string condition_) : condition(condition_) {}
- };
-
- struct relative_coverage_ec_remover {
- bool enabled;
- size_t max_ec_length;
- double rcec_ratio;
- };
-
- struct topology_based_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct tr_based_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- double unreliable_coverage;
- };
-
- struct interstrand_ec_remover {
- size_t max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t span_distance;
- };
-
- struct max_flow_ec_remover {
- bool enabled;
- double max_ec_length_coefficient;
- size_t uniqueness_length;
- size_t plausibility_length;
- };
-
- struct isolated_edges_remover {
- bool enabled;
- size_t max_length;
- double max_coverage;
- size_t max_length_any_cov;
- };
-
- struct complex_bulge_remover {
- bool enabled;
- double max_relative_length;
- size_t max_length_difference;
- };
-
- struct hidden_ec_remover {
- bool enabled;
- size_t uniqueness_length;
- double unreliability_threshold;
- double relative_threshold;
- };
-
- struct relative_coverage_edge_disconnector {
- bool enabled;
- double diff_mult;
- };
-
- struct relative_coverage_comp_remover {
- bool enabled;
- double coverage_gap;
- double length_coeff;
- double tip_allowing_length_coeff;
- size_t max_ec_length_coefficient;
- double max_coverage_coeff;
- size_t vertex_count_limit;
- };
-
- struct init_cleaning {
- std::string self_conj_condition;
-
- bool early_it_only;
- double activation_cov;
- isolated_edges_remover ier;
- std::string tip_condition;
- std::string ec_condition;
- double disconnect_flank_cov;
- };
-
- size_t cycle_iter_count;
- bool post_simplif_enabled;
- bool topology_simplif_enabled;
- tip_clipper tc;
- dead_end_clipper dead_end;
- complex_tip_clipper complex_tc;
- topology_tip_clipper ttc;
- bulge_remover br;
- erroneous_connections_remover ec;
- relative_coverage_ec_remover rcec;
- relative_coverage_comp_remover rcc;
- relative_coverage_edge_disconnector relative_ed;
- topology_based_ec_remover tec;
- tr_based_ec_remover trec;
- interstrand_ec_remover isec;
- max_flow_ec_remover mfec;
- isolated_edges_remover ier;
- complex_bulge_remover cbr;
- hidden_ec_remover her;
-
- tip_clipper final_tc;
- bulge_remover final_br;
- bulge_remover second_final_br;
-
- init_cleaning init_clean;
- };
-
- struct construction {
- struct early_tip_clipper {
- bool enable;
- boost::optional<size_t> length_bound;
- early_tip_clipper() : enable(false) {}
- };
-
- construction_mode con_mode;
- early_tip_clipper early_tc;
- bool keep_perfect_loops;
- size_t read_buffer_size;
- construction() :
- con_mode(construction_mode::extention),
- keep_perfect_loops(true),
- read_buffer_size(0) {}
- };
-
- simplification simp;
- boost::optional<simplification> preliminary_simp;
-
- struct sensitive_mapper {
- size_t k;
- };
-
- struct distance_estimator {
- double linkage_distance_coeff;
- double max_distance_coeff;
- double max_distance_coeff_scaff;
- double filter_threshold;
- };
-
- struct smoothing_distance_estimator {
- size_t threshold;
- double range_coeff;
- double delta_coeff;
- double percentage;
- size_t cutoff;
- size_t min_peak_points;
- double inv_density;
- double derivative_threshold;
- };
-
- struct ambiguous_distance_estimator {
- bool enabled;
- double haplom_threshold;
- double relative_length_threshold;
- double relative_seq_threshold;
- };
-
- struct plasmid {
- size_t long_edge_length;
- size_t edge_length_for_median;
- double relative_coverage;
- size_t small_component_size;
- double small_component_relative_coverage;
- size_t min_component_length;
- size_t min_isolated_length;
- };
-
- struct pacbio_processor {
- //align and traverse.
- size_t pacbio_k; //13
- bool additional_debug_info; //false
- double compression_cutoff;// 0.6
- double domination_cutoff; //1.5
- double path_limit_stretching; //1.3
- double path_limit_pressing;//0.7
- bool ignore_middle_alignment; //true; false for stats and mate_pairs;
- //gap_closer
- size_t long_seq_limit; //400
- size_t pacbio_min_gap_quantity; //2
- size_t contigs_min_gap_quantity; //1
- size_t max_contigs_gap_length; // 10000
- };
-
- struct position_handler {
- size_t max_mapping_gap;
- size_t max_gap_diff;
- std::string contigs_for_threading;
- std::string contigs_to_analyze;
- bool late_threading;
- bool careful_labeling;
- };
-
- struct gap_closer {
- int minimal_intersection;
- bool before_simplify;
- bool in_simplify;
- bool after_simplify;
- double weight_threshold;
- };
-
- struct info_printer {
- bool basic_stats;
- bool lib_info;
- bool extended_stats;
- bool write_components;
- std::string components_for_kmer;
- std::string components_for_genome_pos;
- bool write_components_along_genome;
- bool write_components_along_contigs;
- bool save_full_graph;
- bool save_all;
- bool save_graph_pack;
- bool write_error_loc;
- bool write_full_graph;
- bool write_full_nc_graph;
- };
-
- struct graph_read_corr_cfg {
- bool enable;
- std::string output_dir;
- bool binary;
- };
-
- struct kmer_coverage_model {
- double probability_threshold;
- double strong_probability_threshold;
- double coverage_threshold;
- bool use_coverage_threshold;
- };
-
- struct bwa_aligner {
- bool bwa_enable;
- bool debug;
- std::string path_to_bwa;
- size_t min_contig_len;
- };
-
- typedef std::map<info_printer_pos, info_printer> info_printers_t;
-
- std::string dataset_file;
- std::string project_name;
- std::string input_dir;
- std::string output_base;
- std::string output_dir;
- std::string tmp_dir;
- std::string output_suffix;
- std::string output_saves;
- std::string final_contigs_file;
- std::string log_filename;
-
- bool output_pictures;
- bool output_nonfinal_contigs;
- bool compute_paths_number;
-
- bool use_additional_contigs;
- bool use_unipaths;
- std::string additional_contigs;
-
- struct scaffold_correction {
- std::string scaffolds_file;
- bool output_unfilled;
- size_t max_insert;
- size_t max_cut_length;
- };
-
- struct truseq_analysis {
- std::string scaffolds_file;
- std::string genome_file;
- };
-
- boost::optional<scaffold_correction> sc_cor;
- truseq_analysis tsa;
- std::string load_from;
-
- std::string entry_point;
-
- bool rr_enable;
- bool two_step_rr;
- bool use_intermediate_contigs;
-
- single_read_resolving_mode single_reads_rr;
- bool use_single_reads;
-
- bool correct_mismatches;
- bool paired_info_statistics;
- bool paired_info_scaffolder;
- bool gap_closer_enable;
-
- size_t max_repeat_length;
-
- //Convertion options
- size_t buffer_size;
- std::string temp_bin_reads_dir;
- std::string temp_bin_reads_path;
- std::string temp_bin_reads_info;
- std::string paired_read_prefix;
- std::string single_read_prefix;
-
- size_t K;
-
- bool main_iteration;
-
- size_t max_threads;
- size_t max_memory;
-
- estimation_mode est_mode;
- resolving_mode rm;
- path_extend::pe_config::MainPEParamsT pe_params;
- boost::optional<path_extend::pe_config::MainPEParamsT> prelim_pe_params;
- bool avoid_rc_connections;
-
- construction con;
- sensitive_mapper sensitive_map;
- distance_estimator de;
- smoothing_distance_estimator ade;
- ambiguous_distance_estimator amb_de;
- pacbio_processor pb;
- bool use_scaffolder;
- dataset ds;
- position_handler pos;
- gap_closer gc;
- graph_read_corr_cfg graph_read_corr;
- info_printers_t info_printers;
- kmer_coverage_model kcm;
- bwa_aligner bwa;
- boost::optional<plasmid> pd;
- size_t flanking_range;
-
- bool need_mapping;
-
- debruijn_config() :
- use_single_reads(false) {
-
- }
-};
-
-void load(debruijn_config& cfg, const std::vector<std::string> &filenames);
-void load(debruijn_config& cfg, const std::string &filename);
-void load_lib_data(const std::string& prefix);
-void write_lib_data(const std::string& prefix);
-
-} // config
-} // debruijn_graph
-
-
-typedef config_common::config<debruijn_graph::config::debruijn_config> cfg;
diff --git a/src/modules/pipeline/genomic_info_filler.cpp b/src/modules/pipeline/genomic_info_filler.cpp
deleted file mode 100644
index 65a8eda..0000000
--- a/src/modules/pipeline/genomic_info_filler.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "genomic_info_filler.hpp"
-
-#include "math/kmer_coverage_model.hpp"
-#include "algorithms/simplification/ec_threshold_finder.hpp"
-
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-
-#include <string>
-
-#include <map>
-#include <vector>
-
-using namespace llvm;
-using namespace debruijn_graph;
-
-static std::vector<size_t> extract(const std::map<size_t, size_t> &hist) {
- std::map<size_t, size_t> tmp = hist;
-
- size_t maxcov = 0;
- for (auto it = tmp.cbegin(), et = tmp.cend(); it != et; ++it)
- maxcov = std::max(maxcov, it->first);
-
- // Touch all the values until maxcov to make sure all the values exist in the map
- for (size_t i = 0; i <= maxcov; ++i)
- tmp[i];
-
- // Extract the values
- std::vector<size_t> res(maxcov);
- for (size_t i = 0; i < maxcov; ++i)
- res[i] = tmp[i + 1];
-
- return res;
-}
-
-namespace llvm { namespace yaml {
-template <>
-struct MappingTraits<GenomicInfo> {
- static void mapping(yaml::IO &io, GenomicInfo &info) {
- info.yamlize(io);
- }
-};
-
-
-template <>
-struct SequenceTraits<std::vector<std::size_t>> {
- static size_t size(IO &, std::vector<std::size_t> &seq) {
- return seq.size();
- }
- static size_t&
- element(IO &, std::vector<std::size_t> &seq, size_t index) {
- if (index >= seq.size())
- seq.resize(index+1);
- return seq[index];
- }
- static const bool flow = true;
-};
-}}
-
-void GenomicInfo::yamlize(yaml::IO &io) {
- io.mapOptional("ec bound", ec_bound_, 0.0);
- io.mapOptional("estimated mean", estimated_mean_, 0.0);
- io.mapOptional("trusted bound", trusted_bound_, 0.0);
- io.mapOptional("genome size", genome_size_, size_t(0));
- io.mapOptional("coverage histogram", cov_histogram_);
-}
-
-
-bool GenomicInfo::Load(const std::string &filename) {
- ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getFile(filename);
- if (!Buf)
- return false;
-
- yaml::Input yin(*Buf.get());
- yin >> *this;
-
- if (yin.error())
- return false;
-
- return true;
-}
-
-void GenomicInfo::Save(const std::string &filename) const {
- std::error_code EC;
- llvm::raw_fd_ostream ofs(filename, EC, llvm::sys::fs::OpenFlags::F_Text);
- llvm::yaml::Output yout(ofs);
- yout << const_cast<GenomicInfo&>(*this);
-}
-
-void GenomicInfoFiller::run(conj_graph_pack &gp, const char*) {
- if (cfg::get().uneven_depth) {
- ErroneousConnectionThresholdFinder<decltype(gp.g)> finder(gp.g);
- std::map<size_t, size_t> hist = finder.ConstructHistogram();
- double avg = finder.AvgCoverage();
- double gthr = finder.FindThreshold(hist);
- INFO("Average edge coverage: " << avg);
- INFO("Graph threshold: " << gthr);
-
- gp.ginfo.set_cov_histogram(extract(hist));
- gp.ginfo.set_ec_bound(std::min(avg, gthr));
- } else {
- // First, get k-mer coverage histogram
- std::map<size_t, size_t> tmp;
- size_t maxcov = 0;
- size_t kmer_per_record = 1;
- if (conj_graph_pack::index_t::InnerIndex::storing_type::IsInvertable())
- kmer_per_record = 2;
-
- for (auto I = gp.index.inner_index().value_cbegin(), E = gp.index.inner_index().value_cend(); I != E; ++I) {
- size_t ccov = I->count;
- maxcov = std::max(ccov, maxcov);
- tmp[ccov] += kmer_per_record;
- }
-
- gp.ginfo.set_cov_histogram(extract(tmp));
-
- // Fit the coverage model and get the threshold
- cov_model::KMerCoverageModel CovModel(gp.ginfo.cov_histogram(), cfg::get().kcm.probability_threshold, cfg::get().kcm.strong_probability_threshold);
- CovModel.Fit();
-
- gp.ginfo.set_genome_size(CovModel.GetGenomeSize());
- gp.ginfo.set_ec_bound((double)CovModel.GetErrorThreshold());
- if (CovModel.converged()) {
- gp.ginfo.set_estimated_mean((double)CovModel.GetMeanCoverage());
- INFO("Mean coverage was calculated as " << gp.ginfo.estimated_mean());
- } else
- INFO("Failed to estimate mean coverage");
-
- if (cfg::get().kcm.use_coverage_threshold) {
- double coef = (cfg::get().ds.aRL() - double(cfg::get().K) + 1) / cfg::get().ds.aRL();
- if (coef < 0)
- coef = double(cfg::get().ds.RL() - cfg::get().K + 1) / double(cfg::get().ds.RL());
- gp.ginfo.set_trusted_bound(CovModel.converged() && cfg::get().kcm.coverage_threshold == 0.0 ?
- double(CovModel.GetLowThreshold()) :
- cfg::get().kcm.coverage_threshold * coef);
- }
- }
-
- INFO("EC coverage threshold value was calculated as " << gp.ginfo.ec_bound());
- INFO("Trusted kmer low bound: " << gp.ginfo.trusted_bound());
-}
diff --git a/src/modules/pipeline/graph_pack.hpp b/src/modules/pipeline/graph_pack.hpp
deleted file mode 100644
index e445ba0..0000000
--- a/src/modules/pipeline/graph_pack.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "data_structures/indices/edge_position_index.hpp"
-#include "data_structures/indices/storing_traits.hpp"
-#include "data_structures/sequence/genome_storage.hpp"
-#include "assembly_graph/handlers/id_track_handler.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
-#include "paired_info/paired_info.hpp"
-#include "pipeline/config_struct.hpp"
-#include "assembly_graph/graph_alignment/edge_index.hpp"
-#include "assembly_graph/graph_support/genomic_quality.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "genomic_info.hpp"
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
-#include "assembly_graph/graph_support/detail_coverage.hpp"
-#include "assembly_graph/components/connected_component.hpp"
-#include "assembly_graph/graph_alignment/kmer_mapper.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-struct graph_pack: private boost::noncopyable {
- typedef Graph graph_t;
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- typedef runtime_k::RtSeq seq_t;
- typedef EdgeIndex<graph_t> index_t;
- using PairedInfoIndicesT = omnigraph::de::PairedInfoIndicesT<Graph>;
- //typedef omnigraph::de::PairedInfoIndicesT<Graph> PairedInfoIndicesT;
- typedef omnigraph::de::UnclusteredPairedInfoIndicesT<Graph> UnclusteredPairedInfoIndicesT;
- typedef LongReadContainer<Graph> LongReadContainerT;
-
- size_t k_value;
-
- graph_t g;
- index_t index;
- KmerMapper<graph_t> kmer_mapper;
- FlankingCoverage<graph_t> flanking_cov;
- UnclusteredPairedInfoIndicesT paired_indices;
- PairedInfoIndicesT clustered_indices;
- PairedInfoIndicesT scaffolding_indices;
- LongReadContainerT single_long_reads;
- GenomicInfo ginfo;
-
- GenomeStorage genome;
- EdgeQuality<Graph> edge_qual;
- mutable EdgesPositionHandler<graph_t> edge_pos;
- ConnectedComponentCounter components;
- graph_pack(size_t k, const std::string &workdir, size_t lib_count,
- const std::string &genome = "",
- size_t flanking_range = 50,
- size_t max_mapping_gap = 0,
- size_t max_gap_diff = 0,
- bool detach_indices = true)
- : k_value(k), g(k), index(g, workdir),
- kmer_mapper(g),
- flanking_cov(g, flanking_range),
- paired_indices(g, lib_count),
- clustered_indices(g, lib_count),
- scaffolding_indices(g, lib_count),
- single_long_reads(g, lib_count),
- genome(genome),
- edge_qual(g),
- edge_pos(g, max_mapping_gap + k, max_gap_diff),
- components(g)
- {
- if (detach_indices) {
- DetachAll();
- }
- }
-
- void FillQuality() {
- edge_qual.Fill(index, kmer_mapper, genome.GetSequence());
- }
-
- //todo remove with usages after checking
- void ClearQuality() {
- edge_qual.clear();
- }
-
- void EnsureIndex() {
- if (!index.IsAttached()) {
- INFO("Index refill");
- index.Refill();
- index.Attach();
- }
- }
-
- void EnsureBasicMapping() {
- VERIFY(kmer_mapper.IsAttached());
- EnsureIndex();
- INFO("Normalizing k-mer map. Total " << kmer_mapper.size() << " kmers to process");
- kmer_mapper.Normalize();
- INFO("Normalizing done");
- }
-
- void EnsureQuality() {
- if (!edge_qual.IsAttached()) {
- ClearQuality();
- FillQuality();
- edge_qual.Attach();
- }
- }
-
- //positions are refilled every time
- void EnsurePos() {
- if (!edge_pos.IsAttached()) {
- edge_pos.Attach();
- }
- edge_pos.clear();
- FillPos(*this, genome.GetSequence(), "ref0");
- FillPos(*this, !genome.GetSequence(), "ref1");
- }
-
- void EnsureDebugInfo() {
- EnsureBasicMapping();
- EnsureQuality();
- EnsurePos();
- }
-
- void InitRRIndices() {
- clustered_indices.Init();
- scaffolding_indices.Init();
- }
-
- void ClearRRIndices() {
- for (auto& pi : paired_indices) {
- pi.Clear();
- }
- clustered_indices.Clear();
- scaffolding_indices.Clear();
- single_long_reads.Clear();
- }
-
- void DetachAll() {
- index.Detach();
- kmer_mapper.Detach();
- edge_pos.Detach();
- edge_qual.Detach();
- }
-
-};
-
-typedef graph_pack<ConjugateDeBruijnGraph> conj_graph_pack;
-typedef conj_graph_pack::index_t Index;
-
-typedef conj_graph_pack::PairedInfoIndicesT PairedIndicesT;
-typedef conj_graph_pack::UnclusteredPairedInfoIndicesT UnclusteredPairedIndicesT;
-typedef conj_graph_pack::LongReadContainerT LongReadContainerT;
-typedef omnigraph::de::PairedInfoIndexT<ConjugateDeBruijnGraph> PairedIndexT;
-typedef omnigraph::de::UnclusteredPairedInfoIndexT<ConjugateDeBruijnGraph> UnclusteredPairedIndexT;
-
-} // namespace debruijn_graph
diff --git a/src/modules/pipeline/graphio.hpp b/src/modules/pipeline/graphio.hpp
deleted file mode 100644
index d47d00a..0000000
--- a/src/modules/pipeline/graphio.hpp
+++ /dev/null
@@ -1,1040 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-
-#include "assembly_graph/handlers/id_track_handler.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-
-#include "paired_info/paired_info.hpp"
-
-#include "assembly_graph/graph_core/graph.hpp"
-#include "assembly_graph/graph_support/detail_coverage.hpp"
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
-
-#include "assembly_graph/graph_core/order_and_law.hpp"
-
-#include <cmath>
-#include <set>
-#include <map>
-#include <algorithm>
-#include <fstream>
-#include <cstdio>
-
-namespace debruijn_graph {
-
-namespace graphio {
-
-using namespace omnigraph;
-using namespace omnigraph::de;
-//todo think of inner namespace
-
-template<class KmerMapper>
-void SaveKmerMapper(const string& file_name,
- const KmerMapper& mapper) {
- std::ofstream file;
- file.open((file_name + ".kmm").c_str(),
- std::ios_base::binary | std::ios_base::out);
- DEBUG("Saving kmer mapper, " << file_name <<" created");
- VERIFY(file.is_open());
-
- uint32_t k_ = (uint32_t) mapper.get_k();
- file.write((char *) &k_, sizeof(uint32_t));
- mapper.BinWrite(file);
-
- file.close();
- DEBUG("kmer mapper saved ")
-}
-
-template<class KmerMapper>
-bool LoadKmerMapper(const string& file_name,
- KmerMapper& kmer_mapper) {
- kmer_mapper.clear();
- std::ifstream file;
- file.open((file_name + ".kmm").c_str(),
- std::ios_base::binary | std::ios_base::in);
- if (!file.is_open()) {
- return false;
- }
- INFO("Reading kmer mapper, " << file_name <<" started");
-
- uint32_t k_;
- file.read((char *) &k_, sizeof(uint32_t));
-
- VERIFY_MSG(k_ == kmer_mapper.get_k(), "Cannot read kmer mapper, different Ks");
- kmer_mapper.BinRead(file);
-
- file.close();
- return true;
-}
-
-template<class EdgeIndex>
-void SaveEdgeIndex(const std::string& file_name,
- const EdgeIndex& index) {
- std::ofstream file;
- file.open((file_name + ".kmidx").c_str(),
- std::ios_base::binary | std::ios_base::out);
- DEBUG("Saving kmer index, " << file_name <<" created");
- VERIFY(file.is_open());
-
- uint32_t k_ = index.k();
- file.write((char *) &k_, sizeof(uint32_t));
- index.BinWrite(file);
-
- file.close();
- DEBUG("index saved ")
-}
-
-template<class EdgeIndex>
-bool LoadEdgeIndex(const std::string& file_name,
- EdgeIndex& index) {
- std::ifstream file;
- file.open((file_name + ".kmidx").c_str(),
- std::ios_base::binary | std::ios_base::in);
- INFO("Reading kmer index, " << file_name <<" started");
- if (!file.is_open())
- return false;
-
- uint32_t k_;
- file.read((char *) &k_, sizeof(uint32_t));
- VERIFY_MSG(k_ == index.k(), "Cannot read edge index, different Ks:");
-
- index.BinRead(file, file_name + ".kmidx");
-
- file.close();
-
- return true;
-}
-
-inline
-void SaveMapCoverage(const std::string& path, const std::map<int, int>& data ) {
- std::ofstream outFile;
- outFile.open(path.c_str());
-
- INFO("Saving detailed coverage in file " << path <<" started");
- outFile << data.size() << "\n";
- for (auto dataIterator = data.begin(); dataIterator != data.end(); ++dataIterator){
- outFile << dataIterator->first << " " << dataIterator->second << " .\n";
- }
-}
-
-template<class KmerIndex>
-void SaveDetailCoverage(const std::string& pathInCov, const std::string& pathOutCov, const KmerIndex& index ) {
- SaveMapCoverage(pathInCov, index.inCoverage);
- SaveMapCoverage(pathOutCov, index.outCoverage);
-}
-
-inline void SerializePoint(FILE* file, size_t e1, size_t e2, const RawPoint &p) {
- fprintf(file, "%zu %zu %.2f %.2f 0.00 .\n", e1, e2, (double)p.d, (double)p.weight);
-}
-
-inline void SerializePoint(FILE* file, size_t e1, size_t e2, const Point &p) {
- fprintf(file, "%zu %zu %.2f %.2f %.2f .\n", e1, e2, (double)p.d, (double)p.weight, (double)p.var);
-}
-
-inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, RawPoint &p) {
- float unused;
- size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
- (float *)&p.d, (float *)&p.weight, (float *)&unused);
- VERIFY(read_count == 5);
-
-}
-
-inline void DeserializePoint(FILE* file, size_t& e1, size_t& e2, Point &p) {
- size_t read_count = fscanf(file, "%zu %zu %f %f %f .\n", &e1, &e2,
- (float *)&p.d, (float *)&p.weight, (float *)&p.var);
- VERIFY(read_count == 5);
-}
-
-
-template<class Graph>
-class DataPrinter {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- //todo reduce duplication
- template<class T>
- void SaveEdgeAssociatedInfo(std::function<T (EdgeId)> access_f, ostream& out) const {
- out << component_.e_size() << endl;
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- //todo fixme currently matches old format .cvr format
- out << e.int_id()/* << endl*/;
- out << " " << access_f(e) << " ." << endl;
- }
- }
-
-// template<class C>
-// void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
-// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
-// }
-
- template<class C>
- void SaveEdgeAssociatedInfo(const C& c, ostream& out) const {
- out << component_.e_size() << endl;
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- //todo fixme currently matches old format .cvr format
- out << e.int_id()/* << endl*/;
- out << " ";
- c.Save(e, out);
- out << " ." << endl;
- }
- }
-
- public:
-
- void SaveGraph(const string& file_name) const {
- FILE* gid_file = fopen((file_name + ".gid").c_str(), "w");
- size_t max_id = this->component().g().GetGraphIdDistributor().GetMax();
- fprintf(gid_file, "%zu\n", max_id);
- fclose(gid_file);
- FILE* file = fopen((file_name + ".grp").c_str(), "w");
- DEBUG("Graph saving to " << file_name << " started");
- VERIFY_MSG(file != NULL,
- "Couldn't open file " << (file_name + ".grp") << " on write");
- size_t vertex_count = component_.v_size();
- size_t edge_count = component_.e_size();
- fprintf(file, "%zu %zu \n", vertex_count, edge_count);
- for (auto iter = component_.v_begin(); iter != component_.v_end(); ++iter) {
- Save(file, *iter);
- }
-
- fprintf(file, "\n");
-
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- Save(file, *iter);
- }
- DEBUG("Graph saving to " << file_name << " finished");
-
- fclose(file);
- }
-
- void SaveEdgeSequences(const string& file_name) const {
- ofstream out(file_name + ".sqn");
- //todo switch to general function after its switching to fasta
- //SaveEdgeAssociatedInfo<Sequence>(boost::bind(&Graph::EdgeNucls, component_.g(), _1), out);
- DEBUG("Saving sequences, " << file_name <<" created");
- for (auto iter = component_.e_begin(); iter != component_.e_end(); ++iter) {
- EdgeId e = *iter;
- out << ">" << e.int_id() << endl;
- out << component_.g().EdgeNucls(e) << endl;
- }
- }
-
- void SaveCoverage(const string& file_name) const {
- ofstream out(file_name + ".cvr");
- DEBUG("Saving coverage, " << file_name <<" created");
- SaveEdgeAssociatedInfo(component_.g().coverage_index(), out);
- }
-
- void SaveFlankingCoverage(const string& file_name, const FlankingCoverage<Graph>& flanking_cov) const {
- ofstream out(file_name + ".flcvr");
- DEBUG("Saving flanking coverage, " << file_name <<" created");
- SaveEdgeAssociatedInfo(flanking_cov, out);
- }
-
- template<class Index>
- void SavePaired(const string& file_name,
- Index const& paired_index) const {
- FILE* file = fopen((file_name + ".prd").c_str(), "w");
- DEBUG("Saving paired info, " << file_name <<" created");
- VERIFY(file != NULL);
-
- size_t comp_size = 0;
- for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
- EdgeId e1 = *I;
- auto inner_map = paired_index.GetHalf(e1);
- for (auto entry : inner_map) {
- if (component_.contains(entry.first)) { // if the second edge also lies in the same component
- comp_size += entry.second.size();
- continue;
- }
- }
- }
-
- fprintf(file, "%zu\n", comp_size);
-
- for (auto I = component_.e_begin(), E = component_.e_end(); I != E; ++I) {
- EdgeId e1 = *I;
- const auto& inner_map = paired_index.GetHalf(e1);
- std::map<typename Graph::EdgeId, typename Index::HistProxy> ordermap(inner_map.begin(), inner_map.end());
- for (auto entry : ordermap) {
- EdgeId e2 = entry.first;
- if (component_.contains(e2))
- for (auto point : entry.second)
- SerializePoint(file, e1.int_id(), e2.int_id(), point);
- }
- }
-
- fclose(file);
- }
-
- void SavePositions(const string& file_name,
- EdgesPositionHandler<Graph> const& ref_pos) const {
- ofstream file((file_name + ".pos").c_str());
- DEBUG("Saving edges positions, " << file_name << " created");
- VERIFY(file.is_open());
- file << component_.e_size() << endl;
- for (auto it = component_.e_begin(); it != component_.e_end(); ++it) {
- vector<omnigraph::EdgePosition> pos_it = ref_pos.GetEdgePositions(*it);
- file << it->int_id() << " " << pos_it.size() << endl;
- for (size_t i = 0; i < pos_it.size(); i++) {
- file << " " << pos_it[i].contigId << " " << pos_it[i].mr << endl;
- }
- }
- }
-
- private:
- void Save(FILE* file, EdgeId eid) const {
- fprintf(file, "%s\n", ToPrint(eid).c_str());
- }
-
- void Save(FILE* file, VertexId vid) const {
- fprintf(file, "%s\n", ToPrint(vid).c_str());
- }
-
- const GraphComponent<Graph> component_;
-
- virtual std::string ToPrint(VertexId v) const = 0;
- virtual std::string ToPrint(EdgeId e) const = 0;
-
- protected:
-
- //todo optimize component copy
- DataPrinter(const GraphComponent<Graph>& component) :
- component_(component) {
- }
-
- const GraphComponent<Graph>& component() const {
- return component_;
- }
-
- public:
- virtual ~DataPrinter() {
- }
-};
-
-template<class Graph>
-class ConjugateDataPrinter: public DataPrinter<Graph> {
- typedef DataPrinter<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- public:
- ConjugateDataPrinter(Graph const& g) :
- base(g) {
- }
-
- ConjugateDataPrinter(const GraphComponent<Graph>& graph_component) :
- base(GraphComponent<Graph>(graph_component, true)) {
- }
-
- template<class VertexIt>
- ConjugateDataPrinter(const Graph& g, VertexIt begin, VertexIt end) :
- base(GraphComponent<Graph>(g, begin, end, true)) {
- }
-
- std::string ToPrint(VertexId v) const {
- stringstream ss;
- ss
- << "Vertex "
- << v.int_id()
- << " ~ "
- << this->component().g().conjugate(v).int_id() << " .";
- return ss.str();
- }
-
- std::string ToPrint(EdgeId e) const {
- stringstream ss;
- ss
- << "Edge "
- << e.int_id()
- << " : "
- << this->component().g().EdgeStart(e).int_id()
- << " -> "
- << this->component().g().EdgeEnd(e).int_id()
- << ", l = "
- << this->component().g().length(e)
- << " ~ "
- << this->component().g().conjugate(e).int_id() << " .";
- return ss.str();
- }
-
-};
-
-template<class Graph>
-class DataScanner {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- template<class T>
- void LoadEdgeAssociatedInfo(std::function<void (EdgeId, T)> setting_f, istream& in) const {
- size_t cnt;
- in >> cnt;
- for (size_t i = 0 ; i < cnt; ++i) {
- size_t edge_id;
- T t;
- string delim;
- in >> edge_id;
- in >> t;
- in >> delim;
- VERIFY(delim == ".");
- VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
- setting_f(this->edge_id_map()[edge_id], t);
- }
- }
-
- template<class T>
- void LoadEdgeAssociatedInfo(T& t, istream& in) const {
- size_t cnt;
- in >> cnt;
- for (size_t i = 0 ; i < cnt; ++i) {
- size_t edge_id;
- in >> edge_id;
- VERIFY(this->edge_id_map().find(edge_id) != this->edge_id_map().end());
- EdgeId eid = this->edge_id_map().find(edge_id)->second;
- t.Load(eid, in);
- string delim;
- in >> delim;
- VERIFY(delim == ".");
- }
- }
-
-// template<class C>
-// void LoadEdgeAssociatedInfo(const C& c, ostream& out) const {
-// SaveEdgeAssociatedInfo<decltype(C::operator[])>(boost::bind(&C::operator[], c, _1), out);
-// }
-
- public:
- virtual void LoadGraph(const string& file_name) = 0;
-
- void LoadCoverage(const string& file_name) {
- INFO("Reading coverage from " << file_name);
- ifstream in(file_name + ".cvr");
- LoadEdgeAssociatedInfo(g_.coverage_index(), in);
- }
-
- bool LoadFlankingCoverage(const string& file_name, FlankingCoverage<Graph>& flanking_cov) {
- if (!path::FileExists(file_name + ".flcvr")) {
- INFO("Flanking coverage saves are absent");
- return false;
- }
- INFO("Reading flanking coverage from " << file_name);
- ifstream in(file_name + ".flcvr");
- LoadEdgeAssociatedInfo(flanking_cov, in);
- return true;
- }
-
- template<typename Index>
- void LoadPaired(const string& file_name,
- Index& paired_index,
- bool force_exists = true) {
- typedef typename Graph::EdgeId EdgeId;
- FILE* file = fopen((file_name + ".prd").c_str(), "r");
- INFO((file_name + ".prd"));
- if (force_exists) {
- VERIFY(file != NULL);
- } else if (file == NULL) {
- INFO("Paired info not found, skipping");
- return;
- }
- INFO("Reading paired info from " << file_name << " started");
-
- size_t paired_count;
- int read_count = fscanf(file, "%zu \n", &paired_count);
- VERIFY(read_count == 1);
- while (!feof(file)) {
- size_t first_real_id, second_real_id;
-
- typename Index::Point point;
- DeserializePoint(file, first_real_id, second_real_id, point);
-
- TRACE(first_real_id << " " << second_real_id << " " << point);
- VERIFY(this->edge_id_map().find(first_real_id) != this->edge_id_map().end())
- EdgeId e1 = this->edge_id_map()[first_real_id];
- EdgeId e2 = this->edge_id_map()[second_real_id];
- if (e1 == EdgeId(NULL) || e2 == EdgeId(NULL))
- continue;
- TRACE(e1 << " " << e2 << " " << point);
- //Need to prevent doubling of self-conjugate edge pairs
- //Their weight would be always even, so we don't lose precision
- auto ep = std::make_pair(e1, e2);
- if (ep == paired_index.ConjugatePair(ep))
- point.weight = math::round(point.weight / 2);
- paired_index.Add(e1, e2, point);
- }
- DEBUG("PII SIZE " << paired_index.size());
- fclose(file);
- }
-
- bool LoadPositions(const string& file_name,
- EdgesPositionHandler<Graph>& edge_pos) {
- FILE* file = fopen((file_name + ".pos").c_str(), "r");
- if (file == NULL) {
- INFO("No positions were saved");
- return false;
- }
- VERIFY(!edge_pos.IsAttached());
- edge_pos.Attach();
- INFO("Reading edges positions, " << file_name <<" started");
- VERIFY(file != NULL);
- size_t pos_count;
- int read_count = fscanf(file, "%zu\n", &pos_count);
- VERIFY(read_count == 1);
- for (size_t i = 0; i < pos_count; i++) {
- size_t edge_real_id, pos_info_count;
- char contigId[500];
- char cur_str[500];
- read_count = fscanf(file, "%zu %zu\n", &edge_real_id, &pos_info_count);
- VERIFY(read_count == 2);
- // INFO( edge_real_id);
- for (size_t j = 0; j < pos_info_count; j++) {
- int start_pos, end_pos;
- int m_start_pos, m_end_pos;
- read_count = fscanf(file, "%[^\n]s", cur_str);
- read_count = fscanf(file, "\n");
- read_count = sscanf(cur_str, "%s [%d - %d] --> [%d - %d]", contigId,
- &start_pos, &end_pos, &m_start_pos, &m_end_pos);
- // INFO(cur_str);
- // INFO (contigId<<" "<< start_pos<<" "<<end_pos);
- // VERIFY(read_count == 3);
- VERIFY(read_count == 5);
- VERIFY(this->edge_id_map().find(edge_real_id) != this->edge_id_map().end());
- EdgeId eid = this->edge_id_map()[edge_real_id];
- edge_pos.AddEdgePosition(eid, string(contigId), start_pos - 1, end_pos, m_start_pos - 1, m_end_pos);
- }
- }
- fclose(file);
- return true;
- }
-
- private:
- Graph& g_;
- // int edge_count_;
- map<size_t, EdgeId> edge_id_map_;
- map<size_t, VertexId> vertex_id_map_;
-
- protected:
- DataScanner(Graph &g) : g_(g) {
- INFO("Creating of scanner started");
- // edge_count_ = 0;
- }
-
- Graph& g() {
- return g_;
- }
-
- map<size_t, EdgeId> &edge_id_map() {
- return edge_id_map_;
- }
-
- map<size_t, VertexId> &vertex_id_map() {
- return vertex_id_map_;
- }
-
- const map<size_t, EdgeId> &edge_id_map() const {
- return edge_id_map_;
- }
-
- const map<size_t, VertexId> &vertex_id_map() const {
- return vertex_id_map_;
- }
-
- public:
- virtual ~DataScanner() {
-
- }
-};
-
-template<class Graph>
-class ConjugateDataScanner: public DataScanner<Graph> {
- typedef DataScanner<Graph> base;
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-private:
- restricted::IdSegmentStorage CreateIdStorage(const string& file_name) {
- FILE* file = fopen((file_name + ".gid").c_str(), "r");
- //This is to support compatibility to old saves. Will be removed soon
- if(file == NULL) {
- return this->g().GetGraphIdDistributor().ReserveUpTo(1000000000);
- }
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".gid"));
- size_t max;
- int flag = fscanf(file, "%zu\n", &max);
- VERIFY(flag == 1);
- fclose(file);
- return this->g().GetGraphIdDistributor().ReserveUpTo(max);
- }
-
- public:
- /*virtual*/
- void LoadGraph(const string& file_name) {
- restricted::IdSegmentStorage id_storage = CreateIdStorage(file_name);
- INFO("Trying to read conjugate de bruijn graph from " << file_name << ".grp");
- FILE* file = fopen((file_name + ".grp").c_str(), "r");
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".grp"));
- FILE* sequence_file = fopen((file_name + ".sqn").c_str(), "r");
- VERIFY_MSG(file != NULL, "Couldn't find file " << (file_name + ".sqn"));
- INFO("Reading conjugate de bruijn graph from " << file_name << " started");
- size_t vertex_count;
- size_t edge_count;
- int flag = fscanf(file, "%zu %zu \n", &vertex_count, &edge_count);
- VERIFY(flag == 2);
- for (size_t i = 0; i < vertex_count; i++) {
- size_t vertex_real_id, conjugate_id;
- flag = fscanf(file, "Vertex %zu ~ %zu .\n", &vertex_real_id, &conjugate_id);
- TRACE("Vertex "<<vertex_real_id<<" ~ "<<conjugate_id<<" .");
- VERIFY(flag == 2);
-
- if (this->vertex_id_map().find((int) vertex_real_id) == this->vertex_id_map().end()) {
- size_t ids[2] = {vertex_real_id, conjugate_id};
- auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
- VertexId vid = this->g().AddVertex(typename Graph::VertexData(), id_distributor);
- VertexId conj_vid = this->g().conjugate(vid);
-
- this->vertex_id_map()[vertex_real_id] = vid;
- this->vertex_id_map()[conjugate_id] = conj_vid;
- }
- }
-
- char first_char = (char) getc(sequence_file);
- VERIFY(!ferror(sequence_file));
- ungetc(first_char, sequence_file);
- bool fasta = (first_char == '>'); // if it's not fasta, then it's old .sqn
-
-
- if (!fasta) {
- size_t tmp_edge_count;
- flag = fscanf(sequence_file, "%zu", &tmp_edge_count);
- VERIFY(flag == 1);
- VERIFY(edge_count == tmp_edge_count);
- }
-
- const size_t longstring_size = 1000500; // TODO: O RLY magic constant? => Can't load edges >= 1Mbp
- char longstring[longstring_size];
- for (size_t i = 0; i < edge_count; i++) {
- size_t e_real_id, start_id, fin_id, length, conjugate_edge_id;
- flag = fscanf(file, "Edge %zu : %zu -> %zu, l = %zu ~ %zu .\n",
- &e_real_id, &start_id, &fin_id, &length, &conjugate_edge_id);
- VERIFY(flag == 5);
- VERIFY(length < longstring_size);
- if (fasta) {
- flag = fscanf(sequence_file, ">%zu\n%s\n", &e_real_id, longstring);
- }
- else {
- flag = fscanf(sequence_file, "%zu %s .", &e_real_id, longstring);
- }
- VERIFY(flag == 2);
- TRACE("Edge " << e_real_id << " : " << start_id << " -> "
- << fin_id << " l = " << length << " ~ " << conjugate_edge_id);
- if (this->edge_id_map().find((int) e_real_id) == this->edge_id_map().end()) {
- size_t ids[2] = {e_real_id, conjugate_edge_id};
- auto id_distributor = id_storage.GetSegmentIdDistributor(ids, ids + 2);
- Sequence tmp(longstring);
- EdgeId eid = this->g().AddEdge(this->vertex_id_map()[start_id], this->vertex_id_map()[fin_id], tmp, id_distributor);
- this->edge_id_map()[e_real_id] = eid;
- this->edge_id_map()[conjugate_edge_id] = this->g().conjugate(eid);
- }
- }
- fclose(file);
- fclose(sequence_file);
- }
- public:
- ConjugateDataScanner(Graph& g) :
- base(g) {
- }
-};
-
-inline std::string MakeSingleReadsFileName(const std::string& file_name,
- size_t index) {
- return file_name + "_paths_" + ToString(index) + ".mpr";
-}
-
-//helper methods
-// todo think how to organize them in the most natural way
-
-template<class Graph>
-void PrintBasicGraph(const string& file_name, DataPrinter<Graph>& printer) {
- printer.SaveGraph(file_name);
- printer.SaveEdgeSequences(file_name);
- printer.SaveCoverage(file_name);
-}
-
-template<class graph_pack>
-void PrintGraphPack(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp) {
- PrintBasicGraph(file_name, printer);
- // printer.SavePaired(file_name + "_et", gp.etalon_paired_index);
- if (gp.edge_pos.IsAttached())
- printer.SavePositions(file_name, gp.edge_pos);
- if (gp.index.IsAttached())
- SaveEdgeIndex(file_name, gp.index.inner_index());
- if (gp.kmer_mapper.IsAttached())
- SaveKmerMapper(file_name, gp.kmer_mapper);
- if (gp.flanking_cov.IsAttached())
- printer.SaveFlankingCoverage(file_name, gp.flanking_cov);
-}
-
-template<class graph_pack>
-void PrintGraphPack(const string& file_name, const graph_pack& gp) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g);
- PrintGraphPack(file_name, printer, gp);
-}
-
-template<class Graph>
-void PrintPairedIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& paired_index) {
- printer.SavePaired(file_name, paired_index);
-}
-
-template<class Graph>
-void PrintUnclusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
- const UnclusteredPairedInfoIndexT<Graph>& paired_index) {
- printer.SavePaired(file_name, paired_index);
-}
-
-template<class Graph>
-void PrintClusteredIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& clustered_index) {
- PrintPairedIndex(file_name + "_cl", printer, clustered_index);
-}
-
-template<class Graph>
-void PrintScaffoldingIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& clustered_index) {
- PrintPairedIndex(file_name + "_scf", printer, clustered_index);
-}
-
-template<class Graph>
-void PrintScaffoldIndex(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndexT<Graph>& scaffold_index) {
- PrintPairedIndex(file_name + "_scf", printer, scaffold_index);
-}
-
-template<class Graph>
-void PrintUnclusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
- const UnclusteredPairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintUnclusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class Graph>
-void PrintClusteredIndices(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintClusteredIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class Graph>
-void PrintScaffoldingIndices(const string& file_name, DataPrinter<Graph>& printer,
- const PairedInfoIndicesT<Graph>& paired_indices) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- PrintScaffoldingIndex(file_name + "_" + ToString(i), printer, paired_indices[i]);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndex(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
-
- PrintGraphPack(file_name, printer, gp);
- if (!clustered_index) {
- PrintPairedIndex(file_name, printer, paired_index);
- } else {
- PrintClusteredIndex(file_name, printer, paired_index);
- }
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndex(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index) {
- PrintWithPairedIndex(file_name, printer, gp, paired_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndices(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
- PrintGraphPack(file_name, printer, gp);
- if (!clustered_index)
- PrintPairedIndices(file_name, printer, paired_indices);
- else
- PrintClusteredIndices(file_name, printer, paired_indices);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndices(const string& file_name,
- DataPrinter<typename graph_pack::graph_t>& printer,
- const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- PrintWithPairedIndices(file_name, printer, gp, paired_indices, true);
-}
-
-template<class Graph>
-void PrintSingleLongReads(const string& file_name, const LongReadContainer<Graph>& single_long_reads) {
- for (size_t i = 0; i < single_long_reads.size(); ++i){
- single_long_reads[i].DumpToFile(MakeSingleReadsFileName(file_name, i));
- }
-}
-
-template<class graph_pack>
-void PrintAll(const string& file_name, const graph_pack& gp) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
- PrintGraphPack(file_name, printer, gp);
- PrintUnclusteredIndices(file_name, printer, gp.paired_indices);
- PrintClusteredIndices(file_name, printer, gp.clustered_indices);
- PrintScaffoldingIndices(file_name, printer, gp.scaffolding_indices);
- PrintSingleLongReads(file_name, gp.single_long_reads);
- gp.ginfo.Save(file_name + ".ginfo");
-}
-
-template<class graph_pack, class VertexIt>
-void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintWithPairedIndex(file_name, printer, gp, paired_index, clustered_index);
-}
-
-template<class graph_pack, class VertexIt>
-void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end,
- const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintWithPairedIndex(file_name, printer, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndex(const string& file_name, const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- PrintWithPairedIndex(file_name, gp, gp.g.begin(), gp.g.end(), paired_index,
- clustered_index);
-}
-
-template<class graph_pack, class VertexIt>
-void PrinGraphPack(const string& file_name, const graph_pack& gp,
- VertexIt begin, VertexIt end) {
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g,
- begin, end);
- PrintGraphPack(file_name, printer, gp);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndex(const string& file_name, const graph_pack& gp,
- const PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- PrintWithPairedIndex(file_name, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void PrintWithPairedIndices(const string& file_name, const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
-
- ConjugateDataPrinter<typename graph_pack::graph_t> printer(gp.g, gp.g.begin(), gp.g.end());
-
- PrintWithPairedIndices(file_name, printer, gp, paired_indices, clustered_index);
-}
-
-template<class graph_pack>
-void PrintWithClusteredIndices(const string& file_name, const graph_pack& gp,
- const PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- PrintWithPairedIndices(file_name, gp, paired_indices, true);
-}
-
-template<class Graph>
-void ScanBasicGraph(const string& file_name, DataScanner<Graph>& scanner) {
- scanner.LoadGraph(file_name);
- scanner.LoadCoverage(file_name);
-}
-
-template<class graph_pack>
-void ScanGraphPack(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp) {
- ScanBasicGraph(file_name, scanner);
- gp.index.Attach();
- if (LoadEdgeIndex(file_name, gp.index.inner_index())) {
- gp.index.Update();
- } else {
- WARN("Cannot load edge index, kmer coverages will be missed");
- gp.index.Refill();
- }
- // scanner.LoadPaired(file_name + "_et", gp.etalon_paired_index);
- scanner.LoadPositions(file_name, gp.edge_pos);
- //load kmer_mapper only if needed
- if (gp.kmer_mapper.IsAttached())
- if (!LoadKmerMapper(file_name, gp.kmer_mapper)) {
- WARN("Cannot load kmer_mapper, information on projected kmers will be missed");
- }
- if (!scanner.LoadFlankingCoverage(file_name, gp.flanking_cov)) {
- gp.flanking_cov.Fill(gp.index.inner_index());
- }
-}
-
-template<class Graph>
-void ScanPairedIndex(const string& file_name, DataScanner<Graph>& scanner,
- UnclusteredPairedInfoIndexT<Graph>& paired_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name, paired_index, force_exists);
-}
-
-template<class Graph>
-void ScanClusteredIndex(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndexT<Graph>& clustered_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name + "_cl", clustered_index, force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldingIndex(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndexT<Graph>& clustered_index,
- bool force_exists = true) {
- scanner.LoadPaired(file_name + "_scf", clustered_index, force_exists);
-}
-
-template<class Graph>
-void ScanPairedIndices(const std::string& file_name, DataScanner<Graph>& scanner,
- UnclusteredPairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanPairedIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanClusteredIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanClusteredIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldingIndices(const std:: string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& paired_indices,
- bool force_exists = true) {
- for (size_t i = 0; i < paired_indices.size(); ++i)
- ScanScaffoldingIndex(file_name + "_" + ToString(i), scanner, paired_indices[i], force_exists);
-}
-
-template<class Graph>
-void ScanScaffoldIndices(const string& file_name, DataScanner<Graph>& scanner,
- PairedInfoIndicesT<Graph>& scaffold_indices) {
-
- for (size_t i = 0; i < scaffold_indices.size(); ++i) {
- ScanScaffoldIndex(file_name + "_" + ToString(i), scanner, scaffold_indices[i]);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndex(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ScanGraphPack(file_name, scanner, gp);
- if (!clustered_index) {
- ScanPairedIndex(file_name, scanner, paired_index);
- } else {
- ScanClusteredIndex(file_name, scanner, paired_index);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndices(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
-
- ScanGraphPack(file_name, scanner, gp);
- if (!clustered_index) {
- ScanPairedIndices(file_name, scanner, paired_indices);
- } else {
- ScanClusteredIndices(file_name, scanner, paired_indices);
- }
-}
-
-template<class graph_pack>
-void ScanWithPairedIndex(const string& file_name, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& paired_index,
- bool clustered_index = false) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanWithPairedIndex(file_name, scanner, gp, paired_index, clustered_index);
-}
-
-template<class graph_pack>
-void ScanWithClusteredIndex(const string& file_name, graph_pack& gp,
- PairedInfoIndexT<typename graph_pack::graph_t>& clustered_index) {
- ScanWithPairedIndex(file_name, gp, clustered_index, true);
-}
-
-template<class graph_pack>
-void ScanWithClusteredIndices(const string& file_name,
- DataScanner<typename graph_pack::graph_t>& scanner, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- ScanWithPairedIndices(file_name, scanner, gp, paired_indices, true);
-}
-
-template<class graph_pack>
-void ScanWithPairedIndices(const string& file_name, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices,
- bool clustered_index = false) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanWithPairedIndices(file_name, scanner, gp, paired_indices, clustered_index);
-}
-
-
-template<class graph_pack>
-void ScanWithClusteredIndices(const string& file_name, graph_pack& gp,
- PairedInfoIndicesT<typename graph_pack::graph_t>& paired_indices) {
- ScanWithPairedIndices(file_name, gp, paired_indices, true);
-}
-
-template<class Graph>
-void ScanBasicGraph(const string& file_name, Graph& g) {
- ConjugateDataScanner<Graph> scanner(g);
- ScanBasicGraph<Graph>(file_name, scanner);
-}
-
-template<class Graph>
-void ScanSingleLongReads(const string& file_name, LongReadContainer<Graph>& single_long_reads) {
- for (size_t i = 0; i < single_long_reads.size(); ++i){
- single_long_reads[i].LoadFromFile(MakeSingleReadsFileName(file_name, i), false);
- }
-}
-
-template<class graph_pack>
-void ScanGraphPack(const string& file_name, graph_pack& gp) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanGraphPack(file_name, scanner, gp);
-}
-
-template<class graph_pack>
-void ScanAll(const std::string& file_name, graph_pack& gp,
- bool force_exists = true) {
- ConjugateDataScanner<typename graph_pack::graph_t> scanner(gp.g);
- ScanGraphPack(file_name, scanner, gp);
- ScanPairedIndices(file_name, scanner, gp.paired_indices, force_exists);
- ScanClusteredIndices(file_name, scanner, gp.clustered_indices, force_exists);
- ScanScaffoldingIndices(file_name, scanner, gp.scaffolding_indices, force_exists);
- ScanSingleLongReads(file_name, gp.single_long_reads);
- gp.ginfo.Load(file_name + ".ginfo");
-}
-}
-}
diff --git a/src/modules/pipeline/library.cpp b/src/modules/pipeline/library.cpp
deleted file mode 100644
index 6852156..0000000
--- a/src/modules/pipeline/library.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "pipeline/library.hpp"
-#include "dev_support/path_helper.hpp"
-
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-
-#include <string>
-#include <fstream>
-#include <iostream>
-
-using namespace llvm;
-using namespace io;
-
-namespace llvm { namespace yaml {
-template <>
-struct ScalarEnumerationTraits<LibraryOrientation> {
- static void enumeration(yaml::IO &io, LibraryOrientation &value) {
- io.enumCase(value, "fr", LibraryOrientation::FR);
- io.enumCase(value, "rf", LibraryOrientation::RF);
- io.enumCase(value, "ff", LibraryOrientation::FF);
- io.enumCase(value, "rr", LibraryOrientation::RR);
- }
-};
-
-template <>
-struct ScalarEnumerationTraits<LibraryType> {
- static void enumeration(yaml::IO &io, LibraryType &value) {
- io.enumCase(value, "paired-end", LibraryType::PairedEnd);
- io.enumCase(value, "mate-pairs", LibraryType::MatePairs);
- io.enumCase(value, "hq-mate-pairs", LibraryType::HQMatePairs);
- io.enumCase(value, "pacbio", LibraryType::PacBioReads);
- io.enumCase(value, "single", LibraryType::SingleReads);
- io.enumCase(value, "sanger", LibraryType::SangerReads);
- io.enumCase(value, "nanopore", LibraryType::NanoporeReads);
- io.enumCase(value, "trusted-contigs", LibraryType::TrustedContigs);
- io.enumCase(value, "untrusted-contigs", LibraryType::UntrustedContigs);
- io.enumCase(value, "path-extend-contigs", LibraryType::PathExtendContigs);
- }
-};
-
-template <>
-struct SequenceTraits<std::vector<std::string>> {
- static size_t size(IO &, std::vector<std::string> &seq) {
- return seq.size();
- }
- static std::string&
- element(IO &, std::vector<std::string> &seq, size_t index) {
- if (index >= seq.size())
- seq.resize(index+1);
- return seq[index];
- }
-};
-}}
-
-namespace io {
-template<>
-void SequencingLibrary<io::NoData>::yamlize(llvm::yaml::IO &io) {
- SequencingLibraryBase::yamlize(io);
-}
-template<>
-void SequencingLibrary<io::NoData>::validate(llvm::yaml::IO &io, llvm::StringRef &res) {
- SequencingLibraryBase::validate(io, res);
-}
-}
-
-void SequencingLibraryBase::yamlize(llvm::yaml::IO &io) {
- io.mapRequired("type", type_);
- io.mapOptional("orientation", orientation_, LibraryOrientation::Undefined);
- io.mapOptional("left reads", left_paired_reads_);
- io.mapOptional("right reads", right_paired_reads_);
- io.mapOptional("single reads", single_reads_);
-}
-
-void SequencingLibraryBase::validate(llvm::yaml::IO &, llvm::StringRef &res) {
- switch (type_) {
- case LibraryType::PairedEnd:
- case LibraryType::MatePairs:
- case LibraryType::HQMatePairs:
- if (left_paired_reads_.size() != right_paired_reads_.size()) {
- res = "Left and right reads lists should have equal length";
- return;
- }
-
- if (orientation_ == LibraryOrientation::Undefined) {
- res = "Orientation for paired reads should be specified";
- return;
- }
- break;
- case LibraryType::SingleReads:
- case LibraryType::PacBioReads:
- case LibraryType::SangerReads:
- case LibraryType::NanoporeReads:
- case LibraryType::TrustedContigs:
- case LibraryType::UntrustedContigs:
- case LibraryType::PathExtendContigs:
- if (left_paired_reads_.size() || right_paired_reads_.size()) {
- res = "Paired reads should not be set for this library type";
- return;
- }
- break;
- default:
- // Impossible
- res = "Unsupported library type";
- return;
- }
-}
-
-// FIXME: Lambda
-struct update_relative_filename : public std::binary_function<std::string, std::string, std::string> {
- std::string operator() (const std::string &filename, const std::string &input_dir) const {
- if (filename[0] == '/')
- return filename;
- return input_dir + filename;
- }
-};
-
-void SequencingLibraryBase::update_relative_reads_filenames(const std::string &input_dir) {
- std::transform(left_paired_reads_.begin(), left_paired_reads_.end(), left_paired_reads_.begin(),
- std::bind2nd(update_relative_filename(), input_dir));
- std::transform(right_paired_reads_.begin(), right_paired_reads_.end(), right_paired_reads_.begin(),
- std::bind2nd(update_relative_filename(), input_dir));
- std::transform(single_reads_.begin(), single_reads_.end(), single_reads_.begin(),
- std::bind2nd(update_relative_filename(), input_dir));
-}
-
-#include "pipeline/library.inl"
-
-// Provide default implementation here (e.g. in case of Data == io::NoData)
-template class io::DataSet<>;
diff --git a/src/modules/pipeline/library.hpp b/src/modules/pipeline/library.hpp
deleted file mode 100644
index a183fe9..0000000
--- a/src/modules/pipeline/library.hpp
+++ /dev/null
@@ -1,366 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __IO_LIBRARY_HPP__
-#define __IO_LIBRARY_HPP__
-
-#include "utils/adt/chained_iterator.hpp"
-#include "utils/adt/iterator_range.hpp"
-
-#include <boost/iterator/iterator_facade.hpp>
-
-#include <string>
-#include <vector>
-
-// Forward decls for YAML API
-namespace llvm { namespace yaml { class IO; template<typename T> struct MappingTraits; } }
-namespace llvm { class StringRef; }
-
-namespace io {
-
-enum class LibraryType {
- SingleReads,
- PairedEnd,
- MatePairs,
- HQMatePairs,
- PacBioReads,
- SangerReads,
- NanoporeReads,
- TrustedContigs,
- UntrustedContigs,
- PathExtendContigs
-};
-
-static std::vector<LibraryType> LibraryPriotity = {
- LibraryType::SingleReads,
- LibraryType::SangerReads,
- LibraryType::PacBioReads,
- LibraryType::NanoporeReads,
- LibraryType::PairedEnd,
- LibraryType::HQMatePairs,
- LibraryType::MatePairs,
- LibraryType::TrustedContigs,
- LibraryType::PathExtendContigs,
- LibraryType::UntrustedContigs
-};
-
-enum class LibraryOrientation {
- FR,
- FF,
- RF,
- RR,
- Undefined
-};
-
-class SequencingLibraryBase {
-public:
- class paired_reads_iterator :
- public boost::iterator_facade<paired_reads_iterator,
- std::pair<std::string, std::string>,
- boost::forward_traversal_tag,
- std::pair<std::string, std::string> > {
-
- typedef std::vector<std::string>::const_iterator inner_iterator;
-
- public:
- paired_reads_iterator(inner_iterator left, inner_iterator right)
- : left_(left), right_(right){}
-
- private:
- friend class boost::iterator_core_access;
-
- void increment() { ++left_; ++right_; }
- bool equal(const paired_reads_iterator &other) const {
- return this->left_ == other.left_ && this->right_ == other.right_;
- }
- std::pair<std::string, std::string> dereference() const {
- return std::make_pair(*left_, *right_);
- }
-
- inner_iterator left_;
- inner_iterator right_;
- };
-
- typedef chained_iterator<std::vector<std::string>::const_iterator> single_reads_iterator;
-
- SequencingLibraryBase()
- : type_(LibraryType::PairedEnd), orientation_(LibraryOrientation::FR) {}
-
- // YAML API. Public because we cannot have template friend class.
- void yamlize(llvm::yaml::IO &io);
- void validate(llvm::yaml::IO &io, llvm::StringRef &res);
-
- LibraryType type() const { return type_; }
- void set_type(LibraryType type) { type_ = type; }
- LibraryOrientation orientation() const { return orientation_; }
- void set_orientation(LibraryOrientation orientation) { orientation_ = orientation; }
-
- void clear() {
- left_paired_reads_.clear();
- right_paired_reads_.clear();
- single_reads_.clear();
- }
-
- void update_relative_reads_filenames(const std::string &input_dir);
-
- void push_back_single(const std::string &reads) {
- single_reads_.push_back(reads);
- }
-
- void push_back_paired(const std::string &left, const std::string &right) {
- left_paired_reads_.push_back(left);
- right_paired_reads_.push_back(right);
- }
-
- paired_reads_iterator paired_begin() const {
- return paired_reads_iterator(left_paired_reads_.begin(), right_paired_reads_.begin());
- }
- paired_reads_iterator paired_end() const {
- return paired_reads_iterator(left_paired_reads_.end(), right_paired_reads_.end());
- }
-
- adt::iterator_range<paired_reads_iterator> paired_reads() const {
- return adt::make_range(paired_begin(), paired_end());
- }
-
- single_reads_iterator reads_begin() const {
- // NOTE: We have a contract with single_end here. Single reads always go last!
- single_reads_iterator res(left_paired_reads_.begin(), left_paired_reads_.end());
- res.join(right_paired_reads_.begin(), right_paired_reads_.end());
- res.join(single_reads_.begin(), single_reads_.end());
-
- return res;
- }
- single_reads_iterator reads_end() const {
- // NOTE: Do not forget about the contract with single_begin here!
- return single_reads_iterator(single_reads_.end(), single_reads_.end());
- }
-
- adt::iterator_range<single_reads_iterator> reads() const {
- return adt::make_range(reads_begin(), reads_end());
- }
-
- single_reads_iterator single_begin() const {
- return single_reads_iterator(single_reads_.begin(), single_reads_.end());
- }
- single_reads_iterator single_end() const {
- // NOTE: Do not forget about the contract with single_begin here!
- return single_reads_iterator(single_reads_.end(), single_reads_.end());
- }
-
- adt::iterator_range<single_reads_iterator> single_reads() const {
- return adt::make_range(single_begin(), single_end());
- }
-
- bool is_graph_contructable() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::SingleReads ||
- type_ == io::LibraryType::HQMatePairs);
- }
-
- bool is_bwa_alignable() const {
- return type_ == io::LibraryType::MatePairs;
- }
-
- bool is_mismatch_correctable() const {
- return is_graph_contructable();
- }
-
- bool is_binary_covertable() {
- return is_graph_contructable() || is_mismatch_correctable() || is_paired();
- }
-
- bool is_paired() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::MatePairs||
- type_ == io::LibraryType::HQMatePairs);
- }
-
- bool is_repeat_resolvable() const {
- return (type_ == io::LibraryType::PairedEnd ||
- type_ == io::LibraryType::HQMatePairs ||
- type_ == io::LibraryType::MatePairs ||
- type_ == io::LibraryType::PacBioReads ||
- type_ == io::LibraryType::SangerReads ||
- type_ == io::LibraryType::NanoporeReads ||
- type_ == io::LibraryType::TrustedContigs ||
- type_ == io::LibraryType::UntrustedContigs ||
- type_ == io::LibraryType::PathExtendContigs);
- }
-
- static bool is_contig_lib(LibraryType type) {
- return type == io::LibraryType::TrustedContigs ||
- type == io::LibraryType::UntrustedContigs ||
- type == io::LibraryType::PathExtendContigs;
- }
-
- static bool is_long_read_lib(LibraryType type) {
- return type == io::LibraryType::PacBioReads ||
- type == io::LibraryType::SangerReads ||
- type == io::LibraryType::NanoporeReads;
- }
-
- bool is_contig_lib() const {
- return is_contig_lib(type_);
- }
-
- bool is_long_read_lib() const {
- return is_long_read_lib(type_);
- }
-
- bool is_pacbio_alignable() const {
- return (type_ == io::LibraryType::PacBioReads ||
- type_ == io::LibraryType::SangerReads ||
- type_ == io::LibraryType::NanoporeReads ||
- //comment next line to switch alignment method for trusted contigs
- type_ == io::LibraryType::TrustedContigs ||
- type_ == io::LibraryType::UntrustedContigs);
- }
-
-private:
- LibraryType type_;
- LibraryOrientation orientation_;
-
- std::vector<std::string> left_paired_reads_;
- std::vector<std::string> right_paired_reads_;
- std::vector<std::string> single_reads_;
-};
-
-struct NoData {};
-
-template<class Data = NoData>
-class SequencingLibrary: public SequencingLibraryBase {
-public:
- const Data& data() const {
- return data_;
- }
- Data& data() {
- return data_;
- }
-
- void yamlize(llvm::yaml::IO &io);
- void validate(llvm::yaml::IO &io, llvm::StringRef &res);
-
-private:
- Data data_;
-};
-
-// Just convenient wrapper to "unwrap" the iterators over libraries.
-template<class Data = NoData>
-class DataSet {
-public:
- typedef SequencingLibrary<Data> Library;
- typedef std::vector<Library> LibraryStorage;
-
-public:
- typedef typename LibraryStorage::iterator iterator;
- typedef typename LibraryStorage::const_iterator const_iterator;
- typedef chained_iterator<typename Library::single_reads_iterator> single_reads_iterator;
- typedef chained_iterator<typename Library::paired_reads_iterator> paired_reads_iterator;
-
- DataSet() {}
- explicit DataSet(const std::string &path) { load(path); }
-
- void load(const std::string &filename);
- void save(const std::string &filename);
-
- void clear() { libraries_.clear(); }
- void push_back(const Library &lib) {
- libraries_.push_back(lib);
- }
- Library& operator[](size_t n) { return libraries_[n]; }
- const Library& operator[](size_t n) const { return libraries_[n]; }
- size_t lib_count() const { return libraries_.size(); }
-
- iterator library_begin() { return libraries_.begin(); }
- const_iterator library_begin() const { return libraries_.begin(); }
- iterator begin() { return libraries_.begin(); }
- const_iterator begin() const { return libraries_.begin(); }
-
- iterator library_end() { return libraries_.end(); }
- const_iterator library_end() const { return libraries_.end(); }
- iterator end() { return libraries_.end(); }
- const_iterator end() const { return libraries_.end(); }
-
- adt::iterator_range<iterator> libraries() {
- return adt::make_range(library_begin(), library_end());
- }
- adt::iterator_range<const_iterator> libraries() const {
- return adt::make_range(library_begin(), library_end());
- }
-
- single_reads_iterator reads_begin() const {
- auto it = libraries_.begin();
- single_reads_iterator res(it->reads_begin(), it->reads_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->reads_begin(), it->reads_end());
-
- return res;
- }
- single_reads_iterator reads_end() const {
- return single_reads_iterator(libraries_.back().reads_end(), libraries_.back().reads_end());
- }
- adt::iterator_range<single_reads_iterator> reads() const {
- return adt::make_range(reads_begin(), reads_end());
- }
-
- single_reads_iterator single_begin() const {
- auto it = libraries_.begin();
- single_reads_iterator res(it->single_begin(), it->single_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->single_begin(), it->single_end());
-
- return res;
- }
- single_reads_iterator single_end() const {
- return single_reads_iterator(libraries_.back().single_end(), libraries_.back().single_end());
- }
- adt::iterator_range<single_reads_iterator> single_reads() const {
- return adt::make_range(single_begin(), single_end());
- }
-
- paired_reads_iterator paired_begin() const {
- auto it = libraries_.begin();
- paired_reads_iterator res(it->paired_begin(), it->paired_end());
- ++it;
- for (auto end = libraries_.end(); it != end; ++it)
- res.join(it->paired_begin(), it->paired_end());
-
- return res;
- }
- paired_reads_iterator paired_end() const {
- return paired_reads_iterator(libraries_.back().paired_end(), libraries_.back().paired_end());
- }
-
- adt::iterator_range<paired_reads_iterator> paired_reads() const {
- return adt::make_range(paired_begin(), paired_end());
- }
-
-private:
- LibraryStorage libraries_;
-};
-
-}
-
-namespace llvm { namespace yaml {
-template <>
-struct MappingTraits<io::SequencingLibraryBase> {
- static void mapping(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
- static StringRef validate(llvm::yaml::IO &io, io::SequencingLibraryBase &lib);
-};
-
-template <class Data>
-struct MappingTraits<io::SequencingLibrary<Data> > {
- static void mapping(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
- static StringRef validate(llvm::yaml::IO &io, io::SequencingLibrary<Data> &lib);
-};
-
-}}
-
-#endif // __IO_LIBRARY_HPP__
diff --git a/src/modules/pipeline/stage.cpp b/src/modules/pipeline/stage.cpp
deleted file mode 100644
index 4477536..0000000
--- a/src/modules/pipeline/stage.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "pipeline/stage.hpp"
-#include "pipeline/graphio.hpp"
-
-#include "dev_support/logger/log_writers.hpp"
-
-#include <algorithm>
-#include <cstring>
-
-namespace spades {
-
-void AssemblyStage::load(debruijn_graph::conj_graph_pack& gp,
- const std::string &load_from,
- const char* prefix) {
- std::string p = path::append_path(load_from, prefix == NULL ? id_ : prefix);
- INFO("Loading current state from " << p);
-
- debruijn_graph::graphio::ScanAll(p, gp, false);
- debruijn_graph::config::load_lib_data(p);
-}
-
-void AssemblyStage::save(const debruijn_graph::conj_graph_pack& gp,
- const std::string &save_to,
- const char* prefix) const {
- std::string p = path::append_path(save_to, prefix == NULL ? id_ : prefix);
- INFO("Saving current state to " << p);
-
- debruijn_graph::graphio::PrintAll(p, gp);
- debruijn_graph::config::write_lib_data(p);
-}
-
-class StageIdComparator {
- public:
- StageIdComparator(const char* id)
- : id_(id) {
- const char* pos = strstr(id, ":");
- len_ = (pos != NULL ? pos - id : strlen(id));
- }
-
- bool operator()(const std::unique_ptr<AssemblyStage> &stage) const {
- const char* sid = stage->id();
- return (0 == strncmp(id_, sid, len_) && sid[len_] == 0);
- }
-
- private:
- const char* id_;
- size_t len_;
-};
-
-class PhaseIdComparator {
- public:
- PhaseIdComparator(const char* id) {
- const char* pos = strstr(id, ":");
- VERIFY(pos != NULL);
- id_ = pos + 1;
- }
-
- bool operator()(const std::unique_ptr<CompositeStageBase::PhaseBase> &phase) const {
- return 0 == strcmp(id_, phase->id());
- }
-
- private:
- const char* id_;
-};
-
-void CompositeStageBase::run(debruijn_graph::conj_graph_pack& gp,
- const char* started_from) {
- VERIFY(parent_);
- auto start_phase = phases_.begin();
- if (started_from &&
- strstr(started_from, ":") &&
- started_from == strstr(started_from, id())) {
- start_phase = std::find_if(phases_.begin(), phases_.end(), PhaseIdComparator(started_from));
- if (start_phase == phases_.end()) {
- ERROR("Invalid start stage / phase combination specified: " << started_from);
- exit(-1);
- }
- if (start_phase != phases_.begin()) {
- PhaseBase * prev_phase = std::prev(start_phase)->get();
- std::string composite_id(id());
- composite_id += ":";
- composite_id += prev_phase->id();
- prev_phase->load(gp, parent_->saves_policy().load_from_, composite_id.c_str());
- }
- }
-
- for (auto et = phases_.end(); start_phase != et; ++start_phase) {
- PhaseBase *phase = start_phase->get();
-
- INFO("PROCEDURE == " << phase->name());
- phase->run(gp, started_from);
-
- if (parent_->saves_policy().make_saves_) {
- std::string composite_id(id());
- composite_id += ":";
- composite_id += phase->id();
-
- phase->save(gp, parent_->saves_policy().save_to_, composite_id.c_str());
- }
-
- }
-}
-
-void StageManager::run(debruijn_graph::conj_graph_pack& g,
- const char* start_from) {
- auto start_stage = stages_.begin();
- if (start_from) {
- start_stage = std::find_if(stages_.begin(), stages_.end(), StageIdComparator(start_from));
- if (start_stage == stages_.end()) {
- ERROR("Invalid start stage specified: " << start_from);
- exit(-1);
- }
- if (start_stage != stages_.begin())
- (*std::prev(start_stage))->load(g, saves_policy_.load_from_);
- }
-
- for (; start_stage != stages_.end(); ++start_stage) {
- AssemblyStage *stage = start_stage->get();
-
- INFO("STAGE == " << stage->name());
- stage->run(g, start_from);
- if (saves_policy_.make_saves_)
- stage->save(g, saves_policy_.save_to_);
- }
-}
-
-}
diff --git a/src/modules/stages/construction.cpp b/src/modules/stages/construction.cpp
deleted file mode 100644
index 5702185..0000000
--- a/src/modules/stages/construction.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "io/reads_io/vector_reader.hpp"
-#include "io/dataset_support/dataset_readers.hpp"
-#include "pipeline/graph_pack.hpp"
-#include "io/dataset_support/read_converter.hpp"
-
-#include "algorithms/graph_construction.hpp"
-#include "assembly_graph/stats/picture_dump.hpp"
-#include "construction.hpp"
-
-namespace debruijn_graph {
-
-template<class Read>
-void construct_graph(io::ReadStreamList<Read>& streams,
- conj_graph_pack& gp, io::SingleStreamPtr contigs_stream = io::SingleStreamPtr()) {
- config::debruijn_config::construction params = cfg::get().con;
- params.early_tc.enable &= !cfg::get().gap_closer_enable;
-
- ReadStatistics stats = ConstructGraphWithCoverage(params, streams, gp.g,
- gp.index, gp.flanking_cov, contigs_stream);
- size_t rl = stats.max_read_length_;
-
- if (!cfg::get().ds.RL()) {
- INFO("Figured out: read length = " << rl);
- cfg::get_writable().ds.set_RL(rl);
- cfg::get_writable().ds.set_aRL(1.0 * stats.bases_ / stats.reads_);
- } else if (cfg::get().ds.RL() != rl)
- WARN("In datasets.info, wrong RL is specified: " << cfg::get().ds.RL() << ", not " << rl);
-}
-
-void Construction::run(conj_graph_pack &gp, const char*) {
- // Has to be separate stream for not counting it in coverage
- io::ReadStreamList<io::SingleRead> trusted_contigs;
- if (cfg::get().use_additional_contigs) {
- DEBUG("Contigs from previous K will be used: " << cfg::get().additional_contigs);
- trusted_contigs.push_back(io::EasyStream(cfg::get().additional_contigs, true));
- }
-
- bool trusted_contigs_exist = false;
- for (const auto& lib : cfg::get().ds.reads) {
- if (lib.type() != io::LibraryType::TrustedContigs)
- continue;
-
- for (const auto& read : lib.single_reads()) {
- trusted_contigs.push_back(io::EasyStream(read, true));
- trusted_contigs_exist = true;
- }
- }
-
- if (trusted_contigs_exist)
- INFO("Trusted contigs will be used in graph construction");
- auto contigs_stream = MultifileWrap(trusted_contigs);
-
- auto& dataset = cfg::get_writable().ds;
- std::vector<size_t> libs_for_construction;
- for (size_t i = 0; i < dataset.reads.lib_count(); ++i)
- if (dataset.reads[i].is_graph_contructable())
- libs_for_construction.push_back(i);
-
- auto streams = single_binary_readers_for_libs(dataset, libs_for_construction, true, true);
- construct_graph<io::SingleReadSeq>(streams, gp, contigs_stream);
-}
-
-} //namespace debruijn_graph
diff --git a/src/modules/stages/simplification.cpp b/src/modules/stages/simplification.cpp
deleted file mode 100644
index cd46d1a..0000000
--- a/src/modules/stages/simplification.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
-#include "stages/simplification_pipeline/simplification_settings.hpp"
-#include "stages/simplification_pipeline/graph_simplification.hpp"
-#include "algorithms/simplification/parallel_simplification_algorithms.hpp"
-
-#include "simplification.hpp"
-
-namespace debruijn_graph {
-
-using namespace debruijn::simplification;
-using namespace config;
-
-class GraphSimplifier {
- typedef std::function<void(EdgeId)> HandlerF;
- typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
- omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
- LengthComparator<Graph>> TipClipperT;
- typedef omnigraph::PersistentEdgeRemovingAlgorithm<Graph,
- omnigraph::ParallelInterestingElementFinder<Graph, EdgeId>,
- CoverageComparator<Graph>> ECRemoverT;
-
- typedef std::vector<std::pair<AlgoPtr<Graph>, std::string>> AlgoStorageT;
-
- conj_graph_pack& gp_;
- Graph& g_;
- SimplifInfoContainer info_container_;
- const debruijn_config::simplification simplif_cfg_;
-
- CountingCallback<Graph> cnt_callback_;
- HandlerF removal_handler_;
- stats::detail_info_printer& printer_;
-
-// bool FastModeAvailable(const SimplifInfoContainer& info, double activation_cov_threshold) {
-// const auto& cfg = cfg::get();
-//
-// //todo fix logic
-// //also handles meta case for now
-// if (cfg.ds.single_cell) {
-// return !cfg::get().main_iteration;
-// }
-//
-// if (math::eq(info.detected_mean_coverage(), 0.) &&
-// !cfg.kcm.use_coverage_threshold) {
-// WARN("Mean coverage wasn't reliably estimated");
-// return false;
-// }
-//
-// //todo review logic
-// if (math::ls(info.detected_mean_coverage(), activation_cov_threshold) &&
-// !(cfg.kcm.use_coverage_threshold &&
-// math::ge(cfg.kcm.coverage_threshold, activation_cov_threshold))) {
-// INFO("Estimated mean coverage " << info.detected_mean_coverage() <<
-// " is less than fast mode activation coverage " << activation_cov_threshold);
-// return false;
-// }
-//
-// return true;
-// }
-
- bool PerformInitCleaning() {
-
- if (simplif_cfg_.init_clean.early_it_only && info_container_.main_iteration()) {
- INFO("Most init cleaning disabled on main iteration");
- return false;
- }
- if (math::ge(simplif_cfg_.init_clean.activation_cov, 0.)
- && math::ls(info_container_.detected_mean_coverage(), simplif_cfg_.init_clean.activation_cov)) {
- INFO("Most init cleaning disabled since detected mean " << info_container_.detected_mean_coverage()
- << " was less than activation coverage " << simplif_cfg_.init_clean.activation_cov);
- return false;
- }
-
- return true;
- }
-
- void RemoveShortPolyATEdges(size_t max_length,
- HandlerF removal_handler = 0, size_t chunk_cnt = 1) {
- INFO("Removing short polyAT");
- EdgeRemover<Graph> er(g_, removal_handler);
- ATCondition<Graph> condition (g_, 0.8, max_length, false);
- for (auto iter = g_.SmartEdgeBegin(); !iter.IsEnd(); ++iter){
- if (g_.length(*iter) == 1 && condition.Check(*iter)) {
- er.DeleteEdgeWithNoCompression(*iter);
- }
- }
- ParallelCompress(g_, chunk_cnt);
- }
-
- void InitialCleaning() {
- INFO("PROCEDURE == InitialCleaning");
-
- AlgoStorageT algos;
-
- PushValid(
- SelfConjugateEdgeRemoverInstance(g_,
- simplif_cfg_.init_clean.self_conj_condition,
- info_container_, removal_handler_),
- "Self conjugate edge remover",
- algos);
-
- if (info_container_.mode() == config::pipeline_type::rna){
- RemoveShortPolyATEdges(1, removal_handler_, info_container_.chunk_cnt());
- PushValid(ShortPolyATEdgesRemoverInstance(g_, 1, removal_handler_, info_container_.chunk_cnt()), "Short PolyA/T Edges",algos) ;
- PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
- }
-
- if (PerformInitCleaning()) {
- PushValid(
- IsolatedEdgeRemoverInstance(g_,
- simplif_cfg_.init_clean.ier,
- info_container_, removal_handler_),
- "Initial isolated edge remover",
- algos);
-
- PushValid(
- TipClipperInstance(g_,
- debruijn_config::simplification::tip_clipper(simplif_cfg_.init_clean.tip_condition),
- info_container_,
- removal_handler_),
- "Initial tip clipper",
- algos);
-
- PushValid(
- ECRemoverInstance(g_,
- debruijn_config::simplification::erroneous_connections_remover(simplif_cfg_.init_clean.ec_condition),
- info_container_,
- removal_handler_),
- "Initial ec remover",
- algos);
-
- PushValid(
- LowFlankDisconnectorInstance(g_, gp_.flanking_cov,
- simplif_cfg_.init_clean.disconnect_flank_cov, info_container_,
- removal_handler_),
- "Disconnecting edges with low flanking coverage",
- algos);
- }
-
- RunAlgos(algos);
-
- //FIXME why called directly?
- if (info_container_.mode() == config::pipeline_type::rna){
- RemoveHiddenLoopEC(g_, gp_.flanking_cov, info_container_.detected_coverage_bound(), simplif_cfg_.her, removal_handler_);
- cnt_callback_.Report();
- }
- }
-
- bool AllTopology() {
- bool res = TopologyRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
- removal_handler_);
- cnt_callback_.Report();
- res |= TopologyReliabilityRemoveErroneousEdges(gp_.g, simplif_cfg_.trec,
- removal_handler_);
- cnt_callback_.Report();
- res |= RemoveThorns(gp_.g, simplif_cfg_.isec, removal_handler_);
- cnt_callback_.Report();
- res |= MultiplicityCountingRemoveErroneousEdges(gp_.g, simplif_cfg_.tec,
- removal_handler_);
- cnt_callback_.Report();
- return res;
- }
-
- bool FinalRemoveErroneousEdges() {
-
- // gp.ClearQuality();
- // gp.FillQuality();
- // auto colorer = debruijn_graph::DefaultGPColorer(gp);
- // omnigraph::DefaultLabeler<typename gp_t::graph_t> labeler(gp.g, gp.edge_pos);
- // QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
- // cfg::get().output_dir + "pictures/colored_edges_deleted/");
- //
- // //positive quality edges removed (folder colored_edges_deleted)
- // std::function<void(EdgeId)> qual_removal_handler_f = boost::bind(
- // // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
- // &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
- // boost::ref(qual_removal_handler), _1);
- //
- // std::function<void(set<EdgeId>)> set_removal_handler_f = boost::bind(
- // &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, _1, qual_removal_handler_f);
- //
-
- std::function<void(set<EdgeId>)> set_removal_handler_f(0);
- if (removal_handler_) {
- set_removal_handler_f = std::bind(
- &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler_);
- }
-
- bool changed = RemoveRelativelyLowCoverageComponents(gp_.g, gp_.flanking_cov,
- simplif_cfg_.rcc, info_container_, set_removal_handler_f);
-
- cnt_callback_.Report();
-
- changed |= DisconnectRelativelyLowCoverageEdges(gp_.g, gp_.flanking_cov, simplif_cfg_.relative_ed);
-
- if (simplif_cfg_.topology_simplif_enabled && info_container_.main_iteration()) {
- changed |= AllTopology();
- changed |= MaxFlowRemoveErroneousEdges(gp_.g, simplif_cfg_.mfec,
- removal_handler_);
- cnt_callback_.Report();
- }
- return changed;
- }
-
- void PostSimplification() {
- INFO("PROCEDURE == Post simplification");
- size_t iteration = 0;
-
- AlgoStorageT algos;
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.tc,
- info_container_, removal_handler_),
- "Tip clipper",
- algos);
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.final_tc,
- info_container_, removal_handler_),
- "Final tip clipper",
- algos);
-
- PushValid(
- BRInstance(g_, simplif_cfg_.br,
- info_container_, removal_handler_),
- "Bulge remover",
- algos);
-
- PushValid(
- BRInstance(g_, simplif_cfg_.final_br,
- info_container_, removal_handler_),
- "Final bulge remover",
- algos);
-
- if (simplif_cfg_.topology_simplif_enabled) {
- PushValid(
- TopologyTipClipperInstance(g_, simplif_cfg_.ttc,
- info_container_, removal_handler_),
- "Topology tip clipper",
- algos);
- }
-
- //FIXME need better configuration
-
- if (info_container_.mode() == config::pipeline_type::meta) {
- PushValid(
- BRInstance(g_, simplif_cfg_.second_final_br,
- info_container_, removal_handler_),
- "Yet another final bulge remover",
- algos);
- }
-
- if (info_container_.mode() == config::pipeline_type::rna) {
- PushValid(ATTipClipperInstance(g_, removal_handler_, info_container_.chunk_cnt()), "AT Tips", algos);
- }
-
- bool enable_flag = true;
- while (enable_flag) {
- enable_flag = false;
-
- INFO("Iteration " << iteration);
-
- enable_flag |= FinalRemoveErroneousEdges();
- cnt_callback_.Report();
-
- enable_flag |= ClipComplexTips(gp_.g, simplif_cfg_.complex_tc, info_container_, removal_handler_);
- cnt_callback_.Report();
-
- enable_flag |= RemoveComplexBulges(gp_.g, simplif_cfg_.cbr, iteration);
- cnt_callback_.Report();
-
- enable_flag |= RunAlgos(algos);
-
- iteration++;
-
- // printer(ipp_before_final_err_con_removal);
- // printer(ipp_final_tip_clipping, str(format("_%d") % iteration));
- // printer(ipp_final_err_con_removal, str(format("_%d") % iteration));
- // printer(ipp_final_bulge_removal, str(format("_%d") % iteration));
- }
-
- //fixme move to AllTopology?
- if (simplif_cfg_.topology_simplif_enabled) {
- RemoveHiddenEC(gp_.g, gp_.flanking_cov, simplif_cfg_.her, info_container_, removal_handler_);
-
- cnt_callback_.Report();
- }
-
- INFO("Disrupting self-conjugate edges");
- SelfConjugateDisruptor<Graph>(gp_.g, removal_handler_).Run();
- cnt_callback_.Report();
- }
-
- //inline
- //void IdealSimplification(Graph& graph,
- // std::function<double(EdgeId)> quality_handler_f) {
- // for (auto iterator = graph.SmartEdgeBegin(); !iterator.IsEnd();
- // ++iterator) {
- // if (math::eq(quality_handler_f(*iterator), 0.))
- // graph.DeleteEdge(*iterator);
- // }
- // CompressAllVertices(graph);
- //}
-
-// std::shared_ptr<Predicate<EdgeId>> ParseCondition(const string& condition) const {
-// ConditionParser<Graph> parser(g_, condition, info_container_);
-// return parser();
-// }
-
- void PushValid(const AlgoPtr<Graph>& algo_ptr, std::string comment, AlgoStorageT& algos) const {
- if (algo_ptr) {
- algos.push_back(std::make_pair(algo_ptr, comment));
- }
- }
-
- bool RunAlgos(AlgoStorageT& algos, bool force_primary_launch = false) {
- bool changed = false;
- for (auto algo_comment : algos) {
- INFO("Running " << algo_comment.second);
- changed |= algo_comment.first->Run(force_primary_launch);
- cnt_callback_.Report();
- }
- return changed;
- }
-
-public:
- GraphSimplifier(conj_graph_pack &gp, const SimplifInfoContainer& info_container,
- const debruijn_config::simplification& simplif_cfg,
- const std::function<void(EdgeId)>& removal_handler,
- stats::detail_info_printer& printer)
- : gp_(gp),
- g_(gp_.g),
- info_container_(info_container),
- simplif_cfg_(simplif_cfg),
- removal_handler_(AddCountingCallback(cnt_callback_, removal_handler)),
- printer_(printer) {
-
- }
-
- void SimplifyGraph() {
- printer_(info_printer_pos::before_simplification);
- INFO("Graph simplification started");
-
- InitialCleaning();
-
- AlgoStorageT algos;
-
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Tip clipper",
- algos);
- PushValid(
- BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Bulge remover",
- algos);
- PushValid(
- ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_, simplif_cfg_.cycle_iter_count),
- "Low coverage edge remover",
- algos);
-
- size_t iteration = 0;
- bool graph_changed = true;
- //cannot stop simply if nothing changed, since threshold change on every iteration
- while (iteration < simplif_cfg_.cycle_iter_count || graph_changed) {
- INFO("PROCEDURE == Simplification cycle, iteration " << iteration + 1);
- graph_changed = RunAlgos(algos);
- ++iteration;
- }
-
- printer_(info_printer_pos::before_post_simplification);
-
- if (simplif_cfg_.post_simplif_enabled) {
- PostSimplification();
- } else {
- INFO("PostSimplification disabled");
- }
- }
-
- void SimplifyRNAGraph() {
- printer_(info_printer_pos::before_simplification);
- INFO("Graph simplification started");
-
- InitialCleaning();
-
- if (gp_.genome.GetSequence().size() > 0) {
- DEBUG("Reference genome length = " + std::to_string(gp_.genome.GetSequence().size()));
- }
-
- AlgoStorageT ec_algo;
-
- PushValid(ECRemoverInstance(g_, simplif_cfg_.ec, info_container_, removal_handler_,
- simplif_cfg_.cycle_iter_count), "Low coverage edge remover", ec_algo);
-
- size_t iteration = 0;
- bool graph_changed_ec = true;
- //TODO: config. Or just graph_changed?
- size_t tc_max_iteration = 2;
- //cannot stop simply if nothing changed, since threshold change on every iteration
- while (iteration < simplif_cfg_.cycle_iter_count || graph_changed_ec) {
- AlgoStorageT algos;
- PushValid(
- TipClipperInstance(g_, simplif_cfg_.tc, info_container_, removal_handler_, tc_max_iteration),
- "Tip clipper",
- algos);
- PushValid(
- DeadEndInstance(g_, simplif_cfg_.dead_end, info_container_, removal_handler_, tc_max_iteration),
- "Dead end clipper",
- algos);
- PushValid(
- BRInstance(g_, simplif_cfg_.br, info_container_, removal_handler_, tc_max_iteration),
- "Bulge remover",
- algos);
- bool graph_changed = true;
- size_t tc_iteration = 0;
-
- while (tc_iteration < tc_max_iteration || graph_changed) {
- INFO("PROCEDURE == Tip clipper and bulge removal cycle, iteration " << iteration + 1 << "." << tc_iteration);
- graph_changed = RunAlgos(algos);
- ++tc_iteration;
- }
- INFO("PROCEDURE == Erroneous connection, iteration " << iteration + 1);
- graph_changed_ec = RunAlgos(ec_algo);
- ++iteration;
- }
-
- printer_(info_printer_pos::before_post_simplification);
-
- if (simplif_cfg_.post_simplif_enabled) {
- PostSimplification();
- } else {
- INFO("PostSimplification disabled");
- }
- }
-};
-
-
-void Simplification::run(conj_graph_pack &gp, const char*) {
- using namespace omnigraph;
-
- //no other handlers here, todo change with DetachAll
- gp.index.Detach();
- gp.index.clear();
-
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
-
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
-
- // QualityLoggingRemovalHandler<Graph> qual_removal_handler(gp.g, edge_qual);
-// auto colorer = debruijn_graph::DefaultGPColorer(gp);
-// QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
-// cfg::get().output_dir + "pictures/colored_edges_deleted/");
-//
-// //positive quality edges removed (folder colored_edges_deleted)
-// std::function<void(EdgeId)> removal_handler_f = boost::bind(
-// // &QualityLoggingRemovalHandler<Graph>::HandleDelete,
-// &QualityEdgeLocalityPrintingRH<Graph>::HandleDelete,
-// boost::ref(qual_removal_handler), _1);
-
-
- SimplifInfoContainer info_container(cfg::get().mode);
- info_container.set_read_length(cfg::get().ds.RL())
- .set_main_iteration(cfg::get().main_iteration)
- .set_chunk_cnt(5 * cfg::get().max_threads);
-
- //0 if model didn't converge
- //todo take max with trusted_bound
- //FIXME add warning when used for uneven coverage applications
- info_container.set_detected_mean_coverage(gp.ginfo.estimated_mean())
- .set_detected_coverage_bound(gp.ginfo.ec_bound());
-
- GraphSimplifier simplifier(gp, info_container,
- preliminary_ ? *cfg::get().preliminary_simp : cfg::get().simp,
- nullptr/*removal_handler_f*/,
- printer);
- if (cfg::get().mode == pipeline_type::rna)
- simplifier.SimplifyRNAGraph();
- else
- simplifier.SimplifyGraph();
-
-}
-
-
-void SimplificationCleanup::run(conj_graph_pack &gp, const char*) {
- SimplifInfoContainer info_container(cfg::get().mode);
- info_container
- .set_read_length(cfg::get().ds.RL())
- .set_main_iteration(cfg::get().main_iteration)
- .set_chunk_cnt(5 * cfg::get().max_threads);
-
-
- auto isolated_edge_remover =
- IsolatedEdgeRemoverInstance(gp.g, cfg::get().simp.ier, info_container, (HandlerF<Graph>)nullptr);
- if (isolated_edge_remover != nullptr)
- isolated_edge_remover->Run();
-
- double low_threshold = gp.ginfo.trusted_bound();
- if (math::gr(low_threshold, 0.0)) {
- INFO("Removing all the edges having coverage " << low_threshold << " and less");
- ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>>
- cov_cleaner(gp.g,
- CoverageUpperBound<Graph>(gp.g, low_threshold),
- info_container.chunk_cnt(),
- (HandlerF<Graph>)nullptr,
- /*canonical_only*/true,
- CoverageComparator<Graph>(gp.g));
- cov_cleaner.Run();
- }
-
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
- printer(info_printer_pos::final_simplified);
-
- DEBUG("Graph simplification finished");
-
- INFO("Counting average coverage");
- AvgCovereageCounter<Graph> cov_counter(gp.g);
-
- cfg::get_writable().ds.set_avg_coverage(cov_counter.Count());
-
- INFO("Average coverage = " << cfg::get().ds.avg_coverage());
- if (!cfg::get().uneven_depth) {
- if (cfg::get().ds.avg_coverage() < gp.ginfo.ec_bound())
- WARN("The determined erroneous connection coverage threshold may be determined improperly\n");
- }
-}
-
-
-#if 0
-void corrected_and_save_reads(const conj_graph_pack& gp) {
- //saving corrected reads
- //todo read input files, correct, save and use on the next iteration
-
- auto_ptr<io::IReader<io::PairedReadSeq>> paired_stream =
- paired_binary_multireader(false, /*insert_size*/0);
- io::ModifyingWrapper<io::PairedReadSeq> refined_paired_stream(
- *paired_stream,
- GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
-
- auto_ptr<io::IReader<io::SingleReadSeq>> single_stream =
- single_binary_multireader(false, /*include_paired_reads*/false);
- io::ModifyingWrapper<io::SingleReadSeq> refined_single_stream(
- *single_stream,
- GraphReadCorrectorInstance(gp.g, *MapperInstance(gp)));
-
- if (cfg::get().graph_read_corr.binary) {
- INFO("Correcting paired reads");
-
- io::BinaryWriter paired_converter(
- cfg::get().paired_read_prefix + "_cor", cfg::get().max_threads,
- cfg::get().buffer_size);
- paired_converter.ToBinary(refined_paired_stream);
-
- INFO("Correcting single reads");
- io::BinaryWriter single_converter(
- cfg::get().single_read_prefix + "_cor", cfg::get().max_threads,
- cfg::get().buffer_size);
- single_converter.ToBinary(refined_single_stream);
- } else {
- //save in fasta
- VERIFY(false);
- }
-
- INFO("Error correction done");
-}
-#endif
-
-} //debruijn_graph
diff --git a/src/modules/stages/simplification_pipeline/graph_simplification.hpp b/src/modules/stages/simplification_pipeline/graph_simplification.hpp
deleted file mode 100644
index 013443e..0000000
--- a/src/modules/stages/simplification_pipeline/graph_simplification.hpp
+++ /dev/null
@@ -1,1034 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * graph_simplification.hpp
- *
- * Created on: Aug 12, 2011
- * Author: sergey
- */
-
-#pragma once
-
-#include "pipeline/config_struct.hpp"
-
-#include "algorithms/simplification/tip_clipper.hpp"
-#include "algorithms/simplification/complex_tip_clipper.hpp"
-#include "algorithms/simplification/bulge_remover.hpp"
-#include "algorithms/simplification/complex_bulge_remover.hpp"
-#include "algorithms/simplification/erroneous_connection_remover.hpp"
-#include "algorithms/simplification/relative_coverage_remover.hpp"
-#include "algorithms/simplification/mf_ec_remover.hpp"
-#include "algorithms/simplification/parallel_simplification_algorithms.hpp"
-#include "stages/simplification_pipeline/simplification_settings.hpp"
-#include "stages/simplification_pipeline/single_cell_simplification.hpp"
-
-#include "algorithms/graph_read_correction.hpp"
-
-#include "assembly_graph/graph_support/chimera_stats.hpp"
-#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
-#include "assembly_graph/stats/picture_dump.hpp"
-#include "assembly_graph/graph_support/parallel_processing.hpp"
-#include "assembly_graph/graph_support/detail_coverage.hpp"
-
-#include "assembly_graph/graph_core/graph.hpp"
-
-#include "visualization/graph_colorer.hpp"
-#include "dev_support/standard_base.hpp"
-
-namespace debruijn {
-
-namespace simplification {
-
-//todo remove this line
-using namespace debruijn_graph;
-
-template<class Graph>
-using AlgoPtr = std::shared_ptr<omnigraph::PersistentAlgorithmBase<Graph>>;
-
-template<class Graph>
-using EdgeConditionT = pred::TypedPredicate<typename Graph::EdgeId>;
-
-template<class Graph>
-class ConditionParser {
-private:
- typedef typename Graph::EdgeId EdgeId;
-
- const Graph &g_;
- string next_token_;
- string input_;
- const SimplifInfoContainer settings_;
- size_t curr_iteration_;
- size_t iteration_cnt_;
- std::queue<string> tokenized_input_;
-
- size_t max_length_bound_;
- double max_coverage_bound_;
-
- string ReadNext() {
- if (!tokenized_input_.empty()) {
- next_token_ = tokenized_input_.front();
- tokenized_input_.pop();
- } else {
- next_token_ = "";
- }
- return next_token_;
- }
-
- template<typename T>
- bool RelaxMax(T &cur_max, T t) {
- if (t > cur_max) {
- cur_max = t;
- return true;
- }
- return false;
- }
-
- template<typename T>
- bool RelaxMin(T &cur_min, T t) {
- if (t < cur_min) {
- cur_min = t;
- return true;
- }
- return false;
- }
-
- double GetCoverageBound() {
- if (next_token_ == "auto") {
- return settings_.detected_coverage_bound();
- } else {
- return std::stod(next_token_);
- }
- }
-
- pred::TypedPredicate<EdgeId> ParseCondition(size_t &min_length_bound,
- double &min_coverage_bound) {
- if (next_token_ == "tc_lb") {
- double length_coeff = std::stod(ReadNext());
-
- DEBUG("Creating tip length bound. Coeff " << length_coeff);
- size_t length_bound = LengthThresholdFinder::MaxTipLength(
- settings_.read_length(), g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- DEBUG("Min length bound - " << min_length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
-
- } else if (next_token_ == "rlmk") {
- //Read length minus k
- VERIFY_MSG(settings_.read_length() > g_.k(), "Read length was shorter than K");
- DEBUG("Creating (rl - k) bound");
- size_t length_bound = settings_.read_length() - g_.k();
- RelaxMin(min_length_bound, length_bound);
- DEBUG("Min length bound - " << min_length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
-
- } else if (next_token_ == "to_ec_lb") {
- double length_coeff = std::stod(ReadNext());
-
- DEBUG( "Creating length bound for erroneous connections originated from tip merging. Coeff " << length_coeff);
- size_t length_bound =
- LengthThresholdFinder::MaxTipOriginatedECLength(
- settings_.read_length(), g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- DEBUG("Min length bound - " << min_length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
-
- } else if (next_token_ == "ec_lb") {
- size_t length_coeff = std::stoll(ReadNext());
-
- DEBUG("Creating ec length bound. Coeff " << length_coeff);
- size_t length_bound =
- LengthThresholdFinder::MaxErroneousConnectionLength(
- g_.k(), length_coeff);
-
- DEBUG("Length bound " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- DEBUG("Min length bound - " << min_length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "lb") {
- size_t length_bound = std::stoll(ReadNext());
-
- DEBUG("Creating length bound. Value " << length_bound);
-
- RelaxMin(min_length_bound, length_bound);
- DEBUG("Min length bound - " << min_length_bound);
- return LengthUpperBound<Graph>(g_, length_bound);
- } else if (next_token_ == "cb") {
- ReadNext();
- double cov_bound = GetCoverageBound();
- DEBUG("Creating coverage upper bound " << cov_bound);
- RelaxMin(min_coverage_bound, cov_bound);
- return CoverageUpperBound<Graph>(g_, cov_bound);
- } else if (next_token_ == "icb") {
- VERIFY(iteration_cnt_ != -1ul && curr_iteration_ != -1ul);
- ReadNext();
- double cov_bound = GetCoverageBound();
- cov_bound = cov_bound / (double) iteration_cnt_ * (double) (curr_iteration_ + 1);
- DEBUG("Creating iterative coverage upper bound " << cov_bound);
- RelaxMin(min_coverage_bound, cov_bound);
- return CoverageUpperBound<Graph>(g_, cov_bound);
- } else if (next_token_ == "rctc") {
- ReadNext();
- DEBUG("Creating relative cov tip cond " << next_token_);
- return RelativeCoverageTipCondition<Graph>(g_, std::stod(next_token_));
- } else if (next_token_ == "disabled") {
- DEBUG("Creating disabling condition");
- return pred::AlwaysFalse<EdgeId>();
- } else if (next_token_ == "mmm") {
- ReadNext();
- DEBUG("Creating max mismatches cond " << next_token_);
- return MismatchTipCondition<Graph>(g_, std::stoll(next_token_));
- } else {
- VERIFY(false);
- return pred::AlwaysTrue<EdgeId>();
- }
- }
-
- pred::TypedPredicate<EdgeId> ParseConjunction(size_t &min_length_bound,
- double &min_coverage_bound) {
- pred::TypedPredicate<EdgeId> answer = pred::AlwaysTrue<EdgeId>();
- VERIFY(next_token_ == "{");
- ReadNext();
- while (next_token_ != "}") {
- answer = pred::And(answer,
- ParseCondition(min_length_bound, min_coverage_bound));
- ReadNext();
- }
- return answer;
- }
-
-public:
-
- ConditionParser(const Graph &g, string input, const SimplifInfoContainer &settings,
- size_t curr_iteration = -1ul, size_t iteration_cnt = -1ul)
- : g_(g),
- input_(input),
- settings_(settings),
- curr_iteration_(curr_iteration),
- iteration_cnt_(iteration_cnt),
- max_length_bound_(0),
- max_coverage_bound_(0.) {
- DEBUG("Creating parser for string " << input);
- using namespace boost;
- vector<string> tmp_tokenized_input;
- boost::split(tmp_tokenized_input, input_, boost::is_any_of(" ,;"), boost::token_compress_on);
- for (auto it = tmp_tokenized_input.begin();
- it != tmp_tokenized_input.end(); ++it) {
- tokenized_input_.push(*it);
- }
- ReadNext();
- }
-
- pred::TypedPredicate<EdgeId> operator()() {
- DEBUG("Parsing");
- pred::TypedPredicate<EdgeId> answer = pred::AlwaysFalse<EdgeId>();
- VERIFY_MSG(next_token_ == "{", "Expected \"{\", but next token was " << next_token_);
- while (next_token_ == "{") {
- size_t min_length_bound = numeric_limits<size_t>::max();
- double min_coverage_bound = numeric_limits<double>::max();
- answer = pred::Or(answer,
- ParseConjunction(min_length_bound, min_coverage_bound));
- RelaxMax(max_length_bound_, min_length_bound);
- RelaxMax(max_coverage_bound_, min_coverage_bound);
- ReadNext();
- }
- return answer;
- }
-
- size_t max_length_bound() const {
- return max_length_bound_;
- }
-
- double max_coverage_bound() const {
- return max_coverage_bound_;
- }
-
-private:
- DECL_LOGGER("ConditionParser");
-};
-
-//todo move to visualization
-template<class graph_pack>
-shared_ptr<omnigraph::visualization::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
- const graph_pack &gp) {
- auto mapper = MapperInstance(gp);
- auto path1 = mapper->MapSequence(gp.genome.GetSequence()).path();
- auto path2 = mapper->MapSequence(!gp.genome.GetSequence()).path();
- return omnigraph::visualization::DefaultColorer(gp.g, path1, path2);
-}
-
-template<class Graph>
-class EditDistanceTrackingCallback {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::EdgeData EdgeData;
- const Graph &g_;
-
-public:
- EditDistanceTrackingCallback(const Graph &g)
- : g_(g) {
- }
-
- bool operator()(EdgeId edge, const vector<EdgeId> &path) const {
- vector<Sequence> path_sequences;
- for (auto it = path.begin(); it != path.end(); ++it) {
- path_sequences.push_back(g_.EdgeNucls(*it));
- }
- Sequence path_sequence(
- MergeOverlappingSequences(path_sequences, g_.k()));
- size_t dist = EditDistance(g_.EdgeNucls(edge), path_sequence);
- TRACE( "Bulge sequences with distance " << dist << " were " << g_.EdgeNucls(edge) << " and " << path_sequence);
- return true;
- }
-
-private:
- DECL_LOGGER("EditDistanceTrackingCallback");
-};
-
-//template<class Graph, class SmartEdgeIt>
-//bool ClipTips(
-// Graph &g,
-// SmartEdgeIt &it,
-// const config::debruijn_config::simplification::tip_clipper &tc_config,
-// const SimplifInfoContainer &info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// INFO("Clipping tips");
-//
-// string condition_str = tc_config.condition;
-//
-// ConditionParser<Graph> parser(g, condition_str, info);
-// auto condition = parser();
-//
-// omnigraph::EdgeRemovingAlgorithm<Graph> tc(g,
-// omnigraph::AddTipCondition(g, condition),
-// removal_handler, true);
-//
-// TRACE("Tip length bound " << parser.max_length_bound());
-// return tc.RunFromIterator(it,
-// make_shared<LengthUpperBound<Graph>>(g, parser.max_length_bound()));
-//}
-
-//template<class Graph>
-//bool ClipTips(
-// Graph &g,
-// const config::debruijn_config::simplification::tip_clipper &tc_config,
-// const SimplifInfoContainer &info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// auto it = g.SmartEdgeBegin(LengthComparator<Graph>(g), true);
-// return ClipTips(g, it, tc_config, info, removal_handler);
-//}
-
-//enabling tip projection, todo optimize if hotspot
-template<class gp_t>
-HandlerF<typename gp_t::graph_t> WrapWithProjectionCallback(
- gp_t &gp,
- HandlerF<typename gp_t::graph_t> removal_handler) {
- typedef typename gp_t::graph_t Graph;
- typedef typename Graph::EdgeId EdgeId;
- TipsProjector<gp_t> tip_projector(gp);
-
- HandlerF<Graph> projecting_callback = std::bind(&TipsProjector<gp_t>::ProjectTip,
- tip_projector, std::placeholders::_1);
-
- return func::Composition<EdgeId>(std::ref(removal_handler), projecting_callback);
-}
-
-template<class Graph, class InterestingEdgeFinder>
-class LowCoverageEdgeRemovingAlgorithm : public PersistentEdgeRemovingAlgorithm<Graph,
- InterestingEdgeFinder, CoverageComparator<Graph>> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentEdgeRemovingAlgorithm<Graph, InterestingEdgeFinder, CoverageComparator<Graph>> base;
- SimplifInfoContainer simplif_info_;
- std::string condition_str_;
- pred::TypedPredicate<EdgeId> remove_condition_;
- pred::TypedPredicate<EdgeId> proceed_condition_;
-
-protected:
-
- void PrepareIteration(size_t it_cnt, size_t total_it_estimate) override {
- TRACE("Preparing iteration " << it_cnt << " out of total estimate " << total_it_estimate);
- ConditionParser<Graph> parser(this->g(), condition_str_,
- simplif_info_, it_cnt, total_it_estimate);
- remove_condition_ = omnigraph::AddAlternativesPresenceCondition(this->g(), parser());
- TRACE("Updated remove condition");
- proceed_condition_ = CoverageUpperBound<Graph>(this->g(), parser.max_coverage_bound());
- TRACE("Updated proceed condition up to coverage " << parser.max_coverage_bound());
- }
-
- bool Proceed(EdgeId e) const override {
- return proceed_condition_(e);
- }
-
- bool ShouldRemove(EdgeId e) const override {
- return remove_condition_(e);
- }
-
-public:
- LowCoverageEdgeRemovingAlgorithm(Graph &g,
- const InterestingEdgeFinder &interest_edge_finder,
- const SimplifInfoContainer &simplif_info,
- const std::string &condition_str,
- std::function<void(EdgeId)> removal_handler = nullptr,
- bool canonical_only = false,
- bool track_changes = true,
- size_t total_iteration_estimate = -1ul)
- : base(g, interest_edge_finder,
- removal_handler,
- canonical_only,
- CoverageComparator<Graph>(g),
- track_changes,
- total_iteration_estimate),
- simplif_info_(simplif_info),
- condition_str_(condition_str),
- remove_condition_(pred::AlwaysFalse<EdgeId>()),
- proceed_condition_(pred::AlwaysTrue<EdgeId>()) {}
-
-private:
- DECL_LOGGER("LowCoverageEdgeRemovingAlgorithm");
-};
-
-template<class Graph>
-AlternativesAnalyzer<Graph> ParseBRConfig(const Graph &g,
- const config::debruijn_config::simplification::bulge_remover &config) {
- size_t max_length = LengthThresholdFinder::MaxBulgeLength(
- g.k(), config.max_bulge_length_coefficient,
- config.max_additive_length_coefficient);
-
- DEBUG("Length bound " << max_length);
-
- return AlternativesAnalyzer<Graph>(g, config.max_coverage,
- max_length,
- config.max_relative_coverage,
- config.max_delta,
- config.max_relative_delta,
- config.max_number_edges);
-}
-
-template<class Graph>
-AlgoPtr<Graph> SelfConjugateEdgeRemoverInstance(Graph &g, const string &condition_str,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler = 0) {
- ConditionParser<Graph> parser(g, condition_str, info);
- auto condition = pred::And(SelfConjugateCondition<Graph>(g), parser());
-
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
- condition,
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true);
-}
-
-template<class Graph>
-bool RemoveRelativelyLowCoverageComponents(
- Graph &g,
- const FlankingCoverage<Graph> &flanking_cov,
- const config::debruijn_config::simplification::relative_coverage_comp_remover &rcc_config,
- const SimplifInfoContainer &info,
- typename ComponentRemover<Graph>::HandlerF removal_handler = 0) {
- if (rcc_config.enabled) {
- INFO("Removing relatively low covered connections");
- size_t connecting_path_length_bound = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), rcc_config.max_ec_length_coefficient);
-
- std::string pics_dir = "";
-
- double max_coverage = math::ge(rcc_config.max_coverage_coeff, 0.)
- ? info.detected_coverage_bound() * rcc_config.max_coverage_coeff
- : std::numeric_limits<double>::max();
-
- omnigraph::simplification::relative_coverage::
- RelativeCoverageComponentRemover<Graph> rel_rem(
- g,
- std::bind(&FlankingCoverage<Graph>::LocalCoverage,
- std::cref(flanking_cov), std::placeholders::_1, std::placeholders::_2),
- rcc_config.coverage_gap, size_t(double(info.read_length()) * rcc_config.length_coeff),
- size_t(double(info.read_length()) * rcc_config.tip_allowing_length_coeff),
- connecting_path_length_bound,
- max_coverage,
- removal_handler, rcc_config.vertex_count_limit, pics_dir);
- return rel_rem.Run();
- } else {
- INFO("Removal of relatively low covered connections disabled");
- return false;
- }
-}
-
-template<class Graph>
-bool DisconnectRelativelyLowCoverageEdges(Graph &g,
- const FlankingCoverage<Graph> &flanking_cov,
- const config::debruijn_config::simplification::relative_coverage_edge_disconnector &rced_config) {
- if (rced_config.enabled) {
- INFO("Disconnecting edges with relatively low coverage");
- omnigraph::simplification::relative_coverage::RelativeCoverageDisconnector<
- Graph> disconnector(g, std::bind(&FlankingCoverage<Graph>::LocalCoverage,
- std::cref(flanking_cov), std::placeholders::_1,
- std::placeholders::_2), rced_config.diff_mult);
- return disconnector.Run();
- } else {
- INFO("Disconnection of relatively low covered edges disabled");
- return false;
- }
-}
-
-template<class Graph>
-bool RemoveComplexBulges(
- Graph &g,
- config::debruijn_config::simplification::complex_bulge_remover cbr_config,
- size_t /*iteration*/ = 0) {
- if (!cbr_config.enabled)
- return false;
- INFO("Removing complex bulges");
- size_t max_length = (size_t) ((double) g.k() * cbr_config.max_relative_length);
- size_t max_diff = cbr_config.max_length_difference;
- omnigraph::complex_br::ComplexBulgeRemover<Graph> complex_bulge_remover(
- g, max_length, max_diff);
- return complex_bulge_remover.Run();
-}
-
-//template<class Graph>
-//bool RemoveIsolatedEdges(Graph &g, size_t max_length, double max_coverage, size_t max_length_any_cov,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0, size_t chunk_cnt = 1) {
-// typedef typename Graph::EdgeId EdgeId;
-//
-// //todo add info that some other edges might be removed =)
-// INFO("Removing isolated edges");
-// INFO("All edges shorter than " << max_length_any_cov << " will be removed");
-// INFO("Also edges shorter than " << max_length << " and coverage smaller than " << max_coverage << " will be removed");
-// //todo add warn on max_length_any_cov > max_length
-//
-// auto condition = func::And<EdgeId>(
-// make_shared<IsolatedEdgeCondition<Graph>>(g),
-// func::Or<EdgeId>(
-// make_shared<LengthUpperBound<Graph>>(g, max_length_any_cov),
-// func::And<EdgeId>(
-// make_shared<LengthUpperBound<Graph>>(g, max_length),
-// make_shared<CoverageUpperBound<Graph>>(g, max_coverage)
-// )));
-//
-// if (chunk_cnt == 1) {
-// omnigraph::EdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
-//
-// return removing_algo.Run(LengthComparator<Graph>(g),
-// make_shared<LengthUpperBound<Graph>>(g, std::max(max_length, max_length_any_cov)));
-// } else {
-// SemiParallelAlgorithmRunner<Graph, EdgeId> runner(g);
-// SemiParallelEdgeRemovingAlgorithm<Graph> removing_algo(g, condition, removal_handler);
-//
-// return RunEdgeAlgorithm(g, runner, removing_algo, chunk_cnt);
-// }
-//}
-
-template<class Graph>
-bool ClipComplexTips(Graph &g, config::debruijn_config::simplification::complex_tip_clipper ctc_conf, const SimplifInfoContainer &info, HandlerF<Graph> removal_handler = 0) {
- if (!ctc_conf.enabled) {
- INFO("Complex tip clipping disabled");
- return false;
- }
-
- std::function<void(set<EdgeId>)> set_removal_handler_f(0);
- if (removal_handler) {
- set_removal_handler_f = std::bind(
- &omnigraph::simplification::SingleEdgeAdapter<set<EdgeId>>, std::placeholders::_1, removal_handler);
- }
-
- INFO("Complex tip clipping");
-
- ConditionParser<Graph> parser(g, ctc_conf.condition, info);
- parser();
-
- ComplexTipClipper<Graph> tip_clipper(g, ctc_conf.max_relative_coverage, ctc_conf.max_edge_len, parser.max_length_bound(), "", set_removal_handler_f);
- return tip_clipper.Run();
-}
-
-template<class Graph>
-AlgoPtr<Graph> ShortPolyATEdgesRemoverInstance(Graph &g, size_t max_length, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
- auto condition = pred::And(ATCondition<Graph>(g, 0.8, max_length, false), LengthUpperBound<Graph>(g, 1));
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, condition, chunk_cnt, removal_handler, true);
-}
-
-template<class Graph>
-AlgoPtr<Graph> ATTipClipperInstance(Graph &g, HandlerF<Graph> removal_handler = 0, size_t chunk_cnt = 1) {
-//TODO: review params 0.8, 200?
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g, ATCondition<Graph>(g, 0.8, 200, true), chunk_cnt, removal_handler, true);
-}
-
-template<class Graph>
-AlgoPtr<Graph> IsolatedEdgeRemoverInstance(Graph &g,
- config::debruijn_config::simplification::isolated_edges_remover ier,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler = 0) {
- if (!ier.enabled) {
- return nullptr;
- }
- size_t max_length_any_cov = std::max(info.read_length(), ier.max_length_any_cov);
-
- INFO("Removing isolated edges");
- INFO("All isolated edges shorter than " << max_length_any_cov << " will be removed");
- INFO("Also isolated edges shorter than " << ier.max_length << " and coverage smaller than " << ier.max_coverage << " will be removed");
-
- auto condition = pred::And(IsolatedEdgeCondition<Graph>(g),
- pred::Or(LengthUpperBound<Graph>(g, max_length_any_cov),
- pred::And(LengthUpperBound<Graph>(g, ier.max_length),
- CoverageUpperBound<Graph>(g, ier.max_coverage))));
-
- return std::make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
- condition,
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true);
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryBulgeCondition(const Graph &g,
- const config::debruijn_config::simplification::bulge_remover &br_config,
- const SimplifInfoContainer&) {
- auto analyzer = ParseBRConfig(g, br_config);
- return omnigraph::NecessaryBulgeCondition(g, analyzer.max_length(), analyzer.max_coverage());
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryTipCondition(const Graph &g,
- const config::debruijn_config::simplification::tip_clipper &tc_config,
- const SimplifInfoContainer &info) {
- ConditionParser<Graph> parser(g, tc_config.condition, info);
- auto condition = parser();
- return omnigraph::NecessaryTipCondition(g, parser.max_length_bound(),
- parser.max_coverage_bound());
-}
-
-template<class Graph>
-pred::TypedPredicate<typename Graph::EdgeId> NecessaryECCondition(const Graph &g,
- const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
- const SimplifInfoContainer &info,
- size_t current_iteration = 0,
- size_t iteration_cnt = 1) {
- ConditionParser<Graph> parser(g, ec_config.condition, info, current_iteration, iteration_cnt);
- auto condition = parser();
- return omnigraph::NecessaryECCondition(g, parser.max_length_bound(),
- parser.max_coverage_bound());
-}
-
-template<class Graph>
-AlgoPtr<Graph> ECRemoverInstance(Graph &g,
- const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (ec_config.condition.empty())
- return nullptr;
-
- typedef omnigraph::ParallelInterestingElementFinder<Graph> InterestingFinderT;
- InterestingFinderT interesting_finder(g,
- NecessaryECCondition(g, ec_config, info, iteration_cnt - 1, iteration_cnt),
- info.chunk_cnt());
- return make_shared<LowCoverageEdgeRemovingAlgorithm<Graph, InterestingFinderT>>(
- g, interesting_finder, info, ec_config.condition, removal_handler,
- /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
-}
-
-template<class Graph>
-AlgoPtr<Graph> RelativeECRemoverInstance(Graph &g,
- const config::debruijn_config::simplification::relative_coverage_ec_remover &rcec_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (!rcec_config.enabled)
- return nullptr;
-
- return make_shared<ParallelEdgeRemovingAlgorithm<Graph>>(g,
- AddRelativeCoverageECCondition(g, rcec_config.rcec_ratio,
- AddAlternativesPresenceCondition(g, pred::TypedPredicate<typename Graph::EdgeId>
- (LengthUpperBound<Graph>(g, rcec_config.max_ec_length)))),
- info.chunk_cnt(), removal_handler, /*canonical_only*/true);
-}
-
-template<class Graph>
-AlgoPtr<Graph> NotBulgeECRemoverInstance(Graph &g,
- const config::debruijn_config::simplification::erroneous_connections_remover &ec_config,
- const SimplifInfoContainer &info, HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (ec_config.condition.empty())
- return nullptr;
-
- std::string curr_condition = ec_config.condition;
- ConditionParser<Graph> parser(g, curr_condition, info, iteration_cnt - 1, iteration_cnt);
- auto condition = parser();
-
- typedef omnigraph::ParallelInterestingElementFinder<Graph> InterestingFinderT;
- InterestingFinderT interesting_finder(g, AddNotBulgeECCondition(g, AddAlternativesPresenceCondition(g, pred::And(
- LengthUpperBound<Graph>(g, parser.max_length_bound()),
- CoverageUpperBound<Graph>(g, parser.max_coverage_bound())))),
- info.chunk_cnt());
- return make_shared<LowCoverageEdgeRemovingAlgorithm<Graph, InterestingFinderT>>(
- g, interesting_finder, info, ec_config.condition, removal_handler,
- /*canonical only*/ true, /*track changes*/ true, iteration_cnt);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph &g,
- const EdgeConditionT<Graph> &condition,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- bool track_changes = true,
- size_t /*iteration_cnt*/ = 1) {
- return make_shared<ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>>>(g,
- AddTipCondition(g, condition),
- info.chunk_cnt(),
- removal_handler,
- /*canonical_only*/true,
- LengthComparator<Graph>(g),
- track_changes);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TipClipperInstance(Graph &g,
- const config::debruijn_config::simplification::tip_clipper &tc_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- size_t iteration_cnt = 1) {
- if (tc_config.condition.empty())
- return nullptr;
-
- ConditionParser<Graph> parser(g, tc_config.condition, info);
- auto condition = parser();
- return TipClipperInstance(g, condition, info, removal_handler, /*track changes*/true, iteration_cnt);
-}
-
-template<class Graph>
-AlgoPtr<Graph> DeadEndInstance(Graph &g,
- const config::debruijn_config::simplification::dead_end_clipper &dead_end_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- size_t /*iteration_cnt*/ = 1) {
- if (!dead_end_config.enabled || dead_end_config.condition.empty())
- return nullptr;
-
- ConditionParser<Graph> parser(g, dead_end_config.condition, info);
- auto condition = parser();
- return make_shared<ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>>>(g,
- AddDeadEndCondition(g, condition), info.chunk_cnt(), removal_handler, /*canonical_only*/true,
- LengthComparator<Graph>(g), /*track changes*/true);
-}
-
-template<class Graph>
-AlgoPtr<Graph> TopologyTipClipperInstance(
- Graph &g,
- const config::debruijn_config::simplification::topology_tip_clipper &ttc_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler) {
-
- auto condition
- = pred::And(LengthUpperBound<Graph>(g,
- LengthThresholdFinder::MaxTipLength(info.read_length(), g.k(), ttc_config.length_coeff)),
- DefaultUniquenessPlausabilityCondition<Graph>(g,
- ttc_config.uniqueness_length, ttc_config.plausibility_length));
-
- return TipClipperInstance(g,
- condition, info, removal_handler, /*track changes*/false);
-}
-
-template<class Graph>
-AlgoPtr<Graph> BRInstance(Graph &g,
- const config::debruijn_config::simplification::bulge_remover &br_config,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler,
- size_t /*iteration_cnt*/ = 1) {
- typedef ParallelInterestingElementFinder<Graph,
- typename Graph::EdgeId> InterestingEdgeFinder;
- if (!br_config.enabled || (br_config.main_iteration_only && !info.main_iteration())) {
- return nullptr;
- }
-
- auto alternatives_analyzer = ParseBRConfig(g, br_config);
-
-
- InterestingEdgeFinder interesting_edge_finder(g,
- NecessaryBulgeCondition(g,
- alternatives_analyzer.max_length(),
- alternatives_analyzer.max_coverage()),
- info.chunk_cnt());
- if (br_config.parallel) {
- INFO("Creating parallel br instance");
- return make_shared<ParallelBulgeRemover<Graph, InterestingEdgeFinder>>(g,
- interesting_edge_finder,
- br_config.buff_size,
- br_config.buff_cov_diff,
- br_config.buff_cov_rel_diff,
- alternatives_analyzer,
- nullptr,
- removal_handler,
- /*track_changes*/true);
- } else {
- INFO("Creating br instance");
- return make_shared<BulgeRemover<Graph, InterestingEdgeFinder>>(g,
- interesting_edge_finder,
- alternatives_analyzer,
- nullptr,
- removal_handler,
- /*track_changes*/true);
- }
-}
-
-//todo make this all work for end of the edges also? switch to canonical iteration?
-//todo rename, since checking topology also
-template<class Graph>
-class FlankingCovBound : public EdgeCondition<Graph> {
- typedef EdgeCondition<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- const FlankingCoverage<Graph> &flanking_cov_;
- double max_coverage_;
-public:
- FlankingCovBound(const Graph &g,
- const FlankingCoverage<Graph> &flanking_cov,
- double max_coverage)
- : base(g),
- flanking_cov_(flanking_cov),
- max_coverage_(max_coverage) {
- }
-
- bool Check(EdgeId e) const override {
- return this->g().length(e) > 1
- && this->g().OutgoingEdgeCount(this->g().EdgeStart(e)) > 1
- && math::le(flanking_cov_.CoverageOfStart(e), max_coverage_);
- }
-
-};
-
-template<class Graph, class Comparator = std::less<typename Graph::EdgeId>>
-class ParallelDisconnectionAlgorithm : public PersistentProcessingAlgorithm<Graph,
- typename Graph::EdgeId,
- ParallelInterestingElementFinder<Graph>, Comparator> {
- typedef typename Graph::EdgeId EdgeId;
- typedef PersistentProcessingAlgorithm<Graph, EdgeId,
- ParallelInterestingElementFinder<Graph>, Comparator> base;
- pred::TypedPredicate<EdgeId> condition_;
- omnigraph::simplification::relative_coverage::EdgeDisconnector<Graph> disconnector_;
-
-public:
- ParallelDisconnectionAlgorithm(Graph &g,
- pred::TypedPredicate<EdgeId> condition,
- size_t chunk_cnt,
- HandlerF<Graph> removal_handler,
- const Comparator &comp = Comparator(),
- bool track_changes = true)
- : base(g,
- ParallelInterestingElementFinder<Graph>(g, condition, chunk_cnt),
- /*canonical_only*/false, comp, track_changes),
- condition_(condition),
- disconnector_(g, removal_handler) {
- }
-
- bool Process(EdgeId e) override {
- if (condition_(e)) {
- disconnector_(e);
- return true;
- }
- return false;
- }
-
-};
-
-template<class Graph>
-AlgoPtr<Graph> LowFlankDisconnectorInstance(Graph &g,
- const FlankingCoverage<Graph> &flanking_cov,
- double cov_bound,
- const SimplifInfoContainer &info,
- HandlerF<Graph> removal_handler) {
- if (math::ls(cov_bound, 0.)) {
- INFO("Flanking coverage based disconnection disabled");
- return nullptr;
- }
-
- return make_shared<ParallelDisconnectionAlgorithm<Graph>>(g,
- FlankingCovBound<Graph>(g, flanking_cov, cov_bound),
- info.chunk_cnt(),
- removal_handler);
-}
-
-template<class Graph>
-bool RemoveHiddenLoopEC(Graph &g,
- const FlankingCoverage<Graph> &flanking_cov,
- double determined_coverage_threshold,
- config::debruijn_config::simplification::hidden_ec_remover her_config,
- HandlerF<Graph> removal_handler) {
- if (her_config.enabled) {
- INFO("Removing loops and rc loops with erroneous connections");
- ECLoopRemover<Graph> hc(g, flanking_cov,
- determined_coverage_threshold,
- her_config.relative_threshold, removal_handler);
- bool res = hc.Run();
- hc.PrintLoopStats();
- return res;
- }
- return false;
-}
-
-
-////todo add chunk_cnt
-//template<class Graph>
-//bool ClipTips(
-// Graph &g,
-// const std::string &condition,
-// const SimplifInfoContainer &info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// if (condition != "") {
-// ConditionParser<Graph> parser(g, condition, info);
-// auto condition = parser();
-// ParallelEdgeRemovingAlgorithm<Graph, LengthComparator<Graph>> algo(g,
-// AddTipCondition(g, condition),
-// info.chunk_cnt(),
-// removal_handler,
-// /*canonical_only*/true,
-// LengthComparator<Graph>(g));
-// return algo.Run();
-// } else {
-// return false;
-// }
-//}
-
-//template<class Graph>
-//bool RemoveLowCoverageEdges(
-// Graph &g,
-// const std::string &condition,
-// const SimplifInfoContainer &info,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-//
-// if (condition != "") {
-// ConditionParser<Graph> parser(g, condition, info);
-// auto condition = parser();
-// blahblahblah
-// ParallelEdgeRemovingAlgorithm<Graph, CoverageComparator<Graph>> algo(g,
-// condition,
-// info.chunk_cnt(),
-// removal_handler,
-// /*canonical_only*/true,
-// CoverageComparator<Graph>(g));
-// return algo.Run();
-// } else {
-// return false;
-// }
-//}
-
-
-//Parallel algo launch
-
-template<class Graph>
-void ParallelCompress(Graph &g, size_t chunk_cnt, bool loop_post_compression = true) {
- INFO("Parallel compression");
- debruijn::simplification::ParallelCompressor<Graph> compressor(g);
- TwoStepAlgorithmRunner<Graph, typename Graph::VertexId> runner(g, false);
- RunVertexAlgorithm(g, runner, compressor, chunk_cnt);
-
- //have to call cleaner to get rid of new isolated vertices
- omnigraph::Cleaner<Graph>(g, chunk_cnt).Run();
-
- if (loop_post_compression) {
- INFO("Launching post-compression to compress loops");
- CompressAllVertices(g, chunk_cnt);
- }
-}
-
-template<class Graph>
-bool ParallelClipTips(Graph &g,
- const string &tip_condition,
- const SimplifInfoContainer &info,
- std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
- INFO("Parallel tip clipping");
-
- string condition_str = tip_condition;
-
- ConditionParser<Graph> parser(g, condition_str, info);
-
- parser();
-
- debruijn::simplification::ParallelTipClippingFunctor<Graph> tip_clipper(g,
- parser.max_length_bound(), parser.max_coverage_bound(), removal_handler);
-
- AlgorithmRunner<Graph, typename Graph::VertexId> runner(g);
-
- RunVertexAlgorithm(g, runner, tip_clipper, info.chunk_cnt());
-
- ParallelCompress(g, info.chunk_cnt());
- //Cleaner is launched inside ParallelCompression
- //CleanGraph(g, info.chunk_cnt());
-
- return true;
-}
-
-//template<class Graph>
-//bool ParallelRemoveBulges(Graph &g,
-// const config::debruijn_config::simplification::bulge_remover &br_config,
-// size_t /*read_length*/,
-// std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
-// INFO("Parallel bulge remover");
-//
-// size_t max_length = LengthThresholdFinder::MaxBulgeLength(
-// g.k(), br_config.max_bulge_length_coefficient,
-// br_config.max_additive_length_coefficient);
-//
-// DEBUG("Max bulge length " << max_length);
-//
-// debruijn::simplification::ParallelSimpleBRFunctor<Graph> bulge_remover(g,
-// max_length,
-// br_config.max_coverage,
-// br_config.max_relative_coverage,
-// br_config.max_delta,
-// br_config.max_relative_delta,
-// removal_handler);
-// for (VertexId v : g) {
-// bulge_remover(v);
-// }
-//
-// Compress(g);
-// return true;
-//}
-
-template<class Graph>
-bool ParallelEC(Graph &g,
- const string &ec_condition,
- const SimplifInfoContainer &info,
- std::function<void(typename Graph::EdgeId)> removal_handler = 0) {
- INFO("Parallel ec remover");
-
- ConditionParser<Graph> parser(g, ec_condition, info);
-
- auto condition = parser();
-
- size_t max_length = parser.max_length_bound();
- double max_coverage = parser.max_coverage_bound();
-
- debruijn::simplification::CriticalEdgeMarker<Graph> critical_marker(g, info.chunk_cnt());
- critical_marker.PutMarks();
-
- debruijn::simplification::ParallelLowCoverageFunctor<Graph> ec_remover(g,
- max_length,
- max_coverage,
- removal_handler);
-
- TwoStepAlgorithmRunner<Graph, typename Graph::EdgeId> runner(g, true);
-
- RunEdgeAlgorithm(g, runner, ec_remover, info.chunk_cnt());
-
- critical_marker.ClearMarks();
-
- ParallelCompress(g, info.chunk_cnt());
- //called in parallel compress
- //CleanGraph(g, info.chunk_cnt());
- return true;
-}
-
-}
-}
diff --git a/src/modules/stages/simplification_pipeline/simplification_settings.hpp b/src/modules/stages/simplification_pipeline/simplification_settings.hpp
deleted file mode 100644
index efaf4d6..0000000
--- a/src/modules/stages/simplification_pipeline/simplification_settings.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-#include "modules/pipeline/config_struct.hpp"
-
-namespace debruijn {
-
-namespace simplification {
-
-class LengthThresholdFinder {
-public:
- static size_t MaxTipLength(size_t read_length, size_t k, double coeff) {
- return std::max((size_t) math::round((double)std::min(k, read_length / 2) * coeff),
- read_length);
- }
-
- static size_t MaxBulgeLength(size_t k, double coeff,
- size_t additive_coeff) {
- return std::max((size_t) math::round((double)k * coeff), k + additive_coeff);
- }
-
- static size_t MaxErroneousConnectionLength(size_t k, size_t param) {
- return k + param;
- }
-
- static size_t MaxTipOriginatedECLength(size_t read_length, size_t k,
- double coeff) {
- return 2 * MaxTipLength(read_length, k, coeff) - 1;
- }
-};
-
-//todo use GenomicInfo as field!
-class SimplifInfoContainer {
- size_t read_length_;
- double detected_mean_coverage_;
- double detected_coverage_bound_;
- bool main_iteration_;
- size_t chunk_cnt_;
- debruijn_graph::config::pipeline_type mode_;
-
-public:
- SimplifInfoContainer(debruijn_graph::config::pipeline_type mode) :
- read_length_(-1ul),
- detected_mean_coverage_(-1.0),
- detected_coverage_bound_(-1.0),
- main_iteration_(false),
- chunk_cnt_(-1ul),
- mode_(mode) {
- }
-
- size_t read_length() const {
- VERIFY(read_length_ != -1ul);
- return read_length_;
- }
-
- double detected_mean_coverage() const {
- VERIFY(math::ge(detected_mean_coverage_, 0.));
- return detected_mean_coverage_;
- }
-
- double detected_coverage_bound() const {
- VERIFY(math::ge(detected_coverage_bound_, 0.));
- return detected_coverage_bound_;
- }
-
- bool main_iteration() const {
- return main_iteration_;
- }
-
- size_t chunk_cnt() const {
- VERIFY(chunk_cnt_ != -1ul);
- return chunk_cnt_;
- }
-
- debruijn_graph::config::pipeline_type mode() const {
- return mode_;
- }
-
- SimplifInfoContainer& set_read_length(size_t read_length) {
- read_length_ = read_length;
- return *this;
- }
-
- SimplifInfoContainer& set_detected_coverage_bound(double detected_coverage_bound) {
- detected_coverage_bound_ = detected_coverage_bound;
- return *this;
- }
-
- SimplifInfoContainer& set_detected_mean_coverage(double detected_mean_coverage) {
- detected_mean_coverage_ = detected_mean_coverage;
- return *this;
- }
-
- SimplifInfoContainer& set_main_iteration(bool main_iteration) {
- main_iteration_ = main_iteration;
- return *this;
- }
-
- SimplifInfoContainer& set_chunk_cnt(size_t chunk_cnt) {
- chunk_cnt_ = chunk_cnt;
- return *this;
- }
-};
-
-}
-
-}
diff --git a/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp b/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp
deleted file mode 100644
index 49dbc27..0000000
--- a/src/modules/stages/simplification_pipeline/single_cell_simplification.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#pragma once
-
-#include "pipeline/config_struct.hpp"
-#include "algorithms/simplification/erroneous_connection_remover.hpp"
-#include "algorithms/simplification/mf_ec_remover.hpp"
-#include "stages/simplification_pipeline/simplification_settings.hpp"
-#include "assembly_graph/graph_support/detail_coverage.hpp"
-
-namespace debruijn {
-namespace simplification {
-
-template<class Graph>
-bool TopologyRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topology");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), tec_config.max_ec_length_coefficient);
-
- pred::TypedPredicate<typename Graph::EdgeId>
- condition(omnigraph::DefaultUniquenessPlausabilityCondition<Graph>(g, tec_config.uniqueness_length, tec_config.plausibility_length));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool MultiplicityCountingRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::config::debruijn_config::simplification::topology_based_ec_remover& tec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topological multiplicity counting");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), tec_config.max_ec_length_coefficient);
-
- pred::TypedPredicate<typename Graph::EdgeId>
- condition(omnigraph::MultiplicityCountingCondition<Graph>(g, tec_config.uniqueness_length,
- /*plausibility*/ MakePathLengthLowerBound(g,
- omnigraph::PlausiblePathFinder<Graph>(g, 2 * tec_config.plausibility_length), tec_config.plausibility_length)));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool RemoveThorns(
- Graph &g,
- const debruijn_graph::config::debruijn_config::simplification::interstrand_ec_remover& isec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing interstrand connections");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), isec_config.max_ec_length_coefficient);
-
- auto condition
- = pred::And(omnigraph::LengthUpperBound<Graph>(g, max_length),
- omnigraph::ThornCondition<Graph>(g, isec_config.uniqueness_length, isec_config.span_distance));
-
- return omnigraph::RemoveErroneousEdgesInCoverageOrder(g, condition, numeric_limits<double>::max(), removal_handler);
-}
-
-template<class Graph>
-bool TopologyReliabilityRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::config::debruijn_config::simplification::tr_based_ec_remover& trec_config,
- std::function<void(typename Graph::EdgeId)> removal_handler) {
- INFO("Removing connections based on topology and reliable coverage");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), trec_config.max_ec_length_coefficient);
-
- auto condition
- = pred::And(omnigraph::CoverageUpperBound<Graph>(g, trec_config.unreliable_coverage),
- omnigraph::PredicateUniquenessPlausabilityCondition<Graph>(g,
- /*uniqueness*/omnigraph::MakePathLengthLowerBound(g, omnigraph::UniquePathFinder<Graph>(g), trec_config.uniqueness_length),
- /*plausibility*/pred::AlwaysTrue<typename Graph::EdgeId>()));
-
- return omnigraph::RemoveErroneousEdgesInLengthOrder(g, condition, max_length, removal_handler);
-}
-
-template<class Graph>
-bool MaxFlowRemoveErroneousEdges(
- Graph &g,
- const debruijn_graph::config::debruijn_config::simplification::max_flow_ec_remover& mfec_config,
- omnigraph::HandlerF<Graph> removal_handler = 0) {
- if (!mfec_config.enabled)
- return false;
- INFO("Removing connections based on max flow strategy");
- size_t max_length = LengthThresholdFinder::MaxErroneousConnectionLength(
- g.k(), (size_t) mfec_config.max_ec_length_coefficient);
- omnigraph::MaxFlowECRemover<Graph> erroneous_edge_remover(
- g, max_length, mfec_config.uniqueness_length,
- mfec_config.plausibility_length, removal_handler);
- return erroneous_edge_remover.Process();
-}
-
-template<class Graph>
-bool RemoveHiddenEC(Graph& g,
- const debruijn_graph::FlankingCoverage<Graph>& flanking_cov,
- const debruijn_graph::config::debruijn_config::simplification::hidden_ec_remover& her_config,
- const SimplifInfoContainer& info,
- omnigraph::HandlerF<Graph> removal_handler) {
- if (her_config.enabled) {
- INFO("Removing hidden erroneous connections");
- return omnigraph::HiddenECRemover<Graph>(g, her_config.uniqueness_length, flanking_cov,
- her_config.unreliability_threshold, info.detected_coverage_bound(),
- her_config.relative_threshold, removal_handler).Run();
- }
- return false;
-}
-
-}
-}
diff --git a/src/modules/visualization/graph_colorer.hpp b/src/modules/visualization/graph_colorer.hpp
deleted file mode 100644
index 234e1c1..0000000
--- a/src/modules/visualization/graph_colorer.hpp
+++ /dev/null
@@ -1,340 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "assembly_graph/components/graph_component.hpp"
-#include "assembly_graph/paths/mapping_path.hpp"
-#include "visualization/printing_parameter_storage.hpp"
-//#include "edges_position_handler.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<typename ElementId>
-class ElementColorer : public virtual ParameterStorage<ElementId, string> {
-public:
- template<typename Iter>
- set<ElementId> ColoredWith(Iter begin, Iter end, const string &color) {
- set<ElementId> result;
- for(Iter it = begin; it != end; ++it) {
- if(this->GetValue(*it) == color)
- result.insert(*it);
- }
- return result;
- }
-};
-
-//TODO remove all default color parameters!
-
-template<typename ElementId>
-class MapColorer : public ElementColorer<ElementId>, public MapParameterStorage<ElementId, string> {
-public:
- MapColorer(const string &default_color) : MapParameterStorage<ElementId, string>(default_color) {
- }
-
- MapColorer(const map<ElementId, string> &color_map) : MapParameterStorage<ElementId, string>(color_map) {
- }
-
- MapColorer(const map<ElementId, string> &color_map, const string& default_color) : MapParameterStorage<ElementId, string>(color_map, default_color) {
- }
-
- template<class It>
- MapColorer(It begin, It end, const string& color, const string& default_color) : MapParameterStorage<ElementId, string>(begin, end, color, default_color) {
- }
-
- virtual ~MapColorer() {
- }
-};
-
-template<typename ElementId>
-class FixedColorer: public MapColorer<ElementId> {
-public:
- FixedColorer(const string& default_color): MapColorer<ElementId>(default_color) {
- }
-};
-
-template<class Graph>
-class SetColorer : public MapColorer<typename Graph::EdgeId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph &graph_;
-
- template<class It>
- map<EdgeId, string> ConstructColorMap(It begin, It end, const string &color) {
- map<EdgeId, string> result;
- for (auto it = begin; it != end; ++it) {
- result[*it] = color;
- }
- return result;
- }
-
-public:
- template<class It>
- SetColorer(const Graph &graph, It begin, It end, const string &color) :
- MapColorer<typename Graph::EdgeId>(ConstructColorMap(begin, end, color), "black"), graph_(graph) {
- }
-
- template<class Collection>
- SetColorer(const Graph &graph, const Collection& c, const string &color) :
- MapColorer<typename Graph::EdgeId>(ConstructColorMap(c.begin(), c.end(), color), "black"), graph_(graph) {
- }
-
-};
-//
-//template<class Graph>
-//class PositionsEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
-//private:
-// typedef typename Graph::VertexId VertexId;
-// typedef typename Graph::EdgeId EdgeId;
-// const Graph &graph_;
-// EdgesPositionHandler<Graph> &positions_;
-//public:
-// PositionsEdgeColorer(const Graph &graph, EdgesPositionHandler<Graph> &positions):
-// graph_(graph), positions_(positions) {
-// }
-// string GetValue(EdgeId element) const {
-// std::vector<EdgeId> path;
-// path.push_back(element);
-// if (positions_.GetEdgePositions(element).size() == 0) return "black";
-// else {
-// if (positions_.IsConsistentWithGenome(path)) return "green";
-// else return "orange";
-// }
-// }
-//};
-
-
-template<class Graph>
-class CompositeEdgeColorer: public ElementColorer<typename Graph::EdgeId> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- string default_color_;
- vector<shared_ptr<ElementColorer<typename Graph::EdgeId>>> colorers_;
-
- vector<string> CollectColors(EdgeId edge) const {
- vector<string> result = {default_color_};
- for(auto it = colorers_.begin(); it != colorers_.end(); ++it) {
- string next_color = (*it)->GetValue(edge);
- if(std::find(result.begin(), result.end(), next_color) == result.end())
- result.push_back(next_color);
- }
- return result;
- }
-
- string ConstructColorString(const vector<string> &colors) const {
- if(colors.size() == 1)
- return default_color_;
- string result = "";
- for(size_t i = 1; i < colors.size(); i++)
- result += ":" + colors[i];
- return result.substr(1, result.size());
- }
-
-public:
- CompositeEdgeColorer(const string &default_color): default_color_(default_color) {
- }
-
- CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer, const string &default_color): default_color_(default_color) {
- AddColorer(colorer);
- }
-
- CompositeEdgeColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer1, shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer2,
- const string &default_color): default_color_(default_color) {
- AddColorer(colorer1);
- AddColorer(colorer2);
- }
-
- void AddColorer(shared_ptr<ElementColorer<typename Graph::EdgeId>> colorer) {
- colorers_.push_back(colorer);
- }
-
- string GetValue(EdgeId edge) const {
- return ConstructColorString(CollectColors(edge));
- }
-};
-
-template<class Graph>
-class GraphColorer : public ElementColorer<typename Graph::VertexId>, public ElementColorer<typename Graph::EdgeId>{
-public:
- string GetValue(typename Graph::VertexId) const = 0;
- string GetValue(typename Graph::EdgeId) const = 0;
-
- template<typename Iter>
- set<typename Iter::value_type> ColoredWith(Iter begin, Iter end, const string &color) {
- return ElementColorer<typename Iter::value_type>::ColoredWith(begin, end, color);
- }
-};
-
-template<class Graph>
-class DelegatingGraphColorer : public GraphColorer<Graph> {
-private:
- const GraphColorer<Graph> &inner_colorer_;
-public:
- DelegatingGraphColorer(const GraphColorer<Graph> &inner_colorer) : inner_colorer_(inner_colorer) {
- }
-
- string GetValue(typename Graph::VertexId v) const {
- return inner_colorer_.GetValue(v);
- }
- string GetValue(typename Graph::EdgeId e) const {
- return inner_colorer_.GetValue(e);
- }
-};
-
-template<typename Graph>
-class BorderDecorator : public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const GraphComponent<Graph> &component_;
-// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
-// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
- const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
- const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
- const string border_color_;
-public:
-// BorderDecorator(const GraphComponent<Graph> &component,
-// const shared_ptr<const GraphColorer<Graph>> colorer,
-// const string &border_color) :
-// component_(component), vertex_colorer_ptr_(colorer), edge_colorer_ptr_(
-// colorer), vertex_colorer_(*colorer), edge_colorer_(
-// *colorer), border_color_(border_color) {
-// }
-
- BorderDecorator(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &border_color = "yellow") :
- component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), border_color_(border_color) {
- }
-
- string GetValue(VertexId v) const {
- if(component_.IsBorder(v)) {
- return border_color_;
- } else {
- return vertex_colorer_.GetValue(v);
- }
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_.GetValue(e);
- }
-
- static shared_ptr<BorderDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &border_color = "yellow") {
- return make_shared<BorderDecorator<Graph>>(component, colorer, border_color);
- }
-};
-
-
-template<typename Graph>
-class SinkSourceDecorator : public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const GraphComponent<Graph> &component_;
-// const shared_ptr<const ElementColorer<typename Graph::VertexId>> vertex_colorer_ptr_;
-// const shared_ptr<const ElementColorer<typename Graph::EdgeId>> edge_colorer_ptr_;
- const ElementColorer<typename Graph::VertexId> &vertex_colorer_;
- const ElementColorer<typename Graph::EdgeId> &edge_colorer_;
- const string sink_color_;
- const string source_color_;
- const string sinksource_color_;
-public:
-
- SinkSourceDecorator(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange", const string &sinksource_color = "green") :
- component_(component), vertex_colorer_(colorer), edge_colorer_(colorer), sink_color_(sink_color), source_color_(source_color), sinksource_color_(sinksource_color) {
- }
-
- string GetValue(VertexId v) const {
- if(component_.sinks().count(v) && !component_.sources().count(v)) {
- return sink_color_;
- }
- if(component_.sources().count(v) && !component_.sinks().count(v))
- {
- return source_color_;
- }
- if(component_.sources().count(v) && component_.sinks().count(v))
- {
- return sinksource_color_;
- }
-
- return vertex_colorer_.GetValue(v);
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_.GetValue(e);
- }
-
- static shared_ptr<SinkSourceDecorator<Graph>> GetInstance(const GraphComponent<Graph> &component,
- const GraphColorer<Graph> &colorer, const string &sink_color = "red", const string &source_color = "orange") {
- return make_shared<SinkSourceDecorator<Graph>>(component, colorer, sink_color, source_color);
- }
-};
-
-template<class Graph>
-class CompositeGraphColorer: public GraphColorer<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- const shared_ptr<ElementColorer<VertexId>> vertex_colorer_;
- const shared_ptr<ElementColorer<EdgeId>> edge_colorer_;
-public:
- CompositeGraphColorer(shared_ptr<ElementColorer<VertexId>> vertex_colorer
- , shared_ptr<ElementColorer<EdgeId>> edge_colorer) :
- vertex_colorer_(vertex_colorer),
- edge_colorer_(edge_colorer) {
- }
-
-// explicit CompositeGraphColorer(shared_ptr<ElementColorer<EdgeId>> edge_colorer = make_shared<FixedColorer<EdgeId>>("black")) :
-// vertex_colorer_(shared_ptr<ElementColorer<VertexId>>(new FixedColorer<VertexId>("white"))),
-// edge_colorer_(edge_colorer) {
-// }
-
- string GetValue(VertexId v) const {
- return vertex_colorer_->GetValue(v);
- }
-
- string GetValue(EdgeId e) const {
- return edge_colorer_->GetValue(e);
- }
-
-};
-
-
-
-// edge_colorer management is passed here
-//TODO check all usages
-template <class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/,
- shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer) {
- return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(make_shared<FixedColorer<typename Graph::VertexId>>("white"), edge_colorer));
-}
-
-template <class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& g,
- const Path<typename Graph::EdgeId>& path1,
- const Path<typename Graph::EdgeId>& path2) {
- shared_ptr<ElementColorer<typename Graph::EdgeId>> edge_colorer =
- make_shared<CompositeEdgeColorer<Graph>>(
- make_shared<SetColorer<Graph>>(g, path1.sequence(), "red"),
- make_shared<SetColorer<Graph>>(g, path2.sequence(), "blue"), "black");
- return DefaultColorer(g, edge_colorer);
-}
-
-template<class Graph>
-shared_ptr<GraphColorer<Graph>> DefaultColorer(const Graph& /*g*/) {
- return shared_ptr<GraphColorer<Graph>>(new CompositeGraphColorer<Graph>(
- make_shared<FixedColorer<typename Graph::VertexId>>("white"),
- make_shared<FixedColorer<typename Graph::EdgeId>>("black")));
-}
-
-}
-}
diff --git a/src/modules/visualization/graph_labeler.hpp b/src/modules/visualization/graph_labeler.hpp
deleted file mode 100644
index 733ca0f..0000000
--- a/src/modules/visualization/graph_labeler.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef GRAPH_LABELER_HPP_
-#define GRAPH_LABELER_HPP_
-
-#include "dev_support/simple_tools.hpp"
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-
-namespace omnigraph {
-
-/**
- * (Interface)
- * Provides string labels for vertices and edges of some graph.
- * Used with GraphPrinter to visualize graphs.
- */
-template<class Graph>
-class GraphLabeler {
-public:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-
- virtual ~GraphLabeler() {
- }
-
- virtual string label(VertexId v) const = 0;
-
- virtual string label(EdgeId e) const = 0;
-
-};
-
-//template<class Graph>
-//class MapGraphLabeler {
-// typedef typename Graph::EdgeId EdgeId;
-// typedef typename Graph::VertexId VertexId;
-// map<EdgeId, string> edge_map_;
-// map<VertexId, string> vertex_map_;
-//
-//public:
-//
-// string label(VertexId v) const {
-// auto it = vertex_map_.find(v);
-// if (it == vertex_map_.end())
-// return "";
-// else
-// return it->second;
-// }
-//
-// string label(EdgeId e) const {
-// auto it = edge_map_.find(e);
-// if (it == edge_map_.end())
-// return "";
-// else
-// return it->second;
-// }
-//
-//};
-
-template<class Graph>
-class AbstractGraphLabeler: public GraphLabeler<Graph> {
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
- const Graph& g_;
-protected:
- AbstractGraphLabeler(const Graph& g): g_(g) {
-
- }
-
- const Graph& graph() const {
- return g_;
- }
-
-public:
- /*virtual*/ std::string label(VertexId /*v*/) const {
- return "";
- }
-
- /*virtual*/ std::string label(EdgeId /*e*/) const {
- return "";
- }
-
-};
-
-/**
- * Trivial implementation of GraphLabeler.
- * All labels are "".
- */
-template<class Graph>
-class EmptyGraphLabeler : public GraphLabeler<Graph> {
- typedef GraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- EmptyGraphLabeler() {}
-
- std::string label(VertexId /*v*/) const {
- return "";
- }
-
- std::string label(EdgeId /*e*/) const {
- return "";
- }
-};
-
-/**
- * Implementation of GraphLabeler for Graphs that have methods
- * str(VertexId) and str(EdgeId), such as AbstractGraph.
- */
-template<class Graph>
-class StrGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- StrGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(VertexId v) const {
- return this->graph().str(v);
- }
-
- /*virtual*/ std::string label(EdgeId e) const {
- return this->graph().str(e);
- }
-
- /*virtual*/ ~StrGraphLabeler() {
-
- }
-};
-
-template <class Graph>
-shared_ptr<GraphLabeler<Graph>> StrGraphLabelerInstance(const Graph& g) {
- return make_shared<StrGraphLabeler<Graph>>(g);
-}
-
-template<class Graph>
-class LengthIdGraphLabeler : public StrGraphLabeler<Graph> {
- typedef StrGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- LengthIdGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(EdgeId e) const {
- std::stringstream ss;
- ss << this->graph().length(e) << " (id: " << this->graph().int_id(e) << ")";
- return ss.str();
- }
-
-};
-
-template<class Graph>
-class LengthGraphLabeler : public StrGraphLabeler<Graph> {
- typedef StrGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- LengthGraphLabeler(const Graph& g) : base(g) {}
-
- /*virtual*/ std::string label(EdgeId e) const {
- return ToString(this->graph().length(e));
- }
-
-};
-
-template<class Graph>
-class CoverageGraphLabeler : public AbstractGraphLabeler<Graph> {
- typedef AbstractGraphLabeler<Graph> base;
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- CoverageGraphLabeler(const Graph& g) : base(g) {}
-
- std::string label(EdgeId e) const {
- double coverage = this->graph().coverage(e);
- return " {Cov:" + ToString(coverage) + "}";
- }
-};
-
-template<class Graph>
-class CompositeLabeler : public GraphLabeler<Graph> {
-private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- vector<GraphLabeler<Graph>*> list_;
-
- template<typename ElementId>
- string ConstructLabel(ElementId id) const {
- vector<string> to_print;
- for(size_t i = 0; i < list_.size(); i++) {
- string next = list_[i]->label(id);
- if(next.size() != 0) {
- to_print.push_back(next);
- }
- }
- string result = "";
- for(size_t i = 0; i < to_print.size(); i++) {
- result += to_print[i];
- if(i + 1 < to_print.size())
- result += "\\n";
- }
- return result;
- }
-
-public:
- CompositeLabeler() {
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3, GraphLabeler<Graph> &labeler4) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- AddLabeler(labeler3);
- AddLabeler(labeler4);
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2, GraphLabeler<Graph> &labeler3) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- AddLabeler(labeler3);
- }
-
- CompositeLabeler(GraphLabeler<Graph> &labeler1, GraphLabeler<Graph> &labeler2) {
- AddLabeler(labeler1);
- AddLabeler(labeler2);
- }
-
- virtual ~CompositeLabeler() {
- }
-
- void AddLabeler(GraphLabeler<Graph> &labeler) {
- list_.push_back(&labeler);
- }
-
- virtual string label(VertexId vertexId) const {
- return ConstructLabel<VertexId>(vertexId);
- }
-
- virtual string label(EdgeId edgeId) const {
- return ConstructLabel<EdgeId>(edgeId);
- }
-};
-
-template<class Graph>
-class EdgePosGraphLabeler: public AbstractGraphLabeler<Graph> {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
-public:
- const EdgesPositionHandler<Graph>& edge_pos_;
-
- EdgePosGraphLabeler(const Graph& g, const EdgesPositionHandler<Graph>& edge_pos) :
- AbstractGraphLabeler<Graph>(g), edge_pos_(edge_pos) {
- }
-
- virtual std::string label(EdgeId edgeId) const {
- return "Positions: " + edge_pos_.str(edgeId);
- }
-
- virtual ~EdgePosGraphLabeler() {
-// TRACE("~EdgePosGraphLabeler");
- }
-private:
- DECL_LOGGER("EdgePosGraphLabeler")
-};
-
-template<class Graph>
-class DefaultLabeler: public GraphLabeler<Graph> {
-private:
- const Graph& g_;
- const EdgesPositionHandler<Graph> &edges_positions_;
-protected:
- typedef GraphLabeler<Graph> super;
- typedef typename super::EdgeId EdgeId;
- typedef typename super::VertexId VertexId;
-public:
-
- DefaultLabeler(const Graph &g, const EdgesPositionHandler<Graph> &position_handler) :
- g_(g), edges_positions_(position_handler) {
- }
-
- virtual std::string label(VertexId vertexId) const {
- return ToString(vertexId.int_id());
- }
-
- virtual std::string label(EdgeId edgeId) const {
- std::string ret_label;
- ret_label += "Id " + g_.str(edgeId) + "\\n";
- ret_label += "Positions:\\n"+ edges_positions_.str(edgeId);
- size_t len = g_.length(edgeId);
- double cov = g_.coverage(edgeId);
- ret_label += "Len(cov): " + ToString(len) + "(" + ToString(cov) + ")";
- return ret_label;
- }
-
- virtual ~DefaultLabeler() {
- }
-};
-
-}
-
-#endif /* GRAPH_LABELER_HPP_ */
diff --git a/src/modules/visualization/graph_printer.hpp b/src/modules/visualization/graph_printer.hpp
deleted file mode 100644
index 9a9927f..0000000
--- a/src/modules/visualization/graph_printer.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/standard_base.hpp"
-#include "io/graph_io/graph_print_utils.hpp"
-#include "graph_labeler.hpp"
-#include "graph_colorer.hpp"
-#include "vertex_linker.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<class Graph>
-class GraphPrinter {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-// ostream& os_;
- const Graph &graph_;
-protected:
- const GraphLabeler<Graph> &labeler_;
- const GraphColorer<Graph> &colorer_;
- const VertexLinker<Graph> &linker_;
-
-protected:
-// ostream& os() {
-// return os_;
-// }
-
-
- const Graph &graph() {
- return graph_;
- }
-
- template<class GvisVertexId>
- gvis::BaseVertex<GvisVertexId> CreateBaseVertex(GvisVertexId id, VertexId v) {
- return gvis::BaseVertex<GvisVertexId>(id, labeler_.label(v), linker_.GetValue(v), colorer_.GetValue(v));
- }
-
- template<class GvisVertexId>
- gvis::BaseEdge<GvisVertexId> CreateBaseEdge(GvisVertexId from, GvisVertexId to, EdgeId e){
- return gvis::BaseEdge<GvisVertexId>(from, to, this->labeler_.label(e), this->colorer_.GetValue(e));
- }
-
- virtual void ManageDrawn(VertexId v, set<VertexId> &visited) {
- visited.insert(v);
- }
-
-public:
- GraphPrinter(const Graph &graph, /*ostream &os,*/
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) :
- /*os_(os), */graph_(graph), labeler_(labeler), colorer_(colorer), linker_(
- linker) {
- }
-
- virtual void open() = 0;
-
- virtual void close() = 0;
-
- virtual void AddVertex(VertexId v1) = 0;
-
- template<class iter>
- void AddVertices(iter vbegin, iter vend) {
- set<VertexId> drawn;
- for(;vbegin != vend; ++vbegin) {
- if(drawn.count(*vbegin) == 0) {
- AddVertex(*vbegin);
- ManageDrawn(*vbegin, drawn);
- }
- }
- }
-
- virtual void AddEdge(EdgeId e) = 0;
-
- template<class iter>
- void AddEdges(iter ebegin, iter eend) {
- for(;ebegin != eend; ++ebegin) {
- AddEdge(*ebegin);
- }
- }
-
- virtual ~GraphPrinter() {
- }
-};
-
-template<typename Graph>
-class SingleGraphPrinter : public GraphPrinter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- gvis::DotSingleGraphRecorder<size_t> recorder_;
-
-public:
- SingleGraphPrinter(const Graph &graph, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os){
- }
-
- void open() {
- recorder_.startGraphRecord("graph_picture");
- }
-
- void close() {
- recorder_.endGraphRecord();
- }
-
- void AddVertex(VertexId v) {
- recorder_.recordVertex(this->CreateBaseVertex((size_t)this->graph().int_id(v), v));
- }
-
- void AddEdge(EdgeId edge) {
- recorder_.recordEdge(this->CreateBaseEdge((size_t)this->graph().int_id(this->graph().EdgeStart(edge)), (size_t)this->graph().int_id(this->graph().EdgeEnd(edge)), edge));
- }
-};
-
-template<typename Graph>
-class PairedGraphPrinter : public GraphPrinter<Graph> {
-private:
- typedef typename Graph::VertexId VertexId;
- typedef typename Graph::EdgeId EdgeId;
-
- gvis::DotPairedGraphRecorder<size_t> recorder_;
-
- pair<gvis::BaseVertex<size_t>, gvis::BaseVertex<size_t>> CreateDoubleVertex(VertexId v) {
- gvis::BaseVertex<size_t> u1 = this->CreateBaseVertex((size_t)this->graph().int_id(v), v);
- gvis::BaseVertex<size_t> u2 = this->CreateBaseVertex((size_t)this->graph().int_id(this->graph().conjugate(v)), this->graph().conjugate(v));
- return make_pair(u1, u2);
- }
-
- pair<size_t, size_t> CreateDoubleVertexId(VertexId v) {
- return make_pair(this->graph().int_id(v), this->graph().int_id(this->graph().conjugate(v)));
- }
-protected:
- /*virtual */void ManageDrawn(VertexId v, set<VertexId> &visited) {
- visited.insert(v);
- visited.insert(this->graph().conjugate(v));
- }
-
-public:
- PairedGraphPrinter(const Graph &graph, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) : GraphPrinter<Graph>(/*os_, */graph, labeler, colorer, linker), recorder_(os) {
- }
-
- void open() {
- recorder_.startGraphRecord("graph_picture");
- }
-
- void close() {
- recorder_.endGraphRecord();
- }
-
- void AddVertex(VertexId v) {
- recorder_.recordVertex(CreateDoubleVertex(v));
- }
-
- void AddEdge(EdgeId edge) {
- auto vid1 = CreateDoubleVertexId(this->graph().EdgeStart(edge));
- auto vid2 = CreateDoubleVertexId(this->graph().EdgeEnd(edge));
- recorder_.recordEdge(gvis::BaseEdge<pair<size_t, size_t>>(vid1, vid2, this->labeler_.label(edge), this->colorer_.GetValue(edge)));
- }
-};
-
-}
-}
diff --git a/src/modules/visualization/position_filler.hpp b/src/modules/visualization/position_filler.hpp
deleted file mode 100644
index 406d679..0000000
--- a/src/modules/visualization/position_filler.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
-#include "assembly_graph/handlers/edges_position_handler.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
-#include "io/reads_io/easy_reader.hpp"
-#include "io/reads_io/io_helper.hpp"
-
-namespace debruijn_graph {
-
-template<class Graph>
-class PosFiller {
- typedef typename Graph::EdgeId EdgeId;
- typedef std::shared_ptr<SequenceMapper<Graph>> MapperPtr;
- const Graph &g_;
- MapperPtr mapper_;
- EdgesPositionHandler<Graph> &edge_pos_;
-
-public:
- PosFiller(const Graph &g, MapperPtr mapper,
- EdgesPositionHandler<Graph> &edge_pos) :
- g_(g), mapper_(mapper), edge_pos_(edge_pos) {
-
- }
-
- void Process(const Sequence &s, string name) const {
- //todo stupid conversion!
- return Process(io::SingleRead(name, s.str()));
- }
-
- void Process(const io::SingleRead &read) const {
- MappingPath<EdgeId> path = mapper_->MapRead(read);
- const string name = read.name();
- int cur_pos = 0;
- TRACE("Contig " << name << " mapped on " << path.size()
- << " fragments.");
- for (size_t i = 0; i < path.size(); i++) {
- EdgeId ei = path[i].first;
- MappingRange mr = path[i].second;
- int len = (int) (mr.mapped_range.end_pos - mr.mapped_range.start_pos);
- if (i > 0) if (path[i - 1].first != ei) if (g_.EdgeStart(ei) != g_.EdgeEnd(path[i - 1].first)) {
- TRACE(
- "Contig " << name
- << " mapped on not adjacent edge. Position in contig is "
- << path[i - 1].second.initial_range.start_pos
- + 1
- << "--"
- << path[i - 1].second.initial_range.end_pos
- << " and "
- << mr.initial_range.start_pos + 1
- << "--" << mr.initial_range.end_pos);
- }
- edge_pos_.AddEdgePosition(ei, name, mr.initial_range.start_pos,
- mr.initial_range.end_pos,
- mr.mapped_range.start_pos,
- mr.mapped_range.end_pos);
- cur_pos += len;
- }
- }
-
- void Process(io::SingleStream &stream) const {
- io::SingleRead read;
- while (!stream.eof()) {
- stream >> read;
- Process(read);
- }
- }
-
-private:
- DECL_LOGGER("PosFiller");
-};
-
-template<class gp_t>
-void FillPos(gp_t &gp, const string &contig_file, string prefix, bool with_rc = false) {
- PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
- auto irs = std::make_shared<io::PrefixAddingReaderWrapper>(io::EasyStream(contig_file, with_rc, false), prefix);
- pos_filler.Process(*irs);
-}
-
-template<class gp_t>
-void FillPos(gp_t &gp, const Sequence &s, string name) {
- PosFiller<typename gp_t::graph_t> pos_filler(gp.g, MapperInstance(gp), gp.edge_pos);
- pos_filler.Process(s, name);
-}
-
-}
diff --git a/src/modules/visualization/printing_parameter_storage.hpp b/src/modules/visualization/printing_parameter_storage.hpp
deleted file mode 100644
index f052733..0000000
--- a/src/modules/visualization/printing_parameter_storage.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "dev_support/standard_base.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-namespace omnigraph {
-namespace visualization {
-
-template<typename ElementId, typename Value>
-class ParameterStorage {
-public:
- virtual Value GetValue(ElementId element) const = 0;
-
- virtual ~ParameterStorage() {
- }
-};
-
-template<typename ElementId, typename Value>
-class MapParameterStorage : public virtual ParameterStorage<ElementId, Value> {
-private:
-private:
- template<class It>
- static map<ElementId, string> ConstructMap(It begin, It end, const string& color) {
- map<ElementId, string> result;
- for (auto it = begin; it != end; ++it) {
- result.insert(make_pair(*it, color));
- }
- return result;
- }
-
-protected:
- map<ElementId, Value> storage_;
-private:
- boost::optional<Value> default_value_;
-public:
- MapParameterStorage(const string &default_value) : default_value_(default_value) {
- }
-
- MapParameterStorage(map<ElementId, Value> storage, Value default_value) : storage_(storage), default_value_(default_value) {
- }
-
- MapParameterStorage(map<ElementId, Value> storage) : storage_(storage) {
- }
-
- template<class It>
- MapParameterStorage(It begin, It end, const Value& value, const string& default_value) : storage_(ConstructMap(begin, end, value)), default_value_(default_value) {
- }
-
-
- Value GetValue(ElementId element) const {
- auto it = storage_.find(element);
- if (it == storage_.end()) {
- VERIFY(default_value_);
- return default_value_.get();
- }
- return it->second;
- }
-};
-
-template<typename ElementId, typename Value>
-class DecoratorParameterStorage : public virtual ParameterStorage<ElementId, Value> {
-private:
- ParameterStorage<ElementId, Value> inner_storage_;
-public:
- DecoratorParameterStorage(ParameterStorage<ElementId, Value> inner_storage) : inner_storage_(inner_storage) {
- }
-
- Value GetInnerValue(ElementId element) {
- return inner_storage_.GetValue(element);
- }
-};
-
-}
-}
diff --git a/src/modules/visualization/vertex_linker.hpp b/src/modules/visualization/vertex_linker.hpp
deleted file mode 100644
index f960b20..0000000
--- a/src/modules/visualization/vertex_linker.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "dev_support/standard_base.hpp"
-#include "printing_parameter_storage.hpp"
-
-namespace omnigraph {
-namespace visualization {
-
-template<class Graph>
-class VertexLinker : public virtual ParameterStorage<typename Graph::VertexId, string> {
-};
-
-template<class Graph>
-class MapVertexLinker : public VertexLinker<Graph>, public MapParameterStorage<typename Graph::VertexId, string> {
-public:
- MapVertexLinker() : MapParameterStorage<typename Graph::VertexId, string>("") {
- }
-
- MapVertexLinker(const map<typename Graph::VertexId, string> &link_map) : MapParameterStorage<typename Graph::VertexId, string>(link_map, "") {
- }
-
- virtual ~MapVertexLinker() {
- }
-};
-
-template<class Graph>
-class EmptyGraphLinker : public MapVertexLinker<Graph> {
-public:
- EmptyGraphLinker() {
- }
-};
-
-}
-}
diff --git a/src/modules/visualization/visualization_utils.hpp b/src/modules/visualization/visualization_utils.hpp
deleted file mode 100644
index 72d4f74..0000000
--- a/src/modules/visualization/visualization_utils.hpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "graph_printer.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
-#include "assembly_graph/components/splitters.hpp"
-#include "assembly_graph/components/graph_component.hpp"
-#include "visualizers.hpp"
-#include "vertex_linker.hpp"
-
-#include <fstream>
-
-namespace omnigraph {
-namespace visualization {
-
-
-template<class Graph>
-void WriteComponents(const Graph& g,
- const string& folder_name,
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
-// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
- auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
- omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(*splitter, folder_name);
-}
-
-template<class Graph>
-void DrawComponentsOfShortEdges(const Graph& g, const string &output_dir, size_t min_length, size_t sinks, size_t sources)
-{
- vector<typename Graph::EdgeId> short_edges;
- std::string pics_folder_ = output_dir + ToString(min_length) + "_" + ToString(sinks) + "_" + ToString(sources) + "_"+ "pics_polymorphic/";
- make_dir(pics_folder_);
- INFO("Writing pics with components consisting of short edges to " + pics_folder_);
- shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g, min_length);
- while (splitter->HasNext()) {
- GraphComponent<Graph> component = splitter->Next();
- if(component.v_size() > 3 && component.sinks().size() == sinks && component.sources().size() == sources)
- {
- bool fail = false;
- for(auto v : component.sources()) {
- if(component.g().IncomingEdgeCount(v) != 1) {
- fail = true;
- }
- }
- for(auto v : component.sinks()) {
- if(component.g().OutgoingEdgeCount(v) != 1) {
- fail = true;
- }
- }
-
- if(fail)
- {
- continue;
- }
-
- StrGraphLabeler<Graph> labeler(component.g());
- CoverageGraphLabeler<Graph> labeler2(component.g());
- CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
- WriteComponentSinksSources(component, pics_folder_ + ToString(g.int_id(*component.vertices().begin()))
- + ".dot", visualization::DefaultColorer(g),
- compositeLabeler);
- INFO("Component is written to " + ToString(g.int_id(*component.vertices().begin())) + ".dot");
-
- // PrintComponent(component,
-// pics_folder_ + "ShortComponents/"
-// + ToString(gp.g.int_id(component.vertices_[0]))
-// + ".dot");
- }
- }
-}
-
-
-template<class Graph>
-void WriteSizeLimitedComponents(const Graph& g,
- const string& folder_name,
- shared_ptr<GraphSplitter<Graph>> inner_splitter,
- shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler, int min_component_size, int max_component_size, size_t max_components) {
- EmptyGraphLinker<Graph> linker;
-
- auto filter = make_shared<omnigraph::ComponentSizeFilter<Graph>>(g, 1000000000, (size_t) min_component_size, (size_t) max_component_size);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CollectingSplitterWrapper<Graph>>(inner_splitter, filter);
- omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker, false, max_components).SplitAndVisualize(*splitter, folder_name);
-}
-
-template<class Graph>
-void WriteComponent(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- BorderDecorator<Graph> component_colorer(gc, *colorer, "yellow");
- std::ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- SinkSourceDecorator<Graph> component_colorer(gc, *colorer);
- std::ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), true).Visualize(gc, os, labeler, component_colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentSinksSources(const GraphComponent<Graph>& gc,
- const string& file_name) {
-
- StrGraphLabeler<Graph> labeler(gc.g());
- CoverageGraphLabeler<Graph> labeler2(gc.g());
- CompositeLabeler<Graph> compositeLabeler(labeler, labeler2);
- EmptyGraphLinker<Graph> linker;
- WriteComponentSinksSources(gc, file_name, DefaultColorer(gc.g()),
- compositeLabeler);
-}
-
-template<class Graph>
-void WriteSimpleComponent(const GraphComponent<Graph>& gc,
- const string& file_name, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler) {
- EmptyGraphLinker<Graph> linker;
- std::ofstream os;
- os.open(file_name);
- omnigraph::visualization::ComponentVisualizer<Graph>(gc.g(), false).Visualize(gc, os, labeler, *colorer, linker);
- os.close();
-}
-
-template<class Graph>
-void WriteComponentsAlongPath(const Graph& g, vector<typename Graph::EdgeId> path,
- const string& prefix_path, shared_ptr<GraphColorer<Graph>> colorer,
- const GraphLabeler<Graph> &labeler, bool color_path = true) {
- auto edge_colorer = make_shared<CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer);
- if (color_path) {
- edge_colorer->AddColorer(make_shared<SetColorer<Graph>>(g, path, "green"));
- }
- shared_ptr<GraphColorer<Graph>> resulting_colorer = make_shared<CompositeGraphColorer<Graph>>(colorer, edge_colorer);
- shared_ptr<GraphSplitter<Graph>> rs = ReliableSplitterAlongPath<Graph>(g, path);
- auto filter = make_shared<omnigraph::SmallComponentFilter<Graph>>(g, 3);
- shared_ptr<GraphSplitter<Graph>> splitter = make_shared<omnigraph::CondensingSplitterWrapper<Graph>>(rs, filter);
- WriteComponents<Graph>(g, prefix_path, splitter, resulting_colorer, labeler);
-}
-
-template<class Graph>
-class LocalityPrintingRH {
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
- const Graph& g_;
- const GraphLabeler<Graph>& labeler_;
- std::shared_ptr<visualization::GraphColorer<Graph>> colorer_;
- const string output_folder_;
-public:
- LocalityPrintingRH(const Graph& g
- , const GraphLabeler<Graph>& labeler
- , std::shared_ptr<visualization::GraphColorer<Graph>> colorer
- , const string& output_folder) :
- g_(g),
- labeler_(labeler),
- colorer_(colorer),
- output_folder_(output_folder) {
-// path::make_dirs(output_folder_);
- }
-
- void HandleDelete(EdgeId e, const string& add_label = "") {
- //todo magic constant
-// map<EdgeId, string> empty_coloring;
- auto edge_colorer = make_shared<visualization::CompositeEdgeColorer<Graph>>("black");
- edge_colorer->AddColorer(colorer_);
- edge_colorer->AddColorer(make_shared<visualization::SetColorer<Graph>>(g_, vector<EdgeId>(1, e), "green"));
- shared_ptr<visualization::GraphColorer<Graph>> resulting_colorer = make_shared<visualization::CompositeGraphColorer<Graph>>(colorer_, edge_colorer);
-
- string fn = output_folder_ + "/edge_" + ToString(g_.int_id(e)) + add_label + ".dot";
- omnigraph::visualization::WriteComponent(omnigraph::EdgeNeighborhood<Graph>(g_, e, 50, 250)
- , fn
- , resulting_colorer, labeler_);
- }
-
-private:
- DECL_LOGGER("LocalityPrintingRH")
- ;
-};
-
-//static void WriteFilteredComponents(const Graph& g,
-// const string& folder_name,
-// shared_ptr<GraphComponentFilter<Graph>> filter,
-// shared_ptr<GraphSplitter<Graph>> splitter,
-// shared_ptr<GraphColorer<Graph>> colorer,
-// const GraphLabeler<Graph> &labeler) {
-// EmptyGraphLinker<Graph> linker;
-//// shared_ptr<GraphComponentFilter<Graph>> checker = make_shared<ComponentSizeFilter<Graph>>(g, 1500, 2, 300);
-// omnigraph::FilteringSplitterWrapper<Graph> filtered_splitter(splitter, filter);
-// omnigraph::visualization::SplittingGraphVisualizer<Graph>(g, labeler, *colorer, linker).SplitAndVisualize(filtered_splitter, folder_name);
-//}
-
-}
-}
diff --git a/src/modules/visualization/visualizers.hpp b/src/modules/visualization/visualizers.hpp
deleted file mode 100644
index 6b35a94..0000000
--- a/src/modules/visualization/visualizers.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-#pragma once
-
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-
-#include "dev_support/standard_base.hpp"
-#include "graph_printer.hpp"
-#include <fstream>
-
-namespace omnigraph {
-namespace visualization {
-
-//DECL_LOGGER("omg.gvis")
-
-template<class Graph>
-class ComponentVisualizer {
- const Graph& graph_;
- const bool paired_;
-
-private:
- void Visualize(const GraphComponent<Graph>& component, GraphPrinter<Graph> &printer) {
- printer.open();
- printer.AddVertices(component.vertices().begin(), component.vertices().end());
- for (auto e_it = component.e_begin(); e_it != component.e_end();
- ++e_it) {
- printer.AddEdge(*e_it);
- }
- printer.close();
- }
-
-public:
- ComponentVisualizer(const Graph& graph, bool paired = true) :
- graph_(graph), paired_(paired) {
- }
-
- void Visualize(const GraphComponent<Graph>& component, ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) {
- if(paired_) {
- PairedGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
- Visualize(component, printer);
- } else {
- SingleGraphPrinter<Graph> printer(graph_, os, labeler, colorer, linker);
- Visualize(component, printer);
- }
- }
-
- void Visualize(ostream &os,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker) {
- GraphComponent<Graph> component(graph_, graph_.begin(), graph_.end(), false);
- Visualize(component, os, labeler, colorer, linker);
- }
-};
-
-
-template<class Graph>
-class ComponentNameGenerator {
-public:
- virtual string ComponentName(const GraphComponent<Graph>& component) = 0;
-
- virtual ~ComponentNameGenerator() {
- }
-};
-
-template<class Graph>
-class SimpleCountingComponentNameGenerator: public ComponentNameGenerator<Graph> {
-private:
- string name_;
- string extension_;
- size_t cnt_;
-public:
- SimpleCountingComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
- }
-
- string ComponentName(const GraphComponent<Graph>& component) {
- cnt_++;
- stringstream ss;
- ss << name_ << "_" << cnt_;
- if(component.name().size() > 0)
- ss << "_" << component.name();
- ss << "." << extension_;
- return ss.str();
- }
-};
-
-template<class Graph>
-class CountingSizeComponentNameGenerator: public ComponentNameGenerator<Graph> {
-private:
- string name_;
- string extension_;
- size_t cnt_;
-public:
- CountingSizeComponentNameGenerator(string name, string extension): name_(name), extension_(extension), cnt_(0) {
- }
-
- string ComponentName(const GraphComponent<Graph>& component) {
- cnt_++;
- stringstream ss;
- ss << name_ << "_" << cnt_;
- if(component.name().size() > 0)
- ss << "_" << component.name();
- ss << "_size_" << component.size();
- ss << "." << extension_;
-
- return ss.str();
- }
-};
-
-
-template<class Graph>
-class SplittingGraphVisualizer {
-private:
- const Graph& graph_;
- const GraphLabeler<Graph> &labeler_;
- const GraphColorer<Graph> &colorer_;
- const VertexLinker<Graph> &linker_;
- const bool paired_;
- const size_t max_component_number_;
- static const size_t DEFAULT_MAX_COMPONENT_NUMBER = 500;
-
- string ComponentFileName(size_t cnt, const string &folder, const GraphComponent<Graph>& component) {
- stringstream ss;
- ss << folder << cnt;
- if(component.name().size() > 0)
- ss << "graph_" << component.name();
- ss << ".dot";
- return ss.str();
- }
-
-public:
- SplittingGraphVisualizer(const Graph& graph,
- const GraphLabeler<Graph> &labeler,
- const GraphColorer<Graph> &colorer,
- const VertexLinker<Graph> &linker,
- bool paired = true,
- size_t max_component_number = DEFAULT_MAX_COMPONENT_NUMBER) :
- graph_(graph), labeler_(labeler), colorer_(colorer), linker_(linker), paired_(paired), max_component_number_(max_component_number) {
- }
-
- size_t SplitAndVisualize(GraphSplitter<Graph> &splitter, const string &folder) {
- INFO("Writing components to folder " << folder);
- ComponentVisualizer<Graph> visualizer(graph_, paired_);
- size_t cnt = 0;
- while(splitter.HasNext()) {
- if(cnt > max_component_number_) {
- INFO("The number of graph components exceeded " << max_component_number_ << ". Aborting current visualization.");
- break;
- }
- cnt++;
- GraphComponent<Graph> component = splitter.Next();
- BorderDecorator<Graph> border_colorer(component, colorer_, "yellow");
- ofstream os(ComponentFileName(cnt, folder, component));
- visualizer.Visualize(component, os, labeler_, border_colorer, linker_);
- os.close();
- }
- return cnt;
- }
-
-private:
- DECL_LOGGER("SplittingGraphVisualizer");
-};
-
-}
-}
-
diff --git a/src/projects/CMakeLists.txt b/src/projects/CMakeLists.txt
index 4fd1f77..02eca68 100644
--- a/src/projects/CMakeLists.txt
+++ b/src/projects/CMakeLists.txt
@@ -19,4 +19,4 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
endif()
else()
add_subdirectory(mph_test)
-endif()
\ No newline at end of file
+endif()
diff --git a/src/projects/cap/assembly_compare.hpp b/src/projects/cap/assembly_compare.hpp
index e879bc0..ec86be5 100644
--- a/src/projects/cap/assembly_compare.hpp
+++ b/src/projects/cap/assembly_compare.hpp
@@ -9,10 +9,10 @@
#include "pipeline/graph_pack.hpp"
#include "pipeline/graphio.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "algorithms/simplification/cleaner.hpp"
-#include "io/reads_io/splitting_wrapper.hpp"
-#include "io/reads_io/multifile_reader.hpp"
+#include "utils/simple_tools.hpp"
+#include "modules/simplification/cleaner.hpp"
+#include "io/reads/splitting_wrapper.hpp"
+#include "io/reads/multifile_reader.hpp"
#include <boost/algorithm/string/predicate.hpp>
#include "coloring.hpp"
@@ -113,7 +113,7 @@ public:
// typedef typename gp_t::graph_t Graph;
// typedef typename Graph::EdgeId EdgeId;
// typedef typename Graph::VertexId VertexId;
-// typedef NewExtendedSequenceMapper<Graph, typename gp_t::seq_t> Mapper; // gp_t::k_value + 1
+// typedef BasicSequenceMapper<Graph, typename gp_t::seq_t> Mapper; // gp_t::k_value + 1
//
// gp_t gp_;
// ColorHandler<Graph> coloring_;
@@ -137,7 +137,7 @@ public:
//
// template<class gp_t2>
// void UniversalSaveGP(
-// const gp_t2& gp/*, const omnigraph::visualization::GraphColorer<typename gp_t2::graph_t> coloring*/,
+// const gp_t2& gp/*, const visualization::graph_colorer::GraphColorer<typename gp_t2::graph_t> coloring*/,
// const string& filename) {
// typename PrinterTraits<Graph>::Printer printer(gp.g);
// INFO("Saving graph to " << filename);
@@ -464,12 +464,12 @@ void ThreadAssemblies(const string& base_saves, ContigStream& base_assembly,
// ConstructGraph<gp_t::k_value, Graph>(gp.g, gp.index, base_assembly);
ScanGraphPack(base_saves, gp);
base_assembly.reset();
- FillPos(gp, base_assembly, base_prefix);
- FillPos(gp, assembly_to_thread, to_thread_prefix);
+ visualization::position_filler::FillPos(gp, base_assembly, base_prefix);
+ visualization::position_filler::FillPos(gp, assembly_to_thread, to_thread_prefix);
- EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
- StrGraphLabeler<Graph> str_labeler(gp.g);
- CompositeLabeler<Graph> labeler(pos_labeler, str_labeler);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(gp.g, gp.edge_pos);
+ visualization::graph_labeler::StrGraphLabeler<Graph> str_labeler(gp.g);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(pos_labeler, str_labeler);
auto mapper = MapperInstance(gp);
diff --git a/src/projects/cap/assembly_problem_detection.hpp b/src/projects/cap/assembly_problem_detection.hpp
index 7040caf..6ad4075 100644
--- a/src/projects/cap/assembly_problem_detection.hpp
+++ b/src/projects/cap/assembly_problem_detection.hpp
@@ -47,7 +47,7 @@
// typedef io::SingleRead Contig;
// typedef io::IReader<io::SingleRead> ContigStream;
// typedef io::MultifileReader<io::SingleRead> CompositeStream;
-// typedef debruijn_graph::NewExtendedSequenceMapper<Graph, Index> Mapper;
+// typedef debruijn_graph::BasicSequenceMapper<Graph, Index> Mapper;
//
// const gp_t& gp_;
// const ColorHandler<Graph>& coloring_;
@@ -204,7 +204,7 @@
// }
//
// void ReportLocality(VertexId v, const vector<EdgeId>& good_contig_path, const string& best_contig, const Contig& c, const string& folder) {
-// using namespace omnigraph::visualization;
+// using namespace visualization;
// make_dir(folder);
// LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
// EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
@@ -398,7 +398,7 @@
// }
//
// void ReportEdge(EdgeId e, const string& folder) {
-// using namespace omnigraph::visualization;
+// using namespace visualization;
// INFO(
// "Can close gap between edges " << g_.str(g_.GetUniqueIncomingEdge(g_.EdgeStart(e))) << " and " << g_.str(g_.GetUniqueOutgoingEdge(g_.EdgeEnd(e))) << " with edge " << g_.str(e));
// LengthIdGraphLabeler<Graph> basic_labeler(g_);
@@ -407,7 +407,7 @@
// CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
// GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e);
// auto colorer = coloring_.ConstructColorer(component);
-// omnigraph::visualization::WriteComponent(component, folder + ToString(g_.int_id(e)) + "_loc.dot", colorer, labeler);
+// visualization::visualization_utils::WriteComponent(component, folder + ToString(g_.int_id(e)) + "_loc.dot", colorer, labeler);
// }
//
//// bool CheckEdges(const vector<EdgeId>& edges) {
diff --git a/src/projects/cap/cap_commands.hpp b/src/projects/cap/cap_commands.hpp
index c4c637f..1c0c945 100644
--- a/src/projects/cap/cap_commands.hpp
+++ b/src/projects/cap/cap_commands.hpp
@@ -10,8 +10,8 @@
#include "cap_environment.hpp"
#include "cap_environment_manager.hpp"
#include "mosaic.hpp"
-#include "io/reads_io/sequence_reader.hpp"
-#include "dev_support/path_helper.hpp"
+#include "io/reads/sequence_reader.hpp"
+#include "utils/path_helper.hpp"
namespace online_visualization {
diff --git a/src/projects/cap/cap_environment.hpp b/src/projects/cap/cap_environment.hpp
index f0f24d4..fa41558 100644
--- a/src/projects/cap/cap_environment.hpp
+++ b/src/projects/cap/cap_environment.hpp
@@ -29,8 +29,8 @@ class CapEnvironment : public Environment {
typedef Graph::VertexId VertexId;
typedef Graph::EdgeId EdgeId;
- typedef debruijn_graph::KmerStoringEdgeIndex<Graph, runtime_k::RtSeq, kmer_index_traits<runtime_k::RtSeq>, debruijn_graph::SimpleStoring> RtSetIndex;
- typedef debruijn_graph::graph_pack<Graph, runtime_k::RtSeq, RtSetIndex> RtSeqGraphPack;
+ typedef debruijn_graph::KmerStoringEdgeIndex<Graph, RtSeq, kmer_index_traits<RtSeq>, debruijn_graph::SimpleStoring> RtSetIndex;
+ typedef debruijn_graph::graph_pack<Graph, RtSeq, RtSetIndex> RtSeqGraphPack;
typedef debruijn_graph::KmerStoringEdgeIndex<Graph, cap::LSeq, kmer_index_traits<cap::LSeq>, debruijn_graph::SimpleStoring> LSeqIndex;
typedef debruijn_graph::graph_pack<Graph, cap::LSeq, LSeqIndex> LSeqGraphPack;
diff --git a/src/projects/cap/cap_environment_manager.hpp b/src/projects/cap/cap_environment_manager.hpp
index 9628fdb..33a39f3 100644
--- a/src/projects/cap/cap_environment_manager.hpp
+++ b/src/projects/cap/cap_environment_manager.hpp
@@ -22,7 +22,7 @@
#include "test_utils.hpp"
#include "cap_environment.hpp"
-#include "io/reads_io/sequence_reader.hpp"
+#include "io/reads/sequence_reader.hpp"
#include "pipeline/config_struct.hpp"
#include "junk_cropping_reader.hpp"
diff --git a/src/projects/cap/cap_kmer_index.hpp b/src/projects/cap/cap_kmer_index.hpp
index feab11a..5b7414b 100644
--- a/src/projects/cap/cap_kmer_index.hpp
+++ b/src/projects/cap/cap_kmer_index.hpp
@@ -10,11 +10,11 @@
#include "compare_standard.hpp"
#include "longseq.hpp"
#include "polynomial_hash.hpp"
-#include "utils/adt/kmer_map.hpp"
-#include "data_structures/indices/edge_position_index.hpp"
+#include "common/adt/kmer_map.hpp"
+#include "utils/indices/edge_position_index.hpp"
-#include "io/reads_io/sequence_reader.hpp"
-#include "data_structures/mph_index/base_hash.hpp"
+#include "io/reads/sequence_reader.hpp"
+#include "utils/mph_index/base_hash.hpp"
template<>
struct kmer_index_traits<cap::LSeq> {
diff --git a/src/projects/cap/cap_logger.hpp b/src/projects/cap/cap_logger.hpp
index b54bc48..c8bf020 100644
--- a/src/projects/cap/cap_logger.hpp
+++ b/src/projects/cap/cap_logger.hpp
@@ -5,7 +5,7 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
/*
#undef INFO
diff --git a/src/projects/cap/colored_graph_construction.hpp b/src/projects/cap/colored_graph_construction.hpp
index bfceb8c..091c41c 100644
--- a/src/projects/cap/colored_graph_construction.hpp
+++ b/src/projects/cap/colored_graph_construction.hpp
@@ -7,10 +7,10 @@
#pragma once
-#include "data_structures/sequence/runtime_k.hpp"
+#include "sequence/runtime_k.hpp"
#include "compare_standard.hpp"
#include "cap_kmer_index.hpp"
-#include "algorithms/graph_construction.hpp"
+#include "modules/graph_construction.hpp"
namespace cap {
@@ -76,7 +76,7 @@ public:
void FindCoveredRanges(CoveredRanges& crs, ContigStream& stream) const {
io::SingleRead read;
stream.reset();
-// NewExtendedSequenceMapper<gp_t::k_value + 1, Graph> mapper(gp_.g,
+// BasicSequenceMapper<gp_t::k_value + 1, Graph> mapper(gp_.g,
// gp_.index, gp_.kmer_mapper);
while (!stream.eof()) {
stream >> read;
@@ -366,7 +366,7 @@ void ConstructColoredGraph(gp_t& gp,
// vector<ContigStream*>& streams, const string& reference, bool fill_pos = true, int br_delta = -1) {
// typedef typename gp_t::graph_t Graph;
// const size_t k = gp_t::k_value;
-// typedef NewExtendedSequenceMapper<k + 1, Graph> Mapper;
+// typedef BasicSequenceMapper<k + 1, Graph> Mapper;
//
// INFO("Constructing de Bruijn graph for k=" << k);
//
@@ -389,7 +389,7 @@ void ConstructColoredGraph(gp_t& gp,
// for (auto it = streams.begin(); it != streams.end(); ++it) {
// ContigStream& stream = **it;
// stream.reset();
-// FillPos(gp, stream);
+// visualization::position_filler::FillPos(gp, stream);
// }
// }
//}
diff --git a/src/projects/cap/coloring.hpp b/src/projects/cap/coloring.hpp
index 3916129..2e33e92 100644
--- a/src/projects/cap/coloring.hpp
+++ b/src/projects/cap/coloring.hpp
@@ -8,6 +8,7 @@
#pragma once
#include <boost/format/format_fwd.hpp>
+#include <common/visualization/graph_colorer.hpp>
namespace cap {
@@ -195,7 +196,7 @@ public:
};
template<class Graph, class Element>
-class ElementColorHandler: public GraphActionHandler<Graph>, public visualization::ElementColorer<Element> {
+class ElementColorHandler: public GraphActionHandler<Graph>, public visualization::graph_colorer::ElementColorer<Element> {
typedef GraphActionHandler<Graph> base;
// For each element will store a bitmask of used there colors.
@@ -273,7 +274,7 @@ public:
};
template<class Graph>
-class ColorHandler: public visualization::GraphColorer<Graph>, public GraphActionHandler<Graph> {
+class ColorHandler: public visualization::graph_colorer::GraphColorer<Graph>, public GraphActionHandler<Graph> {
typedef GraphActionHandler<Graph> base;
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::VertexId VertexId;
@@ -369,14 +370,14 @@ public:
//This is a bad unsafe code! The right way is to use shared_ptr of this class in all interfaces.
//Then one can easily draw with this colorer without any delegation
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer() const {
- using namespace omnigraph::visualization;
- return shared_ptr<GraphColorer<Graph>>(new omnigraph::visualization::DelegatingGraphColorer<Graph>(*this));
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> ConstructColorer() const {
+ using namespace visualization;
+ return shared_ptr<GraphColorer<Graph>>(new visualization::DelegatingGraphColorer<Graph>(*this));
}
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer(GraphComponent<Graph> gc) const {
- shared_ptr<omnigraph::visualization::GraphColorer<Graph>> colorer = ConstructColorer();
- return omnigraph::visualization::BorderDecorator<Graph>::GetInstance(gc, colorer);
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> ConstructColorer(GraphComponent<Graph> gc) const {
+ shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> colorer = ConstructColorer();
+ return visualization::BorderDecorator<Graph>::GetInstance(gc, colorer);
}
size_t max_colors() const {
@@ -430,9 +431,9 @@ void LoadColoring(const Graph& /*g*/
template<class Graph>
-std::auto_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer(
+std::auto_ptr<visualization::graph_colorer::GraphColorer<Graph>> ConstructColorer(
const ColorHandler<Graph>& coloring) {
- using namespace omnigraph::visualization;
+ using namespace visualization;
return std::auto_ptr<GraphColorer<Graph>>(
new CompositeGraphColorer<Graph>(
make_shared<MapColorer<typename Graph::VertexId>>(coloring.VertexColorMap()),
@@ -440,9 +441,9 @@ std::auto_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructColorer(
}
template<class Graph>
-std::auto_ptr<omnigraph::visualization::GraphColorer<Graph>> ConstructBorderColorer(const Graph& /*g*/,
+std::auto_ptr<visualization::graph_colorer::GraphColorer<Graph>> ConstructBorderColorer(const Graph& /*g*/,
const ColorHandler<Graph>& coloring) {
- using namespace omnigraph::visualization;
+ using namespace visualization;
return std::auto_ptr<GraphColorer<Graph>>(
new CompositeGraphColorer<Graph>(
make_shared<FixedColorer<Graph>>("white"),
diff --git a/src/projects/cap/compare_standard.hpp b/src/projects/cap/compare_standard.hpp
index 7e0f85e..426b3f5 100644
--- a/src/projects/cap/compare_standard.hpp
+++ b/src/projects/cap/compare_standard.hpp
@@ -7,16 +7,16 @@
#pragma once
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
// log
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
// utils
-#include "dev_support/cpp_utils.hpp"
-#include "dev_support/path_helper.hpp"
+#include "utils/cpp_utils.hpp"
+#include "utils/path_helper.hpp"
-#include "dev_support/simple_tools.hpp"
+#include "utils/simple_tools.hpp"
// longseq
#include "longseq.hpp"
@@ -25,12 +25,12 @@
#include "cap_config_struct.hpp"
// io
-#include "io/reads_io/ireader.hpp"
-#include "io/reads_io/converting_reader_wrapper.hpp"
-#include "io/reads_io/vector_reader.hpp"
-#include "io/reads_io/multifile_reader.hpp"
-#include "io/reads_io/rc_reader_wrapper.hpp"
-#include "io/reads_io/osequencestream.hpp"
+#include "io/reads/ireader.hpp"
+#include "io/reads/converting_reader_wrapper.hpp"
+#include "io/reads/vector_reader.hpp"
+#include "io/reads/multifile_reader.hpp"
+#include "io/reads/rc_reader_wrapper.hpp"
+#include "io/reads/osequencestream.hpp"
namespace cap {
typedef io::SingleRead Contig;
@@ -42,6 +42,6 @@ typedef io::ReadStreamList<Contig> ContigStreams;
}
// debruijn
-#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/core/graph.hpp"
#include "pipeline/graph_pack.hpp"
-#include "algorithms/graph_construction.hpp"
+#include "modules/graph_construction.hpp"
diff --git a/src/projects/cap/comparison_utils.hpp b/src/projects/cap/comparison_utils.hpp
index eefe93d..2dddb7a 100644
--- a/src/projects/cap/comparison_utils.hpp
+++ b/src/projects/cap/comparison_utils.hpp
@@ -8,17 +8,17 @@
#pragma once
#include "pipeline/graphio.hpp"
-#include "dev_support/simple_tools.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
+#include "utils/simple_tools.hpp"
+#include "assembly_graph/core/graph.hpp"
#include "coordinates_handler.hpp"
#include "math/xmath.h"
#include <iostream>
#include <vector>
-#include "dev_support/logger/logger.hpp"
-#include "io/reads_io/multifile_reader.hpp"
-#include "io/reads_io/splitting_wrapper.hpp"
-#include "io/reads_io/modifying_reader_wrapper.hpp"
-#include "io/reads_io/vector_reader.hpp"
+#include "utils/logger/logger.hpp"
+#include "io/reads/multifile_reader.hpp"
+#include "io/reads/splitting_wrapper.hpp"
+#include "io/reads/modifying_reader_wrapper.hpp"
+#include "io/reads/vector_reader.hpp"
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
@@ -137,7 +137,7 @@ inline void PrintGraphComponentContainingEdge(const string& file_name, const Gra
}
template<class Graph>
-class EdgeCoordinatesGraphLabeler: public AbstractGraphLabeler<Graph> {
+class EdgeCoordinatesGraphLabeler: public visualization::graph_labeler::AbstractGraphLabeler<Graph> {
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::VertexId VertexId;
public:
diff --git a/src/projects/cap/coordinates_handler.hpp b/src/projects/cap/coordinates_handler.hpp
index 3caeb4c..a5a7ad1 100644
--- a/src/projects/cap/coordinates_handler.hpp
+++ b/src/projects/cap/coordinates_handler.hpp
@@ -10,8 +10,8 @@
#include <cstring>
#include <vector>
#include <algorithm>
-#include "data_structures/sequence/sequence.hpp"
-#include "data_structures/sequence/sequence_tools.hpp"
+#include "sequence/sequence.hpp"
+#include "sequence/sequence_tools.hpp"
namespace cap {
diff --git a/src/projects/cap/deprecated/tools_deprecated.cpp b/src/projects/cap/deprecated/tools_deprecated.cpp
index 63883cf..6f13424 100644
--- a/src/projects/cap/deprecated/tools_deprecated.cpp
+++ b/src/projects/cap/deprecated/tools_deprecated.cpp
@@ -64,8 +64,8 @@
// ConstructColoredGraph(gp, coloring, streams, false, br_delta);
// // INFO("Filling ref pos " << gp.genome.size());
-// // FillPos(gp_, gp_.genome, "ref_0");
-// // FillPos(gp_, !gp_.genome, "ref_1");
+// // visualization::position_filler::FillPos(gp_, gp_.genome, "ref_0");
+// // visualization::position_filler::FillPos(gp_, !gp_.genome, "ref_1");
// //Indels
// // make_dir(output_folder + "indels/");
diff --git a/src/projects/cap/diff_masking.hpp b/src/projects/cap/diff_masking.hpp
index b4027be..67ef45e 100644
--- a/src/projects/cap/diff_masking.hpp
+++ b/src/projects/cap/diff_masking.hpp
@@ -7,10 +7,10 @@
#pragma once
-#include "io/reads_io/read_stream_vector.hpp"
-#include "algorithms/graph_construction.hpp"
+#include "io/reads/read_stream_vector.hpp"
+#include "modules/graph_construction.hpp"
#include "stages/simplification_pipeline/graph_simplification.hpp"
-#include "algorithms/graph_read_correction.hpp"
+#include "modules/graph_read_correction.hpp"
#include "test_utils.hpp"
#include "coloring.hpp"
@@ -135,7 +135,7 @@ ContigStreams RefineStreams(ContigStreams& streams,
size_t k,
size_t delta = 5,
const std::string &workdir = "tmp") {
- typedef debruijn_graph::KmerStoringEdgeIndex<Graph, Seq, kmer_index_traits<runtime_k::RtSeq>, debruijn_graph::SimpleStoring> RefiningIndex;
+ typedef debruijn_graph::KmerStoringEdgeIndex<Graph, Seq, kmer_index_traits<RtSeq>, debruijn_graph::SimpleStoring> RefiningIndex;
typedef graph_pack<ConjugateDeBruijnGraph, Seq, RefiningIndex> refining_gp_t;
refining_gp_t gp(k, workdir);
@@ -281,7 +281,7 @@ inline void PerformIterativeRefinement(ContigStreams& streams,
gene_collection);
} else {
omp_set_num_threads(8);
- PerformRefinement<runtime_k::RtSeq>(streams, root, suffixes, current_k,
+ PerformRefinement<RtSeq>(streams, root, suffixes, current_k,
gene_root, gene_collection);
}
diff --git a/src/projects/cap/gene_analysis.hpp b/src/projects/cap/gene_analysis.hpp
index a174024..07f99fe 100644
--- a/src/projects/cap/gene_analysis.hpp
+++ b/src/projects/cap/gene_analysis.hpp
@@ -7,8 +7,8 @@
#pragma once
-#include "dev_support/standard_base.hpp"
-#include "dev_support/simple_tools.hpp"
+#include "utils/standard_base.hpp"
+#include "utils/simple_tools.hpp"
#include "comparison_utils.hpp"
#include "boost/tokenizer.hpp"
#include "coloring.hpp"
diff --git a/src/projects/cap/genome_correction.hpp b/src/projects/cap/genome_correction.hpp
index e9ba688..52ba5c4 100644
--- a/src/projects/cap/genome_correction.hpp
+++ b/src/projects/cap/genome_correction.hpp
@@ -11,7 +11,8 @@
#include <vector>
#include <map>
-#include "utils/adt/bag.hpp"
+#include <common/visualization/graph_labeler.hpp>
+#include "common/adt/bag.hpp"
namespace cap {
@@ -376,7 +377,7 @@ class SimpleInDelCorrector {
void GenPicAlongPath(const vector<EdgeId> path, size_t cnt) {
utils::MakeDirPath("ref_correction");
- WriteComponentsAlongPath(g_, StrGraphLabeler<Graph>(g_),
+ WriteComponentsAlongPath(g_, visualization::graph_labeler::StrGraphLabeler<Graph>(g_),
"ref_correction/" + ToString(cnt) + ".dot", 100000, 10,
TrivialMappingPath(g_, path), *ConstructColorer(coloring_));
}
@@ -384,7 +385,8 @@ class SimpleInDelCorrector {
void GenPicAroundEdge(EdgeId e, size_t cnt) {
utils::MakeDirPath("ref_correction");
GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e, 10, 100000);
- omnigraph::visualization::WriteComponent(g_, "ref_correction/" + ToString(cnt) + ".dot", component, coloring_.GetInstance(), StrGraphLabeler<Graph>(g_));
+ visualization::visualization_utils::WriteComponent(g_, "ref_correction/" + ToString(cnt) + ".dot", component, coloring_.GetInstance(),
+ visualization::graph_labeler::StrGraphLabeler<Graph>(g_));
}
void CorrectGenomePath(size_t genome_start, size_t genome_end,
diff --git a/src/projects/cap/junk_cropping_reader.hpp b/src/projects/cap/junk_cropping_reader.hpp
index 5927d75..9c21196 100644
--- a/src/projects/cap/junk_cropping_reader.hpp
+++ b/src/projects/cap/junk_cropping_reader.hpp
@@ -6,8 +6,8 @@
//***************************************************************************
#pragma once
-#include "dev_support/standard_base.hpp"
-#include "io/reads_io/delegating_reader_wrapper.hpp"
+#include "utils/standard_base.hpp"
+#include "io/reads/delegating_reader_wrapper.hpp"
namespace cap {
diff --git a/src/projects/cap/longseq.hpp b/src/projects/cap/longseq.hpp
index 571e69e..7a454ed 100644
--- a/src/projects/cap/longseq.hpp
+++ b/src/projects/cap/longseq.hpp
@@ -10,9 +10,9 @@
#include <cstdlib>
#include <cstdint>
#include "polynomial_hash.hpp"
-#include "dev_support/log.hpp"
-#include "data_structures/sequence/sequence.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/log.hpp"
+#include "sequence/sequence.hpp"
+#include "utils/openmp_wrapper.h"
namespace cap {
diff --git a/src/projects/cap/main.cpp b/src/projects/cap/main.cpp
index 7228aab..2b646c2 100644
--- a/src/projects/cap/main.cpp
+++ b/src/projects/cap/main.cpp
@@ -9,14 +9,14 @@
#include "cap_kmer_index.hpp"
#include "cap_logger.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/stacktrace.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/stacktrace.hpp"
#include "pipeline/config_struct.hpp"
-#include "dev_support/simple_tools.hpp"
+#include "utils/simple_tools.hpp"
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
-#include "dev_support/memory_limit.hpp"
+#include "utils/memory_limit.hpp"
#include "io/dataset_support/read_converter.hpp"
#include "cap_online_visualizer.hpp"
diff --git a/src/projects/cap/mosaic.hpp b/src/projects/cap/mosaic.hpp
index 1939a3f..ac75fbb 100644
--- a/src/projects/cap/mosaic.hpp
+++ b/src/projects/cap/mosaic.hpp
@@ -5,12 +5,12 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/standard_base.hpp"
-#include "io/reads_io/rc_reader_wrapper.hpp"
-#include "io/reads_io/sequence_reader.hpp"
+#include "utils/standard_base.hpp"
+#include "io/reads/rc_reader_wrapper.hpp"
+#include "io/reads/sequence_reader.hpp"
#include "diff_masking.hpp"
-#include "utils/adt/bag.hpp"
-#include "io/reads_io/vector_reader.hpp"
+#include "common/adt/bag.hpp"
+#include "io/reads/vector_reader.hpp"
#include "visualization/graph_colorer.hpp"
namespace cap {
@@ -950,7 +950,7 @@ void DrawGraph(const vector<StrandRange>& all_ranges,
const vector<StrandRange>& full_mosaic_ranges,
const GenomeBlockComposition& block_composition) {
make_dir("tmp");
- graph_pack<Graph, runtime_k::RtSeq> gp(block_composition.block_info().g().k(), "tmp", 0);
+ graph_pack<Graph, RtSeq> gp(block_composition.block_info().g().k(), "tmp", 0);
auto stream = io::RCWrap(StreamInstance(ExtractSequences(all_ranges, block_composition)));
auto streams = io::ReadStreamList<io::SingleRead>(stream);
@@ -959,9 +959,9 @@ void DrawGraph(const vector<StrandRange>& all_ranges,
auto full_mosaic_pos_stream = io::RCWrap(StreamInstance(ExtractSequences(full_mosaic_ranges, block_composition), mosaic_names(full_mosaic_ranges.size())));
INFO("Threading " << full_mosaic_ranges.size() << " full mosaics");
- FillPos(gp, *full_mosaic_pos_stream);
+ visualization::position_filler::FillPos(gp, *full_mosaic_pos_stream);
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
shared_ptr<GraphSplitter<Graph>> splitter = omnigraph::ReliableSplitter(gp.g,
numeric_limits<size_t>::max(),
@@ -971,8 +971,8 @@ void DrawGraph(const vector<StrandRange>& all_ranges,
path::remove_if_exists("mosaic_pics");
path::make_dir("mosaic_pics");
INFO("Writing components");
- omnigraph::visualization::WriteComponents(gp.g, "mosaic_pics/", splitter,
- omnigraph::visualization::DefaultColorer(gp.g), labeler);
+ visualization::visualization_utils::WriteComponents(gp.g, "mosaic_pics/", splitter,
+ visualization::graph_colorer::DefaultColorer(gp.g), labeler);
INFO("Components written");
}
diff --git a/src/projects/cap/repeat_masking.hpp b/src/projects/cap/repeat_masking.hpp
index 5928bb8..ad1e19c 100644
--- a/src/projects/cap/repeat_masking.hpp
+++ b/src/projects/cap/repeat_masking.hpp
@@ -7,9 +7,9 @@
#pragma once
-#include "data_structures/sequence/nucl.hpp"
-#include "io/reads_io/modifying_reader_wrapper.hpp"
-#include "utils/adt/bag.hpp"
+#include "sequence/nucl.hpp"
+#include "io/reads/modifying_reader_wrapper.hpp"
+#include "common/adt/bag.hpp"
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_01.hpp>
#include <boost/random/uniform_int.hpp>
@@ -132,7 +132,7 @@ public:
class RepeatMasker : public io::SequenceModifier {
private:
- typedef runtime_k::RtSeq Kmer;
+ typedef RtSeq Kmer;
typedef KeyIteratingMap<Kmer, Count, kmer_index_traits<Kmer>, SimpleStoring> KmerCountIndex;
typedef typename KmerCountIndex::KeyWithHash KeyWithHash;
typedef KmerCountIndex::KMerIdx KmerIdx;
diff --git a/src/projects/cap/serialization.hpp b/src/projects/cap/serialization.hpp
index 7fb38f2..1a94ce2 100644
--- a/src/projects/cap/serialization.hpp
+++ b/src/projects/cap/serialization.hpp
@@ -13,7 +13,7 @@
#include <string>
#include <vector>
-#include "data_structures/sequence/sequence.hpp"
+#include "sequence/sequence.hpp"
namespace cap {
diff --git a/src/projects/cap/simple_inversion_finder.hpp b/src/projects/cap/simple_inversion_finder.hpp
index 3088a0a..d29a272 100644
--- a/src/projects/cap/simple_inversion_finder.hpp
+++ b/src/projects/cap/simple_inversion_finder.hpp
@@ -12,7 +12,7 @@
#include "coordinates_handler.hpp"
#include "compare_standard.hpp"
#include "comparison_utils.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
namespace cap {
@@ -299,9 +299,9 @@ class SimpleInversionFinder {
MappingPath<EdgeId> mpath = TrivialMappingPath(g_, path);
//Path<EdgeId> cpath(path, mpath.start_pos(), mpath.end_pos());
- LengthIdGraphLabeler<Graph> basic_labeler(g_);
- EdgePosGraphLabeler<Graph> pos_labeler(g_, gp_.edge_pos);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g_, gp_.edge_pos);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
WriteComponentsAlongPath(g_, labeler, out_file, edge_length, max_vertices,
mpath, *ConstructBorderColorer(g_, coloring_));
diff --git a/src/projects/cap/stats.hpp b/src/projects/cap/stats.hpp
index 4d7f1ef..abb916e 100644
--- a/src/projects/cap/stats.hpp
+++ b/src/projects/cap/stats.hpp
@@ -11,7 +11,7 @@
#include "assembly_graph/components/graph_component.hpp"
#include "assembly_graph/components/splitters.hpp"
#include "utils.hpp"
-#include "dev_support/simple_tools.hpp"
+#include "utils/simple_tools.hpp"
#include "comparison_utils.hpp"
#include "assembly_graph/graph_support/basic_graph_stats.hpp"
#include "coloring.hpp"
@@ -377,12 +377,12 @@ public:
}
void CountStats() {
- EmptyGraphLabeler<Graph> labeler;
+ visualization::graph_labeler::EmptyGraphLabeler<Graph> labeler;
make_dir("assembly_compare");
shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(this->graph(), 1000000000);
WriteComponents(this->graph(), *splitter, *this,
"assembly_compare/breakpoint_graph.dot",
- *ConstructColorer(coloring_), labeler);
+ *visualization::ConstructColorer(coloring_), labeler);
ready_ = true;
for (size_t i = 0; i < component_type::size; ++i) {
INFO("Number of components of type " << ComponentClassifier<Graph>::info_printer_pos_name(i) << " is " << GetComponentNumber(i));
@@ -419,7 +419,7 @@ public:
}
void PrintComponents(component_type c_type,
- const GraphLabeler<Graph>& labeler,
+ const visualization::graph_labeler::GraphLabeler<Graph>& labeler,
bool create_subdir = true) const {
string filename;
if (create_subdir) {
@@ -460,7 +460,7 @@ public:
}
}
- void CountStats(const GraphLabeler<Graph>& labeler, bool detailed_output =
+ void CountStats(const visualization::graph_labeler::GraphLabeler<Graph>& labeler, bool detailed_output =
true) const {
make_dir(output_folder_);
BreakPointGraphStatistics<Graph> stats(graph_, coloring_);
@@ -524,14 +524,14 @@ class TrivialBreakpointFinder: public AbstractFilter<
void ReportBreakpoint(VertexId v, const string& folder,
const string& prefix) {
TRACE("Vertex " << g_.str(v) << " identified as breakpoint");
- LengthIdGraphLabeler<Graph> basic_labeler(g_);
- EdgePosGraphLabeler<Graph> pos_labeler(g_, pos_);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g_, pos_);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
VERIFY(g_.OutgoingEdgeCount(v) > 0);
EdgeId e = g_.OutgoingEdges(v).front();
GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g_, e);
- visualization::WriteComponent(
+ visualization::visualization_utils::WriteComponent(
component,
folder + prefix + ToString(g_.int_id(v)) + "_loc.dot",
coloring_.ConstructColorer(component), labeler);
@@ -695,10 +695,10 @@ class SimpleInDelAnalyzer {
}
void WriteAltPath(EdgeId e, const vector<EdgeId>& genome_path) {
- LengthIdGraphLabeler<Graph> basic_labeler(g_);
- EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
string alt_path_folder = folder_ + ToString(g_.int_id(e)) + "/";
make_dir(alt_path_folder);
@@ -843,10 +843,10 @@ private:
INFO(
"Edge " << gp_.g.str(e)
<< " identified as rearrangement connection");
- LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
- EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(gp_.g);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(gp_.g, gp_.edge_pos);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
INFO(
count_ << " example start_ref_pos: " << start_ref_pos
@@ -855,7 +855,7 @@ private:
boost::format("%s%d_%d_%d_%d.dot") % folder % count_
% gp_.g.int_id(e) % start_ref_pos % end_ref_pos);
GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(gp_.g, e);
- omnigraph::visualization::WriteComponent(component, filename, coloring_.ConstructColorer(component), labeler);
+ visualization::visualization_utils::WriteComponent(component, filename, coloring_.ConstructColorer(component), labeler);
count_++;
}
@@ -1467,10 +1467,10 @@ class MissingGenesAnalyser {
const string output_dir_;
void ReportLocality(const Sequence& s, const string& out_file) {
- LengthIdGraphLabeler<Graph> basic_labeler(g_);
- EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g_);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g_, edge_pos_);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
WriteComponentsAlongPath(g_, labeler, out_file, /*split_length*/1000, /*vertex_number*/15
, mapper_.MapSequence(s), *ConstructBorderColorer(g_, coloring_));
diff --git a/src/projects/cap/tools.cpp b/src/projects/cap/tools.cpp
index 41cd674..9a63635 100755
--- a/src/projects/cap/tools.cpp
+++ b/src/projects/cap/tools.cpp
@@ -6,7 +6,7 @@
//***************************************************************************
#include "compare_standard.hpp"
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
#include "pipeline/graphio.hpp"
#include <boost/test/unit_test.hpp>
diff --git a/src/projects/cap/untangling.hpp b/src/projects/cap/untangling.hpp
index dc8737b..0c2cca7 100644
--- a/src/projects/cap/untangling.hpp
+++ b/src/projects/cap/untangling.hpp
@@ -242,8 +242,8 @@
// Untangle(stream2, 1);
//
// UntangledGraphContigMapper<bp_graph_pack<Graph>> contig_mapper(new_gp_);
-// FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream1);
-// FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream2);
+// visualization::position_filler::FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream1);
+// visualization::position_filler::FillPos(new_gp_.g, contig_mapper, new_gp_.edge_pos, stream2);
// }
//private:
// DECL_LOGGER("UntangledGraphConstructor")
diff --git a/src/projects/cap/visualization.hpp b/src/projects/cap/visualization.hpp
index 7b862bb..c4105b2 100644
--- a/src/projects/cap/visualization.hpp
+++ b/src/projects/cap/visualization.hpp
@@ -6,7 +6,7 @@
//***************************************************************************
#pragma once
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
#include "visualization/visualization_utils.hpp"
namespace cap {
@@ -75,10 +75,10 @@ template<class Graph>
void PrintColoredGraph(const Graph& g, const ColorHandler<Graph>& coloring,
const EdgesPositionHandler<Graph>& pos, const string& output_filename) {
shared_ptr<GraphSplitter<Graph>> splitter = ReliableSplitter<Graph>(g, 1000000, 30);
- LengthIdGraphLabeler<Graph> basic_labeler(g);
- EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
WriteComponents(g, splitter, output_filename,
// *ConstructColorer(coloring),
*ConstructBorderColorer(g, coloring), labeler);
@@ -89,12 +89,12 @@ void PrintColoredGraphAroundEdge(const Graph& g,
const ColorHandler<Graph>& coloring, const EdgeId edge,
const EdgesPositionHandler<Graph>& pos, const string& output_filename) {
INFO(output_filename);
- LengthIdGraphLabeler<Graph> basic_labeler(g);
- EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
+ visualization::graph_labeler::LengthIdGraphLabeler<Graph> basic_labeler(g);
+ visualization::graph_labeler::EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(g, edge);
- omnigraph::visualization::WriteComponent(component, output_filename, coloring.ConstructColorer(component), labeler);
+ visualization::visualization_utils::WriteComponent(component, output_filename, coloring.ConstructColorer(component), labeler);
}
template<class Graph>
@@ -111,8 +111,8 @@ void PrintColoredGraphWithColorFilter(const Graph &g, const ColorHandler<Graph>
LengthIdGraphLabeler<Graph> basic_labeler(g);
EdgeCoordinatesGraphLabeler<Graph> pos_labeler(g, pos, genome_names);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
- omnigraph::visualization::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::visualization_utils::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
}
//fixme code duplication
@@ -130,8 +130,8 @@ void PrintColoredGraphWithColorFilter(const Graph &g, const ColorHandler<Graph>
LengthIdGraphLabeler<Graph> basic_labeler(g);
EdgePosGraphLabeler<Graph> pos_labeler(g, pos);
- CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
- omnigraph::visualization::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
+ visualization::graph_labeler::CompositeLabeler<Graph> labeler(basic_labeler, pos_labeler);
+ visualization::visualization_utils::WriteComponents(g, output_folder, fs, coloring.ConstructColorer(), labeler);
}
//todo alert!!! magic constants!!!
@@ -146,7 +146,7 @@ void WriteComponentsAlongSequence(
typedef typename gp_t::graph_t Graph;
LengthIdGraphLabeler < Graph > basic_labeler(gp.g);
EdgePosGraphLabeler < Graph > pos_labeler(gp.g, gp.edge_pos);
- CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
}
template<class gp_t>
@@ -156,7 +156,7 @@ void PrintColoredGraphAlongRef(const gp_t& gp,
LengthIdGraphLabeler < Graph > basic_labeler(gp.g);
EdgePosGraphLabeler < Graph > pos_labeler(gp.g, gp.edge_pos);
- CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
+ visualization::graph_labeler::CompositeLabeler < Graph > labeler(basic_labeler, pos_labeler);
// only breakpoints
TrivialBreakpointFinder<Graph> bp_f(gp.g, coloring, gp.edge_pos);
diff --git a/src/projects/cclean/CMakeLists.txt b/src/projects/cclean/CMakeLists.txt
new file mode 100644
index 0000000..24ce7b9
--- /dev/null
+++ b/src/projects/cclean/CMakeLists.txt
@@ -0,0 +1,30 @@
+############################################################################
+# Copyright (c) 2015 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint-Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+project(cclean CXX)
+aux_source_directory(. SRC_LIST)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+file(GLOB ${CMAKE_CURRENT_SOURCE_DIR}
+ "*.hh"
+ "*.h"
+ "*.hpp"
+ "*.cpp"
+)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+
+target_link_libraries(cclean ssw input cityhash ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(cclean PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+install(TARGETS cclean
+ DESTINATION bin
+ COMPONENT runtime)
+install(DIRECTORY "${SPADES_CFG_DIR}/cclean"
+ DESTINATION share/spades/configs
+ FILES_MATCHING PATTERN "*.info")
diff --git a/src/projects/cclean/adapter_index.cpp b/src/projects/cclean/adapter_index.cpp
new file mode 100644
index 0000000..29d7f3a
--- /dev/null
+++ b/src/projects/cclean/adapter_index.cpp
@@ -0,0 +1,50 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "adapter_index.hpp"
+#include "io/read_processor.hpp"
+#include "valid_kmer_generator.hpp"
+
+#include "io/ireadstream.hpp"
+#include "config_struct_cclean.hpp"
+
+#include <libcxx/sort.hpp>
+
+using namespace cclean;
+
+void AdapterIndexBuilder::FillAdapterIndex(const std::string &db, AdapterIndex &data) {
+ data.clear();
+
+ INFO("Reading adapter database from " << db);
+ ireadstream irs(db);
+ while (!irs.eof()) {
+ Read r;
+ irs >> r;
+ const std::string &seq = r.getSequenceString();
+
+ data.seqs_.push_back(seq);
+ data.seqs_.push_back(ReverseComplement(seq));
+ }
+
+ INFO("Filling adapter index");
+ for (size_t i = 0, e = data.seqs_.size(); i !=e; ++i) {
+ const std::string &seq = data.seqs_[i];
+ ValidKMerGenerator<cclean::K> gen(seq.c_str(), NULL, seq.size());
+
+ while (gen.HasMore()) {
+ KMer kmer = gen.kmer();
+
+ auto& entry = data.index_[kmer];
+ entry.insert(i);
+
+ gen.Next();
+ }
+ }
+
+ INFO("Done. Total " << data.seqs_.size() << " adapters processed. Total "
+ << data.index_.size() << " unique k-mers.");
+}
diff --git a/src/projects/cclean/adapter_index.hpp b/src/projects/cclean/adapter_index.hpp
new file mode 100644
index 0000000..1bcc21f
--- /dev/null
+++ b/src/projects/cclean/adapter_index.hpp
@@ -0,0 +1,61 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef CCLEAN_ADAPTERINDEX_HPP
+#define CCLEAN_ADAPTERINDEX_HPP
+
+#include "sequence/seq.hpp"
+#include "utils/mph_index/kmer_index.hpp"
+
+#include <string>
+#include <set>
+#include <unordered_map>
+
+namespace cclean {
+const unsigned K = 10;
+typedef Seq<K> KMer;
+
+class AdapterIndex {
+ typedef std::set<std::size_t> IndexValueType;
+ std::unordered_map<KMer, IndexValueType, KMer::hash> index_;
+
+ public:
+ AdapterIndex() {}
+
+ void clear() {
+ index_.clear();
+ seqs_.clear();
+ }
+ IndexValueType& operator[](cclean::KMer s) { return index_[s]; }
+ auto find(cclean::KMer s) const -> decltype(index_.find(s)) { return index_.find(s); }
+ auto end() const -> decltype(index_.end()) { return index_.end(); }
+
+ bool contains(cclean::KMer s) const {
+ return index_.find(s) != index_.end();
+ }
+ const std::string& seq(size_t idx) const { return seqs_[idx]; }
+
+ private:
+ std::vector<std::string> seqs_;
+
+ friend class AdapterIndexBuilder;
+};
+
+class AdapterIndexBuilder {
+ public:
+ AdapterIndexBuilder() {}
+
+ void FillAdapterIndex(const std::string &db, AdapterIndex &index);
+
+ private:
+ DECL_LOGGER("Index Building");
+};
+
+ // end of namespace
+}
+
+#endif // __CCLEAN__ADAPTERINDEX_HPP__
diff --git a/src/projects/cclean/additional.cpp b/src/projects/cclean/additional.cpp
new file mode 100644
index 0000000..ed0065f
--- /dev/null
+++ b/src/projects/cclean/additional.cpp
@@ -0,0 +1,69 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef ADDITIONAL_CPP
+#define ADDITIONAL_CPP
+
+#include "output.hpp"
+#include "config_struct_cclean.hpp"
+#include "io/read_processor.hpp"
+
+ enum WorkModeType {
+ NONE = 0,
+ SINGLE_END = 1,
+ SINGLE_END_Q = 2,
+ BRUTE_SIMPLE = 3,
+ BRUTE_WITH_Q = 4
+ };
+
+ constexpr double MatchScore = 0.6;
+ constexpr double MismatchScore = 100;
+
+ class AbstractCclean {
+ // Abstract base class for cclean functors
+ public:
+ AbstractCclean(std::ostream &aligned_output, std::ostream &bed,
+ const std::string &db,
+ const WorkModeType &mode,
+ const unsigned mlen,
+ const bool full_inform = false)
+ :aligned_(0), full_inform_(full_inform), read_mlen_(mlen),
+ mismatch_threshold_(cfg::get().mismatch_threshold),
+ score_threshold_(cfg::get().score_treshold),
+ aligned_part_fraction_(cfg::get().aligned_part_fraction),
+ db_name_(db), mode_(mode), aligned_output_stream_(aligned_output),
+ bad_stream_(bed) {}
+ virtual Read operator()(const Read &read, bool *ok) = 0;
+ inline size_t aligned() { return aligned_; }
+ virtual ~AbstractCclean() {}
+
+ protected:
+ size_t aligned_;
+
+ const bool full_inform_;
+ const uint read_mlen_;
+ const double mismatch_threshold_; // for nonquality mode
+ const double score_threshold_; // for quality mode
+
+ const double aligned_part_fraction_;
+ const std::string &db_name_;
+ const WorkModeType mode_;
+
+ std::ostream &aligned_output_stream_;
+ std::ostream &bad_stream_;
+ // Abstract for clean functors
+ class AbstractCleanFunctor {
+ public:
+ inline virtual bool operator()(const Read &r,
+ const StripedSmithWaterman::Alignment &a,
+ double aligned_part, const std::string &adapter,
+ double *best_score) = 0;
+ virtual ~AbstractCleanFunctor() {}
+ };
+ };
+
+#endif // ADDITIONAL_CPP
diff --git a/src/projects/cclean/brute_force_clean.cpp b/src/projects/cclean/brute_force_clean.cpp
new file mode 100644
index 0000000..de35bb3
--- /dev/null
+++ b/src/projects/cclean/brute_force_clean.cpp
@@ -0,0 +1,97 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "brute_force_clean.hpp"
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include "adapter_index.hpp"
+#include <ssw/ssw_cpp.h> // Striped Smith-Waterman aligner
+#include "additional.cpp"
+#include "output.hpp"
+
+using std::string;
+using std::vector;
+using StripedSmithWaterman::Filter;
+using StripedSmithWaterman::Aligner;
+using StripedSmithWaterman::Alignment;
+using cclean_output::print_alignment;
+using cclean_output::print_bad;
+using cclean_output::print_match;
+using cclean_output::print_read;
+
+static inline bool is_alignment_good(const StripedSmithWaterman::Alignment& a,
+ const std::string& sequence,
+ const std::string& query,
+ double aligned_part_fraction) {
+ // Сheck that query adjoins or even overlaps the sequence edge
+ return (std::min(a.query_end - a.query_begin + 1, a.ref_end - a.ref_begin + 1)
+ / (double) query.size() > aligned_part_fraction) &&
+ (a.ref_begin == 0 || a.ref_end == sequence.size() - 1);
+}
+
+Read BruteForceClean::operator()(const Read &read, bool *ok) {
+ const string &read_name = read.getName();
+ const string &seq_string = read.getSequenceString();
+ Filter filter; // SSW filter
+ Aligner aligner; // SSW aligner
+ aligner.SetReferenceSequence(seq_string.c_str(),
+ static_cast<int>(seq_string.size()));
+ Alignment alignment;
+
+ // It can be many alignment adaps, so we searching the most probable
+ double best_score;
+ if (mode_ == BRUTE_SIMPLE) // so in both mode first overlap will initialize as best
+ best_score = mismatch_threshold_;
+ if (mode_ == BRUTE_WITH_Q)
+ best_score = score_threshold_;
+ std::string best_adapter = "";
+
+ // For each adapter align read and adapter
+ for (std::string adapt_string: adap_seqs_) {
+
+ aligner.Align(adapt_string.c_str(), filter, &alignment);
+ if((*checker)(read, alignment, aligned_part_fraction_, adapt_string,
+ &best_score)) {
+ best_adapter = adapt_string;
+ }
+ }
+
+ if (!best_adapter.empty()) {
+ aligner.Align(best_adapter.c_str(), filter, &alignment);
+ aligned_ += 1;
+ Read cuted_read = cclean_utils::CutRead(read, alignment.ref_begin,
+ alignment.ref_end);
+ if (full_inform_) // If user want full output
+# pragma omp critical
+ print_alignment(aligned_output_stream_, alignment, seq_string,
+ best_adapter, read_name, db_name_);
+
+ // Cuted read must be >= minimum lenght specified by arg
+ if (cuted_read.getSequenceString().size() >= read_mlen_) {
+ if (full_inform_) // If user want full output
+# pragma omp critical
+ print_bad(bad_stream_, read_name, alignment.ref_begin, alignment.ref_end);
+ (*ok) = true;
+ return cuted_read;
+ }
+ else {
+ if (full_inform_)
+# pragma omp critical
+ print_bad(bad_stream_, read_name, 0, alignment.ref_end);
+ (*ok) = false;
+ return cuted_read;
+ }
+ }
+ else {
+ // Read was not aligned with any adapter
+ (*ok) = true;
+ return read;
+ }
+}
diff --git a/src/projects/cclean/brute_force_clean.hpp b/src/projects/cclean/brute_force_clean.hpp
new file mode 100644
index 0000000..daeabe5
--- /dev/null
+++ b/src/projects/cclean/brute_force_clean.hpp
@@ -0,0 +1,72 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef BRUTE_FORCE_CLEAN_HPP
+#define BRUTE_FORCE_CLEAN_HPP
+
+#include "utils.hpp"
+#include "additional.cpp"
+
+class BruteForceClean: public AbstractCclean {
+ // Class that get read with oper() and clean it, if that possible
+ public:
+ BruteForceClean(std::ostream& aligned_output,
+ std::ostream& bed,const std::string &db,
+ const WorkModeType &mode,
+ const uint mlen,
+ const std::vector<std::string> &gen,
+ const bool full_inform = false)
+ : AbstractCclean(aligned_output, bed, db, mode, mlen, full_inform),
+ adap_seqs_(gen) {
+ if(mode == BRUTE_SIMPLE) checker = new BruteCleanFunctor;
+ if(mode == BRUTE_WITH_Q) checker = new BruteQualityCleanFunctor;
+ }
+ virtual ~BruteForceClean() { delete checker; }
+ // ReadProcessor class put each read in this operator
+ virtual Read operator()(const Read &read, bool *ok);
+
+ private:
+ const std::vector<std::string> &adap_seqs_;
+ std::string best_adapter_;
+ AbstractCleanFunctor *checker; // Checks is adapter in read
+
+ // Here goes functors for clean in different modes
+ class BruteCleanFunctor: public AbstractCleanFunctor {
+ virtual inline bool operator()(const Read &r,
+ const StripedSmithWaterman::Alignment &a,
+ double aligned_part, const std::string &adapter,
+ double *best_score) {
+ double cur_score = cclean_utils::
+ GetMismatches(r.getSequenceString(), adapter, a);
+ if (cur_score < (*best_score) &&
+ cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter,
+ aligned_part)) {
+ (*best_score) = cur_score;
+ return true;
+ }
+ return false;
+ }
+ };
+ class BruteQualityCleanFunctor: public AbstractCleanFunctor {
+ virtual inline bool operator()(const Read &r,
+ const StripedSmithWaterman::Alignment &a,
+ double aligned_part, const std::string &adapter,
+ double *best_score) {
+ double cur_score = cclean_utils::
+ GetScoreWithQuality(a, r.getQuality().str());
+ if (cur_score >= (*best_score) &&
+ cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter,
+ aligned_part)) {
+ (*best_score) = cur_score;
+ return true;
+ }
+ return false;
+ }
+ };
+};
+
+#endif // BRUTE_FORCE_CLEAN_HPP
diff --git a/src/projects/cclean/comparator.hpp b/src/projects/cclean/comparator.hpp
new file mode 100644
index 0000000..355431e
--- /dev/null
+++ b/src/projects/cclean/comparator.hpp
@@ -0,0 +1,18 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef COMPARATOR_H_
+#define COMPARATOR_H_
+
+class Compare {
+ public:
+ bool operator() (std::string * lhs, std::string * rhs) const {
+ return *lhs < *rhs;
+ }
+};
+
+#endif /* COMPARATOR_H_ */
diff --git a/src/projects/cclean/config_struct_cclean.cpp b/src/projects/cclean/config_struct_cclean.cpp
new file mode 100644
index 0000000..c9e9eda
--- /dev/null
+++ b/src/projects/cclean/config_struct_cclean.cpp
@@ -0,0 +1,44 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "config_struct_cclean.hpp"
+#include "pipeline/config_common.hpp"
+#include "utils/openmp_wrapper.h"
+
+void load(cclean_config& cfg, const std::string &filename) {
+ boost::property_tree::ptree pt;
+ boost::property_tree::read_info(filename, pt);
+
+ load(cfg, pt);
+}
+
+void load(cclean_config& cfg, boost::property_tree::ptree const& pt) {
+ using config_common::load;
+ load(cfg.use_quality, pt, "use_quality");
+ load(cfg.use_bruteforce, pt, "use_bruteforce");
+ load(cfg.debug_information, pt, "debug_information");
+
+ load(cfg.score_treshold, pt, "score_treshold");
+ load(cfg.mismatch_threshold, pt, "mismatch_threshold");
+ load(cfg.minimum_lenght, pt, "minimum_lenght");
+ load(cfg.nthreads, pt, "nthreads");
+ load(cfg.aligned_part_fraction, pt, "aligned_part_fraction");
+ load(cfg.buffer_size, pt, "buffer_size");
+
+ load(cfg.dataset_file_name, pt, "dataset");
+ load(cfg.database, pt, "database");
+ load(cfg.input_working_dir, pt, "input_working_dir");
+ load(cfg.output_working_dir, pt, "output_working_dir");
+
+ std::string file_name = cfg.dataset_file_name;
+ cfg.dataset.load(file_name);
+
+ // Fix number of threads according to OMP capabilities.
+ cfg.nthreads = std::min(cfg.nthreads, (unsigned)omp_get_max_threads());
+ // Inform OpenMP runtime about this :)
+ omp_set_num_threads(cfg.nthreads);
+}
diff --git a/src/projects/cclean/config_struct_cclean.hpp b/src/projects/cclean/config_struct_cclean.hpp
new file mode 100644
index 0000000..e56cc92
--- /dev/null
+++ b/src/projects/cclean/config_struct_cclean.hpp
@@ -0,0 +1,42 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef CONFIG_STRUCT_CCLEAN_HPP
+#define CONFIG_STRUCT_CCLEAN_HPP
+
+#include "pipeline/config_singl.hpp"
+#include <boost/property_tree/ptree_fwd.hpp>
+#include "pipeline/library.hpp"
+
+struct cclean_config {
+
+ bool use_quality;
+ bool use_bruteforce;
+ bool debug_information;
+
+ unsigned score_treshold;
+ unsigned mismatch_threshold;
+ unsigned minimum_lenght;
+ unsigned nthreads;
+ unsigned buffer_size;
+ double aligned_part_fraction;
+
+ std::string dataset_file_name;
+ std::string database;
+ std::string input_working_dir;
+ std::string output_working_dir;
+
+ io::DataSet<> dataset;
+};
+
+// main config load function
+void load(cclean_config& cfg, const std::string &filename);
+void load(cclean_config& cfg, boost::property_tree::ptree const& pt);
+
+typedef config_common::config<cclean_config> cfg;
+
+#endif
diff --git a/src/projects/cclean/job_wrappers.cpp b/src/projects/cclean/job_wrappers.cpp
new file mode 100644
index 0000000..3ea37c3
--- /dev/null
+++ b/src/projects/cclean/job_wrappers.cpp
@@ -0,0 +1,97 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <set>
+
+#include "job_wrappers.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "adapter_index.hpp"
+#include "valid_kmer_generator.hpp"
+#include "adapter_index.hpp"
+#include "output.hpp"
+#include "ssw/ssw_cpp.h"
+#include "utils.hpp"
+
+using cclean_output::print_alignment;
+using cclean_output::print_bad;
+using cclean_output::print_match;
+using cclean_output::print_read;
+
+Read SimpleClean::operator()(const Read &read, bool *ok)
+{
+ const std::string& name = read.getName();
+ const std::string& sequence = read.getSequenceString();
+
+ std::set<size_t> to_check;
+ ValidKMerGenerator<cclean::K> gen(sequence.c_str(), NULL, sequence.size());
+ while (gen.HasMore()) {
+ cclean::KMer kmer = gen.kmer();
+
+ auto it = index_.find(kmer);
+ if (it != index_.end())
+ to_check.insert(it->second.begin(), it->second.end());
+
+ gen.Next();
+ }
+
+ // Try to align the artifacts for corresponding kmers
+ StripedSmithWaterman::Aligner aligner;
+ StripedSmithWaterman::Filter filter;
+ StripedSmithWaterman::Alignment alignment; // why it was in for loop?
+ aligner.SetReferenceSequence(sequence.c_str(), sequence.size());
+
+ // Pointer on best match adapter
+ const std::string *best_adapter = nullptr;
+ double best_score;
+ if (mode_ == SINGLE_END) // so in both mode first overlap will initialize as best
+ best_score = mismatch_threshold_;
+ if (mode_ == SINGLE_END_Q)
+ best_score = score_threshold_;
+ best_adapter = nullptr;
+
+ for (auto it = to_check.begin(), et = to_check.end(); it != et; ++it) {
+ const std::string &query = index_.seq(*it);
+ aligner.Align(query.c_str(), filter, &alignment);
+ // Check is this apapter better then previous best
+ if((*checker)(read, alignment, aligned_part_fraction_, query,
+ &best_score)) {
+ best_adapter = &query;
+ }
+ }
+
+ if (best_adapter != nullptr) {
+ aligner.Align(best_adapter->c_str(), filter, &alignment);
+ aligned_ += 1;
+ Read cuted_read = cclean_utils::CutRead(read, alignment.ref_begin,
+ alignment.ref_end);
+ if (full_inform_) // If user want full output
+# pragma omp critical
+ print_alignment(aligned_output_stream_, alignment, sequence,
+ *best_adapter,name, db_name_);
+
+ // Cuted read must be >= minimum lenght specified by arg
+ if (cuted_read.getSequenceString().size() >= read_mlen_) {
+ if (full_inform_)
+# pragma omp critical
+ print_bad(bad_stream_, name, alignment.ref_begin, alignment.ref_end);
+ (*ok) = true;
+ return cuted_read;
+ }
+ else {
+ if (full_inform_)
+# pragma omp critical
+ print_bad(bad_stream_, name, 0, alignment.ref_end);
+ (*ok) = false;
+ return cuted_read;
+ }
+ }
+ else {
+ // Read was not aligned with any adapter
+ (*ok) = true;
+ return read;
+ }
+}
diff --git a/src/projects/cclean/job_wrappers.hpp b/src/projects/cclean/job_wrappers.hpp
new file mode 100644
index 0000000..7adccb1
--- /dev/null
+++ b/src/projects/cclean/job_wrappers.hpp
@@ -0,0 +1,73 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef JOB_WRAPERS_HPP
+#define JOB_WRAPERS_HPP
+
+#include "additional.cpp"
+#include "utils.hpp"
+
+namespace cclean {
+ class AdapterIndex;
+}
+
+class SimpleClean: public AbstractCclean {
+ public:
+ SimpleClean(std::ostream &aligned_output,
+ std::ostream &bed, const std::string &db,
+ const WorkModeType &mode,
+ const unsigned mlen,
+ const cclean::AdapterIndex &index,
+ const bool full_inform = false)
+ : AbstractCclean(aligned_output, bed, db, mode, mlen, full_inform),
+ index_(index) {
+ if(mode_ == SINGLE_END) checker = new SimpleCleanFunctor;
+ if(mode_ == SINGLE_END_Q) checker = new SimpleQualityCleanFunctor;
+ }
+ virtual ~SimpleClean() { delete checker; }
+ virtual Read operator()(const Read &read, bool *ok);
+
+ private:
+ const cclean::AdapterIndex &index_;
+ AbstractCleanFunctor *checker; // Checks is adapter in read
+
+ // Here goes functors for clean in different modes
+ class SimpleCleanFunctor: public AbstractCleanFunctor {
+ virtual inline bool operator()(const Read &r,
+ const StripedSmithWaterman::Alignment &a,
+ double aligned_part, const std::string &adapter,
+ double *best_score) {
+ double cur_score = cclean_utils::
+ GetMismatches(r.getSequenceString(), adapter, a);
+ if (cur_score < (*best_score) &&
+ cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter,
+ aligned_part)) {
+ (*best_score) = cur_score;
+ return true;
+ }
+ return false;
+ }
+ };
+ class SimpleQualityCleanFunctor: public AbstractCleanFunctor {
+ virtual inline bool operator()(const Read &r,
+ const StripedSmithWaterman::Alignment &a,
+ double aligned_part, const std::string &adapter,
+ double *best_score) {
+ double cur_score = cclean_utils::
+ GetScoreWithQuality(a, r.getQuality().str());
+ if (cur_score >= (*best_score) &&
+ cclean_utils::is_alignment_good(a, r.getSequenceString(), adapter,
+ aligned_part)) {
+ (*best_score) = cur_score;
+ return true;
+ }
+ return false;
+ }
+ };
+};
+
+#endif /* JOBWRAPPERS_H_ */
diff --git a/src/projects/cclean/main.cpp b/src/projects/cclean/main.cpp
new file mode 100644
index 0000000..4d50785
--- /dev/null
+++ b/src/projects/cclean/main.cpp
@@ -0,0 +1,86 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <exception>
+
+#include "sequence/seq.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "utils/memory_limit.hpp"
+#include "running_modes.hpp"
+#include "config_struct_cclean.hpp"
+#include "utils/simple_tools.hpp"
+#include "adapter_index.hpp"
+#include "utils.hpp"
+
+#include "valid_kmer_generator.hpp"
+#include "io/read_processor.hpp"
+#include "modules/ssw_cpp.h"
+#include "additional.cpp"
+
+#include "job_wrappers.hpp"
+#include "brute_force_clean.hpp"
+
+using logging::logger;
+using logging::create_logger;
+using logging::console_writer;
+using std::string;
+
+constexpr int CONFIG_FILE_ARG = 1;
+
+void usage() {
+ std::cout << "usage: cclean [program config file]" << std::endl;
+}
+
+void create_console_logger() {
+ logger *lg = create_logger("");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
+
+int main(int argc, char *argv[]) {
+
+ create_console_logger();
+
+ if (argc < 2) {
+ usage();
+ return EXIT_FAILURE;
+ }
+
+ std::string config_file = argv[CONFIG_FILE_ARG];
+ INFO("Loading config from " << config_file.c_str());
+ if (!path::FileExists(config_file)) {
+ ERROR("File " + config_file + " doesn't exists.");
+ return EXIT_FAILURE;
+ }
+ cfg::create_instance(config_file);
+
+ const std::string &database = cfg::get().database;
+ if (!path::FileExists(database)) {
+ ERROR("File " + database + " doesn't exists.");
+ return EXIT_FAILURE;
+ }
+ const std::string &dataset = cfg::get().dataset_file_name;
+ if (!path::FileExists(dataset)) {
+ ERROR("File " + dataset + " doesn't exists.");
+ return EXIT_FAILURE;
+ }
+
+ clock_t start = clock();
+
+ Cleaner::ProcessDataset(); // Main work here
+
+ INFO("DONE");
+ clock_t ends = clock();
+ INFO("Processor Time Spent: " << (double) (ends - start) / CLOCKS_PER_SEC
+ << " seconds.");
+ INFO("Goodbye!");
+
+ return EXIT_SUCCESS;
+}
diff --git a/src/projects/cclean/output.cpp b/src/projects/cclean/output.cpp
new file mode 100644
index 0000000..ff85f99
--- /dev/null
+++ b/src/projects/cclean/output.cpp
@@ -0,0 +1,82 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <iostream>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include "output.hpp"
+#include "utils.hpp"
+
+namespace cclean_output {
+
+void print_n_times(std::ostream& output, char c, int n) {
+ for (int i = 0; i < n; ++i) {
+ output << c;
+ }
+}
+
+void print_alignment(std::ostream& output, const StripedSmithWaterman::Alignment &data,
+ const std::string& ref, const std::string& query,
+ const std::string& name, const std::string& database_name) {
+
+ output << "Alignment: input sequence (first line) " << name << " alignes "
+ << std::endl
+ << "sequence from database (last line) " << database_name << std::endl;
+
+ std::string aligned_query, aligned_ref;
+ cclean_utils::RestoreFromCigar(ref, query, aligned_ref, aligned_query, data);
+
+ // case when pattern's start pos is less than text one
+ int text_offset = data.ref_begin - data.query_begin < 0 ? data.query_begin
+ - data.ref_begin : 0;
+
+ // ref = read
+ print_n_times(output, ' ', text_offset);
+ output << ref << std::endl;
+ print_n_times(output, ' ', text_offset + data.ref_begin);
+ output << aligned_ref << std::endl;
+
+ // vertical dashes
+ print_n_times(output, ' ', text_offset + data.ref_begin);
+ for (int i = 0; i < (int)std::min(aligned_query.length(), aligned_ref.length()); ++i) {
+ aligned_query.at(i) == aligned_ref.at(i) ? output << "|" : output << "*";
+ }
+ output << std::endl;
+
+ // query = contamination
+ print_n_times(output, ' ', text_offset + data.ref_begin);
+ output << aligned_query << std::endl;
+ print_n_times(output, ' ', data.ref_begin - data.query_begin);
+ output << query << std::endl;
+ output << std::endl;
+ }
+
+void print_match(std::ostream& output, std::ostream& bed, std::map<std::string*,
+ std::vector<int>, Compare>& res, const std::string& name,
+ const std::string& seq, const std::string &db_name) {
+ for (auto it = res.begin(); it != res.end(); ++it) {
+ for (auto it_pos = it->second.begin(); it_pos != it->second.end(); ++it_pos) {
+
+ output << "Match: input sequence (first line) " << name << " matches "
+ << std::endl
+ << "sequence from database (2nd line) " << db_name << std::endl;
+
+ output << seq << std::endl;
+ print_n_times(output, ' ', *it_pos);
+ print_n_times(output, '|', it->first->length());
+ output << std::endl;
+ print_n_times(output, ' ', *it_pos);
+ output << *(it->first) << std::endl;
+ output << std::endl;
+
+ print_bad(bed, name, *it_pos, *it_pos + it->first->size());
+ }
+ }
+}
+//end of namespace
+}
diff --git a/src/projects/cclean/output.hpp b/src/projects/cclean/output.hpp
new file mode 100644
index 0000000..8266a45
--- /dev/null
+++ b/src/projects/cclean/output.hpp
@@ -0,0 +1,49 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef OUTPUT_HPP
+#define OUTPUT_HPP
+
+#include <string>
+#include <vector>
+#include <map>
+#include <io/read.hpp>
+#include <ostream>
+#include "comparator.hpp"
+#include "modules/ssw_cpp.h"
+
+namespace cclean_output {
+
+void print_n_times(std::ostream& output, char c, int n);
+
+void print_alignment(std::ostream& output,
+ const StripedSmithWaterman::Alignment & data,
+ const std::string& ref,
+ const std::string& query, const std::string& name,
+ const std::string& database_name);
+
+void print_match(std::ostream& output, std::ostream& bed, std::map<std::string*,
+ std::vector<int>, Compare>& res, const std::string& name,
+ const std::string& seq, const std::string &db_name);
+
+void print_bad(std::ostream& output, const std::string & name,
+ int start, int stop);
+
+inline void print_read(std::ostream& output, const Read &read) {
+ std::ofstream &stream =
+ reinterpret_cast<std::ofstream&>(output);
+ read.print(stream, Read::PHRED_OFFSET);
+}
+
+inline void print_bad(std::ostream& output, const std::string & name,
+ int start, int stop) {
+ output << name << "\t" << start << "\t" << stop << std::endl;
+}
+
+// end of namespace
+}
+#endif /* OUTPUT_H_ */
diff --git a/src/projects/cclean/running_modes.cpp b/src/projects/cclean/running_modes.cpp
new file mode 100644
index 0000000..73dcdfb
--- /dev/null
+++ b/src/projects/cclean/running_modes.cpp
@@ -0,0 +1,268 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "running_modes.hpp"
+
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#include "adapter_index.hpp"
+#include "output.hpp"
+#include "io/read_processor.hpp"
+#include "pipeline/library.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "job_wrappers.hpp"
+#include "brute_force_clean.hpp"
+
+AbstractCclean *Cleaner::getCleaner(std::ofstream *outf_alig_debug,
+ std::ofstream *outf_bad_deb,
+ const std::string &db, WorkModeType mode,
+ unsigned mlen,
+ const cclean::AdapterIndex &index,
+ bool deb_info) {
+ AbstractCclean *cleaner; // Creating cleaner for reads
+ if (mode == SINGLE_END || mode == SINGLE_END_Q)
+ cleaner = new SimpleClean(*outf_alig_debug, *outf_bad_deb, db,
+ mode, mlen, index, deb_info);
+ if (mode == BRUTE_SIMPLE || mode == BRUTE_WITH_Q)
+ cleaner = new BruteForceClean(*outf_alig_debug, *outf_bad_deb, db,
+ mode, mlen, index.GetSeqs(), deb_info);
+ return cleaner;
+}
+
+void Cleaner::ProcessDataset() {
+ // Options proceed
+ const std::string db = cfg::get().database;
+ const WorkModeType mode = getMode();
+
+ cclean::AdapterIndex index;
+ cclean::AdapterIndexBuilder().FillAdapterIndex(db, index);
+
+ const io::DataSet<> &dataset = cfg::get().dataset;
+ io::DataSet<> outdataset;
+ // Proccessing dataset. Iterating through libraries
+ for (auto it = dataset.library_begin(), et = dataset.library_end(); it != et; ++it) {
+ const io::SequencingLibrary<> &lib = *it;
+ io::SequencingLibrary<> outlib = lib;
+ outlib.clear();
+ // Iterating through paired reads in current library lib
+ for (auto I = lib.paired_begin(), E = lib.paired_end(); I != E; ++I) {
+ INFO("Correcting pair reads from " << I->first << " and " << I->second);
+
+ const std::string &file_name_l = I->first;
+ const std::string &file_name_r = I->second;
+ const std::string outcorl = getReadsFilename(cfg::get().output_working_dir,
+ file_name_l, "correct_l");
+ const std::string outcorr = getReadsFilename(cfg::get().output_working_dir,
+ file_name_r, "correct_r");
+ const std::string unpaired = getPureFilename(file_name_l) + "_" +
+ getPureFilename(file_name_r);
+ const std::string outcoru = getReadsFilename(cfg::get().output_working_dir,
+ unpaired, "correct_u");
+ const std::string outbadl = getReadsFilename(cfg::get().output_working_dir,
+ file_name_l, "bad");
+ const std::string outbadr = getReadsFilename(cfg::get().output_working_dir,
+ file_name_r, "bad");
+
+ std::ofstream ofcorl(outcorl.c_str());
+ std::ofstream ofbadl(outbadl.c_str());
+ std::ofstream ofcorr(outcorr.c_str());
+ std::ofstream ofbadr(outbadr.c_str());
+ std::ofstream ofunp (outcoru.c_str());
+
+ CorrectPairedReadFiles(index, file_name_l, file_name_r, &ofbadl, &ofcorl,
+ &ofbadr, &ofcorr, &ofunp, mode);
+ outlib.push_back_paired(outcorl, outcorr);
+ outlib.push_back_single(outcoru);
+ }
+
+ for (auto I = lib.single_begin(), E = lib.single_end(); I != E; ++I) {
+ INFO("Correcting single reads from " << *I);
+
+ const std::string reads_file_name = *I;
+ const std::string outcor = getReadsFilename(cfg::get().output_working_dir,
+ reads_file_name, "correct");
+ const std::string outbad = getReadsFilename(cfg::get().output_working_dir,
+ reads_file_name, "bad");
+
+ std::ofstream ofgood(outcor.c_str());
+ std::ofstream ofbad(outbad.c_str());
+
+ CorrectReadFile(index, reads_file_name, &ofgood, &ofbad, mode);
+ outlib.push_back_single(outcor);
+ }
+ outdataset.push_back(outlib);
+ }
+
+ cfg::get_writable().dataset = outdataset;
+}
+
+void Cleaner::CorrectReadFile(const cclean::AdapterIndex &index,
+ const std::string &fname, std::ofstream *outf_good,
+ std::ofstream *outf_bad, WorkModeType mode) {
+ const unsigned nthreads = cfg::get().nthreads;
+ const std::string db = cfg::get().database;
+ const unsigned mlen = cfg::get().minimum_lenght;
+ const size_t read_buffer_size = nthreads * cfg::get().buffer_size;
+ std::vector<Read> reads(read_buffer_size);
+ std::vector<bool> res(read_buffer_size, false);
+
+ const bool deb_info = cfg::get().debug_information;
+ std::string bad_out_debug = "";
+ std::string aligned_out_debug = "";
+ if (deb_info) {
+ // Else ofstreams will be not used, so there is no sense to create empty files
+ // So ofstreams will be created with empty strings
+ bad_out_debug = getReadsFilename(cfg::get().output_working_dir,
+ fname, "debug.bad");
+ aligned_out_debug = getReadsFilename(cfg::get().output_working_dir,
+ fname, "debug.alig");
+ }
+ std::ofstream ofbad_deb(bad_out_debug.c_str());
+ std::ofstream ofalig_deb(aligned_out_debug.c_str());
+
+ unsigned buffer_no = 0;
+ unsigned count_bad = 0;
+ unsigned count_total = 0;
+
+ ireadstream irs(fname);
+ VERIFY(irs.is_open());
+
+ AbstractCclean *cleaner = getCleaner(&ofalig_deb, &ofbad_deb, db, mode, mlen,
+ index, deb_info);
+
+ while (!irs.eof()) {
+ unsigned buf_size = 0;
+ for (; buf_size < read_buffer_size && !irs.eof(); ++buf_size) {
+ irs >> reads[buf_size];
+ }
+ if(deb_info) INFO("Prepared batch " << buffer_no << " of "
+ << buf_size << " reads.");
+ count_bad += CorrectReadsBatch(cleaner, &res, &reads, buf_size, nthreads);
+ count_total += buf_size;
+ if (deb_info) INFO("Processed batch " << buffer_no);
+ for (size_t i = 0; i < buf_size; ++i) { // Here output reads in files
+ reads[i].print(*(res[i] ? outf_good : outf_bad), Read::PHRED_OFFSET);
+ }
+ if(deb_info) INFO("Written batch " << buffer_no);
+ ++buffer_no;
+ }
+
+ delete cleaner;
+ // Process info about results
+ const double percent_val = static_cast<double>(count_total) / 100.0;
+ std::ostringstream percent_bad;
+ percent_bad << std::fixed << std::setprecision(2) <<
+ (static_cast<double>(count_bad) / percent_val);
+ INFO("Total proceed " + std::to_string(count_total) + ", " +
+ std::to_string(count_bad) + " reads (" + percent_bad.str() +
+ " percents of total) is bad.");
+}
+
+void Cleaner::CorrectPairedReadFiles(const cclean::AdapterIndex &index,
+ const std::string &fnamel,
+ const std::string &fnamer, std::ofstream *ofbadl,
+ std::ofstream *ofcorl, std::ofstream *ofbadr,
+ std::ofstream *ofcorr, std::ofstream *ofunp,
+ WorkModeType mode) {
+ const unsigned nthreads = cfg::get().nthreads;
+ const std::string db = cfg::get().database;
+ const unsigned mlen = cfg::get().minimum_lenght;
+ const size_t read_buffer_size = nthreads * cfg::get().buffer_size;
+
+ std::vector<Read> left_reads(read_buffer_size);
+ std::vector<Read> right_reads(read_buffer_size);
+ std::vector<bool> left_res(read_buffer_size, false);
+ std::vector<bool> right_res(read_buffer_size, false);
+
+ ireadstream irsl(fnamel);
+ ireadstream irsr(fnamer);
+ VERIFY(irsl.is_open());
+ VERIFY(irsr.is_open());
+
+ const bool deb_info = cfg::get().debug_information;
+ std::string bad_out_deb_l = "";
+ std::string aligned_out_deb_l = "";
+ std::string bad_out_deb_r = "";
+ std::string aligned_out_deb_r = "";
+ if (deb_info) {
+ // Else ofstreams will be not used, so there is no sense to create empty files
+ // So ofstreams will be created with empty strings
+ bad_out_deb_l = getReadsFilename(cfg::get().output_working_dir,
+ fnamel, "debug.bad");
+ aligned_out_deb_l = getReadsFilename(cfg::get().output_working_dir,
+ fnamel, "debug.alig");
+ bad_out_deb_r = getReadsFilename(cfg::get().output_working_dir,
+ fnamer, "debug.bad");
+ aligned_out_deb_r = getReadsFilename(cfg::get().output_working_dir,
+ fnamer, "debug.alig");
+ }
+ std::ofstream ofbad_deb_l(bad_out_deb_l.c_str());
+ std::ofstream ofalig_deb_l(aligned_out_deb_l.c_str());
+ std::ofstream ofbad_deb_r(bad_out_deb_r.c_str());
+ std::ofstream ofalig_deb_r(aligned_out_deb_r.c_str());
+
+ AbstractCclean *cleaner_l = getCleaner(&ofalig_deb_l, &ofbad_deb_l, db, mode,
+ mlen, index, deb_info);
+ AbstractCclean *cleaner_r = getCleaner(&ofalig_deb_r, &ofbad_deb_r, db, mode,
+ mlen, index, deb_info);
+ unsigned buffer_no = 0;
+ unsigned count_bad_l = 0;
+ unsigned count_bad_r = 0;
+ unsigned count_total = 0;
+
+ while (!irsl.eof() && !irsr.eof()) {
+ unsigned buf_size = 0;
+ for (; buf_size < read_buffer_size && !irsl.eof() &&
+ !irsr.eof(); ++buf_size) {
+ irsl >> left_reads[buf_size];
+ irsr >> right_reads[buf_size];
+ }
+ if(deb_info) INFO("Prepared batch " << buffer_no << " of " << buf_size
+ << " reads.");
+
+ count_bad_l += CorrectReadsBatch(cleaner_l, &left_res, &left_reads,
+ buf_size, nthreads);
+ count_bad_r += CorrectReadsBatch(cleaner_r, &right_res, &right_reads,
+ buf_size, nthreads);
+ count_total += buf_size;
+
+ if(deb_info) INFO("Processed batch " << buffer_no);
+ for (size_t i = 0; i < buf_size; ++i) {
+ if (left_res[i] && right_res[i]) {
+ left_reads[i].print(*ofcorl, Read::PHRED_OFFSET);
+ right_reads[i].print(*ofcorr, Read::PHRED_OFFSET);
+ }
+ else {
+ left_reads[i].print(*(left_res[i] ? ofunp : ofbadl),
+ Read::PHRED_OFFSET);
+ right_reads[i].print(*(right_res[i] ? ofunp : ofbadr),
+ Read::PHRED_OFFSET);
+ }
+ }
+ if(deb_info) INFO("Written batch " << buffer_no);
+ ++buffer_no;
+ }
+
+ delete cleaner_l;
+ delete cleaner_r;
+
+ // Process info abouts results
+ const double percent_val = static_cast<double>(count_total) / 100.0;
+ std::ostringstream percent_bad_l;
+ std::ostringstream percent_bad_r;
+ percent_bad_l << std::fixed << std::setprecision(2) <<
+ (static_cast<double>(count_bad_l) / percent_val);
+ percent_bad_r << std::fixed << std::setprecision(2) <<
+ (static_cast<double>(count_bad_r) / percent_val);
+ INFO("Total proceed " + std::to_string(count_total) + ", " +
+ std::to_string(count_bad_l) + " left reads (" +
+ percent_bad_l.str() + " percents of total) is bad" + ", " +
+ std::to_string(count_bad_r) + " right reads (" +
+ percent_bad_r.str() + " percents of total) is bad.");
+}
diff --git a/src/projects/cclean/running_modes.hpp b/src/projects/cclean/running_modes.hpp
new file mode 100644
index 0000000..c2709db
--- /dev/null
+++ b/src/projects/cclean/running_modes.hpp
@@ -0,0 +1,93 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef RUNNING_MODES_HPP
+#define RUNNING_MODES_HPP
+
+#include <unordered_map>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include "additional.cpp"
+#include "adapter_index.hpp"
+
+class Cleaner {
+
+ public:
+ static void ProcessDataset();
+ // Correct reads in a given file
+ static void CorrectReadFile(const cclean::AdapterIndex &index,
+ const std::string &fname,
+ std::ofstream *outf_good, std::ofstream *outf_bad,
+ WorkModeType mode);
+ // Correct reads in a given pair of files
+ static void CorrectPairedReadFiles(const cclean::AdapterIndex &index,
+ const std::string &fnamel,
+ const std::string &fnamer,
+ std::ofstream *ofbadl,
+ std::ofstream *ofcorl,
+ std::ofstream *ofbadr,
+ std::ofstream *ofcorr,
+ std::ofstream *ofunp,
+ WorkModeType mode);
+ // Parallel correction of batch of reads
+ static inline unsigned CorrectReadsBatch(AbstractCclean *cleaner,
+ std::vector<bool> *results,
+ std::vector<Read> *reads,
+ size_t buf_size, unsigned nthreads) {
+ unsigned bad = 0;
+# pragma omp parallel for shared(reads, results) num_threads(nthreads)
+ for (size_t i = 0; i < buf_size; ++i) {
+ bool ok;
+ (*reads)[i] = (*cleaner)((*reads)[i], &ok);
+ (*results)[i] = ok;
+ if (!ok) ++bad;
+ }
+ return bad;
+ }
+ // Get pure file name without extension
+ inline static std::string getPureFilename(const std::string &fname) {
+ std::string tmp = path::filename(fname);
+ std::string pure_file_name = "";
+ size_t pos = tmp.find(".fastq");
+ if (pos == std::string::npos)
+ pure_file_name = tmp;
+ else
+ pure_file_name = tmp.substr(0, pos);
+ return pure_file_name;
+ }
+ // Get filename for reads
+ inline static std::string getReadsFilename(const std::string &dirprefix,
+ const std::string &fname,
+ const std::string &suffix) {
+ const std::string &pure_file_name = getPureFilename(fname);
+ return (dirprefix + "/" + pure_file_name + "." + suffix + ".fastq");
+ }
+ // Define mode depends on config file data
+ inline static WorkModeType getMode() {
+ WorkModeType mode;
+ if (cfg::get().use_bruteforce) {
+ if (cfg::get().use_quality) mode = BRUTE_WITH_Q;
+ else mode = BRUTE_SIMPLE;
+ }
+ else {
+ if (cfg::get().use_quality) mode = SINGLE_END_Q;
+ else mode = SINGLE_END;
+ }
+ return mode;
+ }
+ // Create and return cleaner depends on mode
+ inline static AbstractCclean* getCleaner(std::ofstream *outf_alig_debug,
+ std::ofstream *outf_bad_deb,
+ const std::string &db,
+ WorkModeType mode, unsigned mlen,
+ const cclean::AdapterIndex &index,
+ bool deb_info);
+
+};
+
+#endif /* RUNNING_MODES_H_ */
diff --git a/src/projects/cclean/utils.cpp b/src/projects/cclean/utils.cpp
new file mode 100644
index 0000000..a5f0fc1
--- /dev/null
+++ b/src/projects/cclean/utils.cpp
@@ -0,0 +1,136 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include <iostream>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "utils.hpp"
+#include <ssw/ssw_cpp.h>
+#include <ssw/ssw_cpp.h> // Striped Smith-Waterman aligner
+#include <io/read.hpp>
+#include "additional.cpp"
+
+namespace cclean_utils {
+
+inline std::string ReverseComplement(const std::string& read) {
+ std::map<char, char> reverse;
+ reverse['C'] = 'G';
+ reverse['G'] = 'C';
+ reverse['T'] = 'A';
+ reverse['A'] = 'T';
+ reverse['N'] = 'N';
+
+ std::vector<char> res;
+ for(int i = 0; i < (int) read.length(); ++i) {
+ res.push_back(reverse[read[i]]);
+ }
+
+ std::reverse(res.begin(), res.end());
+ return std::string(res.begin(), res.end());
+}
+
+double GetScoreWithQuality(const StripedSmithWaterman::Alignment &a,
+ const Quality &qual)
+{ // Try to get more realistic align score depend on read quality
+ // Mathes and mismatches get from cigar alignment string below
+ double score = 0.0;
+ int ref_pos = 0, query_pos = 0;
+ for (std::vector<uint32_t>::const_iterator it = a.cigar.begin();
+ it != a.cigar.end(); ++it) {
+
+ int num = (*it & 0xFFFFFFF0) >> 4;
+ int op_code = *it & 0x0000000F;
+
+ switch (op_code) {
+ case 0: { //match
+ for (int i = 0; i < num; ++i, ++ref_pos, ++query_pos)
+ score += MatchScore;
+ break;
+ }
+ case 1: { //insert
+ for (int i = 0; i < num; ++i, ++query_pos)
+ score -= (double)qual[query_pos] / MismatchScore;
+ break;
+ }
+ case 2: { //del
+ for (int i = 0; i < num; ++i, ++ref_pos)
+ score -= (double)qual[query_pos] / MismatchScore;
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ return score;
+}
+
+Read CutRead(const Read &r, int start_pos, int end_pos) {
+ if(start_pos > end_pos) return r;
+ // Step 1: cutting read sequence
+ Read read = r;
+ std::string read_seq = read.getSequenceString();
+ std::string cuted_read_seq(std::string(read_seq, 0, start_pos) +
+ std::string(read_seq, end_pos + 1));
+ read.setSequence(cuted_read_seq.c_str());
+
+ // Step 2: cutting read quality string
+ std::string qual_string = read.getQuality().str();
+ if(qual_string.empty()) return read;
+ std::string cuted_qual_string(std::string(qual_string, 0, start_pos) +
+ std::string(qual_string, end_pos + 1));
+ read.setQuality(cuted_qual_string.c_str(), 0);
+ return read;
+}
+
+void RestoreFromCigar(const std::string& ref, const std::string& query,
+ std::string& out_ref, std::string& out_query,
+ const StripedSmithWaterman::Alignment& a) {
+
+ std::vector<char> aligned_ref, aligned_query;
+ int ref_pos = 0, query_pos = 0;
+ for (std::vector<uint32_t>::const_iterator it = a.cigar.begin();
+ it != a.cigar.end(); ++it) {
+ int num = (*it & 0xFFFFFFF0) >> 4;
+ int op_code = *it & 0x0000000F;
+
+ switch (op_code) {
+ case 0: { //match
+ for (int i = 0; i < num; ++i) {
+ aligned_ref.push_back(ref[a.ref_begin + ref_pos++]);
+ aligned_query.push_back(query[a.query_begin + query_pos++]);
+ }
+ break;
+ }
+ case 1: { //insert
+ for (int i = 0; i < num; ++i) {
+ aligned_ref.push_back('-');
+ aligned_query.push_back(query[a.query_begin + query_pos++]);
+ }
+ break;
+ }
+ case 2: { //del
+ for (int i = 0; i < num; ++i) {
+ aligned_ref.push_back(ref[a.ref_begin + ref_pos++]);
+ aligned_query.push_back('-');
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ }
+
+ out_ref = std::string(aligned_ref.begin(), aligned_ref.end());
+ out_query = std::string(aligned_query.begin(), aligned_query.end());
+}
+
+ // end of namespace cclean_utils
+}
diff --git a/src/projects/cclean/utils.hpp b/src/projects/cclean/utils.hpp
new file mode 100644
index 0000000..a71a200
--- /dev/null
+++ b/src/projects/cclean/utils.hpp
@@ -0,0 +1,58 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <ssw/ssw_cpp.h> // Striped Smith-Waterman aligner
+#include <io/read.hpp>
+#include "additional.cpp"
+#include "running_modes.hpp"
+#include "adapter_index.hpp"
+
+namespace cclean_utils {
+
+std::string ReverseComplement(const std::string& read);
+
+std::unordered_map<std::string, std::string> ProcessArgs(int argc, char *argv[],
+ bool *ok, std::string *error);
+
+double GetScoreWithQuality(const StripedSmithWaterman::Alignment &a,
+ const Quality &qual);
+
+inline bool is_alignment_good(const StripedSmithWaterman::Alignment& a,
+ const std::string& sequence,
+ const std::string& query,
+ double aligned_part_fraction) {
+ // Сheck that query adjoins or even overlaps the sequence edge
+ return (std::min(a.query_end - a.query_begin + 1, a.ref_end - a.ref_begin + 1)
+ / (double) query.size() > aligned_part_fraction) /*&&
+ (a.ref_begin == 0 || a.ref_end == sequence.size() - 1)*/;
+}
+
+// Cut read from start to end position of best aligment with adapter
+Read CutRead(const Read &r, int start_pos, int end_pos);
+void RestoreFromCigar(const std::string& ref, const std::string& query,
+ std::string& out_ref, std::string& out_query,
+ const StripedSmithWaterman::Alignment& a);
+
+inline double GetMismatches(const std::string &read, const std::string &adapter,
+ const StripedSmithWaterman::Alignment &a) {
+ std::string aligned_read;
+ std::string aligned_adapter;
+ RestoreFromCigar(read, adapter, aligned_read, aligned_adapter, a);
+ int size = (int)std::min(aligned_read.length(), aligned_adapter.length());
+ int mismatched_score = 0;
+ for (int i = 0; i < size; ++i) {
+ if (aligned_read[i] != aligned_adapter[i])
+ ++mismatched_score;
+ }
+ return static_cast<double>(mismatched_score);
+}
+// end of namespace
+}
+#endif /* UTILS_HPP */
diff --git a/src/projects/cclean/valid_kmer_generator.hpp b/src/projects/cclean/valid_kmer_generator.hpp
new file mode 100644
index 0000000..a03a9b3
--- /dev/null
+++ b/src/projects/cclean/valid_kmer_generator.hpp
@@ -0,0 +1,198 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#ifndef HAMMER_VALIDKMERGENERATOR_HPP_
+#define HAMMER_VALIDKMERGENERATOR_HPP_
+
+#include "io/read.hpp"
+#include "sequence/seq.hpp"
+
+#include <string>
+#include <vector>
+
+#include <cstdint>
+#include <cmath>
+
+/**
+ * This class is designed to iterate through valid k-mers in read.
+ * @example
+ * ValidKMerGenerator<2> gen(read, 4);
+ * while (gen.HasMore()) {
+ * MyTrickyFunction(gen.kmer());
+ * gen.Next();
+ * }
+ * or
+ * for (ValidKMerGenerator<2> gen(read, 2); gen.HasMore; gen.Next() {
+ * MyTrickyFunction(gen.kmer(), gen.pos(), gen.correct_probability());
+ * }
+ * @param kK k-mer length.
+ */
+template<uint32_t kK>
+class ValidKMerGenerator {
+ public:
+ /**
+ * @param read Read to generate k-mers from.
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const Read &read,
+ uint32_t bad_quality_threshold = 2) {
+ Reset(read.getSequenceString().data(),
+ read.getQualityString().data(),
+ read.getSequenceString().size(),
+ bad_quality_threshold);
+ }
+ /**
+ * @param seq sequence to generate k-mers from.
+ * @param qual quality string
+ * @param bad_quality_threshold This class virtually cuts
+ * nucleotides with quality lower the threshold from the ends of the
+ * read.
+ */
+ explicit ValidKMerGenerator(const char *seq, const char *qual,
+ size_t len,
+ uint32_t bad_quality_threshold = 2) {
+ Reset(seq, qual, len, bad_quality_threshold);
+ }
+
+ ValidKMerGenerator()
+ : kmer_(), seq_(0), qual_(0),
+ pos_(-1), end_(-1), len_(0),
+ correct_probability_(1), bad_quality_threshold_(2),
+ has_more_(false), first(true) {}
+
+ void Reset(const char *seq, const char *qual,
+ size_t len,
+ uint32_t bad_quality_threshold = 2) {
+ kmer_ = Seq<kK>();
+ seq_ = seq;
+ qual_ = qual;
+ pos_ = -1;
+ end_ = -1;
+ len_ = len;
+ correct_probability_ = 1.0;
+ bad_quality_threshold_ = bad_quality_threshold;
+ has_more_ = true;
+ first = true;
+
+ TrimBadQuality();
+ Next();
+ }
+
+ /**
+ * @result true if Next() succeed while generating new k-mer, false
+ * otherwise.
+ */
+ bool HasMore() const {
+ return has_more_;
+ }
+ /**
+ * @result last k-mer generated by Next().
+ */
+ const Seq<kK>& kmer() const {
+ return kmer_;
+ }
+ /**
+ * @result last k-mer position in initial read.
+ */
+ int pos() const {
+ return pos_;
+ }
+ /**
+ * @result probability that last generated k-mer is correct.
+ */
+ double correct_probability() const {
+ return correct_probability_;
+ }
+ /**
+ * This functions reads next k-mer from the read and sets hasmore to
+ * if succeeded. You can access k-mer read with kmer().
+ */
+ void Next();
+ private:
+ void TrimBadQuality();
+ double Prob(uint8_t qual) {
+ return 1 - (qual < 3 ? 0.75 : pow(10.0, -(int)qual / 10.0));
+ }
+ uint32_t GetQual(uint32_t pos) {
+ if (pos >= len_) {
+ return 2;
+ } else {
+ return qual_[pos];
+ }
+ }
+ Seq<kK> kmer_;
+ const char* seq_;
+ const char* qual_;
+ size_t pos_;
+ size_t end_;
+ size_t len_;
+ double correct_probability_;
+ uint32_t bad_quality_threshold_;
+ bool has_more_;
+ bool first;
+
+ // Disallow copy and assign
+ ValidKMerGenerator(const ValidKMerGenerator&) = delete;
+ void operator=(const ValidKMerGenerator&) = delete;
+};
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::TrimBadQuality() {
+ pos_ = 0;
+ if (qual_)
+ for (; pos_ < len_; ++pos_) {
+ if (GetQual(pos_) >= bad_quality_threshold_)
+ break;
+ }
+ end_ = len_;
+ if (qual_)
+ for (; end_ > pos_; --end_) {
+ if (GetQual(end_ - 1) >= bad_quality_threshold_)
+ break;
+ }
+}
+
+template<uint32_t kK>
+void ValidKMerGenerator<kK>::Next() {
+ if (pos_ + kK > end_) {
+ has_more_ = false;
+ } else if (first || !is_nucl(seq_[pos_ + kK - 1])) {
+ // in this case we have to look for new k-mer
+ correct_probability_ = 1.0;
+ uint32_t start_hypothesis = pos_;
+ uint32_t i = pos_;
+ for (; i < len_; ++i) {
+ if (i == kK + start_hypothesis) {
+ break;
+ }
+ if (qual_)
+ correct_probability_ *= Prob(GetQual(i));
+ if (!is_nucl(seq_[i])) {
+ start_hypothesis = i + 1;
+ correct_probability_ = 1.0;
+ }
+ }
+ if (i == kK + start_hypothesis) {
+ kmer_ = Seq<kK>(seq_ + start_hypothesis, 0, kK, /* raw */ true);
+ pos_ = start_hypothesis + 1;
+ } else {
+ has_more_ = false;
+ }
+ } else {
+ // good case we can just shift our previous answer
+ kmer_ = kmer_ << seq_[pos_ + kK - 1];
+ if (qual_) {
+ correct_probability_ *= Prob(GetQual(pos_ + kK - 1));
+ correct_probability_ /= Prob(GetQual(pos_ - 1));
+ }
+ ++pos_;
+ }
+ first = false;
+}
+#endif // HAMMER_VALIDKMERGENERATOR_HPP__
diff --git a/src/projects/corrector/CMakeLists.txt b/src/projects/corrector/CMakeLists.txt
index 0434323..4678d70 100644
--- a/src/projects/corrector/CMakeLists.txt
+++ b/src/projects/corrector/CMakeLists.txt
@@ -18,7 +18,7 @@ add_executable(corrector
config_struct.cpp
main.cpp)
-target_link_libraries(corrector input spades_modules ${COMMON_LIBRARIES})
+target_link_libraries(corrector input common_modules ${COMMON_LIBRARIES})
diff --git a/src/projects/corrector/config_struct.cpp b/src/projects/corrector/config_struct.cpp
index d799b7a..594bae8 100644
--- a/src/projects/corrector/config_struct.cpp
+++ b/src/projects/corrector/config_struct.cpp
@@ -7,7 +7,7 @@
#include "config_struct.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
#include "llvm/Support/YAMLParser.h"
#include "llvm/Support/YAMLTraits.h"
diff --git a/src/projects/corrector/contig_processor.cpp b/src/projects/corrector/contig_processor.cpp
index 7a90b62..8564d17 100644
--- a/src/projects/corrector/contig_processor.cpp
+++ b/src/projects/corrector/contig_processor.cpp
@@ -9,11 +9,11 @@
#include "config_struct.hpp"
#include "variants_table.hpp"
-#include "io/reads_io/ireader.hpp"
-#include "io/reads_io/osequencestream.hpp"
-#include "io/reads_io/file_reader.hpp"
+#include "io/reads/ireader.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "io/reads/file_reader.hpp"
#include "io/reads/single_read.hpp"
-#include "dev_support/path_helper.hpp"
+#include "utils/path_helper.hpp"
#include <boost/algorithm/string.hpp>
diff --git a/src/projects/corrector/contig_processor.hpp b/src/projects/corrector/contig_processor.hpp
index 0a46be4..a35db3b 100644
--- a/src/projects/corrector/contig_processor.hpp
+++ b/src/projects/corrector/contig_processor.hpp
@@ -15,10 +15,10 @@
#pragma once
#include "interesting_pos_processor.hpp"
#include "positional_read.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
-#include <io/sam_io/sam_reader.hpp>
-#include <io/sam_io/read.hpp>
+#include <io/sam/sam_reader.hpp>
+#include <io/sam/read.hpp>
#include "pipeline/library.hpp"
#include <string>
diff --git a/src/projects/corrector/dataset_processor.cpp b/src/projects/corrector/dataset_processor.cpp
index 15fe997..20f3e1e 100644
--- a/src/projects/corrector/dataset_processor.cpp
+++ b/src/projects/corrector/dataset_processor.cpp
@@ -10,10 +10,10 @@
#include "contig_processor.hpp"
#include "config_struct.hpp"
-#include "io/reads_io/file_reader.hpp"
-#include "dev_support/path_helper.hpp"
-#include "io/reads_io/osequencestream.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "io/reads/file_reader.hpp"
+#include "utils/path_helper.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "utils/openmp_wrapper.h"
#include <boost/algorithm/string.hpp>
@@ -169,7 +169,7 @@ string DatasetProcessor::RunSingleBwa(const string &single, const size_t lib) {
return "";
}
string nthreads_str = to_string(nthreads_);
- string last_line = bwa_string + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_screened + " " + single + " > " + path::screen_whitespaces(tmp_sam_filename);
+ string last_line = bwa_string + " mem "+ " -v 1 -t " + nthreads_str + " " + genome_screened + " " + path::screen_whitespaces(single) + " > " + path::screen_whitespaces(tmp_sam_filename);
INFO("Running bwa mem ...:" << last_line);
run_res = system(last_line.c_str());
if (run_res != 0) {
diff --git a/src/projects/corrector/dataset_processor.hpp b/src/projects/corrector/dataset_processor.hpp
index 397f5ed..2edf657 100644
--- a/src/projects/corrector/dataset_processor.hpp
+++ b/src/projects/corrector/dataset_processor.hpp
@@ -7,10 +7,10 @@
#pragma once
-#include "dev_support/path_helper.hpp"
+#include "utils/path_helper.hpp"
-#include "io/reads_io/file_reader.hpp"
-#include "dev_support/path_helper.hpp"
+#include "io/reads/file_reader.hpp"
+#include "utils/path_helper.hpp"
#include "pipeline/library.hpp"
diff --git a/src/projects/corrector/interesting_pos_processor.cpp b/src/projects/corrector/interesting_pos_processor.cpp
index 160f4a1..12358ef 100644
--- a/src/projects/corrector/interesting_pos_processor.cpp
+++ b/src/projects/corrector/interesting_pos_processor.cpp
@@ -8,7 +8,7 @@
#include "interesting_pos_processor.hpp"
#include "config_struct.hpp"
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
using namespace std;
diff --git a/src/projects/corrector/main.cpp b/src/projects/corrector/main.cpp
index 07f0ee0..ff6afa8 100644
--- a/src/projects/corrector/main.cpp
+++ b/src/projects/corrector/main.cpp
@@ -8,9 +8,9 @@
#include "dataset_processor.hpp"
#include "pipeline/config_struct.hpp"
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
#include "config_struct.hpp"
-#include "dev_support/segfault_handler.hpp"
+#include "utils/segfault_handler.hpp"
#include "version.hpp"
diff --git a/src/projects/dipspades/CMakeLists.txt b/src/projects/dipspades/CMakeLists.txt
index b60d4b8..cecc0b8 100644
--- a/src/projects/dipspades/CMakeLists.txt
+++ b/src/projects/dipspades/CMakeLists.txt
@@ -8,11 +8,11 @@
project(dipspades CXX)
add_executable(dipspades
- dipspades_config.cpp
- utils/files_utils.cpp
+ dipspades_config.cpp
+ utils/files_utils.cpp
main.cpp)
-target_link_libraries(dipspades spades_modules ${COMMON_LIBRARIES})
+target_link_libraries(dipspades common_modules ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(dipspades PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp b/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
index 4623fa0..445641f 100644
--- a/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
+++ b/src/projects/dipspades/consensus_contigs_constructor/consensus_contigs_constructor.hpp
@@ -7,7 +7,7 @@
#pragma once
-#include "io/reads_io/io_helper.hpp"
+#include "io/reads/io_helper.hpp"
#include "utils/element_printers.hpp"
#include "utils/files_utils.hpp"
@@ -25,7 +25,7 @@ namespace dipspades{
class ConsensusContigsConstructor {
conj_graph_pack &graph_pack_;
BaseHistogram<size_t> &bulge_len_hist_;
- NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> seq_mapper_;
+ BasicSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> seq_mapper_;
VertexPathIndex path_index_;
CorrectionResult correction_result_;
diff --git a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
index aa5047c..46b3080 100644
--- a/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
+++ b/src/projects/dipspades/consensus_contigs_constructor/contig_correctors/close_gaps_corrector.hpp
@@ -30,7 +30,7 @@ class CloseGapsCorrector : public AbstractContigCorrector{
if(i == gap_index[current_gap]){
VertexId start = g_.EdgeEnd(cur_edge);
VertexId end = g_.EdgeStart(path[i + 1]);
- auto dijkstra = DijkstraHelper<Graph>::CreateTargeredBoundedDijkstra(g_,
+ auto dijkstra = DijkstraHelper<Graph>::CreateTargetedBoundedDijkstra(g_,
end, dsp_cfg::get().pbr.max_bulge_nucls_len); //DijkstraHelper<Graph>::CreateBoundedDijkstra(g_, dsp_cfg::get().pbr.max_bulge_nucls_len);
dijkstra.Run(start);
if(dijkstra.DistanceCounted(end)){
diff --git a/src/projects/dipspades/dipspades.hpp b/src/projects/dipspades/dipspades.hpp
index 08c3ad9..2163350 100644
--- a/src/projects/dipspades/dipspades.hpp
+++ b/src/projects/dipspades/dipspades.hpp
@@ -6,8 +6,8 @@
//***************************************************************************
-#include "io/reads_io/splitting_wrapper.hpp"
-#include "algorithms/graph_construction.hpp"
+#include "io/reads/splitting_wrapper.hpp"
+#include "modules/graph_construction.hpp"
#include "pipeline/stage.hpp"
#include "dipspades_config.hpp"
diff --git a/src/projects/dipspades/dipspades_config.cpp b/src/projects/dipspades/dipspades_config.cpp
index 88545e8..deafb99 100644
--- a/src/projects/dipspades/dipspades_config.cpp
+++ b/src/projects/dipspades/dipspades_config.cpp
@@ -8,7 +8,7 @@
#include "dipspades_config.hpp"
#include "pipeline/config_common.hpp"
#include "utils/files_utils.hpp"
-#include "dev_support/path_helper.hpp"
+#include "utils/path_helper.hpp"
using namespace dipspades;
diff --git a/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp b/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
index c5c5f91..f064ede 100644
--- a/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
+++ b/src/projects/dipspades/haplotype_assembly/conservative_regions_searcher.hpp
@@ -7,7 +7,7 @@
#pragma once
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
#include "contig_separation_utils.hpp"
using namespace debruijn_graph;
@@ -20,7 +20,7 @@ class ConservativeRegionsSearcher{
SignedLabels signed_labels_;
ConservativeRegionStorage cons_reg_storage_;
- NewExtendedSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> mapper_;
+ BasicSequenceMapper<conj_graph_pack::graph_t, conj_graph_pack::index_t> mapper_;
map<int, MappingPath<EdgeId> > contig_map_path_;
typedef map<int, vector<int> > diff_labeled_contigs;
diff --git a/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp b/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
index 487e6fa..0969365 100644
--- a/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
+++ b/src/projects/dipspades/kmer_gluing/equal_sequence_gluer.hpp
@@ -122,7 +122,7 @@ public:
size_t cnt = 0;
for(auto it = graph_.SmartEdgeBegin(); !it.IsEnd(); ++it) {
Sequence nucls = graph_.EdgeNucls(*it);
- runtime_k::RtSeq kmer = nucls.start<runtime_k::RtSeq>(graph_.k() + 1) >> 'A';
+ RtSeq kmer = nucls.start<RtSeq>(graph_.k() + 1) >> 'A';
for(size_t i = graph_.k(); i < graph_.length(*it); i++) {
kmer = kmer << graph_.EdgeNucls(*it)[i];
if(!index_.contains(kmer)) {
diff --git a/src/projects/dipspades/main.cpp b/src/projects/dipspades/main.cpp
index 7c63a75..018283f 100644
--- a/src/projects/dipspades/main.cpp
+++ b/src/projects/dipspades/main.cpp
@@ -8,13 +8,11 @@
/*
* Assembler Main
*/
-#include "dev_support/logger/log_writers.hpp"
-
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/memory_limit.hpp"
-#include "dev_support/copy_file.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/memory_limit.hpp"
+#include "utils/copy_file.hpp"
#include "pipeline/graph_pack.hpp"
#include "stages/construction.hpp"
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp b/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
index ac97830..b16cefb 100644
--- a/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
+++ b/src/projects/dipspades/polymorphic_bulge_remover/bulge_paths_searcher.hpp
@@ -8,7 +8,7 @@
#pragma once
#include <vector>
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
#include "assembly_graph/paths/path_processor.hpp"
#include "dipspades_config.hpp"
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp b/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
index debe5e3..1466ed6 100644
--- a/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
+++ b/src/projects/dipspades/polymorphic_bulge_remover/complex_bulge_remover.hpp
@@ -19,7 +19,7 @@
#include "bulge_gluer.hpp"
#include "diploid_bulge_finder.hpp"
-#include "io/reads_io/splitting_wrapper.hpp"
+#include "io/reads/splitting_wrapper.hpp"
#include <stdlib.h>
#include <memory.h>
diff --git a/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp b/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
index 3b481b7..ccdb009 100644
--- a/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
+++ b/src/projects/dipspades/polymorphic_bulge_remover/polymorphic_bulge_remover.hpp
@@ -15,7 +15,7 @@
#include "visualization/visualization.hpp"
#include "assembly_graph/handlers/edges_position_handler.hpp"
#include "assembly_graph/components/graph_component.hpp"
-#include "algorithms/simplification/compressor.hpp"
+#include "modules/simplification/compressor.hpp"
using namespace debruijn_graph;
@@ -75,12 +75,12 @@ class PolymorphicBulgeRemover {
graph_pack_.EnsureDebugInfo();
make_dir(dsp_cfg::get().io.output_dir + "components/");
- omnigraph::DefaultLabeler<Graph> labeler(graph_pack_.g, graph_pack_.edge_pos);
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(graph_pack_.g, graph_pack_.edge_pos);
make_dir(dsp_cfg::get().io.output_dir + "components/" + component_dir + "/");
- omnigraph::visualization::WriteComponents(graph_pack_.g,
+ visualization::visualization_utils::WriteComponents(graph_pack_.g,
dsp_cfg::get().io.output_dir + "components/" + component_dir + "/",
omnigraph::ReliableSplitter<Graph>(graph_pack_.g),
- omnigraph::visualization::DefaultColorer(graph_pack_.g, Path<EdgeId>(), Path<EdgeId>()),
+ visualization::graph_colorer::DefaultColorer(graph_pack_.g, Path<EdgeId>(), Path<EdgeId>()),
labeler);
}
diff --git a/src/projects/dipspades/utils/edge_gluer.hpp b/src/projects/dipspades/utils/edge_gluer.hpp
index 7cc1e50..8fdd1aa 100644
--- a/src/projects/dipspades/utils/edge_gluer.hpp
+++ b/src/projects/dipspades/utils/edge_gluer.hpp
@@ -6,7 +6,7 @@
#pragma once
-#include "algorithms/dijkstra/neighbours_iterator.hpp"
+#include "assembly_graph/dijkstra/neighbours_iterator.hpp"
using namespace debruijn_graph;
diff --git a/src/projects/dipspades/utils/path_routines.hpp b/src/projects/dipspades/utils/path_routines.hpp
index a251496..9ad261b 100644
--- a/src/projects/dipspades/utils/path_routines.hpp
+++ b/src/projects/dipspades/utils/path_routines.hpp
@@ -6,7 +6,7 @@
//***************************************************************************
#pragma once
-#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/core/graph.hpp"
#include "pipeline/graph_pack.hpp"
using namespace debruijn_graph;
@@ -238,13 +238,14 @@ bool PathAdjacentRelatedEdges(Graph &g, vector<EdgeId> path, bool check_start =
for(auto e = path.begin(); e != path.end() - 1; e++)
if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(*e)))
return true;
- if(path.size() != 0)
+ if(path.size() != 0) {
if(check_start)
if(VertexAdjacentRelatedEdges(g, g.EdgeStart(path[0])))
return true;
if(check_end)
if(VertexAdjacentRelatedEdges(g, g.EdgeEnd(path[path.size() - 1])))
return true;
+ }
return false;
}
diff --git a/src/projects/hammer/CMakeLists.txt b/src/projects/hammer/CMakeLists.txt
index 5f5277a..c0fe9c1 100644
--- a/src/projects/hammer/CMakeLists.txt
+++ b/src/projects/hammer/CMakeLists.txt
@@ -22,7 +22,7 @@ add_executable(hammer
# add_subdirectory(quake_count)
# add_subdirectory(gen_test_data)
-target_link_libraries(hammer input dev_support mph_index pipeline BamTools format ${COMMON_LIBRARIES})
+target_link_libraries(hammer input utils mph_index pipeline BamTools format ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(hammer PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/hammer/config_struct_hammer.cpp b/src/projects/hammer/config_struct_hammer.cpp
index 37cd8ac..ba056b9 100644
--- a/src/projects/hammer/config_struct_hammer.cpp
+++ b/src/projects/hammer/config_struct_hammer.cpp
@@ -14,7 +14,7 @@
#include "config_struct_hammer.hpp"
#include "pipeline/config_common.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
#include <boost/property_tree/ptree.hpp>
#include <string>
diff --git a/src/projects/hammer/hamcluster.cpp b/src/projects/hammer/hamcluster.cpp
index d1d2ff2..997ebd5 100644
--- a/src/projects/hammer/hamcluster.cpp
+++ b/src/projects/hammer/hamcluster.cpp
@@ -7,8 +7,8 @@
#include "hamcluster.hpp"
-#include "utils/adt/concurrent_dsu.hpp"
-#include "io/kmers_io/mmapped_reader.hpp"
+#include "common/adt/concurrent_dsu.hpp"
+#include "io/kmers/mmapped_reader.hpp"
#include "parallel_radix_sort.hpp"
#include "config_struct_hammer.hpp"
diff --git a/src/projects/hammer/hamcluster.hpp b/src/projects/hammer/hamcluster.hpp
index 30f5356..0db51f6 100644
--- a/src/projects/hammer/hamcluster.hpp
+++ b/src/projects/hammer/hamcluster.hpp
@@ -10,10 +10,10 @@
#include "kmer_stat.hpp"
#include "kmer_data.hpp"
-#include "io/kmers_io/mmapped_reader.hpp"
+#include "io/kmers/mmapped_reader.hpp"
-#include "dev_support/logger/logger.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "utils/logger/logger.hpp"
+#include "sequence/seq.hpp"
#include <iostream>
#include <vector>
diff --git a/src/projects/hammer/hammer_tools.cpp b/src/projects/hammer/hammer_tools.cpp
index 1fa2461..3a14777 100644
--- a/src/projects/hammer/hammer_tools.cpp
+++ b/src/projects/hammer/hammer_tools.cpp
@@ -5,13 +5,13 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "io/reads_io/ireadstream.hpp"
+#include "io/reads/ireadstream.hpp"
#include "valid_kmer_generator.hpp"
#include "globals.hpp"
#include "kmer_data.hpp"
#include "read_corrector.hpp"
-#include "io/kmers_io/mmapped_writer.hpp"
+#include "io/kmers/mmapped_writer.hpp"
#include <iostream>
#include <fstream>
diff --git a/src/projects/hammer/hammer_tools.hpp b/src/projects/hammer/hammer_tools.hpp
index 3ef9a6a..caac46d 100644
--- a/src/projects/hammer/hammer_tools.hpp
+++ b/src/projects/hammer/hammer_tools.hpp
@@ -14,11 +14,11 @@
#include <iomanip>
#include <fstream>
#include "io/reads/read.hpp"
-#include "io/reads_io/ireadstream.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "io/reads/ireadstream.hpp"
+#include "sequence/seq.hpp"
#include "globals.hpp"
#include "kmer_stat.hpp"
-#include "io/kmers_io/mmapped_reader.hpp"
+#include "io/kmers/mmapped_reader.hpp"
namespace hammer {
diff --git a/src/projects/hammer/kmer_cluster.cpp b/src/projects/hammer/kmer_cluster.cpp
index ff153c9..d6944b2 100644
--- a/src/projects/hammer/kmer_cluster.cpp
+++ b/src/projects/hammer/kmer_cluster.cpp
@@ -5,8 +5,8 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "io/reads_io/ireadstream.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "io/reads/ireadstream.hpp"
+#include "utils/openmp_wrapper.h"
#include "hammer_tools.hpp"
#include "hamcluster.hpp"
diff --git a/src/projects/hammer/kmer_data.cpp b/src/projects/hammer/kmer_data.cpp
index 22b2aac..dd730bd 100644
--- a/src/projects/hammer/kmer_data.cpp
+++ b/src/projects/hammer/kmer_data.cpp
@@ -6,17 +6,17 @@
//***************************************************************************
#include "kmer_data.hpp"
-#include "io/reads_io/read_processor.hpp"
+#include "io/reads/read_processor.hpp"
#include "valid_kmer_generator.hpp"
-#include "io/reads_io/ireadstream.hpp"
+#include "io/reads/ireadstream.hpp"
#include "config_struct_hammer.hpp"
-#include "data_structures/mph_index/kmer_index_builder.hpp"
+#include "utils/mph_index/kmer_index_builder.hpp"
-#include "io/kmers_io/kmer_iterator.hpp"
-#include "utils/adt/bf.hpp"
-#include "utils/adt/hll.hpp"
+#include "io/kmers/kmer_iterator.hpp"
+#include "common/adt/bf.hpp"
+#include "common/adt/hll.hpp"
using namespace hammer;
@@ -112,6 +112,8 @@ path::files_t HammerFilteringKMerSplitter::Split(size_t num_files) {
}
INFO("Total " << processed << " reads processed");
+ this->ClearBuffers();
+
return out;
}
diff --git a/src/projects/hammer/kmer_data.hpp b/src/projects/hammer/kmer_data.hpp
index 57fd1d2..ece0e53 100644
--- a/src/projects/hammer/kmer_data.hpp
+++ b/src/projects/hammer/kmer_data.hpp
@@ -9,8 +9,8 @@
#define __HAMMER_KMER_DATA_HPP__
#include "kmer_stat.hpp"
-#include "utils/adt/array_vector.hpp"
-#include "data_structures/mph_index/kmer_index.hpp"
+#include "common/adt/array_vector.hpp"
+#include "utils/mph_index/kmer_index.hpp"
#include <vector>
typedef KMerIndex<kmer_index_traits<hammer::KMer> > HammerKMerIndex;
diff --git a/src/projects/hammer/kmer_stat.hpp b/src/projects/hammer/kmer_stat.hpp
index 9501e5f..1c7284a 100644
--- a/src/projects/hammer/kmer_stat.hpp
+++ b/src/projects/hammer/kmer_stat.hpp
@@ -8,9 +8,9 @@
#ifndef HAMMER_KMERSTAT_HPP_
#define HAMMER_KMERSTAT_HPP_
-#include "dev_support/verify.hpp"
+#include "utils/verify.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include <folly/SmallLocks.h>
diff --git a/src/projects/hammer/main.cpp b/src/projects/hammer/main.cpp
index 18077e4..1bb0dca 100644
--- a/src/projects/hammer/main.cpp
+++ b/src/projects/hammer/main.cpp
@@ -20,15 +20,15 @@
#include "kmer_data.hpp"
#include "expander.hpp"
-#include "utils/adt/concurrent_dsu.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "io/reads_io/read_processor.hpp"
-#include "io/reads_io/ireadstream.hpp"
+#include "common/adt/concurrent_dsu.hpp"
+#include "utils/segfault_handler.hpp"
+#include "io/reads/read_processor.hpp"
+#include "io/reads/ireadstream.hpp"
-#include "dev_support/memory_limit.hpp"
+#include "utils/memory_limit.hpp"
-#include "dev_support/logger/logger.hpp"
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/logger.hpp"
+#include "utils/logger/log_writers.hpp"
#include "version.hpp"
diff --git a/src/projects/hammer/parallel_radix_sort.hpp b/src/projects/hammer/parallel_radix_sort.hpp
index 6a99911..2765afb 100644
--- a/src/projects/hammer/parallel_radix_sort.hpp
+++ b/src/projects/hammer/parallel_radix_sort.hpp
@@ -36,7 +36,7 @@
#ifndef PARALLEL_RADIX_SORT_H_
#define PARALLEL_RADIX_SORT_H_
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
#include <stdint.h>
#include <cstring>
diff --git a/src/projects/hammer/quake_correct/bithash.cpp b/src/projects/hammer/quake_correct/bithash.cpp
index 65d8203..a3b6f9b 100644
--- a/src/projects/hammer/quake_correct/bithash.cpp
+++ b/src/projects/hammer/quake_correct/bithash.cpp
@@ -6,7 +6,7 @@
//***************************************************************************
#include "bithash.h"
-#include "data_structures/sequence/nucl.hpp"
+#include "sequence/nucl.hpp"
#include <iostream>
#include <fstream>
#include <cstdlib>
diff --git a/src/projects/hammer/quake_count/quake_count.cpp b/src/projects/hammer/quake_count/quake_count.cpp
index 244e650..e04aa4e 100644
--- a/src/projects/hammer/quake_count/quake_count.cpp
+++ b/src/projects/hammer/quake_count/quake_count.cpp
@@ -38,7 +38,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_17.cpp b/src/projects/hammer/quake_count/quake_count_17.cpp
index 2771ea8..1a84fc9 100644
--- a/src/projects/hammer/quake_count/quake_count_17.cpp
+++ b/src/projects/hammer/quake_count/quake_count_17.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_19.cpp b/src/projects/hammer/quake_count/quake_count_19.cpp
index 8bc22ba..b23c711 100644
--- a/src/projects/hammer/quake_count/quake_count_19.cpp
+++ b/src/projects/hammer/quake_count/quake_count_19.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_21.cpp b/src/projects/hammer/quake_count/quake_count_21.cpp
index 24ed7f2..e3bf9b1 100644
--- a/src/projects/hammer/quake_count/quake_count_21.cpp
+++ b/src/projects/hammer/quake_count/quake_count_21.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_25.cpp b/src/projects/hammer/quake_count/quake_count_25.cpp
index 2160242..f52814c 100644
--- a/src/projects/hammer/quake_count/quake_count_25.cpp
+++ b/src/projects/hammer/quake_count/quake_count_25.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_29.cpp b/src/projects/hammer/quake_count/quake_count_29.cpp
index cdbd7cd..182910c 100644
--- a/src/projects/hammer/quake_count/quake_count_29.cpp
+++ b/src/projects/hammer/quake_count/quake_count_29.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_33.cpp b/src/projects/hammer/quake_count/quake_count_33.cpp
index 7e8cde1..ce44f6d 100644
--- a/src/projects/hammer/quake_count/quake_count_33.cpp
+++ b/src/projects/hammer/quake_count/quake_count_33.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
diff --git a/src/projects/hammer/quake_count/quake_count_37.cpp b/src/projects/hammer/quake_count/quake_count_37.cpp
index 2780c3e..529aae7 100644
--- a/src/projects/hammer/quake_count/quake_count_37.cpp
+++ b/src/projects/hammer/quake_count/quake_count_37.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_45.cpp b/src/projects/hammer/quake_count/quake_count_45.cpp
index 663bba3..3fab3bc 100644
--- a/src/projects/hammer/quake_count/quake_count_45.cpp
+++ b/src/projects/hammer/quake_count/quake_count_45.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_55.cpp b/src/projects/hammer/quake_count/quake_count_55.cpp
index c096b19..036a639 100644
--- a/src/projects/hammer/quake_count/quake_count_55.cpp
+++ b/src/projects/hammer/quake_count/quake_count_55.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_65.cpp b/src/projects/hammer/quake_count/quake_count_65.cpp
index 0ac0017..53b34b0 100644
--- a/src/projects/hammer/quake_count/quake_count_65.cpp
+++ b/src/projects/hammer/quake_count/quake_count_65.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/quake_count_75.cpp b/src/projects/hammer/quake_count/quake_count_75.cpp
index fb8de1d..3c32f6a 100644
--- a/src/projects/hammer/quake_count/quake_count_75.cpp
+++ b/src/projects/hammer/quake_count/quake_count_75.cpp
@@ -36,7 +36,7 @@
#include <iomanip>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "kmer_freq_info.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_count/valid_kmer_generator.hpp b/src/projects/hammer/quake_count/valid_kmer_generator.hpp
index 270c6e0..be42726 100644
--- a/src/projects/hammer/quake_count/valid_kmer_generator.hpp
+++ b/src/projects/hammer/quake_count/valid_kmer_generator.hpp
@@ -12,7 +12,7 @@
#include <string>
#include <vector>
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
/**
* This class is designed to iterate through valid k-mers in read.
* @example
diff --git a/src/projects/hammer/quake_enhanced/count.cpp b/src/projects/hammer/quake_enhanced/count.cpp
index 32b6ecd..8d6bede 100644
--- a/src/projects/hammer/quake_enhanced/count.cpp
+++ b/src/projects/hammer/quake_enhanced/count.cpp
@@ -14,7 +14,7 @@
#include <unordered_map>
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "valid_kmer_generator.hpp"
#include "quake_enhanced/quake.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_enhanced/count/count.cpp b/src/projects/hammer/quake_enhanced/count/count.cpp
index eafe3cd..2ea1a8d 100644
--- a/src/projects/hammer/quake_enhanced/count/count.cpp
+++ b/src/projects/hammer/quake_enhanced/count/count.cpp
@@ -34,7 +34,7 @@
#include "logging.hpp"
#include "io/ireadstream.hpp"
#include "io/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include "valid_kmer_generator.hpp"
#define SUPPRESS_UNUSED(X) ((void) (X))
diff --git a/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp b/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp
index cbe54e7..80a8a68 100644
--- a/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp
+++ b/src/projects/hammer/quake_enhanced/filter_trusted_enh/main.cpp
@@ -10,7 +10,7 @@
#include <cstdio>
#include <string>
#include <unordered_map>
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
using std::string;
using std::unordered_map;
diff --git a/src/projects/hammer/valid_kmer_generator.hpp b/src/projects/hammer/valid_kmer_generator.hpp
index c4128c4..49cf71f 100644
--- a/src/projects/hammer/valid_kmer_generator.hpp
+++ b/src/projects/hammer/valid_kmer_generator.hpp
@@ -11,7 +11,7 @@
#include "globals.hpp"
#include "io/reads/read.hpp"
-#include "data_structures/sequence/seq.hpp"
+#include "sequence/seq.hpp"
#include <string>
#include <vector>
diff --git a/src/projects/ionhammer/CMakeLists.txt b/src/projects/ionhammer/CMakeLists.txt
index c78cbca..66a26ed 100644
--- a/src/projects/ionhammer/CMakeLists.txt
+++ b/src/projects/ionhammer/CMakeLists.txt
@@ -20,7 +20,7 @@ add_executable(ionhammer
seqeval/TreephaserLite.cpp
main.cpp)
-target_link_libraries(ionhammer input dev_support pipeline mph_index BamTools ${COMMON_LIBRARIES})
+target_link_libraries(ionhammer input utils pipeline mph_index BamTools ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(ionhammer PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/ionhammer/HSeq.hpp b/src/projects/ionhammer/HSeq.hpp
index b6a3ad6..567f84f 100644
--- a/src/projects/ionhammer/HSeq.hpp
+++ b/src/projects/ionhammer/HSeq.hpp
@@ -8,7 +8,7 @@
#ifndef __HAMMER_HSEQ_HPP__
#define __HAMMER_HSEQ_HPP__
-#include "data_structures/sequence/nucl.hpp"
+#include "sequence/nucl.hpp"
#include <city/city.h>
#include <array>
diff --git a/src/projects/ionhammer/config_struct.cpp b/src/projects/ionhammer/config_struct.cpp
index d821d99..7701eef 100644
--- a/src/projects/ionhammer/config_struct.cpp
+++ b/src/projects/ionhammer/config_struct.cpp
@@ -7,7 +7,7 @@
#include "config_struct.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
#include "llvm/Support/YAMLParser.h"
#include "llvm/Support/YAMLTraits.h"
diff --git a/src/projects/ionhammer/err_helper_table.cpp b/src/projects/ionhammer/err_helper_table.cpp
index c283a5b..8e20c63 100644
--- a/src/projects/ionhammer/err_helper_table.cpp
+++ b/src/projects/ionhammer/err_helper_table.cpp
@@ -10,7 +10,7 @@
#include <fstream>
#include <istream>
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
namespace hammer {
namespace errHelper {
diff --git a/src/projects/ionhammer/err_helper_table.hpp b/src/projects/ionhammer/err_helper_table.hpp
index e24494d..342ff82 100644
--- a/src/projects/ionhammer/err_helper_table.hpp
+++ b/src/projects/ionhammer/err_helper_table.hpp
@@ -16,7 +16,7 @@
#include <cstdlib>
#include <cassert>
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
namespace hammer {
diff --git a/src/projects/ionhammer/expander.cpp b/src/projects/ionhammer/expander.cpp
index 14f4d98..acc7d3e 100644
--- a/src/projects/ionhammer/expander.cpp
+++ b/src/projects/ionhammer/expander.cpp
@@ -11,7 +11,7 @@
#include "kmer_data.hpp"
#include "valid_hkmer_generator.hpp"
-#include "io/reads_io/file_reader.hpp"
+#include "io/reads/file_reader.hpp"
#include <vector>
#include <cstring>
diff --git a/src/projects/ionhammer/hamcluster.cpp b/src/projects/ionhammer/hamcluster.cpp
index a54a66b..a905ddf 100644
--- a/src/projects/ionhammer/hamcluster.cpp
+++ b/src/projects/ionhammer/hamcluster.cpp
@@ -8,8 +8,8 @@
#include "hamcluster.hpp"
#include "hkmer_distance.hpp"
-#include "utils/adt/concurrent_dsu.hpp"
-#include "io/kmers_io/mmapped_reader.hpp"
+#include "common/adt/concurrent_dsu.hpp"
+#include "io/kmers/mmapped_reader.hpp"
#include <iostream>
#include <sstream>
diff --git a/src/projects/ionhammer/hamcluster.hpp b/src/projects/ionhammer/hamcluster.hpp
index 23b7015..17d9b60 100644
--- a/src/projects/ionhammer/hamcluster.hpp
+++ b/src/projects/ionhammer/hamcluster.hpp
@@ -9,9 +9,9 @@
#define HAMMER_SUBKMER_SORTER_HPP
#include "kmer_data.hpp"
-#include "io/kmers_io/mmapped_reader.hpp"
+#include "io/kmers/mmapped_reader.hpp"
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
#include "HSeq.hpp"
#include <iostream>
diff --git a/src/projects/ionhammer/kmer_data.cpp b/src/projects/ionhammer/kmer_data.cpp
index 9b82792..3ba9779 100644
--- a/src/projects/ionhammer/kmer_data.cpp
+++ b/src/projects/ionhammer/kmer_data.cpp
@@ -9,11 +9,11 @@
#include "config_struct.hpp"
#include "valid_hkmer_generator.hpp"
-#include "data_structures/mph_index/kmer_index_builder.hpp"
+#include "utils/mph_index/kmer_index_builder.hpp"
-#include "io/kmers_io/mmapped_writer.hpp"
-#include "io/reads_io/file_reader.hpp"
-#include "io/reads_io/read_processor.hpp"
+#include "io/kmers/mmapped_writer.hpp"
+#include "io/reads/file_reader.hpp"
+#include "io/reads/read_processor.hpp"
using namespace hammer;
@@ -87,6 +87,8 @@ path::files_t HammerKMerSplitter::Split(size_t num_files) {
}
INFO("Processed " << filler.processed() << " reads");
+ this->ClearBuffers();
+
return out;
}
diff --git a/src/projects/ionhammer/kmer_data.hpp b/src/projects/ionhammer/kmer_data.hpp
index 8afd216..e27458a 100644
--- a/src/projects/ionhammer/kmer_data.hpp
+++ b/src/projects/ionhammer/kmer_data.hpp
@@ -8,7 +8,7 @@
#ifndef __HAMMER_KMER_DATA_HPP__
#define __HAMMER_KMER_DATA_HPP__
-#include "data_structures/mph_index/kmer_index.hpp"
+#include "utils/mph_index/kmer_index.hpp"
#include "hkmer.hpp"
#include <vector>
diff --git a/src/projects/ionhammer/main.cpp b/src/projects/ionhammer/main.cpp
index cb3f35b..ab6fd5b 100644
--- a/src/projects/ionhammer/main.cpp
+++ b/src/projects/ionhammer/main.cpp
@@ -5,18 +5,18 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
-#include "io/reads_io/file_reader.hpp"
-#include "io/sam_io/bam_reader.hpp"
-#include "io/reads_io/paired_readers.hpp"
-#include "io/reads_io/osequencestream.hpp"
-#include "io/reads_io/read_processor.hpp"
+#include "io/reads/file_reader.hpp"
+#include "io/sam/bam_reader.hpp"
+#include "io/reads/paired_readers.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "io/reads/read_processor.hpp"
-#include "utils/adt/concurrent_dsu.hpp"
+#include "common/adt/concurrent_dsu.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/memory_limit.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/memory_limit.hpp"
#include "HSeq.hpp"
#include "kmer_data.hpp"
@@ -27,7 +27,7 @@
#include "expander.hpp"
#include "config_struct.hpp"
-#include "dev_support/openmp_wrapper.h"
+#include "utils/openmp_wrapper.h"
#include "version.hpp"
diff --git a/src/projects/ionhammer/read_corrector.hpp b/src/projects/ionhammer/read_corrector.hpp
index def12aa..e06df5b 100644
--- a/src/projects/ionhammer/read_corrector.hpp
+++ b/src/projects/ionhammer/read_corrector.hpp
@@ -35,7 +35,7 @@
#include <fstream>
#if 1
-#include "data_structures/sequence/nucl.hpp"
+#include "sequence/nucl.hpp"
#include <iostream>
#include <iomanip>
#endif
diff --git a/src/projects/ionhammer/subcluster.cpp b/src/projects/ionhammer/subcluster.cpp
index d5dc0a2..1b27e2f 100644
--- a/src/projects/ionhammer/subcluster.cpp
+++ b/src/projects/ionhammer/subcluster.cpp
@@ -10,7 +10,7 @@
#include "consensus.hpp"
#include "hkmer_distance.hpp"
#include "kmer_data.hpp"
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
#include <boost/numeric/ublas/matrix.hpp>
diff --git a/src/projects/mph_test/CMakeLists.txt b/src/projects/mph_test/CMakeLists.txt
index 7338861..270854f 100644
--- a/src/projects/mph_test/CMakeLists.txt
+++ b/src/projects/mph_test/CMakeLists.txt
@@ -12,7 +12,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(spades-kmercount
main.cpp)
-target_link_libraries(spades-kmercount spades_modules ${COMMON_LIBRARIES})
+target_link_libraries(spades-kmercount common_modules ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(spades-kmercount PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/mph_test/main.cpp b/src/projects/mph_test/main.cpp
index c638f77..e5421c1 100644
--- a/src/projects/mph_test/main.cpp
+++ b/src/projects/mph_test/main.cpp
@@ -5,14 +5,13 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/logger/log_writers.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "data_structures/indices/perfect_hash_map.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
-#include "data_structures/mph_index/kmer_index_builder.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/indices/perfect_hash_map.hpp"
+#include "utils/mph_index/kmer_index_builder.hpp"
-#include "io/reads_io/read_processor.hpp"
-#include "io/reads_io/io_helper.hpp"
+#include "io/reads/read_processor.hpp"
+#include "io/reads/io_helper.hpp"
#include "version.hpp"
@@ -31,15 +30,15 @@ void create_console_logger() {
attach_logger(lg);
}
-class SimplePerfectHashMap : public debruijn_graph::KeyIteratingMap<runtime_k::RtSeq, uint32_t> {
- using base = debruijn_graph::KeyIteratingMap<runtime_k::RtSeq, uint32_t>;
+class SimplePerfectHashMap : public debruijn_graph::KeyIteratingMap<RtSeq, uint32_t> {
+ using base = debruijn_graph::KeyIteratingMap<RtSeq, uint32_t>;
public:
SimplePerfectHashMap(size_t k, const std::string &workdir)
: base(k, workdir) {}
};
-class ParallelSortingSplitter : public KMerSortingSplitter<runtime_k::RtSeq> {
- using Seq = runtime_k::RtSeq;
+class ParallelSortingSplitter : public KMerSortingSplitter<RtSeq> {
+ using Seq = RtSeq;
std::vector<std::string> files_;
unsigned nthreads_;
@@ -67,7 +66,7 @@ class ParallelSortingSplitter : public KMerSortingSplitter<runtime_k::RtSeq> {
unsigned thread_id = omp_get_thread_num();
bool stop = false;
- runtime_k::RtSeq kmer = seq.start<runtime_k::RtSeq>(this->K_) >> 'A';
+ RtSeq kmer = seq.start<RtSeq>(this->K_) >> 'A';
for (size_t j = this->K_ - 1; j < seq.size(); ++j) {
kmer <<= seq[j];
stop |= splitter_.push_back_internal(kmer, thread_id);
@@ -110,6 +109,8 @@ class ParallelSortingSplitter : public KMerSortingSplitter<runtime_k::RtSeq> {
}
INFO("Total " << filler.processed() << " reads processed");
+ this->ClearBuffers();
+
return out;
}
};
@@ -169,7 +170,7 @@ int main(int argc, char* argv[]) {
for (const auto& s : input)
splitter.push_back(s);
}
- KMerDiskCounter<runtime_k::RtSeq> counter(workdir, splitter);
+ KMerDiskCounter<RtSeq> counter(workdir, splitter);
counter.CountAll(16, nthreads);
INFO("K-mer counting done, kmers saved to " << counter.GetFinalKMersFname());
} catch (std::string const &s) {
diff --git a/src/projects/mts/CMakeLists.txt b/src/projects/mts/CMakeLists.txt
new file mode 100644
index 0000000..1e06d4b
--- /dev/null
+++ b/src/projects/mts/CMakeLists.txt
@@ -0,0 +1,57 @@
+############################################################################
+# Copyright (c) 2015-2016 Saint Petersburg State University
+# Copyright (c) 2011-2014 Saint Petersburg Academic University
+# All Rights Reserved
+# See file LICENSE for details.
+############################################################################
+
+cmake_minimum_required(VERSION 2.8)
+
+project(kmer_count_filter CXX)
+
+include_directories(kmc_api)
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${EXT_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/debruijn)
+
+add_executable(kmer_multiplicity_counter
+ kmc_api/kmc_file.cpp
+ kmc_api/kmer_api.cpp
+ kmc_api/mmer.cpp
+ kmer_multiplicity_counter.cpp)
+
+target_link_libraries(kmer_multiplicity_counter common_modules utils input getopt_pp ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(kmer_multiplicity_counter PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+add_executable(prop_binning
+ propagate.cpp
+ read_binning.cpp
+ prop_binning.cpp)
+
+target_link_libraries(prop_binning common_modules nlopt BamTools ssw getopt_pp ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(prop_binning PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+add_executable(stats
+ stats.cpp)
+
+target_link_libraries(stats common_modules nlopt BamTools ssw getopt_pp ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(stats PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
+
+add_executable(contig_abundance_counter
+ contig_abundance_counter.cpp
+ contig_abundance.cpp)
+
+target_link_libraries(contig_abundance_counter common_modules nlopt BamTools ssw getopt_pp ${COMMON_LIBRARIES})
+
+if (SPADES_STATIC_BUILD)
+ set_target_properties(contig_abundance_counter PROPERTIES LINK_SEARCH_END_STATIC 1)
+endif()
diff --git a/src/projects/mts/Common.snake b/src/projects/mts/Common.snake
new file mode 100644
index 0000000..6cd6a50
--- /dev/null
+++ b/src/projects/mts/Common.snake
@@ -0,0 +1,69 @@
+configfile: "config.yaml"
+
+from itertools import chain
+from functools import partial
+import os.path
+
+from scripts.common import detect_reads
+
+#Config parameters
+IN = config["IN"]
+LOCAL_DIR = config["LOCAL_DIR"]
+SPADES = config.get("SPADES", LOCAL_DIR)
+SPADES_REASSEMBLY = config.get("SPADES_REASSEMBLY", LOCAL_DIR)
+BIN = config.get("BIN", os.path.join(LOCAL_DIR, "build/release/bin"))
+SCRIPTS = config.get("SCRIPTS", os.path.join(LOCAL_DIR, "src/projects/mts/scripts"))
+SOFT = config["SOFT"]
+K = int(config.get("K", 55))
+SMALL_K = int(config.get("SMALL_K", 21))
+MIN_CONTIG_LENGTH = int(config.get("MIN_CONTIG_LENGTH", 2000))
+THREADS = config.get("THREADS", 16)
+BINNER = config.get("BINNER", "canopy")
+
+#Autodetect samples and their reads
+SAMPLE_DIRS = set(glob_wildcards(os.path.join(IN, "{sample,sample\d+}"))[0])
+SAMPLE_COUNT = len(SAMPLE_DIRS)
+SAMPLES = list()
+for i in range(1, SAMPLE_COUNT + 1):
+ sample_name = "sample" + str(i)
+ if sample_name not in SAMPLE_DIRS:
+ raise WorkflowError("Samples must be consecutive; missing " + sample_name)
+ SAMPLES.append(sample_name)
+
+SAMPLE_READS = dict(map(lambda sample: (sample, detect_reads(os.path.join(IN, sample))), SAMPLES))
+
+#Group samples
+GROUP_SAMPLES = config.get("GROUPS", [])
+USED_SAMPLES = set(chain(*GROUP_SAMPLES))
+#TODO: double-check
+#Replace the wildcard group with unused samples
+if GROUP_SAMPLES and GROUP_SAMPLES[-1] == "*":
+ GROUP_SAMPLES[-1] = [sample for sample in SAMPLES if sample not in USED_SAMPLES]
+#Otherwise, add a single-sample group from the rest of the samples
+else:
+ for sample in SAMPLES:
+ if sample not in USED_SAMPLES:
+ GROUP_SAMPLES.append([sample])
+
+GROUPS = dict()
+group_id = 1
+for group in GROUP_SAMPLES:
+ if len(group) == 1:
+ key = group[0]
+ else:
+ key = "group" + str(group_id)
+ #SAMPLE_READS[key] = ["reads/{}/{}.fastq".format(key, dir) for dir in ["left", "right"]]
+ SAMPLE_READS[key] = ([SAMPLE_READS[s][0] for s in group], [SAMPLE_READS[s][1] for s in group])
+ group_id += 1
+ GROUPS[key] = group
+
+#Helpers for locating input files
+def sample_reads(dir, wildcards):
+ res = SAMPLE_READS[wildcards.sample][dir]
+ if res is str:
+ return [res]
+ else:
+ return res
+
+left_reads = partial(sample_reads, 0)
+right_reads = partial(sample_reads, 1)
diff --git a/src/projects/mts/README b/src/projects/mts/README
new file mode 100644
index 0000000..5e8e6d3
--- /dev/null
+++ b/src/projects/mts/README
@@ -0,0 +1,21 @@
+1. Installing Snakemake
+If you have properly installed Python 3.3+, just
+> easy_install3 snakemake
+or
+> pip3 install snakemake
+In case you have to install Python 3 yourself, we recommend to use the Miniconda Python 3 distribution is recommended.
+With Miniconda installed, you can issue
+> conda install -c bioconda snakemake
+
+2. Running MTS
+Make a directory for output, place config.yaml there, and configure it. Then run
+> snakemake --directory <output directory> --cores XX
+
+3. Gathering stats
+To render some interesting info, you need to specify some references in config:
+REFS: path
+or
+REFS: [path1, path2, ...]
+where path can be either a single reference or a folder with references.
+Then run the stats target manually:
+> snakemake --directory <output directory> stats_all
diff --git a/src/projects/mts/Snakefile b/src/projects/mts/Snakefile
new file mode 100644
index 0000000..25553b4
--- /dev/null
+++ b/src/projects/mts/Snakefile
@@ -0,0 +1,175 @@
+include: "Common.snake"
+
+import os
+import os.path
+
+from scripts.common import dump_dict
+
+#Path to saves of necessary assembly stage
+SAVES = "K{0}/saves/01_before_repeat_resolution/graph_pack".format(K)
+
+onstart:
+ try:
+ os.mkdir("tmp")
+ except:
+ pass
+ print("Detected", SAMPLE_COUNT, "samples in", IN)
+ print("They form: ", GROUPS)
+
+# ---- Main pipeline -----------------------------------------------------------
+
+rule all:
+ input: dynamic("reassembly/{cag}.fasta")
+ message: "Dataset of {SAMPLE_COUNT} samples from {IN} has been processed."
+
+rule assemble:
+ input: left=left_reads, right=right_reads
+ output: "assembly/{sample}.fasta"
+ #TODO: remove this boilerplate
+ params: left=lambda w: " ".join(expand("-1 {r}", r=left_reads(w))),
+ right=lambda w: " ".join(expand("-2 {r}", r=right_reads(w))),
+ dir="assembly/{sample}"
+ log: "assembly/{sample}.log"
+ threads: THREADS
+ message: "Assembling {wildcards.sample} with SPAdes"
+ shell: "{SPADES}/spades.py --meta -m 400 -t {threads} {params.left} {params.right}"
+ " -o {params.dir} >{log} 2>&1 && "
+ "cp {params.dir}/scaffolds.fasta {output}"
+
+rule assemble_all:
+ input: expand("assembly/{sample}.fasta", sample=GROUPS)
+ message: "Assembled all samples"
+
+rule descriptions:
+ output: expand("profile/{sample}.desc", sample=SAMPLES)
+ message: "Generating sample descriptions"
+ run:
+ for sample in SAMPLES:
+ with open("profile/{}.desc".format(sample), "w") as out:
+ wildcards.sample = sample
+ print(left_reads(wildcards), file=out)
+ print(right_reads(wildcards), file=out)
+
+rule kmc:
+ input: "profile/{sample}.desc"
+ output: temp("tmp/{sample}.kmc_pre"), temp("tmp/{sample}.kmc_suf")
+ params: min_mult=2, tmp="tmp/{sample}_kmc", out="tmp/{sample}"
+ log: "profile/kmc_{sample}.log"
+ threads: THREADS
+ message: "Running kmc for {wildcards.sample}"
+ shell: "mkdir {params.tmp} && "
+ "{SOFT}/kmc -k{SMALL_K} -t{threads} -ci{params.min_mult} -cs65535"
+ " @{input} {params.out} {params.tmp} >{log} 2>&1 && "
+ "rm -rf {params.tmp}"
+
+rule multiplicities:
+ input: expand("tmp/{sample}.kmc_pre", sample=SAMPLES), expand("tmp/{sample}.kmc_suf", sample=SAMPLES)
+ output: "profile/kmers.kmm"
+ params: kmc_files=" ".join(expand("tmp/{sample}", sample=SAMPLES)), out="profile/kmers"
+ log: "profile/kmers.log"
+ message: "Gathering {SMALL_K}-mer multiplicities from all samples"
+ shell: "{BIN}/kmer_multiplicity_counter -n {SAMPLE_COUNT} -k {SMALL_K} -s 3"
+ " -f tmp -t {threads} -o {params.out} >{log} 2>&1 && "
+ "rm tmp/*.sorted"
+
+rule profile:
+ input: contigs="assembly/{sample,\w+\d+}.fasta", mpl="profile/kmers.kmm"
+ output: id="profile/{sample}.id", mpl="profile/{sample}.mpl", splits= "assembly/{sample}_splits.fasta"
+ log: "profile/{sample}.log"
+ message: "Counting contig abundancies for {wildcards.sample}"
+ shell: "{BIN}/contig_abundance_counter -k {SMALL_K} -w tmp -c {input.contigs}"
+ " -n {SAMPLE_COUNT} -m profile/kmers -o profile/{wildcards.sample}"
+ " -f {output.splits} -l {MIN_CONTIG_LENGTH} >{log} 2>&1"
+
+rule binning_pre:
+ input: expand("profile/{sample}.id", sample=GROUPS)
+ output: "binning/{binner}/profiles.in"
+ params: " ".join(list(GROUPS.keys()))
+ message: "Preparing input for {wildcards.binner}"
+ shell: "{SCRIPTS}/make_input.py -t {wildcards.binner} -d profile -o {output} {params}"
+
+rule canopy:
+ input: "binning/canopy/profiles.in"
+ output: out="binning/canopy/binning.out", prof="binning/canopy/bins.prof"
+ threads: THREADS
+ message: "Running canopy clustering"
+ shell: "{SOFT}/cc.bin -n {threads} -i {input} -o {output.out} -c {output.prof} >binning/canopy/canopy.log 2>&1"
+
+rule combine_splits:
+ input: expand("assembly/{sample}_splits.fasta", sample=GROUPS)
+ output: "assembly/samples_splits.fasta"
+ message: "Combine splitted contigs"
+ shell: "{SCRIPTS}/combine_contigs.py -r {input} > {output}"
+
+#FIXME what does gt1000 mean?
+rule concoct:
+ input: contigs=rules.combine_splits.output[0], profiles="binning/concoct/profiles.in"
+ output: out="binning/concoct/clustering_gt1000.csv"
+ params: "binning/concoct"
+ message: "Running CONCOCT clustering"
+ shell: "mkdir -p {params} && "
+ "set +u; source activate concoct_env; set -u && "
+ "concoct --composition_file {input.contigs} --coverage_file {input.profiles} -b {params}"
+
+binning_inputs = {"canopy": rules.canopy.output.out, "concoct": rules.concoct.output.out}
+
+rule binning_post:
+ input: binning_inputs[BINNER]
+ output: expand("annotation/{sample}.ann", sample=GROUPS)
+ message: "Preparing raw annotations"
+ shell: "{SCRIPTS}/parse_output.py -t {BINNER} -o annotation {input}"
+
+#Post-clustering pipeline
+rule read_binning:
+ input: contigs="assembly/{sample}.fasta", ann="annotation/{sample}.ann",
+ left=left_reads, right=right_reads
+ output: "propagation/{sample}_edges.ann"
+ params: saves=os.path.join("assembly/{sample}/", SAVES),
+ splits="assembly/{sample}_splits.fasta",
+ out="propagation/{sample}_edges",
+ group=lambda wildcards: GROUPS[wildcards.sample]
+ #left=" ".join(input.left), right=" ".join(input.right)
+ log: "binning/{sample}.log"
+ message: "Propagating annotation & binning reads for {wildcards.sample}"
+ shell:
+ "{BIN}/prop_binning -k {K} -s {params.saves} -c {input.contigs}"
+ " -n {params.group} -l {input.left} -r {input.right}"
+ " -a {input.ann} -f {params.splits} -o binning -d {params.out} >{log} 2>&1"
+
+#TODO: bin profiles for CONCOCT
+rule choose_samples:
+ input: binned=expand("propagation/{sample}_edges.ann", sample=GROUPS),
+ prof=rules.canopy.output.prof
+ output: dynamic("binning/{cag}/left.fastq"),
+ dynamic("binning/{cag}/right.fastq")
+ log: "binning/choose_samples.log"
+ message: "Choosing samples for all CAGs"
+ shell: "{SCRIPTS}/choose_samples.py {input.prof} binning/ >{log} 2>&1"
+
+rule reassembly_config:
+ input: "binning/{cag}/left.fastq"
+ output: "reassembly/{cag}.yaml"
+ message: "Generated config file for reassembly of {wildcards.cag}"
+ run:
+ with open(output[0], "w") as outfile:
+ conf = {"k": SMALL_K, "sample_cnt": SAMPLE_COUNT,
+ "kmer_mult": str(rules.multiplicities.params.out),
+ "bin": wildcards.cag, "bin_prof": str(rules.canopy.output.prof),
+ "edges_sqn": "profile/{}_edges.fasta".format(wildcards.cag),
+ "edges_mpl": "profile/{}_edges.mpl".format(wildcards.cag),
+ "edge_fragments_mpl": "profile/{}_edges_frag.mpl".format(wildcards.cag),
+ "frag_size": 10000, "min_len": 100}
+ dump_dict(conf, outfile)
+
+rule reassemble:
+ input: left="binning/{cag}/left.fastq", right="binning/{cag}/right.fastq",
+ config="reassembly/{cag}.yaml"
+ output: "reassembly/{cag}.fasta"
+ params: "reassembly/reassembly_{cag}"
+ log: "reassembly/reassembly_{cag}.log"
+ threads: THREADS
+ message: "Reassembling reads for {wildcards.cag}"
+ shell: "{SPADES_REASSEMBLY}/spades.py --meta -t {threads}"
+ " --pe1-1 {input.left} --pe1-2 {input.right} --pe1-ff"
+ " -o {params} --series-analysis {input.config} >{log} 2>&1 && "
+ "cp {params}/scaffolds.fasta {output}"
diff --git a/src/projects/mts/Stats.snake b/src/projects/mts/Stats.snake
new file mode 100644
index 0000000..5019433
--- /dev/null
+++ b/src/projects/mts/Stats.snake
@@ -0,0 +1,270 @@
+include: "Common.snake"
+
+import os
+import os.path
+
+import pandas
+from pandas import DataFrame
+
+from scripts.common import gather_refs, dump_dict
+
+#Additional config parameters
+try:
+ QUAST_DIR = config["QUAST"]
+ QUAST = os.path.join(QUAST_DIR, "quast.py")
+ METAQUAST = os.path.join(QUAST_DIR, "metaquast.py")
+except KeyError:
+ QUAST = "quast"
+ METAQUAST = "metaquast"
+
+#Autodetect bins
+CAGS, = glob_wildcards("binning/{cag,CAG\d+}/left.fastq")
+CAGS.sort()
+
+CAG_EDGES = [c + "_edges" for c in CAGS]
+
+#Detect references
+REFS = dict(gather_refs(config.get("REFS", [])))
+ALL_REFS = ",".join(path for path in REFS.values())
+
+FRAGMENT_NAMES_BY_TYPE = {"reassembly": CAG_EDGES,
+ "initial_assembly": list(GROUPS.keys())}
+
+def ref_path(wildcards):
+ return REFS[wildcards.ref]
+
+onstart:
+ try:
+ os.mkdir("tmp")
+ except:
+ pass
+ print("Detected", SAMPLE_COUNT, "samples in", IN)
+ if CAGS:
+ print("Detected good (abundant) CAGs:", " ".join(CAGS))
+ if REFS:
+ print("Detected references:", " ".join(REFS))
+
+#===============================================================================
+#---- Statistics section -------------------------------------------------------
+#===============================================================================
+
+#---- Single alignments for samples per reference -------------------------------
+#TODO: use alignments from meta version instead
+rule quast_all_samples:
+ input: ref_fn=ref_path, contigs=expand("assembly/{sample}.fasta", sample=GROUPS)
+ output: summary_tsv="stats/summary/q_{ref}.tsv", report="stats/initial_assembly/{ref}/report.txt"
+ params: "stats/initial_assembly/{ref}"
+ log: "stats/initial_assembly/{ref}/quast.log"
+ threads: THREADS
+ message: "Aligning all samples on {wildcards.ref}"
+ shell: "{QUAST} -t {threads} -R {input.ref_fn} {input.contigs} -o {params} >/dev/null 2>&1 && "
+ "cp {params}/report.tsv {output.summary_tsv}"
+
+rule quast_all_reassemblies:
+ input: ref=ref_path, fragments=expand("profile/{cag_edges}.fasta", cag_edges=CAG_EDGES)
+ output: "stats/reassembly/{ref}/report.txt"
+ params: "stats/reassembly/{ref}"
+ log: "stats/reassembly/{ref}/quast.log"
+ threads: THREADS
+ message: "Aligning all samples on {wildcards.ref}"
+ shell: "{QUAST} -t {threads} -R {input.ref} {input.fragments} -o {params} >/dev/null 2>&1 && "
+ "cp {params}/report.tsv {output}"
+
+#---- Contigs of interest ------------------------------------------------------
+rule filter_ref_alignments:
+ input: "{path}/report.txt"
+ output: "{path}/{fragments}.info"
+ params: "{path}/contigs_reports/nucmer_output/{fragments}.coords.filtered"
+ shell: "if [ -f {params} ] ; then {SCRIPTS}/filter_nucmer.py {params} {output} {MIN_CONTIG_LENGTH} 70 ; else touch {output} ; fi"
+
+#---- GF of combined sample ----------------------------------------------------
+#rule combine_filtered:
+# input: contigs=expand("assembly/{sample}.fasta", sample=GROUPS),
+# filters=expand("stats/{{ref}}/{sample}.cont", sample=GROUPS)
+# output: "stats/{ref}.fasta"
+# message: "Gathering all interesting contigs for {wildcards.ref} into a single assembly"
+# shell: "{SCRIPTS}/filter_contigs.py {SAMPLE_COUNT} {output} {input.contigs} {input.filters}"
+
+rule quast_combined:
+ input: ref=ref_path, contigs="stats/{ref}.fasta"
+ output: "stats/q_{ref}_all/report.tsv"
+ params: "stats/q_{ref}_all"
+ log: "stats/q_{ref}_all.log"
+ threads: THREADS
+ message: "Aligning combined sample on {wildcards.ref}"
+ shell: "{QUAST} -t {threads} -R {input.ref} {input.contigs} -o {params} >{log} 2>&1"
+
+# Run this
+rule quast_combined_all:
+ input: expand("stats/q_{ref}_all/report.tsv", ref=REFS)
+ message: "Calculated QUAST metrics on all combined samples"
+
+#---- Bins of interest ---------------------------------------------------------
+rule int_bins:
+ input: "annotation/{sample}.ann", "stats/{ref}/{sample}.info"
+ output: "stats/{ref}/{sample}.bin"
+ message: "Filtering interesting bins for {wildcards.sample} aligned to {wildcards.ref}"
+ shell: "{SCRIPTS}/filter_bins.py {input} > {output}"
+
+rule int_bins_all_samples:
+ input: expand("stats/{{ref}}/{sample}.bin", sample=GROUPS)
+ output: "stats/{ref}/total.bin"
+ message: "Gathering interesting bins for {wildcards.ref} from all samples"
+ run:
+ bins = set()
+ for in_fn in input:
+ with open(in_fn) as infile:
+ for line in infile:
+ bins.add(line)
+ with open(output[0], "w") as outfile:
+ for bin in bins:
+ print(bin, file=outfile)
+
+# Run this
+rule int_bins_all:
+ input: expand("stats/{ref}/total.bin", ref=REFS)
+ message: "Gathered all interesting bins"
+
+#---- GF per bin per reference -------------------------------------------------
+#Helper formatters for determining input files from different stages
+PROP = {"prelim": ("assembly/{}_splits.fasta", "annotation/{}.ann"),
+ "prop": ("propagation/{}_edges.fasta", "propagation/{}_edges.ann")}
+
+#TODO: split into different directories per sample
+rule split_bins:
+ input: lambda w: PROP[w.prop][0].format(w.sample),
+ lambda w: PROP[w.prop][1].format(w.sample)
+ output: touch("binning/{prop}/{sample}.log")
+ log: "binning/{prop}/split_{sample}.log"
+ params: "binning/{prop}"
+ message: "Splitting assembly of {wildcards.sample} between {wildcards.prop} bins"
+ shell: "{SCRIPTS}/split_bins.py {input} {params} >{log}"
+
+rule cat_binned_contigs:
+ input: expand("binning/{{prop}}/{sample}.log", sample=SAMPLES)
+ output: "binning/{prop}/{cag,CAG\d+}.fasta"
+ params: "`ls binning/{prop}/*-{cag}.fasta`"
+ message: "Combine binned contigs ({wildcards.prop}) for {wildcards.cag}"
+ shell: "cat {params} > {output}"
+
+#Two helpers for determining dependencies of QUAST targets.
+#For split contigs and reassemblies, we need only corresponding FASTA.
+#For combined contigs, we need to glue their split pieces first.
+def stats_input(wildcards):
+ if wildcards.stage == "reassembly":
+ return expand("reassembly/{cag}.fasta", cag=CAGS)
+ w_bin, w_prop = wildcards.stage.split("_", 2)
+ if w_bin == "split":
+ return expand("binning/{prop}/{sample}.log", prop=w_prop, sample=GROUPS)
+ elif w_bin == "bin":
+ return expand("binning/{prop}/{cag}.fasta", prop=w_prop, cag=CAGS)
+
+def stats_data(wildcards):
+ if wildcards.stage == "reassembly":
+ return "`ls reassembly/CAG*.fasta`"
+ w_bin, w_prop = wildcards.stage.split("_", 2)
+ masks = {"bin": "CAG*", "split": "*-CAG*"}
+ return "`ls binning/{}/{}.fasta`".format(w_prop, masks[w_bin])
+
+rule quast_stats:
+ input: stats_input
+ output: "stats/summary/gf_{stage}.tsv"
+ params: data=stats_data, out="stats/q_{stage}"
+ log: "stats/q_{stage}.log"
+ threads: THREADS
+ message: "Aligning {wildcards.stage} assemblies on all references"
+ shell: "{METAQUAST} -t {threads} -R {ALL_REFS} {params.data} -o {params.out} >{log} 2>&1 && "
+ "cp '{params.out}/summary/TSV/Genome_fraction_(%).tsv' {output}"
+
+# Run this AFTER 'all'
+rule stats_all:
+ input: expand("stats/summary/gf_{bin}_{prop}.tsv", bin=["bin"], prop=["prelim", "prop"]),
+ "stats/initial_assembly/total.cont"
+ message: "Gathered some numbers, deal with them."
+
+#---- Reassembly statistics ----------------------------------------------------
+
+# Run this AFTER 'reassembly_all'
+rule stats_reassembly:
+ input: "stats/summary/gf_reassembly.tsv",
+ "stats/reassembly/total.cont"
+ output: "stats/summary/reassembly.tsv"
+ params: "stats/q_reassembly"
+ message: "Gathered bins stats"
+ shell: "{SCRIPTS}/gather_stats.py {params} > {output}"
+
+#---- Propagator statistics ----------------------------------------------------
+rule prop_stats:
+ input: prelim="annotation/{sample}.ann", prop="annotation/{sample}_edges.ann",
+ contigs="assembly/{sample}.fasta", edges="assembly/{sample}_edges.fasta",
+ ref=REFS.values() #, bins="{sample}/{ref}.bin"
+ output: "stats/prop_{cag}/{sample}.tsv"
+ log: "stats/prop_{cag}/{sample}.log"
+ message: "Calculating propagation statistics for {wildcards.sample}"
+ shell: "{BIN}/stats -k {K} -s {wildcards.sample}/assembly/{SAVES} -r {input.ref}"
+ " -c {input.contigs} -a {input.prelim} -e {input.edges} -p {input.prop}"
+ " -b {wildcards.cag} -o {output} >{log}"
+
+# Run this
+rule prop_stats_all:
+ input: expand("stats/prop_{cag}/{sample}.tsv", sample=GROUPS, cag=CAGS)
+ message: "Calculated propagation statistics"
+
+#---- CheckM stats -------------------------------------------------------------
+rule checkm:
+ input: expand("reassembly/{cag}.fasta", cag=CAGS)
+ output: qa="stats/checkm/qa.tsv", tree_qa="stats/checkm/tree_qa.tsv"
+ params: dir="stats/checkm"
+ threads: THREADS
+ shell: "set +u; source activate concoct_env; set -u \n"
+ "checkm tree -x fasta reassembly {params.dir} \n"
+ "checkm tree_qa -o 2 --tab_table -f {output.tree_qa} {params.dir}\n"
+ "checkm lineage_set {params.dir} {params.dir}/lineage.ms\n"
+ "checkm analyze -x fasta {params.dir}/lineage.ms reassembly {params.dir}\n"
+ "checkm qa -o 2 --tab_table -f {output.qa} {params.dir}/lineage.ms {params.dir}"
+
+rule parse_checkm:
+ input: qa=rules.checkm.output.qa, tree_qa=rules.checkm.output.tree_qa
+ output: "stats/summary/checkm.tsv"
+ #shell: "{SCRIPTS}/parse_checkm.py {input.qa} {input.tree_qa} > {output}"
+ run:
+ table = pandas.read_table(input.qa, dtype="str")
+ tree_table = pandas.read_table(input.tree_qa, dtype="str", na_filter=False)
+ all_table = pandas.merge(table, tree_table, on="Bin Id")
+ res_table = all_table[["Bin Id", "Taxonomy (contained)", "Taxonomy (sister lineage)", "Genome size (Mbp)", "Completeness", "Contamination"]].copy()
+ def extract_taxon(taxonomy):
+ return str(taxonomy).split(";")[-1]
+ for column in ["Taxonomy (contained)", "Taxonomy (sister lineage)"]:
+ res_table[column] = res_table[column].apply(extract_taxon)
+ res_table.to_csv(output[0], index=False, sep="\t")
+
+#---- PCA ----------------------------------------------------------------------
+rule pca:
+ input: "profile/canopy.in", "profile/canopy.out", "{sample}.cont"
+ output: "stats/{sample}.png"
+ message: "Doing some visualization"
+ shell:
+ "Rscript {SCRIPTS}/pca.R {input} {output}"
+
+def fragments_info_by_assembly_type(wildcards):
+ frags=FRAGMENT_NAMES_BY_TYPE[wildcards.assembly_type]
+ return expand("stats/{assembly_type}/{ref}/{fragments}.info", assembly_type=wildcards.assembly_type, ref=wildcards.ref, fragments=frags)
+
+rule combine_fragments_info:
+ input: fragments_info_by_assembly_type
+ output: "stats/{assembly_type}/{ref}/ref.cont"
+ shell: "rm -rf {output}; for f in {input}; do name=$(basename $f .info); cat $f | sed 's/^/'$name'-/g' >> {output} ; done"
+
+rule combine_refs_info:
+ input: expand("stats/{{assembly_type}}/{ref}/ref.cont", ref=list(REFS.keys()))
+ output: "stats/{assembly_type}/total.cont"
+ run:
+ shell("rm -rf {output}")
+ for ref in REFS.keys():
+ shell("awk '{{print $0 \"\t{ref}\"}}' stats/{wildcards.assembly_type}/{ref}/ref.cont >> {output}")
+
+# Run this
+rule pca_total:
+ input: "binning/canopy/profiles.in", "binning/canopy/binning.out", "stats/total.cont"
+ output: "stats/summary/pca.png"
+ shell: "Rscript {SCRIPTS}/pca.R {input} {output}"
diff --git a/src/projects/mts/annotation.hpp b/src/projects/mts/annotation.hpp
new file mode 100644
index 0000000..fa9ccf8
--- /dev/null
+++ b/src/projects/mts/annotation.hpp
@@ -0,0 +1,310 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "utils/standard_base.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+#include "io/reads/io_helper.hpp"
+#include "formats.hpp"
+
+namespace debruijn_graph {
+
+class AnnotationStream {
+ std::ifstream inner_stream_;
+ std::string line_;
+
+ ContigAnnotation Parse(const std::string& s) const {
+ ContigAnnotation annotation;
+ stringstream ss(s);
+ ss >> annotation.first;
+ string delim;
+ ss >> delim;
+ VERIFY(delim == ":");
+ while (true) {
+ bin_id bin;
+ ss >> bin;
+ if (ss.fail())
+ break;
+ annotation.second.push_back(bin);
+ }
+ return annotation;
+ }
+
+public:
+
+ AnnotationStream(const std::string& fn) : inner_stream_(fn) {
+ std::getline(inner_stream_, line_);
+ }
+
+ bool eof() const {
+ return inner_stream_.eof();
+ }
+
+ AnnotationStream& operator >>(ContigAnnotation& annotation) {
+ VERIFY(!inner_stream_.eof())
+
+ annotation = Parse(line_);
+ std::getline(inner_stream_, line_);
+ return *this;
+ }
+
+ void close() {
+ inner_stream_.close();
+ }
+};
+
+class AnnotationOutStream {
+ std::ofstream inner_stream_;
+public:
+
+ AnnotationOutStream(const std::string& fn) : inner_stream_(fn) {
+ }
+
+ AnnotationOutStream& operator <<(const ContigAnnotation& annotation) {
+ inner_stream_ << annotation.first;
+ string delim = " : ";
+ for (bin_id bin : annotation.second) {
+ inner_stream_ << delim << bin;
+ delim = " ";
+ }
+ inner_stream_ << endl;
+ return *this;
+ }
+
+ void close() {
+ inner_stream_.close();
+ }
+};
+
+class EdgeAnnotation {
+ const conj_graph_pack& gp_;
+ set<bin_id> bins_of_interest_;
+ map<EdgeId, set<bin_id>> edge_annotation_;
+
+ template<class BinCollection>
+ void InnerStickAnnotation(EdgeId e, const BinCollection& bins) {
+ edge_annotation_[e].insert(bins.begin(), bins.end());
+ }
+
+public:
+
+ EdgeAnnotation(const conj_graph_pack& gp,
+ const set<bin_id>& bins_of_interest) :
+ gp_(gp),
+ bins_of_interest_(bins_of_interest)
+ {
+ }
+
+ template<class BinCollection>
+ void StickAnnotation(EdgeId e, const BinCollection& bins) {
+ InnerStickAnnotation(e, bins);
+ InnerStickAnnotation(gp_.g.conjugate(e), bins);
+ }
+
+ void StickAnnotation(EdgeId e, const bin_id& bin) {
+ StickAnnotation(e, vector<bin_id>{bin});
+ }
+
+ template<class EdgeCollection>
+ void StickAnnotation(const EdgeCollection& edges, const bin_id& bin) {
+ for (EdgeId e : edges) {
+ StickAnnotation(e, bin);
+ }
+ }
+
+ vector<bin_id> Annotation(EdgeId e) const {
+ if (!edge_annotation_.count(e)) {
+ return {};
+ }
+ const auto& annotation = get(edge_annotation_, e);
+ return vector<bin_id>(annotation.begin(), annotation.end());
+ }
+
+ set<bin_id> RelevantBins(const vector<EdgeId>& path) const {
+ set<bin_id> answer;
+ for (EdgeId e : path) {
+ insert_all(answer, Annotation(e));
+ }
+ return answer;
+ }
+
+ set<EdgeId> EdgesOfBin(bin_id bin, size_t min_length = 0) const {
+ set<EdgeId> answer;
+ for (auto ann_pair : edge_annotation_) {
+ if (ann_pair.second.count(bin) &&
+ gp_.g.length(ann_pair.first) > min_length) {
+ answer.insert(ann_pair.first);
+ }
+ }
+ return answer;
+ }
+
+ size_t size() const {
+ return edge_annotation_.size();
+ }
+
+ const set<bin_id>& interesting_bins() const {
+ return bins_of_interest_;
+ }
+
+};
+
+class AnnotationFiller {
+ const conj_graph_pack& gp_;
+ set<bin_id> interesting_bins_;
+ shared_ptr<SequenceMapper<Graph>> mapper_;
+
+ vector<EdgeId> EdgesOfContig(const io::SingleRead& contig) const {
+ return mapper_->MapRead(contig).simple_path();
+ }
+
+ Bins FilterInteresting(const Bins& bins) const {
+ if (interesting_bins_.empty()) {
+ return bins;
+ } else {
+ Bins answer;
+ for (const bin_id& bin : bins) {
+ if (interesting_bins_.count(bin)) {
+ answer.push_back(bin);
+ }
+ }
+ return answer;
+ }
+ }
+
+ map<contig_id, std::set<bin_id>> LoadAnnotation(AnnotationStream& splits_annotation_stream) const {
+ map<contig_id, std::set<bin_id>> annotation_map;
+ INFO("Reading (split) contigs annotation");
+ ContigAnnotation contig_annotation;
+ size_t cnt = 0;
+ while (!splits_annotation_stream.eof()) {
+ splits_annotation_stream >> contig_annotation;
+ auto bins = FilterInteresting(contig_annotation.second);
+ if (!bins.empty()) {
+ insert_all(annotation_map[contig_annotation.first], bins);
+ }
+ ++cnt;
+ }
+ INFO(cnt << " records read; annotation available for " << annotation_map.size() << " splits");
+ return annotation_map;
+ };
+
+ void ProcessSplit(const io::SingleRead& split, std::set<bin_id> bins,
+ map<EdgeId, map<bin_id, size_t>>& coloring) const {
+ auto mapping_path = mapper_->MapRead(split);
+ for (size_t i = 0; i < mapping_path.size(); ++i) {
+ auto map_info = mapping_path[i];
+ MappingRange mr = map_info.second;
+ auto& bin_lens = coloring[map_info.first];
+ for (bin_id b : bins) {
+ bin_lens[b] += mr.mapped_range.size();
+ }
+ }
+ }
+
+ map<EdgeId, map<bin_id, size_t>> FillColorInfo(io::SingleStream& splits_stream,
+ const map<contig_id, std::set<bin_id>>& split_annotation) const {
+ INFO("Sticking annotation to edges");
+ map<EdgeId, map<bin_id, size_t>> answer;
+ io::SingleRead split;
+ while (!splits_stream.eof()) {
+ splits_stream >> split;
+ auto id = GetId(split);
+ auto bins = split_annotation.find(id);
+ if (bins != split_annotation.end() && !(bins->second.empty())) {
+ ProcessSplit(split, bins->second, answer);
+ //TODO think if it is overkill
+ ProcessSplit(!split, bins->second, answer);
+ }
+ }
+ INFO("Color info available for " << answer.size() << " edges");
+ return answer;
+ };
+
+ void FilterSpuriousInfo(map<EdgeId, map<bin_id, size_t>>& coloring) const {
+ for (auto& edge_info : coloring) {
+ size_t edge_len = gp_.g.length(edge_info.first);
+ for (auto color_it = edge_info.second.begin(); color_it != edge_info.second.end(); ) {
+ if (math::ls(double(color_it->second) / double(edge_len), 0.3)) {
+ edge_info.second.erase(color_it++);
+ } else {
+ ++color_it;
+ }
+ }
+ }
+ }
+
+ set<bin_id> GatherAllBins(const map<EdgeId, map<bin_id, size_t>>& coloring) const {
+ set<bin_id> answer;
+ for (const auto& edge_info : coloring) {
+ for (const auto& bin_info : edge_info.second) {
+ answer.insert(bin_info.first);
+ }
+ }
+ return answer;
+ }
+
+ set<bin_id> DetermineBins(const vector<EdgeId>& path,
+ const map<EdgeId, map<bin_id, size_t>>& coloring) const {
+ map<bin_id, size_t> path_colors;
+ size_t total_len = 0;
+ for (EdgeId e : path) {
+ size_t edge_len = gp_.g.length(e);
+ total_len += edge_len;
+ auto it = coloring.find(e);
+ if (it != coloring.end()) {
+ for (auto color_info : it->second) {
+ //TODO think carefully
+ path_colors[color_info.first] += edge_len; //color_info.second;
+ }
+ }
+ }
+ set<bin_id> answer;
+ for (auto color_info : path_colors) {
+ if (math::gr(double(color_info.second) / double(total_len), 0.3)) {
+ answer.insert(color_info.first);
+ }
+ }
+ return answer;
+ }
+
+public:
+
+ AnnotationFiller(const conj_graph_pack& gp,
+ const vector<bin_id>& interesting_bins) :
+ gp_(gp),
+ interesting_bins_(interesting_bins.begin(), interesting_bins.end()),
+ mapper_(MapperInstance(gp)) {
+ }
+
+ EdgeAnnotation operator() (io::SingleStream& contig_stream,
+ io::SingleStream& splits_stream,
+ AnnotationStream& splits_annotation_stream) {
+ INFO("Filling edge annotation");
+ INFO("Interesting bins " << interesting_bins_);
+
+ auto coloring = FillColorInfo(splits_stream, LoadAnnotation(splits_annotation_stream));
+ FilterSpuriousInfo(coloring);
+
+ EdgeAnnotation edge_annotation(gp_, interesting_bins_.empty() ? GatherAllBins(coloring) : interesting_bins_);
+
+ io::SingleRead contig;
+ while (!contig_stream.eof()) {
+ contig_stream >> contig;
+ auto path = mapper_->MapRead(contig).simple_path();
+ auto bins = DetermineBins(path, coloring);
+ for (EdgeId e : path) {
+ edge_annotation.StickAnnotation(e, bins);
+ }
+ }
+
+ INFO("Edge annotation filled. Annotated " << edge_annotation.size() << " edges.");
+ return edge_annotation;
+ }
+};
+}
diff --git a/src/projects/mts/config.yaml b/src/projects/mts/config.yaml
new file mode 100644
index 0000000..0150528
--- /dev/null
+++ b/src/projects/mts/config.yaml
@@ -0,0 +1,10 @@
+IN: "/Sid/snurk/mts/sim/data"
+SPADES: "~/Projects/mts/assembler/"
+QUAST: "python2 ~/opt/quast-3.2/metaquast.py"
+BIN: "~/Projects/mts/assembler/build/release/bin"
+SCRIPTS: "~/Projects/mts/assembler/src/projects/mts/scripts"
+SOFT: "/home/snurk/soft/"
+REF: "/Sid/snurk/mts/nielsen/ref.fasta"
+K: 55
+small_k: 21
+MIN_CONTIG_LENGTH: 2000
diff --git a/src/projects/mts/contig_abundance.cpp b/src/projects/mts/contig_abundance.cpp
new file mode 100644
index 0000000..ef00ce7
--- /dev/null
+++ b/src/projects/mts/contig_abundance.cpp
@@ -0,0 +1,176 @@
+#include "contig_abundance.hpp"
+#include "utils/indices/kmer_splitters.hpp"
+
+namespace debruijn_graph {
+
+size_t sample_cnt_ = 0;
+
+void SetSampleCount(size_t sample_cnt) {
+ sample_cnt_ = sample_cnt;
+}
+
+size_t SampleCount() {
+ return sample_cnt_;
+}
+
+MplVector SingleClusterAnalyzer::SampleMpls(const KmerProfiles& kmer_mpls, size_t sample) const {
+ MplVector answer;
+ answer.reserve(kmer_mpls.size());
+ for (const auto& kmer_mpl : kmer_mpls) {
+ answer.push_back(kmer_mpl[sample]);
+ }
+ return answer;
+}
+
+Mpl SingleClusterAnalyzer::SampleMedian(const KmerProfiles& kmer_mpls, size_t sample) const {
+ std::vector<Mpl> sample_mpls = SampleMpls(kmer_mpls, sample);
+
+ std::nth_element(sample_mpls.begin(), sample_mpls.begin() + sample_mpls.size()/2, sample_mpls.end());
+ return sample_mpls[sample_mpls.size()/2];
+}
+
+MplVector SingleClusterAnalyzer::MedianVector(const KmerProfiles& kmer_mpls) const {
+ VERIFY(kmer_mpls.size() != 0);
+ MplVector answer(SampleCount(), 0);
+ for (size_t i = 0; i < SampleCount(); ++i) {
+ answer[i] = SampleMedian(kmer_mpls, i);
+ }
+ return answer;
+}
+
+bool SingleClusterAnalyzer::AreClose(const KmerProfile& c, const KmerProfile& v) const {
+ //VERIFY(c.size() == v.size());
+ double sum = 0;
+ size_t non_zero_cnt = 0;
+ for (size_t i = 0; i < c.size(); ++i) {
+ double norm = 1.;
+ if (c[i] != 0) {
+ //norm = std::sqrt(double(c[i]));
+ norm = double(c[i]);
+ ++non_zero_cnt;
+ }
+ sum += std::abs(double(c[i]) - double(v[i])) / norm;
+ }
+ return math::ls(sum, coord_vise_proximity_ * double(non_zero_cnt));
+}
+
+KmerProfiles SingleClusterAnalyzer::CloseKmerMpls(const KmerProfiles& kmer_mpls, const KmerProfile& center) const {
+ KmerProfiles answer;
+ for (const auto& kmer_mpl : kmer_mpls) {
+ if (AreClose(center, kmer_mpl)) {
+ answer.push_back(kmer_mpl);
+ } else {
+ TRACE("Far kmer mpl " << PrintVector(kmer_mpl));
+ }
+ }
+ return answer;
+}
+
+boost::optional<AbundanceVector> SingleClusterAnalyzer::operator()(const KmerProfiles& kmer_mpls) const {
+ auto med = MedianVector(kmer_mpls);
+ return AbundanceVector(med.begin(), med.end());
+ //return boost::optional<AbundanceVector>(answer);
+ //MplVector center = MedianVector(kmer_mpls);
+ //auto locality = CloseKmerMpls(kmer_mpls, KmerProfile(center));
+
+ //for (size_t it_cnt = 0; it_cnt < MAX_IT; ++it_cnt) {
+ // DEBUG("Iteration " << it_cnt);
+ // DEBUG("Center is " << PrintVector(center));
+
+ // DEBUG("Locality size is " << locality.size()
+ // << " making " << (double(locality.size()) / double(kmer_mpls.size()))
+ // << " of total # points");
+
+ // double center_share = double(locality.size()) / double(kmer_mpls.size());
+ // if (math::ls(center_share, central_clust_share_)) {
+ // DEBUG("Detected central area contains too few k-mers: share " << center_share
+ // << " ; center size " << locality.size()
+ // << " ; total size " << kmer_mpls.size());
+ // return boost::none;
+ // }
+
+ // MplVector update = MedianVector(locality);
+ // DEBUG("Center update is " << PrintVector(update));
+
+ // if (center == update) {
+ // DEBUG("Old and new centers matched on iteration " << it_cnt);
+ // break;
+ // }
+
+ // center = update;
+ // locality = CloseKmerMpls(kmer_mpls, center);
+ //}
+
+ //return boost::optional<AbundanceVector>(MeanVector(locality, sample_cnt_));
+}
+
+vector<std::string> ContigAbundanceCounter::SplitOnNs(const std::string& seq) const {
+ vector<std::string> answer;
+ for (size_t i = 0; i < seq.size(); i++) {
+ size_t j = i;
+ while (j < seq.size() && is_nucl(seq[j])) {
+ j++;
+ }
+ if (j > i) {
+ answer.push_back(seq.substr(i, j - i));
+ i = j;
+ }
+ }
+ return answer;
+}
+
+void ContigAbundanceCounter::Init(const std::string& file_prefix) {
+ VERIFY(SampleCount() != 0);
+ INFO("Loading kmer index");
+ std::ifstream kmers_in(file_prefix + ".kmm", std::ios::binary);
+ kmer_mpl_.BinRead(kmers_in, file_prefix + ".kmm");
+
+ INFO("Loading kmer profiles data");
+ const size_t data_size = SampleCount() * kmer_mpl_.size();
+ mpl_data_.resize(data_size);
+ std::ifstream mpls_in(file_prefix + ".bpr", std::ios::binary);
+ mpls_in.read((char *)&mpl_data_[0], data_size * sizeof(Mpl));
+}
+
+boost::optional<AbundanceVector> ContigAbundanceCounter::operator()(
+ const std::string& s,
+ const std::string& /*name*/) const {
+ KmerProfiles kmer_mpls;
+
+ for (const auto& seq : SplitOnNs(s)) {
+ if (seq.size() < k_)
+ continue;
+
+ auto kwh = kmer_mpl_.ConstructKWH(RtSeq(k_, seq));
+ kwh >>= 'A';
+
+ for (size_t j = k_ - 1; j < seq.size(); ++j) {
+ kwh <<= seq[j];
+ TRACE("Processing kmer " << kwh.key().str());
+ if (kmer_mpl_.valid(kwh)) {
+ TRACE("Valid");
+ KmerProfile prof(&mpl_data_[kmer_mpl_.get_value(kwh, inverter_)]);
+ kmer_mpls.push_back(prof);
+ //if (!name.empty()) {
+ // os << PrintVector(kmer_mpl_.get_value(kwh, inverter_), sample_cnt_) << std::endl;
+ //}
+ TRACE(PrintVector(prof));
+ } else {
+ TRACE("Invalid");
+ }
+ }
+ }
+
+ double earmark_share = double(kmer_mpls.size()) / double(s.size() - k_ + 1);
+ DEBUG("Earmark k-mers: share " << earmark_share
+ << " # earmarks " << kmer_mpls.size()
+ << " ; total # " << (s.size() - k_ + 1));
+ if (math::ls(earmark_share, min_earmark_share_)) {
+ DEBUG("Too few earmarks");
+ return boost::none;
+ }
+
+ return cluster_analyzer_(kmer_mpls);
+}
+
+}
diff --git a/src/projects/mts/contig_abundance.hpp b/src/projects/mts/contig_abundance.hpp
new file mode 100644
index 0000000..fb5c9d7
--- /dev/null
+++ b/src/projects/mts/contig_abundance.hpp
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "pipeline/graph_pack.hpp"
+#include "utils/indices/perfect_hash_map_builder.hpp"
+
+namespace debruijn_graph {
+
+typedef uint16_t Mpl;
+typedef std::size_t Offset;
+static const Mpl INVALID_MPL = Mpl(-1);
+
+typedef typename std::vector<Mpl> MplVector;
+typedef typename std::vector<double> AbundanceVector;
+
+void SetSampleCount(size_t sample_cnt);
+size_t SampleCount();
+
+class KmerProfile {
+
+public:
+ typedef Mpl value_type;
+
+ KmerProfile(const value_type* ptr = nullptr):
+ ptr_(ptr) {
+ }
+
+ KmerProfile(const MplVector& vec):
+ ptr_(&vec.front()) {
+ }
+
+ size_t size() const {
+ return SampleCount();
+ }
+
+ Mpl operator[](size_t i) const {
+ VERIFY(i < size());
+ return ptr_[i];
+ }
+
+ const value_type* begin() const {
+ return ptr_;
+ }
+
+ const value_type* end() const {
+ return ptr_ + size();
+ }
+
+private:
+ const value_type* ptr_;
+};
+
+typedef std::vector<KmerProfile> KmerProfiles;
+
+template<class CovVecs>
+AbundanceVector MeanVector(const CovVecs& cov_vecs) {
+ VERIFY(cov_vecs.size() != 0);
+ size_t sample_cnt = cov_vecs.front().size();
+ AbundanceVector answer(sample_cnt, 0.);
+
+ for (const auto& cov_vec : cov_vecs) {
+ for (size_t i = 0; i < sample_cnt; ++i) {
+ answer[i] += double(cov_vec[i]);
+ }
+ }
+
+ for (size_t i = 0; i < sample_cnt; ++i) {
+ answer[i] /= double(cov_vecs.size());
+ }
+ return answer;
+}
+
+template<class AbVector>
+std::string PrintVector(const AbVector& mpl_vector) {
+ stringstream ss;
+ copy(mpl_vector.begin(), mpl_vector.end(),
+ ostream_iterator<typename AbVector::value_type>(ss, " "));
+ return ss.str();
+}
+
+class SingleClusterAnalyzer {
+ static const uint MAX_IT = 10;
+
+ double coord_vise_proximity_;
+ double central_clust_share_;
+
+ MplVector SampleMpls(const KmerProfiles& kmer_mpls, size_t sample) const;
+ Mpl SampleMedian(const KmerProfiles& kmer_mpls, size_t sample) const;
+ MplVector MedianVector(const KmerProfiles& kmer_mpls) const;
+ bool AreClose(const KmerProfile& c, const KmerProfile& v) const;
+ KmerProfiles CloseKmerMpls(const KmerProfiles& kmer_mpls, const KmerProfile& center) const;
+
+public:
+ SingleClusterAnalyzer(double coord_vise_proximity = 0.7,
+ double central_clust_share = 0.7) :
+ coord_vise_proximity_(coord_vise_proximity),
+ central_clust_share_(central_clust_share) {
+ }
+
+ boost::optional<AbundanceVector> operator()(const KmerProfiles& kmer_mpls) const;
+
+private:
+ DECL_LOGGER("SingleClusterAnalyzer");
+};
+
+class ContigAbundanceCounter {
+ typedef typename InvertableStoring::trivial_inverter<Offset> InverterT;
+
+ typedef KeyStoringMap<conj_graph_pack::seq_t,
+ Offset,
+ kmer_index_traits<conj_graph_pack::seq_t>,
+ InvertableStoring> IndexT;
+
+ unsigned k_;
+ SingleClusterAnalyzer cluster_analyzer_;
+ double min_earmark_share_;
+ IndexT kmer_mpl_;
+ InverterT inverter_;
+ std::vector<Mpl> mpl_data_;
+
+ void FillMplMap(const std::string& kmers_mpl_file);
+
+ vector<std::string> SplitOnNs(const std::string& seq) const;
+
+public:
+ ContigAbundanceCounter(unsigned k,
+ const SingleClusterAnalyzer& cluster_analyzer,
+ const std::string& work_dir,
+ double min_earmark_share = 0.7) :
+ k_(k),
+ cluster_analyzer_(cluster_analyzer),
+ min_earmark_share_(min_earmark_share),
+ kmer_mpl_(k_, work_dir) {
+ }
+
+ void Init(const std::string& kmer_mpl_file);
+
+ boost::optional<AbundanceVector> operator()(const std::string& s, const std::string& /*name*/ = "") const;
+
+private:
+ DECL_LOGGER("ContigAbundanceCounter");
+};
+
+}
diff --git a/src/projects/mts/contig_abundance_counter.cpp b/src/projects/mts/contig_abundance_counter.cpp
new file mode 100644
index 0000000..f2a2ba8
--- /dev/null
+++ b/src/projects/mts/contig_abundance_counter.cpp
@@ -0,0 +1,101 @@
+#include <array>
+#include <string>
+#include <iostream>
+#include "getopt_pp/getopt_pp.h"
+#include "io/reads/file_reader.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "pipeline/graphio.hpp"
+#include "logger.hpp"
+#include "formats.hpp"
+#include "contig_abundance.hpp"
+
+using namespace debruijn_graph;
+
+//Helper class to have scoped DEBUG()
+class Runner {
+public:
+ static void Run(ContigAbundanceCounter& abundance_counter, size_t min_length_bound,
+ io::FileReadStream& contigs_stream, io::osequencestream& splits_os,
+ std::ofstream& id_out, std::ofstream& mpl_out) {
+ static const size_t split_length = 10000;
+ io::SingleRead full_contig;
+ while (!contigs_stream.eof()) {
+ contigs_stream >> full_contig;
+ DEBUG("Analyzing contig " << GetId(full_contig));
+
+ for (size_t i = 0; i < full_contig.size(); i += split_length) {
+ if (full_contig.size() - i < min_length_bound) {
+ DEBUG("Fragment shorter than min_length_bound " << min_length_bound);
+ break;
+ }
+
+ io::SingleRead contig = full_contig.Substr(i, std::min(i + split_length, full_contig.size()));
+ splits_os << contig;
+
+ contig_id id = GetId(contig);
+ DEBUG("Processing fragment # " << (i / split_length) << " with id " << id);
+
+ auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name());
+
+ if (abundance_vec) {
+ stringstream ss;
+ copy(abundance_vec->begin(), abundance_vec->end(),
+ ostream_iterator<Mpl>(ss, " "));
+ DEBUG("Successfully estimated abundance of " << id << " : " << ss.str());
+
+ id_out << id << std::endl;
+ mpl_out << ss.str() << std::endl;
+ } else {
+ DEBUG("Failed to estimate abundance of " << id);
+ }
+ }
+ }
+ }
+private:
+ DECL_LOGGER("ContigAbundanceCounter");
+};
+
+int main(int argc, char** argv) {
+ using namespace GetOpt;
+
+ unsigned k;
+ size_t sample_cnt, min_length_bound;
+ std::string work_dir, contigs_path, splits_path;
+ std::string kmer_mult_fn, contigs_abundance_fn;
+
+ try {
+ GetOpt_pp ops(argc, argv);
+ ops.exceptions_all();
+ ops >> Option('k', k)
+ >> Option('w', work_dir)
+ >> Option('c', contigs_path)
+ >> Option('f', splits_path)
+ >> Option('n', sample_cnt)
+ >> Option('m', kmer_mult_fn)
+ >> Option('o', contigs_abundance_fn)
+ >> Option('l', min_length_bound, size_t(0));
+ } catch(GetOptEx &ex) {
+ std::cout << "Usage: contig_abundance_counter -k <K> -w <work_dir> -c <contigs path> "
+ "-n <sample cnt> -m <kmer multiplicities path> -f <splits_path> "
+ "-o <contigs abundance path> [-l <contig length bound> (default: 0)]" << std::endl;
+ exit(1);
+ }
+
+ //TmpFolderFixture fixture("tmp");
+ create_console_logger();
+
+ SetSampleCount(sample_cnt);
+ ContigAbundanceCounter abundance_counter(k, SingleClusterAnalyzer(), work_dir);
+ abundance_counter.Init(kmer_mult_fn);
+
+ io::FileReadStream contigs_stream(contigs_path);
+ io::osequencestream splits_os(splits_path);
+
+ std::ofstream id_out(contigs_abundance_fn + ".id");
+ std::ofstream mpl_out(contigs_abundance_fn + ".mpl");
+
+ Runner::Run(abundance_counter, min_length_bound,
+ contigs_stream, splits_os,
+ id_out, mpl_out);
+ return 0;
+}
diff --git a/src/projects/mts/formats.hpp b/src/projects/mts/formats.hpp
new file mode 100644
index 0000000..565e3b6
--- /dev/null
+++ b/src/projects/mts/formats.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "io/reads/single_read.hpp"
+
+namespace debruijn_graph {
+
+typedef std::string bin_id;
+typedef std::string contig_id;
+typedef std::vector<bin_id> Bins;
+typedef std::pair<contig_id, Bins> ContigAnnotation;
+
+inline contig_id GetId(const io::SingleRead& contig) {
+// std::string name = contig.name();
+// size_t pos = name.find("_ID_");
+// VERIFY(pos != std::string::npos);
+// size_t start = pos + 4;
+// VERIFY(start < name.size());
+// return name.substr(start, name.size() - start);
+ return contig.name();
+}
+
+inline contig_id GetBaseId(const contig_id& id) {
+ size_t pos = id.find('_');
+ VERIFY(pos != string::npos && id.substr(0, pos) == "NODE");
+ size_t pos2 = id.find('_', pos + 1);
+ return id.substr(pos + 1, pos2 - pos - 1);
+}
+
+}
diff --git a/src/projects/mts/kmc_api/kmc_file.cpp b/src/projects/mts/kmc_api/kmc_file.cpp
new file mode 100644
index 0000000..c4c674c
--- /dev/null
+++ b/src/projects/mts/kmc_api/kmc_file.cpp
@@ -0,0 +1,1093 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+#include "stdafx.h"
+#include "mmer.h"
+#include "kmc_file.h"
+#include <iostream>
+#include <tuple>
+
+
+uint64 CKMCFile::part_size = 1 << 25;
+
+
+// ----------------------------------------------------------------------------------
+// Open files *.kmc_pre & *.kmc_suf, read them to RAM, close files.
+// The file *.kmc_suf is opened for random access
+// IN : file_name - the name of kmer_counter's output
+// RET : true - if successful
+// ----------------------------------------------------------------------------------
+bool CKMCFile::OpenForRA(const std::string &file_name)
+{
+ uint64 size;
+ size_t result;
+
+ if (file_pre || file_suf)
+ return false;
+
+ if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP"))
+ return false;
+
+ ReadParamsFrom_prefix_file_buf(size);
+
+ fclose(file_pre);
+ file_pre = NULL;
+
+ if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS"))
+ return false;
+
+ sufix_file_buf = new uchar[size];
+ result = fread(sufix_file_buf, 1, size, file_suf);
+ if (result == 0)
+ return false;
+
+ fclose(file_suf);
+ file_suf = NULL;
+
+ is_opened = opened_for_RA;
+ prefix_index = 0;
+ sufix_number = 0;
+ return true;
+}
+
+//----------------------------------------------------------------------------------
+// Open files *kmc_pre & *.kmc_suf, read *.kmc_pre to RAM, close *kmc.pre
+// *.kmc_suf is buffered
+// IN : file_name - the name of kmer_counter's output
+// RET : true - if successful
+//----------------------------------------------------------------------------------
+bool CKMCFile::OpenForListing(const std::string &file_name)
+{
+ uint64 size;
+ size_t result;
+
+ if (is_opened)
+ return false;
+
+ if (file_pre || file_suf)
+ return false;
+
+ if (!OpenASingleFile(file_name + ".kmc_pre", file_pre, size, (char *)"KMCP"))
+ return false;
+
+ ReadParamsFrom_prefix_file_buf(size);
+ fclose(file_pre);
+ file_pre = NULL;
+
+ end_of_file = total_kmers == 0;
+
+ if (!OpenASingleFile(file_name + ".kmc_suf", file_suf, size, (char *)"KMCS"))
+ return false;
+
+ sufix_file_buf = new uchar[part_size];
+ result = fread(sufix_file_buf, 1, part_size, file_suf);
+ if (result == 0)
+ return false;
+
+ is_opened = opened_for_listing;
+ prefix_index = 0;
+ sufix_number = 0;
+ index_in_partial_buf = 0;
+ return true;
+}
+//----------------------------------------------------------------------------------
+CKMCFile::CKMCFile()
+{
+ file_pre = NULL;
+ file_suf = NULL;
+
+ prefix_file_buf = NULL;
+ sufix_file_buf = NULL;
+ signature_map = NULL;
+
+ is_opened = closed;
+ end_of_file = false;
+};
+//----------------------------------------------------------------------------------
+CKMCFile::~CKMCFile()
+{
+ if (file_pre)
+ fclose(file_pre);
+ if (file_suf)
+ fclose(file_suf);
+ if (prefix_file_buf)
+ delete[] prefix_file_buf;
+ if (sufix_file_buf)
+ delete[] sufix_file_buf;
+ if (signature_map)
+ delete[] signature_map;
+};
+//----------------------------------------------------------------------------------
+// Open a file, recognize its size and check its marker. Auxiliary function.
+// IN : file_name - the name of a file to open
+// RET : true - if successful
+//----------------------------------------------------------------------------------
+bool CKMCFile::OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[])
+{
+ char _marker[4];
+ size_t result;
+
+ if ((file_handler = my_fopen(file_name.c_str(), "rb")) == NULL)
+ return false;
+
+ my_fseek(file_handler, 0, SEEK_END);
+ size = my_ftell(file_handler); //the size of a whole file
+
+ my_fseek(file_handler, -4, SEEK_CUR);
+ result = fread(_marker, 1, 4, file_handler);
+ if (result == 0)
+ return false;
+
+ size = size - 4; //the size of the file without the terminal marker
+ if (strncmp(marker, _marker, 4) != 0)
+ {
+ fclose(file_handler);
+ file_handler = NULL;
+ return false;
+ }
+
+ rewind(file_handler);
+ result = fread(_marker, 1, 4, file_handler);
+ if (result == 0)
+ return false;
+
+ size = size - 4; //the size of the file without initial and terminal markers
+
+ if (strncmp(marker, _marker, 4) != 0)
+ {
+ fclose(file_handler);
+ file_handler = NULL;
+ return false;
+ }
+
+ return true;
+};
+//-------------------------------------------------------------------------------------
+// Recognize current parameters from kmc_databese. Auxiliary function.
+// IN : the size of the file *.kmc_pre, without initial and terminal markers
+// RET : true - if succesfull
+//----------------------------------------------------------------------------------
+bool CKMCFile::ReadParamsFrom_prefix_file_buf(uint64 &size)
+{
+ size_t prev_pos = my_ftell(file_pre);
+ my_fseek(file_pre, -12, SEEK_END);
+ size_t result;
+
+ result = fread(&kmc_version, sizeof(uint32), 1, file_pre);
+ if (kmc_version != 0 && kmc_version != 0x200) //only this versions are supported, 0 = kmc1, 0x200 = kmc2
+ return false;
+ my_fseek(file_pre, prev_pos, SEEK_SET);
+
+ if (kmc_version == 0x200)
+ {
+ my_fseek(file_pre, -8, SEEK_END);
+
+ int64 header_offset;
+ header_offset = fgetc(file_pre);
+
+ size = size - 4; //file size without the size of header_offset (and without 2 markers)
+
+ my_fseek(file_pre, (0LL - (header_offset + 8)), SEEK_END);
+ result = fread(&kmer_length, 1, sizeof(uint32), file_pre);
+ result = fread(&mode, 1, sizeof(uint32), file_pre);
+ result = fread(&counter_size, 1, sizeof(uint32), file_pre);
+ result = fread(&lut_prefix_length, 1, sizeof(uint32), file_pre);
+ result = fread(&signature_len, 1, sizeof(uint32), file_pre);
+ result = fread(&min_count, 1, sizeof(uint32), file_pre);
+ original_min_count = min_count;
+ result = fread(&max_count, 1, sizeof(uint32), file_pre);
+ original_max_count = max_count;
+ result = fread(&total_kmers, 1, sizeof(uint64), file_pre);
+
+ signature_map_size = ((1 << (2 * signature_len)) + 1);
+ uint64 lut_area_size_in_bytes = size - (signature_map_size * sizeof(uint32)+header_offset + 8);
+ single_LUT_size = 1 << (2 * lut_prefix_length);
+ uint64 last_data_index = lut_area_size_in_bytes / sizeof(uint64);
+
+ rewind(file_pre);
+ my_fseek(file_pre, +4, SEEK_CUR);
+ prefix_file_buf_size = (lut_area_size_in_bytes + 8) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers)
+ prefix_file_buf = new uint64[prefix_file_buf_size];
+ result = fread(prefix_file_buf, 1, (size_t)(lut_area_size_in_bytes + 8), file_pre);
+ if (result == 0)
+ return false;
+ prefix_file_buf[last_data_index] = total_kmers + 1;
+
+ signature_map = new uint32[signature_map_size];
+ result = fread(signature_map, 1, signature_map_size * sizeof(uint32), file_pre);
+ if (result == 0)
+ return false;
+
+ sufix_size = (kmer_length - lut_prefix_length) / 4;
+
+ sufix_rec_size = sufix_size + counter_size;
+
+ return true;
+ }
+ else if (kmc_version == 0)
+ {
+ prefix_file_buf_size = (size - 4) / sizeof(uint64); //reads without 4 bytes of a header_offset (and without markers)
+ prefix_file_buf = new uint64[prefix_file_buf_size];
+ result = fread(prefix_file_buf, 1, (size_t)(size - 4), file_pre);
+ if (result == 0)
+ return false;
+
+ my_fseek(file_pre, -8, SEEK_END);
+
+ uint64 header_offset;
+ header_offset = fgetc(file_pre);
+
+ size = size - 4;
+
+ uint64 header_index = (size - header_offset) / sizeof(uint64);
+ uint64 last_data_index = header_index;
+
+ uint64 d = prefix_file_buf[header_index];
+
+ kmer_length = (uint32)d; //- kmer's length
+ mode = d >> 32; //- mode: 0 or 1
+
+ header_index++;
+ counter_size = (uint32)prefix_file_buf[header_index]; //- the size of a counter in bytes;
+ //- for mode 0 counter_size is 1, 2, 3, or 4
+ //- for mode = 1 counter_size is 4;
+ lut_prefix_length = prefix_file_buf[header_index] >> 32; //- the number of prefix's symbols cut frm kmers;
+ //- (kmer_length - lut_prefix_length) is divisible by 4
+
+ header_index++;
+ original_min_count = (uint32)prefix_file_buf[header_index]; //- the minimal number of kmer's appearances
+ min_count = original_min_count;
+ original_max_count = prefix_file_buf[header_index] >> 32; //- the maximal number of kmer's appearances
+ max_count = original_max_count;
+
+ header_index++;
+ total_kmers = prefix_file_buf[header_index]; //- the total number of kmers
+
+ prefix_file_buf[last_data_index] = total_kmers + 1;
+
+ sufix_size = (kmer_length - lut_prefix_length) / 4;
+
+ sufix_rec_size = sufix_size + counter_size;
+
+ return true;
+
+ }
+ return false;
+}
+
+//------------------------------------------------------------------------------------------
+// Check if kmer exists.
+// IN : kmer - kmer
+// OUT: count - kmer's counter if kmer exists
+// RET: true - if kmer exists
+//------------------------------------------------------------------------------------------
+bool CKMCFile::CheckKmer(CKmerAPI &kmer, float &count)
+{
+ uint32 int_counter;
+ if (CheckKmer(kmer, int_counter))
+ {
+ if (mode == 0)
+ count = (float)int_counter;
+ else
+ memcpy(&count, &int_counter, counter_size);
+ return true;
+ }
+ return false;
+}
+
+//------------------------------------------------------------------------------------------
+// Check if kmer exists.
+// IN : kmer - kmer
+// OUT: count - kmer's counter if kmer exists
+// RET: true - if kmer exists
+//------------------------------------------------------------------------------------------
+bool CKMCFile::CheckKmer(CKmerAPI &kmer, uint32 &count)
+{
+ if(is_opened != opened_for_RA)
+ return false;
+ if(end_of_file)
+ return false;
+
+ //recognize a prefix:
+ uint64 pattern_prefix_value = kmer.kmer_data[0];
+
+ uint32 pattern_offset = (sizeof(pattern_prefix_value)* 8) - (lut_prefix_length * 2) - (kmer.byte_alignment * 2);
+ int64 index_start = 0, index_stop = 0;
+
+ pattern_prefix_value = pattern_prefix_value >> pattern_offset; //complements with 0
+ if (pattern_prefix_value >= prefix_file_buf_size)
+ return false;
+
+ if (kmc_version == 0x200)
+ {
+ uint32 signature = kmer.get_signature(signature_len);
+ uint32 bin_start_pos = signature_map[signature];
+ bin_start_pos *= single_LUT_size;
+ //look into the array with data
+ index_start = *(prefix_file_buf + bin_start_pos + pattern_prefix_value);
+ index_stop = *(prefix_file_buf + bin_start_pos + pattern_prefix_value + 1) - 1;
+ }
+ else if (kmc_version == 0)
+ {
+ //look into the array with data
+ index_start = prefix_file_buf[pattern_prefix_value];
+ index_stop = prefix_file_buf[pattern_prefix_value + 1] - 1;
+ }
+
+ return BinarySearch(index_start, index_stop, kmer, count, pattern_offset);
+}
+
+//-----------------------------------------------------------------------------------------------
+// Check if end of file
+// RET: true - all kmers are listed
+//-----------------------------------------------------------------------------------------------
+bool CKMCFile::Eof(void)
+{
+ return end_of_file;
+}
+
+bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, float &count)
+{
+ uint32 int_counter;
+ if (ReadNextKmer(kmer, int_counter))
+ {
+ if (mode == 0)
+ count = (float)int_counter;
+ else
+ memcpy(&count, &int_counter, counter_size);
+ return true;
+ }
+ return false;
+
+}
+//-----------------------------------------------------------------------------------------------
+// Read next kmer
+// OUT: kmer - next kmer
+// OUT: count - kmer's counter
+// RET: true - if not EOF
+//-----------------------------------------------------------------------------------------------
+bool CKMCFile::ReadNextKmer(CKmerAPI &kmer, uint32 &count)
+{
+ uint64 prefix_mask = (1 << 2 * lut_prefix_length) - 1; //for kmc2 db
+
+ if(is_opened != opened_for_listing)
+ return false;
+ do
+ {
+ if(end_of_file)
+ return false;
+
+ if(sufix_number == prefix_file_buf[prefix_index + 1])
+ {
+ prefix_index++;
+
+ while (prefix_file_buf[prefix_index] == prefix_file_buf[prefix_index + 1])
+ prefix_index++;
+ }
+
+ uint32 off = (sizeof(prefix_index) * 8) - (lut_prefix_length * 2) - kmer.byte_alignment * 2;
+
+ uint64 temp_prefix = (prefix_index & prefix_mask) << off; // shift prefix towards MSD. "& prefix_mask" necessary for kmc2 db format
+
+ kmer.kmer_data[0] = temp_prefix; // store prefix in an object CKmerAPI
+
+ for(uint32 i = 1; i < kmer.no_of_rows; i++)
+ kmer.kmer_data[i] = 0;
+
+ //read sufix:
+ uint32 row_index = 0;
+ uint64 suf = 0;
+
+ off = off - 8;
+
+ for(uint32 a = 0; a < sufix_size; a ++)
+ {
+ if(index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ suf = sufix_file_buf[index_in_partial_buf++];
+ suf = suf << off;
+ kmer.kmer_data[row_index] = kmer.kmer_data[row_index] | suf;
+
+ if (off == 0) //the end of a word in kmer_data
+ {
+ off = 56;
+ row_index++;
+ }
+ else
+ off -=8;
+ }
+
+ //read counter:
+ if(index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ count = sufix_file_buf[index_in_partial_buf++];
+
+ for(uint32 b = 1; b < counter_size; b++)
+ {
+ if(index_in_partial_buf == part_size)
+ Reload_sufix_file_buf();
+
+ uint32 aux = 0x000000ff & sufix_file_buf[index_in_partial_buf++];
+ aux = aux << 8 * ( b);
+ count = aux | count;
+ }
+
+ sufix_number++;
+
+ if(sufix_number == total_kmers)
+ end_of_file = true;
+
+ if (mode != 0)
+ {
+ float float_counter;
+ memcpy(&float_counter, &count, counter_size);
+ if ((float_counter < min_count) || (float_counter > max_count))
+ continue;
+ else
+ break;
+ }
+
+ }
+ while((count < min_count) || (count > max_count));
+
+ return true;
+}
+//-------------------------------------------------------------------------------
+// Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function.
+//-------------------------------------------------------------------------------
+void CKMCFile::Reload_sufix_file_buf()
+{
+ fread (sufix_file_buf, 1, (size_t) part_size, file_suf);
+ index_in_partial_buf = 0;
+};
+//-------------------------------------------------------------------------------
+// Release memory and close files in case they were opened
+// RET: true - if files have been readed
+//-------------------------------------------------------------------------------
+bool CKMCFile::Close()
+{
+ if(is_opened)
+ {
+ if(file_pre)
+ {
+ fclose(file_pre);
+ file_pre = NULL;
+ }
+ if(file_suf)
+ {
+ fclose(file_suf);
+ file_suf = NULL;
+ }
+
+ is_opened = closed;
+ end_of_file = false;
+ delete [] prefix_file_buf;
+ prefix_file_buf = NULL;
+ delete [] sufix_file_buf;
+ sufix_file_buf = NULL;
+ delete[] signature_map;
+ signature_map = NULL;
+
+ return true;
+ }
+ else
+ return false;
+};
+//----------------------------------------------------------------------------------
+// Set initial values to enable listing kmers from the begining. Only in listing mode
+// RET: true - if a file has been opened for listing
+//----------------------------------------------------------------------------------
+bool CKMCFile::RestartListing(void)
+{
+ if(is_opened == opened_for_listing)
+ {
+
+ my_fseek ( file_suf , 4 , SEEK_SET );
+ fread (sufix_file_buf, 1, (size_t) part_size, file_suf);
+ prefix_index = 0;
+ sufix_number = 0;
+ index_in_partial_buf = 0;
+
+ end_of_file = total_kmers == 0;
+
+ return true;
+ }
+ return false;
+
+};
+//----------------------------------------------------------------------------------------
+// Set the minimal value for a counter. Kmers with counters below this theshold are ignored
+// IN : x - minimal value for a counter
+// RET : true - if successful
+//----------------------------------------------------------------------------------------
+bool CKMCFile::SetMinCount(uint32 x)
+{
+ if((original_min_count <= x) && (x < max_count))
+ {
+ min_count = x;
+ return true;
+ }
+ else
+ return false;
+}
+
+//----------------------------------------------------------------------------------------
+// Return a value of min_count. Kmers with counters below this theshold are ignored
+// RET : a value of min_count
+//----------------------------------------------------------------------------------------
+uint32 CKMCFile::GetMinCount(void)
+{
+ return min_count;
+};
+
+//----------------------------------------------------------------------------------------
+// Set the maximal value for a counter. Kmers with counters above this theshold are ignored
+// IN : x - maximal value for a counter
+// RET : true - if successful
+//----------------------------------------------------------------------------------------
+bool CKMCFile::SetMaxCount(uint32 x)
+{
+ if((original_max_count >= x) && (x > min_count))
+ {
+ max_count = x;
+ return true;
+ }
+ else
+ return false;
+}
+
+
+//----------------------------------------------------------------------------------------
+// Return a value of max_count. Kmers with counters above this theshold are ignored
+// RET : a value of max_count
+//----------------------------------------------------------------------------------------
+uint32 CKMCFile::GetMaxCount(void)
+{
+ return max_count;
+}
+
+//----------------------------------------------------------------------------------------
+// Set original (readed from *.kmer_pre) values for min_count and max_count
+//----------------------------------------------------------------------------------------
+void CKMCFile::ResetMinMaxCounts(void)
+{
+ min_count = original_min_count;
+ max_count = original_max_count;
+}
+
+//----------------------------------------------------------------------------------------
+// Return the length of kmers
+// RET : the length of kmers
+//----------------------------------------------------------------------------------------
+uint32 CKMCFile::KmerLength(void)
+{
+ return kmer_length;
+}
+
+//----------------------------------------------------------------------------------------
+// Check if kmer exists
+// IN : kmer - kmer
+// RET : true if kmer exists
+//----------------------------------------------------------------------------------------
+bool CKMCFile::IsKmer(CKmerAPI &kmer)
+{
+ uint32 _count;
+ if(CheckKmer(kmer, _count))
+ return true;
+ else
+ return false;
+}
+
+//-----------------------------------------------------------------------------------------
+// Check the total number of kmers between current min_count and max_count
+// RET : total number of kmers or 0 if a database has not been opened
+//-----------------------------------------------------------------------------------------
+uint64 CKMCFile::KmerCount(void)
+{
+ if(is_opened)
+ if((min_count == original_min_count) && (max_count == original_max_count))
+ return total_kmers;
+ else
+ {
+ uint32 count;
+ uint32 int_counter;
+ uint64 aux_kmerCount = 0;
+
+ if(is_opened == opened_for_RA)
+ {
+ uchar *ptr = sufix_file_buf;
+
+ for(uint64 i = 0; i < total_kmers; i++)
+ {
+ ptr += sufix_size;
+ int_counter = *ptr;
+ ptr++;
+
+ for(uint32 b = 1; b < counter_size; b ++)
+ {
+ uint32 aux = 0x000000ff & *(ptr);
+ aux = aux << 8 * ( b);
+ int_counter = aux | int_counter;
+ ptr++;
+ }
+
+ if(mode == 0)
+ count = int_counter;
+ else
+ memcpy(&count, &int_counter, counter_size);
+
+ if((count >= min_count) && (count <= max_count))
+ aux_kmerCount++;
+ }
+ }
+ else //opened_for_listing
+ {
+ CKmerAPI kmer(kmer_length);
+ float count;
+ RestartListing();
+ for(uint64 i = 0; i < total_kmers; i++)
+ {
+ ReadNextKmer(kmer, count);
+ if((count >= min_count) && (count <= max_count))
+ aux_kmerCount++;
+ }
+ RestartListing();
+ }
+ return aux_kmerCount;
+ }
+ else
+ return 0 ;
+}
+//---------------------------------------------------------------------------------
+// Get current parameters from kmer_database
+// OUT : _kmer_length - the length of kmers
+// _mode - mode
+// _counter_size - the size of a counter in bytes
+// _lut_prefix_length - the number of prefix's symbols cut from kmers
+// _min_count - the minimal number of kmer's appearances
+// _max_count - the maximal number of kmer's appearances
+// _total_kmers - the total number of kmers
+// RET : true if kmer_database has been opened
+//---------------------------------------------------------------------------------
+bool CKMCFile::Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint32 &_max_count, uint64 &_total_kmers)
+{
+ if(is_opened)
+ {
+ _kmer_length = kmer_length;
+ _mode = mode;
+ _counter_size = counter_size;
+ _lut_prefix_length = lut_prefix_length;
+ if (kmc_version == 0x200)
+ _signature_len = signature_len;
+ else
+ _signature_len = 0; //for kmc1 there is no signature_len
+ _min_count = min_count;
+ _max_count = max_count;
+ _total_kmers = total_kmers;
+ return true;
+ }
+ return false;
+};
+
+
+//---------------------------------------------------------------------------------
+// Get counters from read
+// OUT : counters - vector of counters of each k-mer in read (of size read_len - kmer_len + 1), if some k-mer is invalid (i.e. contains 'N') the counter is equal to 0
+// IN : read -
+// RET : true if success
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead(const std::string& read, std::vector<uint32>& counters)
+{
+ if (is_opened != opened_for_RA)
+ return false;
+ if (kmc_version == 0x200)
+ return GetCountersForRead_kmc2(read, counters);
+ else if (kmc_version == 0)
+ return GetCountersForRead_kmc1(read, counters);
+ else
+ return false; //never should be here
+}
+
+//---------------------------------------------------------------------------------
+// Get counters from read
+// OUT : counters - vector of counters of each k-mer in read (of size read_len - kmer_len + 1), if some k-mer is invalid (i.e. contains 'N') the counter is equal to 0
+// IN : read -
+// RET : true if success
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead(const std::string& read, std::vector<float>& counters)
+{
+ if (is_opened != opened_for_RA)
+ return false;
+ std::vector<uint32> uint32_v;
+ if (GetCountersForRead(read, uint32_v))
+ {
+ counters.clear();
+ counters.resize(uint32_v.size());
+ if (mode == 0)
+ {
+ for (uint32 i = 0; i < uint32_v.size(); ++i)
+ counters[i] = static_cast<float>(uint32_v[i]);
+ }
+ else
+ {
+ for (uint32 i = 0; i < uint32_v.size(); ++i)
+ memcpy(&counters[i], &uint32_v[i], counter_size);
+ }
+
+ return true;
+ }
+ return false;
+}
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+uint32 CKMCFile::count_for_kmer_kmc1(CKmerAPI& kmer)
+{
+ //recognize a prefix:
+
+ uint64 pattern_prefix_value = kmer.kmer_data[0];
+
+ uint32 pattern_offset = (sizeof(pattern_prefix_value)* 8) - (lut_prefix_length * 2) - (kmer.byte_alignment * 2);
+
+ pattern_prefix_value = pattern_prefix_value >> pattern_offset; //complements with 0
+ if (pattern_prefix_value >= prefix_file_buf_size)
+ return false;
+ //look into the array with data
+
+ int64 index_start = prefix_file_buf[pattern_prefix_value];
+ int64 index_stop = prefix_file_buf[pattern_prefix_value + 1] - 1;
+
+ uint32 counter = 0;
+ if (BinarySearch(index_start, index_stop, kmer, counter, pattern_offset))
+ return counter;
+ return 0;
+}
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+uint32 CKMCFile::count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos)
+{
+ //recognize a prefix:
+ uint64 pattern_prefix_value = kmer.kmer_data[0];
+
+ uint32 pattern_offset = (sizeof(pattern_prefix_value)* 8) - (lut_prefix_length * 2) - (kmer.byte_alignment * 2);
+
+ pattern_prefix_value = pattern_prefix_value >> pattern_offset; //complements with 0
+ if (pattern_prefix_value >= prefix_file_buf_size)
+ return false;
+ //look into the array with data
+
+ int64 index_start = *(prefix_file_buf + bin_start_pos + pattern_prefix_value);
+ int64 index_stop = *(prefix_file_buf + bin_start_pos + pattern_prefix_value + 1) - 1;
+
+ uint32 counter = 0;
+ if (BinarySearch(index_start, index_stop, kmer, counter, pattern_offset))
+ return counter;
+ return 0;
+}
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead_kmc1(const std::string& read, std::vector<uint32>& counters)
+{
+ uint32 read_len = static_cast<uint32>(read.length());
+ counters.resize(read.length() - kmer_length + 1);
+ std::string transformed_read = read;
+ for (char& c : transformed_read)
+ c = CKmerAPI::num_codes[(uchar)c];
+
+ uint32 i = 0;
+ CKmerAPI kmer(kmer_length);
+ uint32 pos = 0;
+
+ uint32 counters_pos = 0;
+
+ while (i + kmer_length - 1 < read_len)
+ {
+ bool contains_N = false;
+ while (i < read_len && pos < kmer_length)
+ {
+ if (CKmerAPI::num_codes[(uchar)read[i]] < 0)
+ {
+ pos = 0;
+ kmer.clear();
+ ++i;
+ uint32 wrong_kmers = MIN(i - counters_pos, static_cast<uint32>(counters.size()) - counters_pos);
+ fill_n(counters.begin() + counters_pos, wrong_kmers, 0);
+ counters_pos += wrong_kmers;
+ contains_N = true;
+ break;
+ }
+ else
+ kmer.insert2bits(pos++, CKmerAPI::num_codes[(uchar)read[i++]]);
+ }
+ if (contains_N)
+ continue;
+ if (pos == kmer_length)
+ {
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer);
+ }
+ else
+ break;
+
+ while (i < read_len)
+ {
+ if (CKmerAPI::num_codes[(uchar)read[i]] < 0)
+ {
+ pos = 0;
+ break;
+ }
+ kmer.SHL_insert2bits(CKmerAPI::num_codes[(uchar)read[i++]]);
+ counters[counters_pos++] = count_for_kmer_kmc1(kmer);
+ }
+ }
+ if (counters_pos < counters.size())
+ {
+ fill_n(counters.begin() + counters_pos, counters.size() - counters_pos, 0);
+ counters_pos = static_cast<uint32>(counters.size());
+ }
+ return true;
+}
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+bool CKMCFile::GetCountersForRead_kmc2(const std::string& read, std::vector<uint32>& counters)
+{
+counters.resize(read.length() - kmer_length + 1);
+ std::string transformed_read = read;
+ for (char& c : transformed_read)
+ c = CKmerAPI::num_codes[(uchar)c];
+ uint32 i = 0;
+ uint32 len = 0; //length of super k-mer
+ uint32 signature_start_pos;
+ CMmer current_signature(signature_len), end_mmer(signature_len);
+
+ using super_kmers_t = std::vector<std::tuple<uint32, uint32, uint32>>;//start_pos, len, bin_no,
+ super_kmers_t super_kmers;
+
+ while (i + kmer_length - 1 < read.length())
+ {
+ bool contains_N = false;
+ //building first signature after 'N' or at the read beginning
+ for (uint32 j = 0; j < signature_len; ++j, ++i)
+ {
+ if (transformed_read[i] < 0)//'N'
+ {
+ contains_N = true;
+ break;
+ }
+ }
+ //signature must be shorter than k-mer so if signature contains 'N', k-mer will contains it also
+ if (contains_N)
+ {
+ ++i;
+ continue;
+ }
+ len = signature_len;
+ signature_start_pos = i - signature_len;
+ current_signature.insert(transformed_read.c_str() + signature_start_pos);
+ end_mmer.set(current_signature);
+
+ for (; i < transformed_read.length(); ++i)
+ {
+ if (transformed_read[i] < 0)//'N'
+ {
+ if (len >= kmer_length)
+ {
+ super_kmers.push_back(std::make_tuple(i - len, len, signature_map[current_signature.get()]));
+ }
+ len = 0;
+ ++i;
+ break;
+ }
+ end_mmer.insert(transformed_read[i]);
+ if (end_mmer < current_signature)//signature at the end of current k-mer is lower than current
+ {
+ if (len >= kmer_length)
+ {
+ super_kmers.push_back(std::make_tuple(i - len, len, signature_map[current_signature.get()]));
+ len = kmer_length - 1;
+ }
+ current_signature.set(end_mmer);
+ signature_start_pos = i - signature_len + 1;
+ }
+ else if (end_mmer == current_signature)
+ {
+ current_signature.set(end_mmer);
+ signature_start_pos = i - signature_len + 1;
+ }
+ else if (signature_start_pos + kmer_length - 1 < i)//need to find new signature
+ {
+ super_kmers.push_back(std::make_tuple(i - len, len, signature_map[current_signature.get()]));
+ len = kmer_length - 1;
+ //looking for new signature
+ ++signature_start_pos;
+ //building first signature in current k-mer
+ end_mmer.insert(transformed_read.c_str() + signature_start_pos);
+ current_signature.set(end_mmer);
+ for (uint32 j = signature_start_pos + signature_len; j <= i; ++j)
+ {
+ end_mmer.insert(transformed_read[j]);
+ if (end_mmer <= current_signature)
+ {
+ current_signature.set(end_mmer);
+ signature_start_pos = j - signature_len + 1;
+ }
+ }
+ }
+ ++len;
+ }
+ }
+ if (len >= kmer_length)//last one in read
+ {
+ super_kmers.push_back(std::make_tuple(i - len, len, signature_map[current_signature.get()]));
+ }
+
+ uint32 counters_pos = 0;
+ if (super_kmers.empty())
+ {
+ fill_n(counters.begin(), counters.size(), 0);
+ return true;
+ }
+
+ CKmerAPI kmer(kmer_length);
+
+ uint32 last_end = 0;
+
+ //'N' somewhere in first k-mer
+ if (std::get<0>(super_kmers.front()) > 0)
+ {
+ fill_n(counters.begin(), std::get<0>(super_kmers.front()), 0);
+ last_end = std::get<0>(super_kmers.front());
+ counters_pos = std::get<0>(super_kmers.front());
+ }
+ for (auto& super_kmer : super_kmers)
+ {
+ //'N's between super k-mers
+ if (last_end < std::get<0>(super_kmer))
+ {
+ uint32 gap = std::get<0>(super_kmer) -last_end;
+ fill_n(counters.begin() + counters_pos, kmer_length + gap - 1, 0);
+ counters_pos += kmer_length + gap - 1;
+ }
+ last_end = std::get<0>(super_kmer) + std::get<1>(super_kmer);
+
+ kmer.clear();
+ kmer.from_binary(transformed_read.c_str() + std::get<0>(super_kmer));
+
+ uint32 bin_start_pos = std::get<2>(super_kmer) * single_LUT_size;
+ counters[counters_pos++] = count_for_kmer_kmc2(kmer, bin_start_pos);
+
+ for (uint32 i = std::get<0>(super_kmer) +kmer_length; i < std::get<0>(super_kmer) +std::get<1>(super_kmer); ++i)
+ {
+ kmer.SHL_insert2bits(transformed_read[i]);
+ counters[counters_pos++] = count_for_kmer_kmc2(kmer, bin_start_pos);
+ }
+ }
+ //'N's at the end of read
+ if (counters_pos < counters.size())
+ {
+ fill_n(counters.begin() + counters_pos, counters.size() - counters_pos, 0);
+ counters_pos = static_cast<uint32>(counters.size());
+ }
+
+ return true;
+}
+
+
+//---------------------------------------------------------------------------------
+// Auxiliary function.
+//---------------------------------------------------------------------------------
+bool CKMCFile::BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint32& counter, uint32 pattern_offset)
+{
+ uchar *sufix_byte_ptr = nullptr;
+ uint64 sufix = 0;
+
+ //sufix_offset is always 56
+ uint32 sufix_offset = 56; // the ofset of a sufix is for shifting the sufix towards MSB, to compare the sufix with a pattern
+ // Bytes of a pattern to search are always shifted towards MSB
+
+ uint32 row_index = 0; // the number of a current row in an array kmer_data
+
+ bool found = false;
+
+ while (index_start <= index_stop)
+ {
+ int64 mid_index = (index_start + index_stop) / 2;
+ sufix_byte_ptr = &sufix_file_buf[mid_index * sufix_rec_size];
+
+ uint64 pattern = 0;
+
+ pattern_offset = (lut_prefix_length + kmer.byte_alignment) * 2;
+
+ row_index = 0;
+ for (uint32 a = 0; a < sufix_size; a++) //check byte by byte
+ {
+ pattern = kmer.kmer_data[row_index];
+ pattern = pattern << pattern_offset;
+ pattern = pattern & 0xff00000000000000;
+
+ sufix = sufix_byte_ptr[a];
+ sufix = sufix << sufix_offset;
+
+ if (pattern != sufix)
+ break;
+
+ pattern_offset += 8;
+
+ if (pattern_offset == 64) //the end of a word
+ {
+ pattern_offset = 0;
+ row_index++;
+ }
+ }
+
+ if (pattern == sufix)
+ {
+ found = true;
+ break;
+ }
+ if (sufix < pattern)
+ index_start = mid_index + 1;
+ else
+ index_stop = mid_index - 1;
+ }
+
+ if (found)
+ {
+ sufix_byte_ptr += sufix_size;
+
+ counter = *sufix_byte_ptr;
+
+ for (uint32 b = 1; b < counter_size; b++)
+ {
+ uint32 aux = 0x000000ff & *(sufix_byte_ptr + b);
+
+ aux = aux << 8 * (b);
+ counter = aux | counter;
+ }
+ if (mode != 0)
+ {
+ float float_counter;
+ memcpy(&float_counter, &counter, counter_size);
+ return (float_counter >= min_count) && (float_counter <= max_count);
+ }
+ return (counter >= min_count) && (counter <= max_count);
+ }
+ return false;
+}
+
+
+// ***** EOF
diff --git a/src/projects/mts/kmc_api/kmc_file.h b/src/projects/mts/kmc_api/kmc_file.h
new file mode 100644
index 0000000..73676f9
--- /dev/null
+++ b/src/projects/mts/kmc_api/kmc_file.h
@@ -0,0 +1,141 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+#ifndef _KMC_FILE_H
+#define _KMC_FILE_H
+
+#include "kmer_defs.h"
+#include "kmer_api.h"
+#include <string>
+#include <vector>
+
+class CKMCFile
+{
+ enum open_mode {closed, opened_for_RA, opened_for_listing};
+ open_mode is_opened;
+
+ bool end_of_file;
+
+ FILE *file_pre;
+ FILE *file_suf;
+
+ uint64* prefix_file_buf;
+ uint64 prefix_file_buf_size;
+ uint64 prefix_index; // The current prefix's index in an array "prefix_file_buf", readed from *.kmc_pre
+ uint32 single_LUT_size; // The size of a single LUT (in no. of elements)
+
+ uint32* signature_map;
+ uint32 signature_map_size;
+
+ uchar* sufix_file_buf;
+ uint64 sufix_number; // The sufix's number to be listed
+ uint64 index_in_partial_buf; // The current byte's number in an array "sufix_file_buf", for listing mode
+
+ uint32 kmer_length;
+ uint32 mode;
+ uint32 counter_size;
+ uint32 lut_prefix_length;
+ uint32 signature_len;
+ uint32 min_count;
+ uint32 max_count;
+ uint64 total_kmers;
+
+ uint32 kmc_version;
+ uint32 sufix_size; // sufix's size in bytes
+ uint32 sufix_rec_size; // sufix_size + counter_size
+
+ uint32 original_min_count;
+ uint32 original_max_count;
+
+ static uint64 part_size; // the size of a block readed to sufix_file_buf, in listing mode
+
+ bool BinarySearch(int64 index_start, int64 index_stop, const CKmerAPI& kmer, uint32& counter, uint32 pattern_offset);
+
+ // Open a file, recognize its size and check its marker. Auxiliary function.
+ bool OpenASingleFile(const std::string &file_name, FILE *&file_handler, uint64 &size, char marker[]);
+
+ // Recognize current parameters. Auxiliary function.
+ bool ReadParamsFrom_prefix_file_buf(uint64 &size);
+
+ // Reload a contents of an array "sufix_file_buf" for listing mode. Auxiliary function.
+ void Reload_sufix_file_buf();
+
+ // Implementation of GetCountersForRead for kmc1 database format
+ bool GetCountersForRead_kmc1(const std::string& read, std::vector<uint32>& counters);
+
+ // Implementation of GetCountersForRead for kmc2 database format
+ bool GetCountersForRead_kmc2(const std::string& read, std::vector<uint32>& counters);
+public:
+
+ CKMCFile();
+ ~CKMCFile();
+
+ // Open files *.kmc_pre & *.kmc_suf, read them to RAM, close files. *.kmc_suf is opened for random access
+ bool OpenForRA(const std::string &file_name);
+
+ // Open files *kmc_pre & *.kmc_suf, read *.kmc_pre to RAM, *.kmc_suf is buffered
+ bool OpenForListing(const std::string& file_name);
+
+ // Return next kmer in CKmerAPI &kmer. Return its counter in float &count. Return true if not EOF
+ bool ReadNextKmer(CKmerAPI &kmer, float &count);
+
+ bool ReadNextKmer(CKmerAPI &kmer, uint32 &count);
+ // Release memory and close files in case they were opened
+ bool Close();
+
+ // Set the minimal value for a counter. Kmers with counters below this theshold are ignored
+ bool SetMinCount(uint32 x);
+
+ // Return a value of min_count. Kmers with counters below this theshold are ignored
+ uint32 GetMinCount(void);
+
+ // Set the maximal value for a counter. Kmers with counters above this theshold are ignored
+ bool SetMaxCount(uint32 x);
+
+ // Return a value of max_count. Kmers with counters above this theshold are ignored
+ uint32 GetMaxCount(void);
+
+ // Return the total number of kmers between min_count and max_count
+ uint64 KmerCount(void);
+
+ // Return the length of kmers
+ uint32 KmerLength(void);
+
+ // Set initial values to enable listing kmers from the begining. Only in listing mode
+ bool RestartListing(void);
+
+ // Return true if all kmers are listed
+ bool Eof(void);
+
+ // Return true if kmer exists. In this case return kmer's counter in count
+ bool CheckKmer(CKmerAPI &kmer, float &count);
+
+ bool CheckKmer(CKmerAPI &kmer, uint32 &count);
+
+ // Return true if kmer exists
+ bool IsKmer(CKmerAPI &kmer);
+
+ // Set original (readed from *.kmer_pre) values for min_count and max_count
+ void ResetMinMaxCounts(void);
+
+ // Get current parameters from kmer_database
+ bool Info(uint32 &_kmer_length, uint32 &_mode, uint32 &_counter_size, uint32 &_lut_prefix_length, uint32 &_signature_len, uint32 &_min_count, uint32 &_max_count, uint64 &_total_kmers);
+
+ // Get counters for all k-mers in read
+ bool GetCountersForRead(const std::string& read, std::vector<uint32>& counters);
+ bool GetCountersForRead(const std::string& read, std::vector<float>& counters);
+ private:
+ uint32 count_for_kmer_kmc1(CKmerAPI& kmer);
+ uint32 count_for_kmer_kmc2(CKmerAPI& kmer, uint32 bin_start_pos);
+};
+
+#endif
+
+// ***** EOF
diff --git a/src/projects/mts/kmc_api/kmer_api.cpp b/src/projects/mts/kmc_api/kmer_api.cpp
new file mode 100644
index 0000000..befd9fe
--- /dev/null
+++ b/src/projects/mts/kmc_api/kmer_api.cpp
@@ -0,0 +1,48 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+
+#include "stdafx.h"
+#include "kmer_api.h"
+#include <vector>
+#include <math.h>
+
+using namespace std;
+
+const char CKmerAPI::char_codes[] = {'A','C', 'G', 'T'};
+char CKmerAPI::num_codes[];
+CKmerAPI::_si CKmerAPI::_init;
+uchar CKmerAPI::rev_comp_bytes_LUT[] = {
+ 0xff, 0xbf, 0x7f, 0x3f, 0xef, 0xaf, 0x6f, 0x2f, 0xdf, 0x9f, 0x5f, 0x1f, 0xcf, 0x8f, 0x4f, 0x0f,
+ 0xfb, 0xbb, 0x7b, 0x3b, 0xeb, 0xab, 0x6b, 0x2b, 0xdb, 0x9b, 0x5b, 0x1b, 0xcb, 0x8b, 0x4b, 0x0b,
+ 0xf7, 0xb7, 0x77, 0x37, 0xe7, 0xa7, 0x67, 0x27, 0xd7, 0x97, 0x57, 0x17, 0xc7, 0x87, 0x47, 0x07,
+ 0xf3, 0xb3, 0x73, 0x33, 0xe3, 0xa3, 0x63, 0x23, 0xd3, 0x93, 0x53, 0x13, 0xc3, 0x83, 0x43, 0x03,
+ 0xfe, 0xbe, 0x7e, 0x3e, 0xee, 0xae, 0x6e, 0x2e, 0xde, 0x9e, 0x5e, 0x1e, 0xce, 0x8e, 0x4e, 0x0e,
+ 0xfa, 0xba, 0x7a, 0x3a, 0xea, 0xaa, 0x6a, 0x2a, 0xda, 0x9a, 0x5a, 0x1a, 0xca, 0x8a, 0x4a, 0x0a,
+ 0xf6, 0xb6, 0x76, 0x36, 0xe6, 0xa6, 0x66, 0x26, 0xd6, 0x96, 0x56, 0x16, 0xc6, 0x86, 0x46, 0x06,
+ 0xf2, 0xb2, 0x72, 0x32, 0xe2, 0xa2, 0x62, 0x22, 0xd2, 0x92, 0x52, 0x12, 0xc2, 0x82, 0x42, 0x02,
+ 0xfd, 0xbd, 0x7d, 0x3d, 0xed, 0xad, 0x6d, 0x2d, 0xdd, 0x9d, 0x5d, 0x1d, 0xcd, 0x8d, 0x4d, 0x0d,
+ 0xf9, 0xb9, 0x79, 0x39, 0xe9, 0xa9, 0x69, 0x29, 0xd9, 0x99, 0x59, 0x19, 0xc9, 0x89, 0x49, 0x09,
+ 0xf5, 0xb5, 0x75, 0x35, 0xe5, 0xa5, 0x65, 0x25, 0xd5, 0x95, 0x55, 0x15, 0xc5, 0x85, 0x45, 0x05,
+ 0xf1, 0xb1, 0x71, 0x31, 0xe1, 0xa1, 0x61, 0x21, 0xd1, 0x91, 0x51, 0x11, 0xc1, 0x81, 0x41, 0x01,
+ 0xfc, 0xbc, 0x7c, 0x3c, 0xec, 0xac, 0x6c, 0x2c, 0xdc, 0x9c, 0x5c, 0x1c, 0xcc, 0x8c, 0x4c, 0x0c,
+ 0xf8, 0xb8, 0x78, 0x38, 0xe8, 0xa8, 0x68, 0x28, 0xd8, 0x98, 0x58, 0x18, 0xc8, 0x88, 0x48, 0x08,
+ 0xf4, 0xb4, 0x74, 0x34, 0xe4, 0xa4, 0x64, 0x24, 0xd4, 0x94, 0x54, 0x14, 0xc4, 0x84, 0x44, 0x04,
+ 0xf0, 0xb0, 0x70, 0x30, 0xe0, 0xa0, 0x60, 0x20, 0xd0, 0x90, 0x50, 0x10, 0xc0, 0x80, 0x40, 0x00
+};
+uint64 CKmerAPI::alignment_mask[] = {
+ 0xFFFFFFFFFFFFFFFFULL,
+ 0x3FFFFFFFFFFFFFFFULL,
+ 0x0FFFFFFFFFFFFFFFULL,
+ 0x03FFFFFFFFFFFFFFULL,
+ 0x00FFFFFFFFFFFFFFULL
+};
+
+// ***** EOF
diff --git a/src/projects/mts/kmc_api/kmer_api.h b/src/projects/mts/kmc_api/kmer_api.h
new file mode 100644
index 0000000..e652aa2
--- /dev/null
+++ b/src/projects/mts/kmc_api/kmer_api.h
@@ -0,0 +1,596 @@
+/*
+This file is a part of KMC software distributed under GNU GPL 3 licence.
+The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
+
+Version: 2.2.0
+Date : 2015-04-15
+*/
+
+#ifndef _KMER_API_H
+#define _KMER_API_H
+
+
+#include "kmer_defs.h"
+#include <string>
+#include <iostream>
+#include "mmer.h"
+class CKMCFile;
+
+class CKmerAPI
+{
+protected:
+
+ uint64 *kmer_data; // An array to store kmer's data. On 64 bits 32 symbols can be stored
+ // Data are shifted to let sufix's symbols to start with a border of a byte
+
+
+ uint32 kmer_length; // Kmer's length, in symbols
+ uchar byte_alignment; // A number of "empty" symbols placed before prefix to let sufix's symbols to start with a border of a byte
+
+ uint32 no_of_rows; // A number of 64-bits words allocated for kmer_data
+
+ friend class CKMCFile;
+
+ //----------------------------------------------------------------------------------
+ inline void clear()
+ {
+ memset(kmer_data, 0, sizeof(*kmer_data) * no_of_rows);
+ }
+
+ //----------------------------------------------------------------------------------
+ inline void insert2bits(uint32 pos, uchar val)
+ {
+ kmer_data[(pos + byte_alignment) >> 5] += (uint64)val << (62 - (((pos + byte_alignment) & 31) * 2));
+ }
+
+ inline uchar extract2bits(uint32 pos)
+ {
+ return (kmer_data[(pos + byte_alignment) >> 5] >> (62 - (((pos + byte_alignment) & 31) * 2))) & 3;
+ }
+ //----------------------------------------------------------------------------------
+ inline void SHL_insert2bits(uchar val)
+ {
+ kmer_data[0] <<= 2;
+ if (byte_alignment)
+ {
+ uint64 mask = ~(((1ull << 2 * byte_alignment) - 1) << (64 - 2 * byte_alignment));
+ kmer_data[0] &= mask;
+ }
+ for (uint32 i = 1; i < no_of_rows; ++i)
+ {
+ kmer_data[i - 1] += kmer_data[i] >> 62;
+ kmer_data[i] <<= 2;
+ }
+ kmer_data[no_of_rows - 1] += (uint64)val << (62 - (((kmer_length - 1 + byte_alignment) & 31) * 2));
+ }
+ // ----------------------------------------------------------------------------------
+ inline void from_binary(const char* kmer)
+ {
+ clear();
+ for (uint32 i = 0; i < kmer_length; ++i)
+ insert2bits(i, kmer[i]);
+ }
+
+ // ----------------------------------------------------------------------------------
+ template<typename RandomAccessIterator>
+ inline void to_string_impl(RandomAccessIterator iter)
+ {
+ uchar *byte_ptr;
+ uchar c;
+ uchar temp_byte_alignment = byte_alignment;
+ uint32 cur_string_size = 0;
+ for (uint32 row_counter = 0; row_counter < no_of_rows; row_counter++)
+ {
+ byte_ptr = reinterpret_cast<uchar*>(&kmer_data[row_counter]);
+
+ byte_ptr += 7; // shift a pointer towards a MSB
+
+ for (uint32 i = 0; (i < kmer_length) && (i < 32); i += 4) // 32 symbols of any "row" in kmer_data
+ {
+ if ((i == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ temp_byte_alignment--;
+ else
+ {
+ c = 0xc0 & *byte_ptr; //11000000
+ c = c >> 6;
+ *(iter + cur_string_size++) = char_codes[c];
+ if (cur_string_size == kmer_length) break;
+ }
+
+ if ((i == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ temp_byte_alignment--;
+ else
+ {
+ c = 0x30 & *byte_ptr; //00110000
+ c = c >> 4;
+ *(iter + cur_string_size++) = char_codes[c];
+ if (cur_string_size == kmer_length) break;
+ }
+
+ if ((i == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ temp_byte_alignment--;
+ else
+ {
+ c = 0x0c & *byte_ptr; //00001100
+ c = c >> 2;
+ *(iter + cur_string_size++) = char_codes[c];
+ if (cur_string_size == kmer_length) break;
+ }
+ // no need to check byte alignment as its length is at most 3
+ c = 0x03 & *byte_ptr; //00000011
+ *(iter + cur_string_size++) = char_codes[c];
+ if (cur_string_size == kmer_length) break;
+
+ byte_ptr--;
+ }
+ }
+ }
+
+ // ----------------------------------------------------------------------------------
+ template<typename RandomAccessIterator>
+ inline bool from_string_impl(const RandomAccessIterator iter, uint32 len)
+ {
+ unsigned char c_char;
+ uchar c_binary;
+ uchar temp_byte_alignment;
+ if (kmer_length != len)
+ {
+ if (kmer_length && kmer_data)
+ delete[] kmer_data;
+
+ kmer_length = len;
+
+ if (kmer_length % 4)
+ byte_alignment = 4 - (kmer_length % 4);
+ else
+ byte_alignment = 0;
+
+
+ if (kmer_length != 0)
+ {
+ no_of_rows = (((kmer_length + byte_alignment) % 32) ? (kmer_length + byte_alignment) / 32 + 1 : (kmer_length + byte_alignment) / 32);
+ //no_of_rows = (int)ceil((double)(kmer_length + byte_alignment) / 32);
+ kmer_data = new uint64[no_of_rows];
+ //memset(kmer_data, 0, sizeof(*kmer_data) * no_of_rows);
+ }
+ }
+
+ memset(kmer_data, 0, sizeof(*kmer_data) * no_of_rows);
+ temp_byte_alignment = byte_alignment;
+ uint32 i = 0;
+ uint32 i_in_string = 0;
+ uchar *byte_ptr;
+
+ for (uint32 row_index = 0; row_index < no_of_rows; row_index++)
+ {
+ byte_ptr = reinterpret_cast<uchar*>(&kmer_data[row_index]);
+ byte_ptr += 7; // shift a pointer towards a MSB
+
+ while (i < kmer_length)
+ {
+ if ((i_in_string == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ {
+ temp_byte_alignment--;
+ i++;
+ }
+ else
+ {
+ c_char = *(iter + i_in_string);
+ c_binary = num_codes[c_char];
+ c_binary = c_binary << 6; //11000000
+ *byte_ptr = *byte_ptr | c_binary;
+ i++;
+ i_in_string++;
+ if (i_in_string == kmer_length) break;
+ }
+
+ if ((i_in_string == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ {
+ temp_byte_alignment--;
+ i++;
+ }
+ else
+ {
+ c_char = *(iter + i_in_string);
+ c_binary = num_codes[c_char];
+ c_binary = c_binary << 4;
+ *byte_ptr = *byte_ptr | c_binary;
+ i++;
+ i_in_string++;
+ if (i_in_string == kmer_length) break;
+ }
+
+ //!!!if((i == 0) && temp_byte_alignment) //poprawka zg3oszona przez Maaka D3ugosza // check if a byte_alignment placed before a prefix is to be skipped
+ if ((i_in_string == 0) && temp_byte_alignment) // check if a byte_alignment placed before a prefix is to be skipped
+ {
+ temp_byte_alignment--;
+ i++;
+ }
+ else
+ {
+ c_char = *(iter + i_in_string);
+ c_binary = num_codes[c_char];
+ c_binary = c_binary << 2;
+ *byte_ptr = *byte_ptr | c_binary;
+ i++;
+ i_in_string++;
+ if (i_in_string == kmer_length) break;
+ }
+
+ c_char = *(iter + i_in_string);
+ c_binary = num_codes[c_char];
+ *byte_ptr = *byte_ptr | c_binary;
+ i++;
+ i_in_string++;
+ if (i_in_string == kmer_length) break;
+
+ if (i % 32 == 0)
+ break; //check if a new "row" is to be started
+ byte_ptr--;
+ }
+ };
+ return true;
+ }
+public:
+ static const char char_codes[];
+ static char num_codes[256];
+ static uchar rev_comp_bytes_LUT[];
+ static uint64 alignment_mask[];
+ struct _si
+ {
+ _si()
+ {
+ for (int i = 0; i < 256; i++)
+ num_codes[i] = -1;
+ num_codes['A'] = num_codes['a'] = 0;
+ num_codes['C'] = num_codes['c'] = 1;
+ num_codes['G'] = num_codes['g'] = 2;
+ num_codes['T'] = num_codes['t'] = 3;
+ }
+ } static _init;
+
+
+// ----------------------------------------------------------------------------------
+// The constructor creates kmer for the number of symbols equal to length.
+// The array kmer_data has the size of ceil((length + byte_alignment) / 32))
+// IN : length - a number of symbols of a kmer
+// ----------------------------------------------------------------------------------
+ inline CKmerAPI(uint32 length = 0)
+ {
+ if(length)
+ {
+ if(length % 4)
+ byte_alignment = 4 - (length % 4);
+ else
+ byte_alignment = 0;
+
+ no_of_rows = (((length + byte_alignment) % 32) ? (length + byte_alignment) / 32 + 1 : (length + byte_alignment) / 32);
+ //no_of_rows = (int)ceil((double)(length + byte_alignment) / 32);
+ kmer_data = new uint64[no_of_rows];
+
+ memset(kmer_data, 0, sizeof(*kmer_data) * no_of_rows);
+ }
+ else
+ {
+ kmer_data = NULL;
+ no_of_rows = 0;
+ byte_alignment = 0;
+ }
+ kmer_length = length;
+ };
+//-----------------------------------------------------------------------
+// The destructor
+//-----------------------------------------------------------------------
+ inline ~CKmerAPI()
+ {
+ if (kmer_data != NULL)
+ delete [] kmer_data;
+ };
+
+//-----------------------------------------------------------------------
+// The copy constructor
+//-----------------------------------------------------------------------
+ inline CKmerAPI(const CKmerAPI &kmer)
+ {
+ kmer_length = kmer.kmer_length;
+ byte_alignment = kmer.byte_alignment;
+ no_of_rows = kmer.no_of_rows;
+
+ kmer_data = new uint64[no_of_rows];
+
+ for(uint32 i = 0; i < no_of_rows; i++)
+ kmer_data[i] = kmer.kmer_data[i];
+
+ };
+
+//-----------------------------------------------------------------------
+// The operator =
+//-----------------------------------------------------------------------
+ inline CKmerAPI& operator=(const CKmerAPI &kmer)
+ {
+ if(kmer.kmer_length != kmer_length)
+ {
+ if(kmer_length && kmer_data)
+ delete [] kmer_data;
+
+ kmer_length = kmer.kmer_length;
+ byte_alignment = kmer.byte_alignment;
+ no_of_rows = kmer.no_of_rows;
+
+ kmer_data = new uint64[no_of_rows];
+ }
+
+ for(uint32 i = 0; i < no_of_rows; i++)
+ kmer_data[i] = kmer.kmer_data[i];
+
+ return *this;
+ };
+
+//-----------------------------------------------------------------------
+// The operator ==
+//-----------------------------------------------------------------------
+ inline bool operator==(const CKmerAPI &kmer)
+ {
+ if(kmer.kmer_length != kmer_length)
+ return false;
+
+ for(uint32 i = 0; i < no_of_rows; i++)
+ if(kmer.kmer_data[i] != kmer_data[i])
+ return false;
+
+ return true;
+
+ };
+
+//-----------------------------------------------------------------------
+// Operator < . If arguments differ in length a result is undefined
+//-----------------------------------------------------------------------
+ inline bool operator<(const CKmerAPI &kmer)
+ {
+ if(kmer.kmer_length != kmer_length)
+ return false;
+
+ for(uint32 i = 0; i < no_of_rows; i++)
+ if(kmer.kmer_data[i] > kmer_data[i])
+ return true;
+ else
+ if(kmer.kmer_data[i] < kmer_data[i])
+ return false;
+
+ return false;
+ };
+
+//-----------------------------------------------------------------------
+// Return a symbol of a kmer from an indicated position (numbered form 0).
+// The symbol is returned as an ASCI character A/C/G/T
+// IN : pos - a position of a symbol
+// RET : symbol - a symbol placed on a position pos
+//-----------------------------------------------------------------------
+ inline char get_asci_symbol(unsigned int pos)
+ {
+ if(pos >= kmer_length)
+ return 0;
+
+ uint32 current_row = (pos + byte_alignment) / 32;
+ uint32 current_pos = ((pos + byte_alignment) % 32) * 2;
+ uint64 mask = 0xc000000000000000 >> current_pos;
+ uint64 symbol = kmer_data[current_row] & mask;
+ symbol = symbol >> (64 - current_pos - 2);
+ return char_codes[symbol];
+
+ };
+
+ //-----------------------------------------------------------------------
+ // Return a symbol of a kmer from an indicated position (numbered form 0)
+ // The symbol is returned as a numerical value 0/1/2/3
+ // IN : pos - a position of a symbol
+ // RET : symbol - a symbol placed on a position pos
+ //-----------------------------------------------------------------------
+ inline uchar get_num_symbol(unsigned int pos)
+ {
+ if (pos >= kmer_length)
+ return 0;
+
+ uint32 current_row = (pos + byte_alignment) / 32;
+ uint32 current_pos = ((pos + byte_alignment) % 32) * 2;
+ uint64 mask = 0xc000000000000000 >> current_pos;
+ uint64 symbol = kmer_data[current_row] & mask;
+ symbol = symbol >> (64 - current_pos - 2);
+ uchar* byte_ptr = reinterpret_cast<uchar*>(&symbol);
+ return *byte_ptr;
+
+ };
+
+ //-----------------------------------------------------------------------
+ // Convert kmer into string (an alphabet ACGT)
+ // RET : string kmer
+ //-----------------------------------------------------------------------
+ inline std::string to_string()
+ {
+ std::string string_kmer;
+ string_kmer.resize(kmer_length);
+ to_string_impl(string_kmer.begin());
+ return string_kmer;
+ };
+ //-----------------------------------------------------------------------
+ // Convert kmer into string (an alphabet ACGT). The function assumes enough memory was allocated
+ // OUT : str - string kmer.
+ //-----------------------------------------------------------------------
+ inline void to_string(char *str)
+ {
+ to_string_impl(str);
+ str[kmer_length] = '\0';
+ };
+
+ //-----------------------------------------------------------------------
+ // Convert kmer into string (an alphabet ACGT)
+ // OUT : str - string kmer
+ //-----------------------------------------------------------------------
+ inline void to_string(std::string &str)
+ {
+ str.resize(kmer_length);
+ to_string_impl(str.begin());
+ };
+
+ //-----------------------------------------------------------------------
+ // Convert a string of an alphabet ACGT into a kmer of a CKmerAPI
+ // IN : kmer_string - a string of an alphabet ACGT
+ // RET : true - if succesfull
+ //-----------------------------------------------------------------------
+ inline bool from_string(const char* kmer_string)
+ {
+ uint32 len = 0;
+ for (; kmer_string[len] != '\0' ; ++len)
+ {
+ if (num_codes[(uchar)kmer_string[len]] == -1)
+ return false;
+ }
+ return from_string_impl(kmer_string, len);
+ }
+
+ //-----------------------------------------------------------------------
+ // Convert a string of an alphabet ACGT into a kmer of a CKmerAPI
+ // IN : kmer_string - a string of an alphabet ACGT
+ // RET : true - if succesfull
+ //-----------------------------------------------------------------------
+ inline bool from_string(const std::string& kmer_string)
+ {
+ for (uint32 ii = 0; ii < kmer_string.size(); ++ii)
+ {
+ if (num_codes[(uchar)kmer_string[ii]] == -1)
+ return false;
+ }
+ return from_string_impl(kmer_string.begin(), static_cast<uint32>(kmer_string.length()));
+ }
+
+ //-----------------------------------------------------------------------
+ // Convert k-mer to its reverse complement
+ //-----------------------------------------------------------------------
+ inline bool reverse()
+ {
+ if (kmer_data == NULL)
+ {
+ return false;
+ }
+
+ // number of bytes used to store the k-mer in the 0-th row
+ const uint32 size_in_byte = ((kmer_length + byte_alignment) / 4) / no_of_rows;
+ uchar* byte1;
+ uchar* byte2;
+
+ if (no_of_rows == 1)
+ {
+ *kmer_data <<= 2 * byte_alignment;
+ byte1 = reinterpret_cast<uchar*>(kmer_data)+8 - size_in_byte;
+ byte2 = reinterpret_cast<uchar*>(kmer_data)+7;
+
+ for (uint32 i_bytes = 0; i_bytes < size_in_byte / 2; ++i_bytes)
+ {
+ unsigned char temp = rev_comp_bytes_LUT[*byte1];
+ *byte1 = rev_comp_bytes_LUT[*byte2];
+ *byte2 = temp;
+
+ ++byte1;
+ --byte2;
+ }
+
+ if (size_in_byte % 2)
+ {
+ *byte1 = rev_comp_bytes_LUT[*byte1];
+ }
+ }
+ else
+ {
+ for (uint32 i_rows = no_of_rows - 1; i_rows > 0; --i_rows)
+ {
+ kmer_data[i_rows] >>= 64 - 8 * size_in_byte - 2 * byte_alignment;
+
+ // more significant row
+ uint64 previous = kmer_data[i_rows - 1];
+ previous <<= 8 * size_in_byte + 2 * byte_alignment;
+ kmer_data[i_rows] |= previous;
+
+ byte1 = reinterpret_cast<uchar*>(kmer_data + i_rows);
+ byte2 = reinterpret_cast<uchar*>(kmer_data + i_rows) + 7;
+
+ for (int i_bytes = 0; i_bytes < 4; ++i_bytes)
+ {
+ unsigned char temp = rev_comp_bytes_LUT[*byte1];
+ *byte1 = rev_comp_bytes_LUT[*byte2];
+ *byte2 = temp;
+
+ ++byte1;
+ --byte2;
+ }
+ }
+
+ // clear less significant bits
+ kmer_data[0] >>= 64 - 8 * size_in_byte - 2 * byte_alignment;
+ kmer_data[0] <<= 64 - 8 * size_in_byte;
+
+ byte1 = reinterpret_cast<uchar*>(kmer_data)+8 - size_in_byte;
+ byte2 = reinterpret_cast<uchar*>(kmer_data)+7;
+
+ for (uint32 i_bytes = 0; i_bytes < size_in_byte / 2; ++i_bytes)
+ {
+ unsigned char temp = rev_comp_bytes_LUT[*byte1];
+ *byte1 = rev_comp_bytes_LUT[*byte2];
+ *byte2 = temp;
+
+ ++byte1;
+ --byte2;
+ }
+
+ if (size_in_byte % 2)
+ {
+ *byte1 = rev_comp_bytes_LUT[*byte1];
+ }
+
+ for (uint32 i_rows = 0; i_rows < no_of_rows / 2; ++i_rows)
+ {
+ std::swap(kmer_data[i_rows], kmer_data[no_of_rows - i_rows - 1]);
+ }
+ }
+
+ // clear alignment
+ *kmer_data &= alignment_mask[byte_alignment];
+
+ return true;
+ }
+
+//-----------------------------------------------------------------------
+// Counts a signature of an existing kmer
+// IN : sig_len - the length of a signature
+// RET : signature value
+//-----------------------------------------------------------------------
+ uint32 get_signature(uint32 sig_len)
+ {
+ uchar symb;
+ CMmer cur_mmr(sig_len);
+
+ for(uint32 i = 0; i < sig_len; ++i)
+ {
+ symb = get_num_symbol(i);
+ cur_mmr.insert(symb);
+ }
+ CMmer min_mmr(cur_mmr);
+ for (uint32 i = sig_len; i < kmer_length; ++i)
+ {
+ symb = get_num_symbol(i);
+ cur_mmr.insert(symb);
+
+ if (cur_mmr < min_mmr)
+ min_mmr = cur_mmr;
+ }
+ return min_mmr.get();
+ }
+
+
+};
+
+
+#endif
+
+// ***** EOF
diff --git a/src/projects/mts/kmc_api/kmer_defs.h b/src/projects/mts/kmc_api/kmer_defs.h
new file mode 100644
index 0000000..4a88d60
--- /dev/null
+++ b/src/projects/mts/kmc_api/kmer_defs.h
@@ -0,0 +1,54 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz and Agnieszka Debudaj-Grabysz
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+
+#ifndef _KMER_DEFS_H
+#define _KMER_DEFS_H
+
+#define KMC_VER "2.2.0"
+#define KMC_DATE "2015-04-15"
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+#ifndef WIN32
+ #include <stdint.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <math.h>
+ #include <string.h>
+
+ #define _TCHAR char
+ #define _tmain main
+
+ #define my_fopen fopen
+ #define my_fseek fseek
+ #define my_ftell ftell
+
+
+ #include <stdio.h>
+ #include <algorithm>
+ #include <iostream>
+ using namespace std;
+
+#else
+ #define my_fopen fopen
+ #define my_fseek _fseeki64
+ #define my_ftell _ftelli64
+#endif
+ //typedef unsigned char uchar;
+
+ typedef int int32;
+ typedef unsigned int uint32;
+ typedef long long int64;
+ typedef unsigned long long uint64;
+ typedef unsigned char uchar;
+#endif
+
+// ***** EOF
diff --git a/src/projects/mts/kmc_api/mmer.cpp b/src/projects/mts/kmc_api/mmer.cpp
new file mode 100644
index 0000000..ed3ea11
--- /dev/null
+++ b/src/projects/mts/kmc_api/mmer.cpp
@@ -0,0 +1,49 @@
+#include "stdafx.h"
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+#include "../kmc_api/mmer.h"
+
+
+uint32 CMmer::norm5[];
+uint32 CMmer::norm6[];
+uint32 CMmer::norm7[];
+uint32 CMmer::norm8[];
+
+CMmer::_si CMmer::_init;
+
+
+//--------------------------------------------------------------------------
+CMmer::CMmer(uint32 _len)
+{
+ switch (_len)
+ {
+ case 5:
+ norm = norm5;
+ break;
+ case 6:
+ norm = norm6;
+ break;
+ case 7:
+ norm = norm7;
+ break;
+ case 8:
+ norm = norm8;
+ break;
+ default:
+ break;
+ }
+ len = _len;
+ mask = (1 << _len * 2) - 1;
+ str = 0;
+}
+
+//--------------------------------------------------------------------------
+
diff --git a/src/projects/mts/kmc_api/mmer.h b/src/projects/mts/kmc_api/mmer.h
new file mode 100644
index 0000000..79187f8
--- /dev/null
+++ b/src/projects/mts/kmc_api/mmer.h
@@ -0,0 +1,182 @@
+/*
+ This file is a part of KMC software distributed under GNU GPL 3 licence.
+ The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
+
+ Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
+
+ Version: 2.2.0
+ Date : 2015-04-15
+*/
+
+#ifndef _MMER_H
+#define _MMER_H
+#include "kmer_defs.h"
+
+// *************************************************************************
+// *************************************************************************
+
+
+class CMmer
+{
+ uint32 str;
+ uint32 mask;
+ uint32 current_val;
+ uint32* norm;
+ uint32 len;
+ static uint32 norm5[1 << 10];
+ static uint32 norm6[1 << 12];
+ static uint32 norm7[1 << 14];
+ static uint32 norm8[1 << 16];
+
+ static bool is_allowed(uint32 mmer, uint32 len)
+ {
+ if ((mmer & 0x3f) == 0x3f) // TTT suffix
+ return false;
+ if ((mmer & 0x3f) == 0x3b) // TGT suffix
+ return false;
+ if ((mmer & 0x3c) == 0x3c) // TG* suffix
+ return false;
+
+ for (uint32 j = 0; j < len - 3; ++j)
+ if ((mmer & 0xf) == 0) // AA inside
+ return false;
+ else
+ mmer >>= 2;
+
+ if (mmer == 0) // AAA prefix
+ return false;
+ if (mmer == 0x04) // ACA prefix
+ return false;
+ if ((mmer & 0xf) == 0) // *AA prefix
+ return false;
+
+ return true;
+ }
+
+ friend class CSignatureMapper;
+ struct _si
+ {
+ static uint32 get_rev(uint32 mmer, uint32 len)
+ {
+ uint32 rev = 0;
+ uint32 shift = len*2 - 2;
+ for(uint32 i = 0 ; i < len ; ++i)
+ {
+ rev += (3 - (mmer & 3)) << shift;
+ mmer >>= 2;
+ shift -= 2;
+ }
+ return rev;
+ }
+
+
+
+ static void init_norm(uint32* norm, uint32 len)
+ {
+ uint32 special = 1 << len * 2;
+ for(uint32 i = 0 ; i < special ; ++i)
+ {
+ uint32 rev = get_rev(i, len);
+ uint32 str_val = is_allowed(i, len) ? i : special;
+ uint32 rev_val = is_allowed(rev, len) ? rev : special;
+ norm[i] = MIN(str_val, rev_val);
+ }
+ }
+
+ _si()
+ {
+ init_norm(norm5, 5);
+ init_norm(norm6, 6);
+ init_norm(norm7, 7);
+ init_norm(norm8, 8);
+ }
+
+ }static _init;
+public:
+ CMmer(uint32 _len);
+ inline void insert(uchar symb);
+ inline uint32 get() const;
+ inline bool operator==(const CMmer& x);
+ inline bool operator<(const CMmer& x);
+ inline void clear();
+ inline bool operator<=(const CMmer& x);
+ inline void set(const CMmer& x);
+ inline void insert(const char* seq);
+
+};
+
+
+
+//--------------------------------------------------------------------------
+inline void CMmer::insert(uchar symb)
+{
+ str <<= 2;
+ str += symb;
+ str &= mask;
+
+ current_val = norm[str];
+}
+
+//--------------------------------------------------------------------------
+inline uint32 CMmer::get() const
+{
+ return current_val;
+}
+
+//--------------------------------------------------------------------------
+inline bool CMmer::operator==(const CMmer& x)
+{
+ return current_val == x.current_val;
+}
+
+//--------------------------------------------------------------------------
+inline bool CMmer::operator<(const CMmer& x)
+{
+ return current_val < x.current_val;
+}
+
+//--------------------------------------------------------------------------
+inline void CMmer::clear()
+{
+ str = 0;
+}
+
+//--------------------------------------------------------------------------
+inline bool CMmer::operator<=(const CMmer& x)
+{
+ return current_val <= x.current_val;
+}
+
+//--------------------------------------------------------------------------
+inline void CMmer::set(const CMmer& x)
+{
+ str = x.str;
+ current_val = x.current_val;
+}
+
+//--------------------------------------------------------------------------
+inline void CMmer::insert(const char* seq)
+{
+ switch (len)
+ {
+ case 5:
+ str = (seq[0] << 8) + (seq[1] << 6) + (seq[2] << 4) + (seq[3] << 2) + (seq[4]);
+ break;
+ case 6:
+ str = (seq[0] << 10) + (seq[1] << 8) + (seq[2] << 6) + (seq[3] << 4) + (seq[4] << 2) + (seq[5]);
+ break;
+ case 7:
+ str = (seq[0] << 12) + (seq[1] << 10) + (seq[2] << 8) + (seq[3] << 6) + (seq[4] << 4 ) + (seq[5] << 2) + (seq[6]);
+ break;
+ case 8:
+ str = (seq[0] << 14) + (seq[1] << 12) + (seq[2] << 10) + (seq[3] << 8) + (seq[4] << 6) + (seq[5] << 4) + (seq[6] << 2) + (seq[7]);
+ break;
+ default:
+ break;
+ }
+
+ current_val = norm[str];
+}
+
+
+#endif
\ No newline at end of file
diff --git a/src/projects/mts/kmc_api/stdafx.h b/src/projects/mts/kmc_api/stdafx.h
new file mode 100644
index 0000000..e7d6ecf
--- /dev/null
+++ b/src/projects/mts/kmc_api/stdafx.h
@@ -0,0 +1,4 @@
+#include <cstdio>
+#include <algorithm>
+#include <iostream>
+using namespace std;
diff --git a/src/projects/mts/kmer_multiplicity_counter.cpp b/src/projects/mts/kmer_multiplicity_counter.cpp
new file mode 100644
index 0000000..37d4a62
--- /dev/null
+++ b/src/projects/mts/kmer_multiplicity_counter.cpp
@@ -0,0 +1,256 @@
+#include <string>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <memory>
+#include <algorithm>
+#include <libcxx/sort.hpp>
+#include <boost/optional/optional.hpp>
+#include "getopt_pp/getopt_pp.h"
+#include "kmc_api/kmc_file.h"
+//#include "omp.h"
+#include "io/kmers/mmapped_reader.hpp"
+#include "utils/path_helper.hpp"
+#include "utils/simple_tools.hpp"
+#include "utils/indices/perfect_hash_map_builder.hpp"
+#include "utils/indices/kmer_splitters.hpp"
+#include "logger.hpp"
+
+using std::string;
+using std::vector;
+
+const string KMER_PARSED_EXTENSION = ".bin";
+const string KMER_SORTED_EXTENSION = ".sorted";
+
+class KmerMultiplicityCounter {
+
+ size_t k_, sample_cnt_;
+ std::string file_prefix_;
+
+ //TODO: get rid of intermediate .bin file
+ string ParseKmc(const string& filename) {
+ CKMCFile kmcFile;
+ kmcFile.OpenForListing(filename);
+ CKmerAPI kmer((unsigned int) k_);
+ uint32 count;
+ std::string parsed_filename = filename + KMER_PARSED_EXTENSION;
+ std::ofstream output(parsed_filename, std::ios::binary);
+ while (kmcFile.ReadNextKmer(kmer, count)) {
+ RtSeq seq(k_, kmer.to_string());
+ seq.BinWrite(output);
+ seq_element_type tmp = count;
+ output.write((char*) &(tmp), sizeof(seq_element_type));
+ }
+ output.close();
+ return parsed_filename;
+ }
+
+ string SortKmersCountFile(const string& filename) {
+ MMappedRecordArrayReader<seq_element_type> ins(filename, RtSeq::GetDataSize(k_) + 1, false);
+ libcxx::sort(ins.begin(), ins.end(), array_less<seq_element_type>());
+ std::string sorted_filename = filename + KMER_SORTED_EXTENSION;
+ std::ofstream out(sorted_filename);
+ out.write((char*) ins.data(), ins.data_size());
+ out.close();
+ remove(filename.c_str());
+ return sorted_filename;
+ }
+
+ bool ReadKmerWithCount(std::ifstream& infile, std::pair<RtSeq, uint32>& res) {
+ RtSeq seq(res.first.size());
+ if (!seq.BinRead(infile)) {
+ return false;
+ }
+ seq_element_type tmp;
+ infile.read((char*) &tmp, sizeof(seq_element_type));
+ res = {seq, (uint32) tmp};
+ return true;
+ }
+
+ void FilterCombinedKmers(const std::vector<string>& files, size_t all_min) {
+ size_t n = files.size();
+ vector<std::unique_ptr<ifstream>> infiles;
+ infiles.reserve(n);
+ for (auto fn : files) {
+ INFO("Processing " << fn);
+ auto parsed = ParseKmc(fn);
+ auto sorted = SortKmersCountFile(parsed);
+ infiles.emplace_back(new std::ifstream(sorted));
+ }
+ vector<std::pair<RtSeq, uint32>> top_kmer(n, {RtSeq(k_), 0});
+ vector<bool> alive(n, false);
+
+ for (size_t i = 0; i < n; i++) {
+ alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
+ }
+
+ std::ofstream output_kmer(file_prefix_ + ".kmer", std::ios::binary);
+ std::ofstream output_cnt(file_prefix_ + ".mpl");
+
+ RtSeq::less3 kmer_less;
+ while (true) {
+ boost::optional<RtSeq> min_kmer;
+ size_t cnt_min = 0;
+ for (size_t i = 0; i < n; ++i) {
+ if (alive[i]) {
+ RtSeq& cur_kmer = top_kmer[i].first;
+ if (!min_kmer || kmer_less(cur_kmer, *min_kmer)) {
+ min_kmer = cur_kmer;
+ cnt_min = 0;
+ }
+ if (cur_kmer == *min_kmer) {
+ cnt_min++;
+ }
+ }
+ }
+ if (!min_kmer) {
+ break;
+ }
+ if (cnt_min >= all_min) {
+ std::vector<uint32> cnt_vector(n, 0);
+ min_kmer.get().BinWrite(output_kmer);
+ for (size_t i = 0; i < n; ++i) {
+ if (alive[i] && top_kmer[i].first == *min_kmer) {
+ cnt_vector[i] += top_kmer[i].second;
+ }
+ }
+ string delim = "";
+ for (auto cnt : cnt_vector) {
+ output_cnt << delim << cnt;
+ delim = " ";
+ }
+ output_cnt << std::endl;
+ }
+ for (size_t i = 0; i < n; ++i) {
+ if (alive[i] && top_kmer[i].first == *min_kmer) {
+ alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
+ }
+ }
+ }
+ }
+
+ void BuildKmerIndex(size_t sample_cnt, const std::string& workdir, size_t nthreads) {
+ INFO("Initializing kmer profile index");
+
+ //TODO: extract into a common header
+ typedef size_t Offset;
+ typedef uint16_t Mpl;
+ using namespace debruijn_graph;
+
+ KeyStoringMap<RtSeq, Offset, kmer_index_traits<RtSeq>, InvertableStoring>
+ kmer_mpl(k_, workdir);
+ InvertableStoring::trivial_inverter<Offset> inverter;
+
+ static const size_t read_buffer_size = 0; //FIXME some buffer size
+ DeBruijnKMerKMerSplitter<StoringTypeFilter<InvertableStoring>>
+ splitter(kmer_mpl.workdir(), k_, k_, true, read_buffer_size);
+
+ //TODO: get rid of temporary .mker & .mpl files
+ splitter.AddKMers(file_prefix_ + ".kmer");
+
+ KMerDiskCounter<RtSeq> counter(kmer_mpl.workdir(), splitter);
+
+ BuildIndex(kmer_mpl, counter, 16, nthreads);
+
+ INFO("Kmer profile fill start");
+ //We must allocate the whole buffer for all profiles at once
+ //to avoid pointer invalidation after possible vector resize
+ const size_t data_size = sample_cnt * kmer_mpl.size();
+
+ std::vector<Mpl> mpl_data;
+ mpl_data.reserve(data_size);
+ INFO("Allocated buffer of " << data_size << " elements");
+ std::ifstream kmers_in(file_prefix_ + ".kmer", std::ios::binary);
+ std::ifstream kmers_mpl_in(file_prefix_ + ".mpl");
+ while (true) {
+ RtSeq kmer(k_);
+ kmer.BinRead(kmers_in);
+ if (kmers_in.fail()) {
+ break;
+ }
+
+// VERIFY(kmer_str.length() == k_);
+// conj_graph_pack::seq_t kmer(k_, kmer_str.c_str());
+// kmer = gp_.kmer_mapper.Substitute(kmer);
+
+ Offset offset = mpl_data.size();
+ for (size_t i = 0; i < sample_cnt; ++i) {
+ Mpl mpl;
+ kmers_mpl_in >> mpl;
+ VERIFY(!kmers_mpl_in.fail());
+ mpl_data.push_back(mpl);
+ }
+ //Double-check we haven't invalidated vector views
+ VERIFY(mpl_data.size() <= data_size);
+
+ auto kwh = kmer_mpl.ConstructKWH(kmer);
+ VERIFY(kmer_mpl.valid(kwh));
+ kmer_mpl.put_value(kwh, offset, inverter);
+ }
+
+ std::ofstream map_file(file_prefix_ + ".kmm", std::ios_base::binary | std::ios_base::out);
+ kmer_mpl.BinWrite(map_file);
+
+ std::ofstream mpl_file(file_prefix_ + ".bpr", std::ios_base::binary | std::ios_base::out);
+ mpl_file.write((const char *)&mpl_data[0], mpl_data.size() * sizeof(Mpl));
+
+ INFO("Kmer profile fill finish");
+ }
+
+public:
+ KmerMultiplicityCounter(size_t k, std::string file_prefix):
+ k_(k), file_prefix_(std::move(file_prefix)) {
+ }
+
+ void CombineMultiplicities(const vector<string>& input_files, size_t min_samples, const string& work_dir, size_t nthreads = 1) {
+ FilterCombinedKmers(input_files, min_samples);
+ BuildKmerIndex(input_files.size(), work_dir, nthreads);
+ }
+private:
+ DECL_LOGGER("KmerMultiplicityCounter");
+};
+
+void PrintUsageInfo() {
+ std::cout << "Usage: kmer_multiplicity_counter [options] -f files_dir" << std::endl;
+ std::cout << "Options:" << std::endl;
+ std::cout << "-k - kmer length" << std::endl;
+ std::cout << "-n - sample count" << std::endl;
+ std::cout << "-o - output file prefix" << std::endl;
+ std::cout << "-t - number of threads (default: 1)" << std::endl;
+ std::cout << "-s - minimal number of samples to contain kmer" << std::endl;
+ std::cout << "files_dir must contain two files (.kmc_pre and .kmc_suf) with kmer multiplicities for each sample from 1 to n" << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+ using namespace GetOpt;
+ create_console_logger();
+
+ size_t k, sample_cnt, min_samples, nthreads;
+ string output, work_dir;
+
+ try {
+ GetOpt_pp ops(argc, argv);
+ ops.exceptions_all();
+ ops >> Option('k', k)
+ >> Option('n', sample_cnt)
+ >> Option('s', min_samples)
+ >> Option('o', output)
+ >> Option('t', "threads", nthreads, size_t(1))
+ >> Option('f', work_dir)
+ ;
+ } catch(GetOptEx &ex) {
+ PrintUsageInfo();
+ exit(1);
+ }
+
+ std::vector<string> input_files;
+ for (size_t i = 1; i <= sample_cnt; ++i) {
+ input_files.push_back(work_dir + "/sample" + ToString(i));
+ }
+
+ KmerMultiplicityCounter kmcounter(k, output);
+ kmcounter.CombineMultiplicities(input_files, min_samples, work_dir, nthreads);
+ return 0;
+}
diff --git a/src/projects/mts/log.properties b/src/projects/mts/log.properties
new file mode 100644
index 0000000..3a7d6e2
--- /dev/null
+++ b/src/projects/mts/log.properties
@@ -0,0 +1,10 @@
+default=INFO
+
+#SingleClusterAnalyzer=TRACE
+#ContigAbundanceCounter=TRACE
+
+#EdgeAnnotationPropagator=TRACE
+#ConnectingPathPropagator=TRACE
+#ContigPropagator=TRACE
+#TipPropagator=TRACE
+#AnnotationChecker+TRACE
diff --git a/src/projects/mts/logger.hpp b/src/projects/mts/logger.hpp
new file mode 100644
index 0000000..a8d2b02
--- /dev/null
+++ b/src/projects/mts/logger.hpp
@@ -0,0 +1,11 @@
+#include "utils/logger/log_writers.hpp"
+
+void create_console_logger() {
+ using namespace logging;
+
+ string log_props_file = "log.properties";
+
+ logger *lg = create_logger(path::FileExists(log_props_file) ? log_props_file : "");
+ lg->add_writer(std::make_shared<console_writer>());
+ attach_logger(lg);
+}
diff --git a/src/projects/mts/mts.py b/src/projects/mts/mts.py
new file mode 100755
index 0000000..b80f145
--- /dev/null
+++ b/src/projects/mts/mts.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python
+from __future__ import (print_function)
+
+import argparse
+import subprocess
+import sys
+import os
+import os.path
+import shutil
+
+#copied from http://stackoverflow.com/questions/431684/how-do-i-cd-in-python/13197763#13197763
+class cd:
+ """Context manager for changing the current working directory"""
+ def __init__(self, newPath):
+ self.newPath = os.path.expanduser(newPath)
+
+ def __enter__(self):
+ self.savedPath = os.getcwd()
+ os.chdir(self.newPath)
+
+ def __exit__(self, etype, value, traceback):
+ os.chdir(self.savedPath)
+
+parser = argparse.ArgumentParser(description="MTS - Metagenomic Time Series")
+
+parser.add_argument("--threads", "-t", type=int, default=8, help="Number of threads")
+parser.add_argument("dir", type=str, help="Output directory")
+parser.add_argument("--config", "-c", type=str, default="", help="config.yaml to be copied to the directory (unnecessary if config.yaml is already there)")
+parser.add_argument("--stats", "-s", action="store_true", help="Calculate stats (when the REFS parameter in config.yaml is provided)")
+parser.add_argument("--reuse-assemblies", action="store_true", help="Use existing assemblies (put them in the corresponding folders)")
+parser.add_argument("--verbose", "-v", action="store_true", help="Increase verbosity level")
+
+args = parser.parse_args()
+
+exec_dir=os.path.dirname(os.path.realpath(sys.argv[0]))
+LOCAL_DIR = os.path.realpath(os.path.join(exec_dir, "../../../"))
+
+base_params = ["snakemake", "--directory", os.path.realpath(args.dir), "--cores", str(args.threads), "--config", "LOCAL_DIR" + "=" + LOCAL_DIR]
+
+if args.verbose:
+ base_params.extend(["-p", "--verbose"])
+
+if args.config:
+ if os.path.exists(os.path.join(args.dir, "config.yaml")):
+ print("Config path specified, but config.yaml already exists in output folder " + args.dir)
+ sys.exit(239)
+
+if not os.path.exists(args.dir):
+ os.makedirs(args.dir)
+
+print("Output folder set to " + args.dir)
+
+if args.config:
+ print("Copying config from " + args.config)
+ shutil.copy(args.config, args.dir)
+
+with cd(exec_dir):
+ def call_snake(extra_params=[]):
+ subprocess.check_call(base_params + extra_params, stdout=sys.stdout, stderr=sys.stderr)
+
+ print("Step #1 - Assembly")
+ if args.reuse_assemblies:
+ call_snake(["assemble_all", "--touch"])
+
+ call_snake()
+
+ if args.stats:
+ print("Step #2a - Assembly statistics")
+ call_snake(["--snakefile", "Stats.snake", "stats_all"])
+
+ print("Step #2b - Reassembly statistics")
+ call_snake(["--snakefile", "Stats.snake", "stats_reassembly"])
+
diff --git a/src/projects/mts/prop_binning.cpp b/src/projects/mts/prop_binning.cpp
new file mode 100644
index 0000000..0df9038
--- /dev/null
+++ b/src/projects/mts/prop_binning.cpp
@@ -0,0 +1,128 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "getopt_pp/getopt_pp.h"
+#include "io/reads/io_helper.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "pipeline/graphio.hpp"
+#include "logger.hpp"
+#include "read_binning.hpp"
+#include "propagate.hpp"
+#include "visualization/position_filler.hpp"
+
+using namespace debruijn_graph;
+
+std::string add_suffix(const std::string& path, const std::string& suffix) {
+ auto ext = path::extension(path);
+ return path.substr(0, path.length() - ext.length()) + suffix + ext;
+}
+
+void DumpEdgesAndAnnotation(const Graph& g,
+ const EdgeAnnotation& edge_annotation,
+ const string& out_edges,
+ const string& out_annotation) {
+ INFO("Dumping edges to " << out_edges << "; their annotation to " << out_annotation);
+ io::osequencestream oss(out_edges);
+ AnnotationOutStream annotation_out(out_annotation);
+ for (auto it = g.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ io::SingleRead edge_read("NODE_" + ToString(g.int_id(e)),
+ g.EdgeNucls(e).str());
+ oss << edge_read;
+ auto relevant_bins = edge_annotation.Annotation(e);
+ if (!relevant_bins.empty()) {
+ annotation_out << ContigAnnotation(GetId(edge_read),
+ vector<bin_id>(relevant_bins.begin(), relevant_bins.end()));
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ using namespace GetOpt;
+
+ //TmpFolderFixture fixture("tmp");
+ create_console_logger();
+
+ size_t k;
+ string saves_path, contigs_path, splits_path, annotation_path;
+ vector<string> sample_names, left_reads, right_reads;
+ string out_root, propagation_dump;
+ vector<bin_id> bins_of_interest;
+ bool no_binning;
+ try {
+ GetOpt_pp ops(argc, argv);
+ ops.exceptions_all();
+ ops >> Option('k', k)
+ >> Option('s', saves_path)
+ >> Option('c', contigs_path)
+ >> Option('f', splits_path)
+ >> Option('a', annotation_path)
+ >> Option('n', sample_names)
+ >> Option('l', left_reads)
+ >> Option('r', right_reads)
+ >> Option('o', out_root)
+ >> Option('d', propagation_dump, "")
+ >> Option('b', bins_of_interest, {})
+ >> OptionPresent('p', no_binning);
+ } catch(GetOptEx &ex) {
+ cout << "Usage: prop_binning -k <K> -s <saves path> -c <contigs path> -f <splits path> "
+ "-a <binning annotation> -n <sample names> -l <left reads> -r <right reads> -o <output root> "
+ "[-d <propagation info dump>] [-p to disable binning] [-b <bins of interest>*]" << endl;
+ exit(1);
+ }
+
+ for (const auto& bin_id : bins_of_interest) {
+ VERIFY_MSG(bin_id.find_last_of(',') == std::string::npos, "Specify bins of interest via space, not comma");
+ }
+
+ conj_graph_pack gp(k, "tmp", 1);
+ gp.kmer_mapper.Attach();
+
+ INFO("Load graph and clustered paired info from " << saves_path);
+ graphio::ScanWithClusteredIndices(saves_path, gp, gp.clustered_indices);
+
+ //Propagation stage
+ INFO("Using contigs from " << contigs_path);
+ io::FileReadStream contigs_stream(contigs_path);
+ io::FileReadStream split_stream(splits_path);
+
+ AnnotationStream annotation_in(annotation_path);
+
+ AnnotationFiller filler(gp, bins_of_interest);
+ EdgeAnnotation edge_annotation = filler(contigs_stream, split_stream, annotation_in);
+
+ INFO("Propagation launched");
+ AnnotationPropagator propagator(gp);
+ propagator.Run(contigs_stream, edge_annotation);
+ INFO("Propagation finished");
+
+ if (!propagation_dump.empty()) {
+ INFO("Dumping propagation info to " << propagation_dump);
+ DumpEdgesAndAnnotation(gp.g, edge_annotation,
+ propagation_dump + ".fasta",
+ propagation_dump + ".ann");
+ }
+
+ if (no_binning) {
+ INFO("Binning was disabled with -p flag");
+ return 0;
+ }
+ //Binning stage
+// contigs_stream.reset();
+// INFO("Using propagated annotation from " << propagated_path);
+// AnnotationStream binning_stream(propagated_path);
+ for (size_t i = 0; i < sample_names.size(); ++i) {
+ ContigBinner binner(gp, edge_annotation, out_root, sample_names[i]);
+ INFO("Initializing binner for " << sample_names[i]);
+ auto paired_stream = io::PairedEasyStream(left_reads[i], right_reads[i], false, 0);
+ INFO("Running binner on " << left_reads[i] << " and " << right_reads[i]);
+ binner.Run(*paired_stream);
+ binner.close();
+ }
+
+ return 0;
+}
diff --git a/src/projects/mts/propagate.cpp b/src/projects/mts/propagate.cpp
new file mode 100644
index 0000000..be650e8
--- /dev/null
+++ b/src/projects/mts/propagate.cpp
@@ -0,0 +1,331 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/simple_tools.hpp"
+
+//#include "pipeline/graphio.hpp"
+#include "pipeline/graph_pack.hpp"
+//#include "io/reads_io/file_reader.hpp"
+#include "modules/simplification/tip_clipper.hpp"
+#include "propagate.hpp"
+#include "visualization.hpp"
+
+namespace debruijn_graph {
+static const size_t EDGE_LENGTH_THRESHOLD = 2000;
+
+//FIXME 2kb edge length threshold might affect tip propagator in undesired way
+class EdgeAnnotationPropagator {
+ const conj_graph_pack& gp_;
+ const string name_;
+ size_t edge_length_threshold_;
+
+protected:
+ const conj_graph_pack& gp() const {
+ return gp_;
+ }
+
+ const Graph& g() const {
+ return gp_.g;
+ }
+
+ virtual set<EdgeId> PropagateEdges(const set<EdgeId>& edges) const = 0;
+
+public:
+ EdgeAnnotationPropagator(const conj_graph_pack& gp,
+ const string& name,
+ size_t edge_length_threshold = EDGE_LENGTH_THRESHOLD) :
+ gp_(gp),
+ name_(name),
+ edge_length_threshold_(edge_length_threshold) {}
+
+ const std::string& name() const {
+ return name_;
+ }
+
+ std::map<bin_id, set<EdgeId>> Propagate(EdgeAnnotation& edge_annotation) const {
+ std::map<bin_id, set<EdgeId>> answer;
+ DEBUG("Propagating with propagator: " << name_);
+ for (bin_id bin : edge_annotation.interesting_bins()) {
+ DEBUG("Processing bin " << bin << " with propagator: " << name_);
+ auto init_edges = edge_annotation.EdgesOfBin(bin, edge_length_threshold_);
+ DEBUG("Initial edge cnt " << init_edges.size() << " (edge length threshold " << edge_length_threshold_ << ")");
+ auto raw_propagated = PropagateEdges(init_edges);
+ set<EdgeId> propagated;
+ std::set_difference(raw_propagated.begin(), raw_propagated.end(),
+ init_edges.begin(), init_edges.end(),
+ std::inserter(propagated, propagated.end()));
+ answer[bin] = std::move(propagated);
+ }
+ DEBUG("Finished propagating with propagator: " << name_);
+ return answer;
+ }
+
+ virtual ~EdgeAnnotationPropagator() {}
+private:
+ DECL_LOGGER("EdgeAnnotationPropagator");
+};
+
+class ConnectingPathPropagator : public EdgeAnnotationPropagator {
+ size_t path_length_threshold_;
+ size_t path_edge_cnt_;
+ const EdgeAnnotation& debug_annotation_;
+
+ bin_id DetermineBin(const set<EdgeId>& edges) const {
+ map<bin_id, size_t> cnt_map;
+ for (EdgeId e : edges) {
+ for (auto b : debug_annotation_.Annotation(e)) {
+ cnt_map[b]++;
+ }
+ }
+ bin_id candidate = "";
+ for (auto cnt_el : cnt_map) {
+ if (cnt_el.second > edges.size() / 2) {
+ if (candidate.empty())
+ candidate = cnt_el.first;
+ else
+ return "";
+ }
+ }
+ return candidate;
+ }
+
+ bool BadPath(const vector<EdgeId>& path, bin_id base_bin) const {
+ size_t cnt = 0;
+ for (EdgeId e : path) {
+ if (g().length(e) < 2000)
+ continue;
+ auto ann = debug_annotation_.Annotation(e);
+ if (!ann.empty() &&
+ std::find(ann.begin(), ann.end(), base_bin) == ann.end()) {
+ cnt++;
+ }
+ }
+ return cnt > 0;
+ }
+
+ set<VertexId> CollectEdgeStarts(const set<EdgeId>& edges) const {
+ set<VertexId> answer;
+ for (EdgeId e : edges) {
+ answer.insert(g().EdgeStart(e));
+ }
+ return answer;
+ }
+
+ set<EdgeId> PropagateEdges(const set<EdgeId>& edges) const override {
+ //static size_t pic_cnt = 0;
+ bin_id bin = DetermineBin(edges);
+ if (!bin.empty()) {
+ DEBUG("Bin determined as " << bin);
+ } else {
+ DEBUG("Failed to determine bin");
+ }
+ set<EdgeId> answer;
+ set<VertexId> starts = CollectEdgeStarts(edges);
+ for (EdgeId e : edges) {
+ PathProcessor<Graph> path_searcher(g(), g().EdgeEnd(e), path_length_threshold_);
+ for (VertexId v : starts) {
+ auto callback = AdapterCallback<Graph>([&](const vector<EdgeId>& path) {
+ //if (pic_cnt < 10) {
+ //if (BadPath(path, bin)) {
+ // auto to_draw = path;
+ // to_draw.insert(to_draw.begin(), e);
+ // PrintAnnotatedAlongPath(gp(), to_draw, debug_annotation_, "/home/snurk/tmp/pics/pic_" + ToString(++pic_cnt) + "_");
+ //}
+ //}
+ insert_all(answer, path);
+ }, true);
+ TRACE("Launching path search between edge " << g().str(e) << " and vertex "
+ << g().str(v) << " with length bound " << path_length_threshold_);
+ path_searcher.Process(v, 0, path_length_threshold_, callback, path_edge_cnt_);
+ }
+ }
+ return answer;
+ }
+
+public:
+ ConnectingPathPropagator(const conj_graph_pack& gp,
+ size_t path_length_threshold,
+ size_t path_edge_cnt,
+ const EdgeAnnotation& ann) :
+ EdgeAnnotationPropagator(gp, "ConnectingPath"),
+ path_length_threshold_(path_length_threshold),
+ path_edge_cnt_(path_edge_cnt),
+ debug_annotation_(ann) {}
+
+private:
+ DECL_LOGGER("ConnectingPathPropagator");
+};
+
+//FIXME make threshold coverage-aware
+class PairedInfoPropagator : public EdgeAnnotationPropagator {
+ omnigraph::de::DEWeight weight_threshold_;
+ set<EdgeId> PropagateEdges(const set<EdgeId>& edges) const override {
+ set<EdgeId> answer;
+ for (EdgeId e1 : edges) {
+ DEBUG("Searching for paired neighbours of " << g().str(e1));
+ for (const auto& index : gp().clustered_indices)
+ for (auto i : index.Get(e1))
+ for (auto point : i.second)
+ if (math::ge(point.weight, weight_threshold_)) {
+ DEBUG("Adding (" << g().str(e1) << "," << g().str(i.first) << "); " << point);
+ answer.insert(i.first);
+ }
+ }
+ return answer;
+ }
+public:
+ PairedInfoPropagator(const conj_graph_pack& gp, omnigraph::de::DEWeight threshold):
+ EdgeAnnotationPropagator(gp, "PairedInfo"), weight_threshold_(threshold) {}
+private:
+ DECL_LOGGER("PairedInfoPropagator");
+};
+
+class ContigPropagator : public EdgeAnnotationPropagator {
+public:
+ ContigPropagator(const conj_graph_pack& gp,
+ io::SingleStream& contigs) :
+ EdgeAnnotationPropagator(gp, "ContigPropagator"),
+ contigs_(contigs),
+ mapper_(MapperInstance(gp))
+ {}
+protected:
+ set<EdgeId> PropagateEdges(const set<EdgeId>& edges) const override {
+ contigs_.reset();
+ set<EdgeId> answer;
+ io::SingleRead contig;
+ while (!contigs_.eof()) {
+ contigs_ >> contig;
+ auto edges_of_contig = mapper_->MapRead(contig).simple_path();
+ for (EdgeId e : edges_of_contig) {
+ if (edges.count(e)) {
+ DEBUG("Edge " << gp().g.str(e) << " belongs to the contig #" <<
+ contig.name() << " of " << edges_of_contig.size() << " edges");
+ insert_all(answer, edges_of_contig);
+ break;
+ }
+ }
+ }
+ return answer;
+ }
+
+private:
+ io::SingleStream& contigs_;
+ shared_ptr<SequenceMapper<Graph>> mapper_;
+
+ DECL_LOGGER("ContigPropagator");
+};
+
+class TipPropagator : public EdgeAnnotationPropagator {
+
+public:
+ TipPropagator(const conj_graph_pack& gp) :
+ EdgeAnnotationPropagator(gp, "TipPropagator"), tipper_(gp.g) {}
+
+protected:
+ set<EdgeId> PropagateEdges(const set<EdgeId>& edges) const override {
+ set<EdgeId> answer;
+ for (EdgeId e1 : edges) {
+ auto v = g().EdgeEnd(e1);
+ auto neighbours = g().OutgoingEdges(v);
+ auto e2_it = std::find_if(neighbours.begin(), neighbours.end(), [&](EdgeId e2){return edges.count(e2);});
+ if (e2_it == neighbours.end()) {
+ TRACE(e1.int_id() << " has no neighbours from the same bin");
+ continue;
+ }
+ TRACE("Finding tips between " << e1.int_id() << " and " << e2_it->int_id());
+ for (EdgeId posTip : g().IncidentEdges(v)) {
+ if (edges.count(posTip))
+ continue;
+ TRACE("Checking " << posTip.int_id() << "...");
+ if (tipper_.Check(posTip)) {
+ TRACE("A tip is found!");
+ answer.insert(posTip);
+ }
+ }
+ }
+ return answer;
+ }
+
+private:
+ TipCondition<Graph> tipper_;
+ DECL_LOGGER("TipPropagator");
+};
+
+class AnnotationChecker {
+ const Graph& g_;
+ const EdgeAnnotation& edge_annotation_;
+ size_t edge_length_threshold_;
+public:
+ AnnotationChecker(const Graph& g,
+ const EdgeAnnotation& edge_annotation,
+ size_t edge_length_threshold = EDGE_LENGTH_THRESHOLD) :
+ g_(g),
+ edge_annotation_(edge_annotation),
+ edge_length_threshold_(edge_length_threshold) {
+ }
+
+ size_t Check(bin_id bin, const set<EdgeId>& propagated_edges) {
+ DEBUG("Checking edges to be annotated with " << bin);
+ size_t answer = 0;
+ for (EdgeId e : propagated_edges) {
+ if (g_.length(e) < edge_length_threshold_)
+ continue;
+ auto ann = edge_annotation_.Annotation(e);
+ for (auto b : ann) {
+ if (b != bin) {
+ DEBUG("Edge " << g_.str(e) << " already was annotated as " << b);
+ ++answer;
+ break;
+ }
+ }
+ }
+ return answer;
+ }
+
+private:
+ DECL_LOGGER("AnnotationChecker");
+};
+
+void AnnotationPropagator::Run(io::SingleStream& /*contigs*/,
+ EdgeAnnotation& edge_annotation
+ /*const string& annotation_out_fn*/) {
+ std::vector<std::shared_ptr<EdgeAnnotationPropagator>> propagator_pipeline {
+ std::make_shared<ConnectingPathPropagator>(gp_, 8000, 10, edge_annotation),
+ std::make_shared<TipPropagator>(gp_),
+ std::make_shared<PairedInfoPropagator>(gp_, 10.)};//,
+// std::make_shared<ContigPropagator>(gp_, contigs)};//,
+// std::make_shared<ConnectingPathPropagator>(gp_, 8000, 10, edge_annotation),
+// std::make_shared<ContigPropagator>(gp_, contigs),
+// std::make_shared<TipPropagator>(gp_)};
+
+ AnnotationChecker checker(gp_.g, edge_annotation);
+
+ for (const auto& bin_id : edge_annotation.interesting_bins()) {
+ size_t problem_cnt = checker.Check(bin_id, edge_annotation.EdgesOfBin(bin_id, EDGE_LENGTH_THRESHOLD));
+ DEBUG("Bin " << bin_id << " had " << problem_cnt << " problems");
+ }
+
+ for (auto prop_ptr : propagator_pipeline) {
+ DEBUG("Propagating with: " << prop_ptr->name());
+ auto propagation_map = prop_ptr->Propagate(edge_annotation);
+
+ DEBUG("Extending " << propagation_map.size() << " bins after propagation with: " << prop_ptr->name());
+ for (const auto& bin_prop : propagation_map) {
+ const auto& bin_id = bin_prop.first;
+ const auto& edges = bin_prop.second;
+ DEBUG("Extending bin " << bin_id << " with "
+ << edges.size() << " edges and their conjugates");
+ size_t problem_cnt = checker.Check(bin_id, edges);
+ DEBUG("Propagation of bin " << bin_id << " with " << prop_ptr->name()
+ << " lead to " << problem_cnt << " binning problems");
+ edge_annotation.StickAnnotation(edges, bin_id);
+ }
+ DEBUG("Applied bin extensions from propagator " << prop_ptr->name());
+ }
+}
+
+}
diff --git a/src/projects/mts/propagate.hpp b/src/projects/mts/propagate.hpp
new file mode 100644
index 0000000..1c3ce0f
--- /dev/null
+++ b/src/projects/mts/propagate.hpp
@@ -0,0 +1,29 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "io/reads/single_read.hpp"
+#include "io/reads/io_helper.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "annotation.hpp"
+
+namespace debruijn_graph {
+
+class AnnotationPropagator {
+ const conj_graph_pack& gp_;
+
+public:
+ AnnotationPropagator(const conj_graph_pack& gp) :
+ gp_(gp) {
+ }
+
+ void Run(io::SingleStream& contigs, EdgeAnnotation& edge_annotation);
+
+private:
+ DECL_LOGGER("AnnotationChecker");
+};
+
+}
diff --git a/src/projects/mts/read_binning.cpp b/src/projects/mts/read_binning.cpp
new file mode 100644
index 0000000..ac2dea2
--- /dev/null
+++ b/src/projects/mts/read_binning.cpp
@@ -0,0 +1,90 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "utils/simple_tools.hpp"
+#include "utils/logger/log_writers.hpp"
+
+#include "pipeline/graphio.hpp"
+#include "io/reads/file_reader.hpp"
+#include "read_binning.hpp"
+
+namespace debruijn_graph {
+
+set<bin_id> ContigBinner::RelevantBins(const io::SingleRead& r) const {
+ return edge_annotation_.RelevantBins(mapper_->MapRead(r).simple_path());
+}
+
+void ContigBinner::Init(bin_id bin) {
+ string out_dir = out_root_ + "/" + ToString(bin) + "/";
+ path::make_dirs(out_dir);
+ out_streams_.insert(make_pair(bin, make_shared<io::OPairedReadStream>(out_dir + sample_name_ + "_1.fastq",
+ out_dir + sample_name_ + "_2.fastq")));
+}
+
+void ContigBinner::Run(io::PairedStream& paired_reads) {
+ io::PairedRead paired_read;
+ while (!paired_reads.eof()) {
+ paired_reads >> paired_read;
+ set<bin_id> bins;
+ insert_all(bins, RelevantBins(paired_read.first()));
+ insert_all(bins, RelevantBins(paired_read.second()));
+ for (auto bin : bins) {
+ if (out_streams_.find(bin) == out_streams_.end()) {
+ Init(bin);
+ }
+ (*(out_streams_[bin])) << paired_read;
+ }
+ }
+}
+
+};
+
+//todo make it take dataset info
+/*
+int main(int argc, char** argv) {
+ using namespace debruijn_graph;
+
+ if (argc < 9) {
+ cout << "Usage: read_binning <K> <saves path> <contigs path> <contigs binning info> "
+ "<left reads> <right reads> <output root> <sample name> (<bins of interest>)*" << endl;
+ exit(1);
+ }
+
+ //TmpFolderFixture fixture("tmp");
+ create_console_logger();
+ size_t k = lexical_cast<size_t>(argv[1]);
+ string saves_path = argv[2];
+ string contigs_path = argv[3];
+ string contigs_binning_path = argv[4];
+ string left_reads = argv[5];
+ string right_reads = argv[6];
+ string out_root = argv[7];
+ string sample_name = argv[8];
+
+ std::vector<bin_id> bins_of_interest;
+ for (int i = 9; i < argc; ++i) {
+ bins_of_interest.push_back(argv[i]);
+ }
+
+ conj_graph_pack gp(k, "tmp", 0);
+ gp.kmer_mapper.Attach();
+ INFO("Load graph from " << saves_path);
+ graphio::ScanGraphPack(saves_path, gp);
+
+ ContigBinner binner(gp, bins_of_interest);
+
+ auto contigs_stream_ptr = make_shared<io::FileReadStream>(contigs_path);
+ AnnotationStream binning_stream(contigs_binning_path);
+
+ binner.Init(out_root, sample_name, *contigs_stream_ptr, binning_stream);
+
+ auto paired_stream = io::PairedEasyStream(left_reads, right_reads, false, 0);
+ binner.Run(*paired_stream);
+ binner.close();
+ return 0;
+}
+*/
diff --git a/src/projects/mts/read_binning.hpp b/src/projects/mts/read_binning.hpp
new file mode 100644
index 0000000..87aeadd
--- /dev/null
+++ b/src/projects/mts/read_binning.hpp
@@ -0,0 +1,92 @@
+//***************************************************************************
+//* Copyright (c) 2015-2016 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+#pragma once
+
+#include "annotation.hpp"
+#include "io/reads/io_helper.hpp"
+
+namespace io {
+
+class OSingleReadStream {
+ std::ofstream os_;
+
+public:
+ OSingleReadStream(const std::string& fn) :
+ os_(fn) {
+ }
+
+ OSingleReadStream& operator<<(const SingleRead& read) {
+ os_ << "@" << read.name() << std::endl;
+ os_ << read.GetSequenceString() << std::endl;
+ os_ << "+" << std::endl;
+ os_ << read.GetPhredQualityString() << std::endl;
+ return *this;
+ }
+
+ void close() {
+ os_.close();
+ }
+};
+
+class OPairedReadStream {
+ OSingleReadStream l_os_;
+ OSingleReadStream r_os_;
+
+public:
+ OPairedReadStream(const std::string& l_fn, const std::string& r_fn) :
+ l_os_(l_fn), r_os_(r_fn) {
+ }
+
+ OPairedReadStream& operator<<(const PairedRead& read) {
+ l_os_ << read.first();
+ r_os_ << read.second();
+ return *this;
+ }
+
+ void close() {
+ l_os_.close();
+ r_os_.close();
+ }
+};
+
+}
+
+namespace debruijn_graph {
+
+class ContigBinner {
+ const conj_graph_pack& gp_;
+ const EdgeAnnotation& edge_annotation_;
+ std::string out_root_;
+ std::string sample_name_;
+ shared_ptr<SequenceMapper<Graph>> mapper_;
+
+ map<bin_id, std::shared_ptr<io::OPairedReadStream>> out_streams_;
+
+ set<bin_id> RelevantBins(const io::SingleRead& r) const;
+
+ void Init(bin_id bin);
+
+public:
+ ContigBinner(const conj_graph_pack& gp,
+ const EdgeAnnotation& edge_annotation,
+ const std::string& out_root,
+ const std::string& sample_name) :
+ gp_(gp),
+ edge_annotation_(edge_annotation),
+ out_root_(out_root),
+ sample_name_(sample_name),
+ mapper_(MapperInstance(gp)) {
+ }
+
+ void Run(io::PairedStream& paired_reads);
+
+ void close() {
+ out_streams_.clear();
+ }
+};
+
+}
diff --git a/src/modules/empty.cpp b/src/projects/mts/scripts/__init__.py
similarity index 100%
rename from src/modules/empty.cpp
rename to src/projects/mts/scripts/__init__.py
diff --git a/src/projects/mts/scripts/calc_kmers_mpl.py b/src/projects/mts/scripts/calc_kmers_mpl.py
new file mode 100755
index 0000000..26382cf
--- /dev/null
+++ b/src/projects/mts/scripts/calc_kmers_mpl.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+import os
+import argparse
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="Kmers mpl filter")
+ parser.add_argument("-om", "--one-min", default=3, type=int, help="min kmer mpl in one sample")
+ parser.add_argument("-am", "--all-min", default=3, type=int, help="min kmer mpl in all samples")
+ parser.add_argument("-kl", "--kmer-len", default=31, type=int, help="kmer length")
+ parser.add_argument("samples_dir", help="directory with samples")
+ parser.add_argument("output", help="output files prefix")
+ args = parser.parse_args()
+ return args
+
+def calc_mpl(args):
+ if not os.path.exists(args.samples_dir):
+ os.makedirs(args.samples_dir)
+
+ files = [f for f in os.listdir(args.samples_dir) if os.path.isfile(os.path.join(args.samples_dir, f))]
+
+ cmd = "/home/toxa31/work/algorithmic-biology/assembler/src/kmer_count_filter/kmer_count_filter -kl {} -one-min {} -all-min {}".format(
+ args.kmer_len, args.one_min, args.all_min)
+
+ for f in files:
+ cmd = cmd + " " + args.samples_dir + "/" + f
+
+ cmd = cmd + " " + args.output
+
+ print(cmd)
+
+ os.system(cmd)
+
+def main():
+ args = parse_args()
+ calc_mpl(args)
+
+main()
\ No newline at end of file
diff --git a/src/projects/mts/scripts/canopy_launch.sh b/src/projects/mts/scripts/canopy_launch.sh
new file mode 100755
index 0000000..5f17acc
--- /dev/null
+++ b/src/projects/mts/scripts/canopy_launch.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+if [ "$#" -lt 3 ]; then
+ echo "Usage: script.sh <canopy.in> <canopy.out> <canopy.prof> [thread_cnt = 4]"
+ exit
+fi
+
+thread_cnt=4
+if [ "$#" -ge 4 ]; then
+ thread_cnt=$4
+fi
+
+/home/snurk/soft/mgs-canopy-algorithm/src/cc.bin -n $thread_cnt -i $1 -o $2 -c $3 #--max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.05 --min_step_dist 0.01 --max_num_canopy_walks 3 --stop_fraction 1 --canopy_size_stats_file stat --filter_min_obs 1 --filter_max_dominant_obs 1.0
+
+#/home/snurk/soft/canopy/cc.bin -n 32 -i $1 -o bin_canopy -c prof_canopy --max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.05 --min_step_dist 0.01 --max_num_canopy_walks 3 --stop_fraction 1 --canopy_size_stats_file stat --filter_min_obs 1 --filter_max_dominant_obs 1.0
+
+#/home/ygorshkov/Projects/canopy/cc.bin -n 32 -i canopy_mod.in -o bin_canopy -c prof_canopy --max_canopy_dist 0.1 --max_close_dist 0.4 --max_merge_dist 0.1 --min_step_dist 0.005 --max_num_canopy_walks 5 --stop_fraction 1 --canopy_size_stats_file stat
diff --git a/src/projects/mts/scripts/choose_samples.py b/src/projects/mts/scripts/choose_samples.py
new file mode 100755
index 0000000..cd58c54
--- /dev/null
+++ b/src/projects/mts/scripts/choose_samples.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+from __future__ import (print_function)
+
+import glob
+from operator import itemgetter
+from os import path
+import subprocess
+import sys
+
+if len(sys.argv) < 3:
+ print("Usage: choose_samples.py <canopy.prof> <binning dir> [CAGS+]")
+ exit(1)
+
+PROF = sys.argv[1]
+DIR = sys.argv[2]
+CAGS = None
+if len(sys.argv) == 4:
+ CAGS = set(sys.argv[3:])
+DESIRED_ABUNDANCE = 50
+MIN_ABUNDANCE = 4
+MIN_TOTAL_ABUNDANCE = 20
+
+#Assuming that samples are enumerated consecutively from 1 to N
+with open(PROF) as input:
+ for line in input:
+ params = line.split()
+ CAG = params[0]
+ if CAGS and CAG not in CAGS:
+ continue
+ profile = map(float, params[1:])
+
+ print("Profile of", CAG, ":", profile)
+
+ weighted_profile = list((i, ab)
+ for i, ab in enumerate(profile) if ab >= MIN_ABUNDANCE and path.exists("{}/{}/sample{}_1.fastq".format(DIR, CAG, i + 1)))
+ weighted_profile.sort(key = itemgetter(1))
+
+ sum = 0
+ samples = []
+ #If we have overabundant samples, use the least.
+ try:
+ i = next(x for x, _ in weighted_profile if profile[x] >= DESIRED_ABUNDANCE)
+ sum = profile[i]
+ samples = [i + 1]
+ except StopIteration:
+ #If there isn't any, collect from samples, starting from the largest
+ for i, _ in reversed(weighted_profile):
+ sum += profile[i]
+ samples.append(i + 1)
+ if sum >= DESIRED_ABUNDANCE:
+ break
+
+ print("Chosen samples are", samples, "with total mean abundance", sum)
+ if sum < MIN_TOTAL_ABUNDANCE:
+ print(CAG, "is too scarce; skipping")
+ continue
+
+ for suf, name in [("1", "left"), ("2", "right")]:
+ reads = ["{}/{}/sample{}_{}.fastq".format(DIR, CAG, sample, suf) for sample in samples]
+ with open("{}/{}/{}.fastq".format(DIR, CAG, name), "w") as output:
+ subprocess.check_call(["cat"] + reads, stdout=output)
diff --git a/src/projects/mts/scripts/combine_contigs.py b/src/projects/mts/scripts/combine_contigs.py
new file mode 100755
index 0000000..16b448f
--- /dev/null
+++ b/src/projects/mts/scripts/combine_contigs.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import sys
+import os
+import re
+from Bio import SeqIO
+
+replace = False
+
+if sys.argv[1] == "-r":
+ replace = True
+ files = sys.argv[2:]
+else:
+ files = sys.argv[1:]
+
+sample_re = re.compile("sample\d+")
+
+output = sys.stdout
+
+for file in files:
+ sample = sample_re.search(file).group(0)
+ for seq in SeqIO.parse(file, "fasta"):
+ seq_id = seq.id
+ if replace:
+ seq_id = seq_id.replace(",", "~")
+ seq.id = sample + "-" + seq_id
+ seq.description = ""
+ SeqIO.write(seq, output, "fasta")
diff --git a/src/projects/mts/scripts/common.py b/src/projects/mts/scripts/common.py
new file mode 100644
index 0000000..4146665
--- /dev/null
+++ b/src/projects/mts/scripts/common.py
@@ -0,0 +1,121 @@
+from __future__ import print_function
+try:
+ from future_builtins import zip
+except:
+ pass
+
+import os
+import os.path
+try:
+ import yaml
+ def load_dict(input):
+ return yaml.load(input)
+ def dump_dict(dict, output):
+ yaml.dump(dict, output)
+except:
+ def load_dict(input):
+ def load_pairs():
+ for line in input:
+ params = line.split(":", 2)
+ yield (params[0].strip(), params[1].strip())
+ return dict(load_pairs())
+ def dump_dict(dict, output):
+ for k, v in dict.items():
+ print(k, ": ", v, sep="", file=output)
+
+FASTA_EXTS = {".fasta", ".fa", ".fna", ".fsa", ".fastq", ".fastq.gz", ".fq", ".fq.gz", ".fna.gz"}
+def gather_paths(path, basename=False):
+ for filename in os.listdir(path):
+ name = os.path.basename(filename)
+ for ext in FASTA_EXTS:
+ if not name.endswith(ext):
+ continue
+ filepath = os.path.join(path, filename)
+ if basename:
+ yield (name[0:-len(ext)], filepath)
+ else:
+ yield filepath
+
+def detect_reads(dir):
+ return sorted(list(gather_paths(dir)))[:2]
+
+#Autodetect references
+def gather_refs(data):
+ if type(data) is list:
+ for path in data:
+ for ref in gather_refs(path):
+ yield ref
+ else:
+ if data.startswith("@"):
+ with open(data[1:]) as input:
+ for ref in load_dict(input).items():
+ yield ref
+ elif os.path.isdir(data):
+ for ref in gather_paths(data, True):
+ yield ref
+ else:
+ yield (os.path.splitext(os.path.basename(data))[0], data)
+
+def get_id(internal_id, sample):
+ res = internal_id.split("_", 2)[1]
+ return sample + "-" + res
+
+def load_annotation(file, normalize=True):
+ res = dict()
+ sample, _ = os.path.splitext(os.path.basename(file))
+ with open(file) as input:
+ for line in input:
+ info = line.split(" : ")
+ id = get_id(info[0], sample) if normalize else info[0]
+ bins = info[1].split()
+ if id in res:
+ res[id].update(bins)
+ else:
+ res[id] = set(bins)
+ return res
+
+class Row:
+ def __init__(self, data, colnames):
+ self.data = data
+ self.colnames = colnames
+
+ def __getitem__(self, index):
+ return self.data[self.colnames[index]]
+
+class Table:
+ def __init__(self):
+ self.data = []
+ self.colnames = None
+ self.rownames = None
+ self.rows = 0
+
+ @staticmethod
+ def read(filepath, sep="\t", headers=False):
+ res = Table()
+ with open(filepath) as input:
+ for line in input:
+ params = line.strip("\n").split(sep)
+ if not res.colnames:
+ res.rownames = dict()
+ if headers:
+ res.colnames = dict(zip(params[1:], range(len(params))))
+ continue
+ else:
+ res.colnames = dict((i, i) for i in range(len(params)))
+ if headers:
+ res.rownames[params[0]] = res.rows
+ res.data.append(params[1:])
+ else:
+ res.rownames[res.rows] = res.rows
+ res.data.append(params)
+ res.rows += 1
+ return res
+
+ def __getitem__(self, index):
+ return Row(self.data[self.rownames[index]], self.colnames)
+
+ def zip_with(self, other, method):
+ for rowname, i in self.rownames.items():
+ for colname, j in self.colnames.items():
+ other_cell = other.data[other.rownames[rowname]][other.colnames[colname]]
+ method(rowname, colname, self.data[i][j], other_cell)
diff --git a/src/projects/mts/scripts/filter_nucmer.py b/src/projects/mts/scripts/filter_nucmer.py
new file mode 100755
index 0000000..eae66a1
--- /dev/null
+++ b/src/projects/mts/scripts/filter_nucmer.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+from __future__ import print_function
+
+import re
+import sys
+from os import path
+
+def print_usage():
+ print("For a sample assembly aligned to a reference, outputs only contigs which were aligned more than <threshold> percent of their length total, and that percent.")
+ print("Usage: filter_nucmer.py <nucmer coords filtered> <output file> <length> <threshold>")
+ print("Parameters:")
+ print("<length> is minimal contig length (default: INF)")
+ print("<threshold> is the minimal total alignment of a contig (0-100%)")
+
+if len(sys.argv) != 5:
+ print_usage()
+ sys.exit(1)
+
+nucmer_output_fn = sys.argv[1]
+output_fn = sys.argv[2]
+min_length = int(sys.argv[3])
+threshold = float(sys.argv[4])
+
+if not path.exists(nucmer_output_fn):
+ print("File {} doesn't exist".format(nucmer_output_fn))
+ sys.exit(2)
+
+with open(nucmer_output_fn, "r") as nucmer_output:
+ with open(output_fn, "w") as output:
+ align_data = re.compile("\d+ \d+ \| \d+ \d+ \| \d+ (\d+) \| [\d.]+ \| [^ ]+ NODE_(\d+)_length_(\d+)")
+ contig = None
+ contig_len = 0
+ align_len = 0
+ def process_contig():
+ per = 100.0 * align_len / contig_len
+ if per > threshold and contig_len >= min_length:
+ print("{}\t{}\t{}".format(contig, contig_len, per), file=output)
+ return align_len
+ return 0
+ for line in nucmer_output:
+ res = align_data.search(line)
+ if res is None:
+ continue
+ new_contig = res.group(2)
+ if contig != new_contig:
+ if contig is not None:
+ process_contig()
+ contig = new_contig
+ contig_len = int(res.group(3))
+ align_len = 0
+ #Assuming that all alignments of the same contig are consequent
+ align_len += int(res.group(1))
+ #Print the last contig separately
+ process_contig()
diff --git a/src/projects/mts/scripts/gather_stats.py b/src/projects/mts/scripts/gather_stats.py
new file mode 100755
index 0000000..a65c1a5
--- /dev/null
+++ b/src/projects/mts/scripts/gather_stats.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+import pandas
+from pandas import DataFrame
+
+from math import isnan
+import os.path
+import sys
+
+quast_dir = sys.argv[1]
+
+res_table = DataFrame(columns=["bin", "ref", "GF", "purity", "NGA50", "misassemblies"])
+gf_table = pandas.read_table(os.path.join(quast_dir, "summary", "TSV", "Genome_fraction_(%).tsv"), dtype=str).set_index("Assemblies")
+gfs = gf_table.apply(pandas.to_numeric, errors="coerce")
+best_ref = gfs.apply(lambda col: col.idxmax())
+
+for bin, ref in best_ref.iteritems():
+ if type(ref) is float:
+ row = {"bin": bin, "GF": "-", "ref": "unknown", "purity": "-", "NGA50": "-", "misassemblies": "-"}
+ else:
+ all_stats = pandas.read_table(os.path.join(quast_dir, "runs_per_reference", ref, "report.tsv"), index_col=0)
+ col = all_stats.get(bin)
+ purity = 100 - float(col["Unaligned length"]) / float(col["Total length"]) * 100
+ row = {"bin": bin, "GF": col["Genome fraction (%)"], "ref": ref, "purity": "{0:.2f}".format(purity),
+ "NGA50": col["NGA50"], "misassemblies": col["# misassemblies"]}
+ res_table = res_table.append(row, ignore_index=True)
+
+res_table.to_csv(sys.stdout, index=False, sep="\t")
diff --git a/src/projects/mts/scripts/gen_samples.py b/src/projects/mts/scripts/gen_samples.py
new file mode 100755
index 0000000..f975b73
--- /dev/null
+++ b/src/projects/mts/scripts/gen_samples.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+from __future__ import print_function
+
+import argparse
+import os
+import os.path
+import random
+import shutil
+import subprocess
+import sys
+from common import gather_refs, dump_dict
+from scipy.stats import expon
+
+def gen_profile(args):
+ if args.distribution == "uni":
+ #def rand():
+ # return random.randint(0, args.scale)
+ pass
+ elif args.distribution == "exp":
+ def rand():
+ return int(expon.rvs(scale=args.scale))
+
+ refs = dict(gather_refs(args.references))
+ if args.dump_desc:
+ with open(args.dump_desc, "w") as desc:
+ dump_dict(refs, desc)
+ for ref in refs:
+ print(ref, end=" ")
+ for _ in range(args.samples):
+ print(rand(), end=" ")
+ print()
+
+def gen_samples(args):
+ refs = dict(gather_refs(args.references.split(",")))
+ try:
+ os.mkdir(args.out_dir)
+ except OSError:
+ pass
+
+ read_len = args.read_length
+ adj_qual = "2" * read_len + "\n"
+
+ with open(args.profile) as input:
+ first_line = True
+ for line in input:
+ params = line.split()
+ ref_name = params[0]
+ ref_path = refs.get(ref_name)
+ if not ref_path:
+ print("Warning: no reference provided for", ref_name)
+ continue
+ for i, abundance in enumerate(map(int, params[1:]), start=1):
+ ref_len = os.stat(ref_path).st_size
+ reads = ref_len * abundance // read_len
+ print("Generating", reads, "reads for subsample", i, "of", ref_name)
+ sample_dir = os.path.join(args.out_dir, "sample" + str(i))
+ if first_line:
+ shutil.rmtree(sample_dir, ignore_errors=True)
+ subprocess.check_call(["mkdir", "-p", sample_dir])
+
+ temp_1 = sample_dir + ".tmp.r1.fastq"
+ temp_2 = sample_dir + ".tmp.r2.fastq"
+ subprocess.check_call(["wgsim", "-N", str(reads), "-r", "0", "-1", str(read_len), "-2", str(read_len), "-d", "300", "-s", "10", "-e", "{:.2f}".format(args.error_rate), "-S", str(i), ref_path, temp_1, temp_2], stdout=subprocess.DEVNULL)
+
+ print("Merging temporary files")
+ for temp, out in [(temp_1, os.path.join(sample_dir, "r1.fastq")), (temp_2, os.path.join(sample_dir, "r2.fastq"))]:
+ with open(temp) as input, open(out, "a") as output:
+ for line in input:
+ if line.startswith("IIIII"): #TODO: remove this hack
+ output.write(adj_qual)
+ else:
+ output.write(line)
+ os.remove(temp)
+ print()
+ first_line = False
+
+parser = argparse.ArgumentParser(description="Metagenomic Time Series Simulator")
+parser.add_argument("--references", "-r", type=str, help="Comma-separated list of references, or a directory with them, or a desc file with reference paths prepended with @", required=True)
+subparsers = parser.add_subparsers()
+
+gen_profile_args = subparsers.add_parser("prof", help="Generate a profile for the reference set")
+gen_profile_args.add_argument("--dump-desc", "-d", type=str, help="Dump description file with reference paths")
+gen_profile_args.add_argument("--samples", "-n", type=int, help="Sample count", default=1)
+gen_profile_args.add_argument("--scale", "-s", type=int, help="Distribution scale", default=20)
+gen_profile_args.add_argument("--distribution", "-t", choices=["uni", "exp"], help="Distribution type", default="uni")
+gen_profile_args.set_defaults(func=gen_profile)
+
+gen_samples_args = subparsers.add_parser("gen", help="Generate reads using a profile")
+gen_samples_args.add_argument("--out-dir", "-o", type=str, help="Output directory. Will be totally overwritten!")
+gen_samples_args.add_argument("--read-length", "-l", type=int, help="Read length", default=100)
+gen_samples_args.add_argument("--error-rate", "-e", type=float, help="Base error rate", default=0)
+gen_samples_args.add_argument("profile", type=str, help="File with reference profiles")
+gen_samples_args.set_defaults(func=gen_samples)
+
+args = parser.parse_args()
+args.func(args)
diff --git a/src/projects/mts/scripts/make_input.py b/src/projects/mts/scripts/make_input.py
new file mode 100755
index 0000000..ae6984c
--- /dev/null
+++ b/src/projects/mts/scripts/make_input.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+from __future__ import print_function
+try:
+ from itertools import izip as zip
+except ImportError:
+ pass
+
+import argparse
+import os
+import sys
+
+parser = argparse.ArgumentParser(description="Binner input formatter")
+parser.add_argument("--type", "-t", type=str, help="Binner type (canopy or concoct)", default="canopy")
+parser.add_argument("--output", "-o", type=str, help="Output file")
+parser.add_argument("--dir", "-d", type=str, help="Directory with profiles (pairs of .id .mpl files)")
+parser.add_argument("samples", type=str, nargs="+", help="Sample names")
+
+args = parser.parse_args()
+
+class CanopyFormatter:
+ def __init__(self):
+ pass
+
+ def header(self, file, samples):
+ pass
+
+ def profile(self, file, contig, profile):
+ print(contig, profile, file=out)
+
+class ConcoctFormatter:
+ def __init__(self):
+ pass
+
+ def header(self, file, samples):
+ print("\t".join(["contig"] + ["cov_mean_" + sample for sample in samples]), file=out)
+
+ def profile(self, file, contig, profile):
+ print(contig.replace(",", "~"), profile.replace(" ", "\t"), sep="\t", file=out)
+
+formatters = {"canopy": CanopyFormatter(), "concoct": ConcoctFormatter()}
+formatter = formatters[args.type]
+
+with open(args.output, "w") as out:
+ formatter.header(out, args.samples)
+ for sample in args.samples:
+ id_file = "{}/{}.id".format(args.dir, sample)
+ mpl_file = "{}/{}.mpl".format(args.dir, sample)
+
+ print("Processing abundances from %s" % id_file)
+
+ with open(id_file, "r") as ctg_id, open(mpl_file, "r") as ctg_mpl:
+ for cid, cmpl in zip(ctg_id, ctg_mpl):
+ formatter.profile(out, sample + "-" + cid.strip(), cmpl.strip())
diff --git a/src/projects/mts/scripts/make_points_matrix.py b/src/projects/mts/scripts/make_points_matrix.py
new file mode 100755
index 0000000..875462b
--- /dev/null
+++ b/src/projects/mts/scripts/make_points_matrix.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+import random
+
+ctg = open("canopy/contigs.in", "r")
+ctr = open("canopy/clusters.out", "r")
+
+out = open("canopy/points_matrix.csv", "w")
+
+ctg_to_ctr = dict()
+
+while True:
+ s = ctr.readline().strip()
+ if (s == ""):
+ break
+ a = s.split()
+ ctr_id = a[0][3:]
+
+ if (random.randint(1, 25) == 1):
+ ctg_to_ctr[a[1]] = ctr_id
+
+while True:
+ s = ctg.readline().strip()
+ if s == "":
+ break
+
+ a = s.split()
+ if (a[0] in ctg_to_ctr):
+ out.write(ctg_to_ctr[a[0]])
+ for x in a[1:]:
+ out.write("," + x)
+
+ out.write("\n")
+
+out.close()
\ No newline at end of file
diff --git a/src/projects/mts/scripts/parse_output.py b/src/projects/mts/scripts/parse_output.py
new file mode 100755
index 0000000..17c44bd
--- /dev/null
+++ b/src/projects/mts/scripts/parse_output.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import argparse
+import os.path
+
+argparser = argparse.ArgumentParser(description="Binner output formatter")
+argparser.add_argument("--type", "-t", type=str, help="Binner type (canopy or concoct)", default="canopy")
+argparser.add_argument("--output", "-o", type=str, help="Output directory with annotations")
+argparser.add_argument("input", type=str, help="File with binning info")
+
+class Parser:
+ def __init__(self):
+ self.samples_annotation = dict()
+
+ def add(self, line):
+ sample_contig, bin_id = self.parse(line)
+ sample_contig = sample_contig.split('-', 1)
+ sample = sample_contig[0]
+ contig = sample_contig[1]
+ if sample not in self.samples_annotation:
+ self.samples_annotation[sample] = dict()
+
+ annotation = self.samples_annotation[sample]
+ if contig not in annotation:
+ annotation[contig] = list()
+
+ annotation[contig].append(bin_id)
+
+class CanopyParser(Parser):
+ def parse(self, line):
+ annotation_str = line.split()
+ bin_id = annotation_str[0].strip()
+ sample_contig = annotation_str[1].strip()
+ return (sample_contig, bin_id)
+
+class ConcoctParser(Parser):
+ def parse(self, line):
+ annotation_str = line.split(",", 1)
+ bin_id = annotation_str[1].strip()
+ sample_contig = annotation_str[0].replace("~", ",")
+ return (sample_contig, bin_id)
+
+parsers = {"canopy": CanopyParser(), "concoct": ConcoctParser()}
+
+args = argparser.parse_args()
+parser = parsers[args.type]
+
+with open(args.input, "r") as input_file:
+ for line in input_file:
+ parser.add(line)
+
+for sample, annotation in parser.samples_annotation.items():
+ with open(os.path.join(args.output, sample + ".ann"), "w") as sample_out:
+ annotation = parser.samples_annotation[sample]
+
+ for contig in annotation:
+ print(contig, ":", " ".join(annotation[contig]), file=sample_out)
diff --git a/src/projects/mts/scripts/pca.R b/src/projects/mts/scripts/pca.R
new file mode 100644
index 0000000..1d41f86
--- /dev/null
+++ b/src/projects/mts/scripts/pca.R
@@ -0,0 +1,77 @@
+library(stringr)
+
+format_ids <- function(table) {
+ table$contig <- paste0(str_extract(table$contig, "\\w+\\d+-"), str_replace(str_extract(table$contig, "NODE_\\d+"), "NODE_", ""))
+ unique(table)
+}
+
+load_binning <- function(canopy_in, canopy_out) {
+ data <- read.table(canopy_in)
+ names(data) <- c('contig', sapply(seq(1, dim(data)[2]-1, 1),
+ function(x) {paste('mlt', x, sep='')}))
+ data <- format_ids(data)
+ binned <- read.table(canopy_out)
+ names(binned) <- c('clust', 'contig')
+ binned <- format_ids(binned)
+ merge(x=data, y=binned, by='contig')
+}
+
+load_clusters <- function(canopy_in, canopy_out, int_contigs) {
+ data <- load_binning(canopy_in, canopy_out)
+ if (missing(int_contigs)) {
+ pieces <- split(data, data$clust)[1:10]
+ lims <- lapply(pieces, function(x) head(x, 50))
+ do.call(rbind, c(lims, list(make.row.names=FALSE)))
+ } else {
+ interesting <- read.table(int_contigs)
+ names(interesting) <- c('contig', 'length', 'alignment', 'ref')
+ droplevels(merge(x=data, y=interesting, by='contig'))
+ }
+}
+
+do_prc <- function(clusters) {
+ prcomp(~ ., data = clusters[, grep('mlt', colnames(clusters))])
+}
+
+print_clusters <- function(pr, clust, image) {
+ if (!missing(image))
+ png(filename=image, width=1024, height=768)
+ lev <- levels(factor(clust))
+ cols <- 1:length(lev)
+ #layout(rbind(1,2), heights=c(7,1))
+ plot(pr$x, col = as.numeric(clust), xlim=c(-100, 200), ylim=c(-50,50))
+ a <- split(as.data.frame(pr$x), clust)
+ for (l in lev) {
+ x <- a[[l]]
+ text(median(x$PC1), median(x$PC2), l)
+ }
+ legend("center", "bottom", legend=lev, col=cols, pch=1)
+ #dev.off()
+}
+
+#For debugging
+local_data <- function() {
+ clusters <- load_clusters("/Volumes/Chihua-Sid/mts/out/sample9.in",
+ "/Volumes/Chihua-Sid/mts/out/sample9.out",
+ "/Volumes/Chihua-Sid/mts/out/70p_3.log")
+
+ prc_data <- do_prc(clusters)
+ print_clusters(prc_data, clusters$clust)
+ prc_data
+}
+
+args <- commandArgs(trailingOnly = TRUE)
+in_fn <- args[1]
+out_fn <- args[2]
+if (length(args) < 4) {
+ image_out <- args[3]
+ clusters <- load_clusters(in_fn, out_fn)
+} else {
+ cont_fn <- args[3]
+ image_out <- args[4]
+ clusters <- load_clusters(in_fn, out_fn, cont_fn)
+}
+
+print(clusters[1:10,])
+prc_data <- do_prc(clusters)
+print_clusters(prc_data, clusters$clust, image_out)
diff --git a/src/projects/mts/scripts/ref_stats.sh b/src/projects/mts/scripts/ref_stats.sh
new file mode 100755
index 0000000..59dcb8d
--- /dev/null
+++ b/src/projects/mts/scripts/ref_stats.sh
@@ -0,0 +1,63 @@
+#/bin/bash
+
+if [ "$#" -lt 3 ]; then
+ echo "Usage: identify.sh <assemblies_folder> <refs_folder> <out_dir>"
+ exit 1
+fi
+
+CTG_LENGTH_THR=5000
+process_cnt=4
+thread_cnt=8
+assemblies_folder=$1
+refs_folder=$2
+#canopy_out=$3
+out_dir=$3
+
+folder=$out_dir/metaquast
+
+export LC_ALL=C
+mkdir -p $out_dir
+
+~/git/quast/metaquast.py --debug -R $refs_folder -o $out_dir/metaquast $assemblies_folder/*.fasta
+
+#awk ' {print $2,$1} ' $canopy_out | sort > $folder/clusters.txt
+
+rm -rf $out_dir/ref_summary.txt
+
+for ref in $refs_folder/*.fasta ; do
+ echo "Processing reference $ref"
+ ref_name=$(basename "$ref")
+ ref_name="${ref_name%.*}"
+
+ rm -rf $out_dir/${ref_name}.ctgs
+
+ #for sample in $assemblies_out_dir/sample9.fasta ; do
+ for sample in $assemblies_folder/*.fasta ; do
+ sample_name=$(basename "$sample")
+ sample_name="${sample_name%.*}"
+ aligned=$out_dir/metaquast/quast_corrected_input/${sample_name}_to_${ref_name}.fasta
+ ~/git/ngs_scripts/contig_length_filter.py $CTG_LENGTH_THR $aligned $out_dir/long.fasta.tmp
+ ~/git/ngs_scripts/contig_info.py $out_dir/long.fasta.tmp $out_dir/ctg.info.tmp
+ sed_command="s/ID_/${sample_name}-/g"
+ grep -Eo "ID_.*$" $out_dir/ctg.info.tmp | sed -e $sed_command >> $out_dir/${ref_name}.ctgs
+ rm $out_dir/long.fasta.tmp
+ rm $out_dir/ctg.info.tmp
+ done
+
+ sed 's/$/ '"${ref_name}"'/g' $out_dir/${ref_name}.ctgs >> $out_dir/ref_summary.txt
+
+ #sort $out_dir/${ref_name}.ctgs.tmp > $out_dir/${ref_name}.ctgs
+
+ #join $out_dir/${ref_name}.ctgs $out_dir/clusters.txt | awk ' { print $2 } ' | sort | uniq -c | sort -nr | head -10
+
+ #join $out_dir/${ref_name}.ctgs $out_dir/clusters.txt > $out_dir/join.txt
+ #awk ' { print $2 } ' $out_dir/join.txt | sort | uniq -c | sort -nr | head -10
+
+ report=$out_dir/metaquast/runs_per_reference/$ref_name/report.txt
+
+ grep "Assembly" $report
+ grep "Genome fraction" $report
+done
+
+#rm -rf $out_dir
+echo "Finished"
diff --git a/src/projects/mts/scripts/split_bins.py b/src/projects/mts/scripts/split_bins.py
new file mode 100755
index 0000000..dea8914
--- /dev/null
+++ b/src/projects/mts/scripts/split_bins.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python
+from __future__ import print_function
+
+import os
+from os import path
+import sys
+from Bio import SeqIO
+import common
+import subprocess
+
+def print_usage():
+ print("Usage: split_bins.py <contigs> <binning info> <output directory>")
+
+contigs = sys.argv[1]
+sample, _ = path.splitext(path.basename(contigs))
+out_dir = sys.argv[3]
+
+binning = common.load_annotation(sys.argv[2], False)
+
+subprocess.call("rm -f {}/{}-*.fasta".format(out_dir, sample), shell=True)
+
+cags = set()
+for seq in SeqIO.parse(contigs, "fasta"):
+ seq_id = seq.id
+ seq.id = sample + "-" + seq_id
+ #seq.id = common.get_id(seq.id, sample)
+ seq.description = ""
+ for cag in binning.get(seq_id, []):
+ with open(path.join(out_dir, "{}-{}.fasta".format(sample, cag)), "a") as output:
+ SeqIO.write(seq, output, "fasta")
diff --git a/src/projects/mts/stats.cpp b/src/projects/mts/stats.cpp
new file mode 100644
index 0000000..603da47
--- /dev/null
+++ b/src/projects/mts/stats.cpp
@@ -0,0 +1,194 @@
+/*
+ * stats.cpp
+ *
+ * Created on: 3 Dec 2015
+ * Author: idmit
+ */
+
+#include "pipeline/graphio.hpp"
+#include "pipeline/graph_pack.hpp"
+#include "utils/simple_tools.hpp"
+#include "utils/path_helper.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "math/xmath.h"
+#include <iostream>
+#include <vector>
+#include "io/reads/multifile_reader.hpp"
+#include "io/reads/splitting_wrapper.hpp"
+#include "io/reads/modifying_reader_wrapper.hpp"
+#include "io/reads/vector_reader.hpp"
+#include "io/reads/file_reader.hpp"
+#include "annotation.hpp"
+#include "visualization.hpp"
+#include "visualization/position_filler.hpp"
+#include "modules/simplification/tip_clipper.hpp"
+#include "getopt_pp/getopt_pp.h"
+
+using namespace debruijn_graph;
+
+io::SingleRead ReadSequence(io::SingleStream& reader) {
+ VERIFY(!reader.eof());
+ io::SingleRead read;
+ reader >> read;
+ return read;
+}
+
+io::SingleRead ReadGenome(const string& genome_path) {
+ path::CheckFileExistenceFATAL(genome_path);
+ auto genome_stream_ptr = std::make_shared<io::FileReadStream>(genome_path);
+ return ReadSequence(*genome_stream_ptr);
+}
+
+EdgeAnnotation LoadAnnotation(const conj_graph_pack& gp,
+ const vector<bin_id>& bins_of_interest,
+ io::SingleStream& contigs_stream,
+ io::SingleStream& splits_stream,
+ const string& annotation_path) {
+ AnnotationFiller filler(gp, bins_of_interest);
+ AnnotationStream annotation_stream(annotation_path);
+ return filler(contigs_stream, splits_stream, annotation_stream);
+}
+
+class BinnedInfo : public pair<size_t, size_t> {
+public:
+ BinnedInfo(): pair(0, 0) {}
+};
+
+void add_edge_info(BinnedInfo& info, size_t edge_length) {
+ ++info.first;
+ info.second += edge_length;
+}
+
+ostream& operator<<(ostream& str, const BinnedInfo& info) {
+ str << info.first << "\t" << info.second;
+ return str;
+}
+
+void create_console_logger() {
+ logging::logger *log = logging::create_logger("", logging::L_INFO);
+ log->add_writer(std::make_shared<logging::console_writer>());
+ logging::attach_logger(log);
+}
+
+int main(int argc, char** argv) {
+ create_console_logger();
+
+ using namespace GetOpt;
+
+ size_t k;
+ string saves_path, contigs_path, splits_path, edges_path;
+ vector<string> genomes_path;
+ string annotation_in_fn, prop_annotation_in_fn;
+ string table_fn, graph_dir;
+ vector<bin_id> bins_of_interest;
+
+ try {
+ GetOpt_pp ops(argc, argv);
+ ops.exceptions_all();
+ ops >> Option('k', k)
+ >> Option('s', saves_path)
+ >> Option('r', genomes_path)
+ >> Option('c', contigs_path)
+ >> Option('f', splits_path)
+ >> Option('a', annotation_in_fn)
+ >> Option('e', edges_path)
+ >> Option('p', prop_annotation_in_fn)
+ >> Option('o', table_fn)
+ //>> Option('d', graph_dir, "")
+ >> Option('b', bins_of_interest, {})
+ ;
+ } catch(GetOptEx &ex) {
+ cout << "Usage: stats -k <K> -s <saves path> -r <genomes path>+ "
+ "-f <splits_path> -c <contigs_path> -a <init binning info> -e <edges_path> -p <propagated binning info> "
+ "-o <stats table> [-d <graph directory> (currently disabled)] [-b (<bins of interest>)+]"
+ << endl;
+ exit(1);
+ }
+ //TmpFolderFixture fixture("tmp");
+
+ conj_graph_pack gp(k, "tmp", 0);
+ gp.kmer_mapper.Attach();
+ INFO("Load graph from " << saves_path);
+ graphio::ScanGraphPack(saves_path, gp);
+ gp.edge_pos.Attach();
+
+ ofstream output(table_fn);
+
+ output << "Reference\t"
+ << "Aligned edges\tAlignment length\t"
+ << "Binned edges\tBinned length\t"
+ << "Unbinned edges\tUnbinned length\t"
+ << "Pre-binned edges\tPre-binned length\t"
+ << "Propagated edges\tPropagated length" << endl;
+
+ for (const auto genome_path : genomes_path) {
+ auto ref_name = path::basename(genome_path);
+ io::SingleRead genome = ReadGenome(genome_path);
+
+ visualization::position_filler::FillPos(gp, genome_path, "", true);
+
+ io::FileReadStream contigs_stream(contigs_path);
+ io::FileReadStream splits_stream(splits_path);
+ EdgeAnnotation edge_annotation = LoadAnnotation(
+ gp, bins_of_interest, contigs_stream,
+ splits_stream, annotation_in_fn);
+
+ io::FileReadStream edges_stream(edges_path);
+ io::FileReadStream edges_stream2(edges_path);
+ EdgeAnnotation prop_edge_annotation = LoadAnnotation(
+ gp, bins_of_interest,
+ edges_stream, edges_stream2,
+ prop_annotation_in_fn);
+
+ shared_ptr<SequenceMapper<Graph>> mapper(MapperInstance(gp));
+
+ BinnedInfo pre_binned_info, prop_binned_info, binned_info,
+ unbinned_info, total_info;
+
+ auto genome_graph_path = mapper->MapRead(genome);
+ std::set<EdgeId> unbinned_edges;
+
+ gp.EnsurePos();
+ for (size_t i = 0; i < genome_graph_path.size(); ++i) {
+ EdgeId e = genome_graph_path[i].first;
+ auto range = genome_graph_path[i].second.mapped_range;
+ add_edge_info(total_info, gp.g.length(e));
+ if (edge_annotation.Annotation(e).empty()) {
+ if (prop_edge_annotation.Annotation(e).empty()) {
+ // Only check for prop_annotation is necessary
+ if (unbinned_edges.count(e) == 0) {
+ unbinned_edges.insert(e);
+ add_edge_info(unbinned_info, range.size());
+ /*std::cout << e.int_id() << "\t"
+ << gp.g.length(e) << "\t"
+ << range.size() << std::endl;*/
+ if (!graph_dir.empty()) {
+ std::string dot_export_path =
+ graph_dir + "/" + ref_name + "/" + std::to_string(e.int_id()) + ".dot";
+ PrintColoredAnnotatedGraphAroundEdge(
+ gp, e, prop_edge_annotation, dot_export_path);
+ }
+ }
+ } else {
+ DEBUG(e.int_id() << " was propagated\n");
+ add_edge_info(prop_binned_info, gp.g.length(e));
+ add_edge_info(binned_info, gp.g.length(e));
+ }
+ } else {
+ add_edge_info(pre_binned_info, gp.g.length(e));
+ if (prop_edge_annotation.Annotation(e).empty()) {
+ WARN(e.int_id() << " was lost during propagation\n");
+ } else {
+ add_edge_info(binned_info, gp.g.length(e));
+ }
+ }
+ }
+
+ output << ref_name << "\t"
+ << total_info << "\t"
+ << binned_info << "\t"
+ << unbinned_info << "\t"
+ << pre_binned_info << "\t"
+ << prop_binned_info << endl;
+ }
+}
diff --git a/src/projects/mts/test.py b/src/projects/mts/test.py
new file mode 100755
index 0000000..8c0c19f
--- /dev/null
+++ b/src/projects/mts/test.py
@@ -0,0 +1,205 @@
+#!/usr/bin/python
+from __future__ import print_function
+
+import argparse
+import os
+import os.path
+import re
+import shutil
+import sys
+import subprocess
+from traceback import print_exc
+import yaml
+
+from scripts.common import Table
+
+#Log class, use it, not print
+class Log:
+ text = ""
+
+ def log(self, s):
+ self.text += s + "\n"
+ print(s)
+
+ def warn(self, s):
+ msg = "WARNING: " + s
+ self.text += msg + "\n"
+ sys.stdout.write(msg)
+ sys.stdout.flush()
+
+ def err(self, s):
+ msg = "ERROR: " + s + "\n"
+ self.text += msg
+ sys.stdout.write(msg)
+ sys.stdout.flush()
+
+ def print_log(self):
+ print(self.text)
+
+ def get_log(self):
+ return self.text
+
+log = Log()
+
+# Taken from teamcity.py
+# Compile SPAdes
+def compile_spades(args, dataset_info, working_dir):
+ if not args.cfg_compilation:
+ log.log("Forced to use current SPAdes build, will not compile SPAdes");
+ elif 'spades_compile' not in dataset_info.__dict__ or dataset_info.spades_compile:
+ comp_params = ' '
+ if 'compilation_params' in dataset_info.__dict__:
+ comp_params = " ".join(dataset_info.compilation_params)
+
+ bin_dir = 'build_spades'
+ if not os.path.exists(bin_dir):
+ os.makedirs(bin_dir)
+ os.chdir(bin_dir)
+
+ #Compilation
+ err_code = os.system('cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=' + working_dir + ' ' + os.path.join(working_dir, 'src') + comp_params)
+ err_code = err_code | os.system('make -j 16')
+ err_code = err_code | os.system('make install')
+
+ os.chdir(working_dir)
+
+ if err_code != 0:
+ # Compile from the beginning if failed
+ shutil.rmtree('bin', True)
+ shutil.rmtree('build_spades', True)
+ return os.system('./spades_compile.sh ' + comp_params)
+ return 0
+
+def compile_mts(workdir):
+ #if not args.cfg_compilation:
+ # log.log("Forced to use current build, will not compile");
+ # return 0
+ os.chdir(workdir)
+ ecode = subprocess.call("./prepare_cfg")
+ if ecode != 0:
+ return ecode
+ return subprocess.call(["make", "-C", "build/release/projects/mts"])
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--config", "-c", help="Config template")
+ parser.add_argument("dir", help="Output directory")
+ parser.add_argument("--saves", "-s", type=str)
+ parser.add_argument("--no-clean", action="store_true")
+ parser.add_argument("--etalons", "-e", type=str, help="Directory of GF etalons")
+ args = parser.parse_args()
+ return args
+
+def prepare_config(args, workdir):
+ with open(os.path.join(args.config)) as template:
+ params = yaml.load(template)
+ params["BIN"] = os.path.join(workdir, "build/release/bin")
+ params["SCRIPTS"] = os.path.join(workdir, "src/projects/mts/scripts")
+ with open(os.path.join(args.dir, "config.yaml"), "w") as config:
+ config.write(yaml.dump(params))
+
+def run_mts(args, workdir):
+ if not args.no_clean:
+ shutil.rmtree(args.dir, True)
+ if not os.path.exists(args.dir):
+ os.mkdir(args.dir)
+ prepare_config(args, workdir)
+ mts_args = ["./mts.py", "--stats", args.dir]
+ if args.saves:
+ log.log("Copying saves from" + args.saves)
+ for saves_dir in ["assembly", "reassembly"]:
+ full_dir = os.path.join(args.saves, saves_dir)
+ if os.path.isdir(full_dir):
+ #shutil.copytree(os.path.join(args.saves, saves_dir), os.path.join(args.dir, saves_dir))
+ os.symlink(full_dir, os.path.join(args.dir, saves_dir))
+ else:
+ log.warn("No " + saves_dir + " dir provided; skipping")
+ #Don't touch symlinked assemblies because it may corrupt other runs with the same dependencies
+ #mts_args.append("--reuse-assemblies")
+ os.chdir(os.path.join(workdir, "src/projects/mts"))
+ return subprocess.call(mts_args)
+
+def check_etalons(args, workdir):
+ class mut:
+ res = 0
+
+ re_num = re.compile("-?\d+(?:\.\d+)?")
+ def read_cell(str):
+ maybe_num = re_num.search(str)
+ if not maybe_num:
+ return 0
+ return float(maybe_num.group(0))
+
+ #TODO: more configurable? Ideally, to set custom threshold for each cell
+
+ #Margin values should stay close to margin, otherwise it's a false pos/neg
+ pos_threshold = 95
+ neg_threshold = 5
+ #For the rest ("floating" clusters we're unsure of), we allow broader +/- margin
+ threshold = 10
+
+ def compare_gf(ref, cag, val1, val2):
+ log.log("Comparing {} in {}: {} vs {}".format(cag, ref, val1, val2))
+ et_val = read_cell(val1)
+ est_val = read_cell(val2)
+ lower = pos_threshold if et_val > pos_threshold else max(0, et_val - threshold)
+ upper = neg_threshold if et_val < neg_threshold else min(100, et_val + threshold)
+ if est_val < lower:
+ log.err("GF of {} in {} = {}% is less than expected {:.2f}%".format(cag, ref, est_val, lower))
+ mut.res = 7
+ elif est_val > upper:
+ log.err("GF of {} in {} = {}% is higher than expected {:.2f}%".format(cag, ref, est_val, upper))
+ mut.res = 7
+
+ for file in os.listdir(args.etalons):
+ etalon = os.path.join(args.etalons, file)
+ estimated = os.path.join(args.dir, "stats", "summary", file)
+ log.log("Trying to compare " + etalon + " and " + estimated)
+ if not os.path.isfile(estimated):
+ log.warn("No table provided for " + file)
+ continue
+ try:
+ log.log("Loading " + etalon)
+ et_table = Table.read(etalon, headers=True)
+ log.log("Loading " + estimated)
+ est_table = Table.read(estimated, headers=True)
+ log.log("Comparing GF for " + file)
+ et_table.zip_with(est_table, compare_gf)
+ except:
+ log.err("Cannot load {}".format(file))
+ raise
+ return mut.res
+
+if __name__ == "__main__":
+ try:
+ sys.stderr = sys.stdout
+ args = parse_args()
+ workdir = os.getcwd()
+ ecode = 0
+
+ #compile
+ #if compile_spades(args, dataset_info, working_dir) != 0:
+ # log.err("SPAdes compilation finished abnormally with exit code " + str(ecode))
+ # sys.exit(3)
+
+ ecode = compile_mts(workdir)
+ if ecode != 0:
+ log.err("MTS compilation finished abnormally with exit code " + str(ecode))
+ sys.exit(3)
+
+ ecode = run_mts(args, workdir)
+ if ecode != 0:
+ log.err("Error while running MTS: " + str(ecode))
+
+ if args.etalons:
+ ecode = check_etalons(args, workdir)
+
+ sys.exit(ecode)
+
+ except SystemExit:
+ raise
+
+ except:
+ log.err("The following unexpected error occured during the run:")
+ print_exc()
+ sys.exit(239)
diff --git a/src/projects/mts/visualization.hpp b/src/projects/mts/visualization.hpp
new file mode 100644
index 0000000..8ab87b5
--- /dev/null
+++ b/src/projects/mts/visualization.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "visualization/graph_colorer.hpp"
+#include "visualization/visualization_utils.hpp"
+
+namespace debruijn_graph {
+
+template <class Graph>
+class AnnotatedGraphColorer
+ : public visualization::graph_colorer::GraphColorer<Graph> {
+
+ EdgeAnnotation annotation_;
+ std::map<bin_id, std::string> color_map_;
+
+public:
+ AnnotatedGraphColorer(const EdgeAnnotation& annotation)
+ : annotation_(annotation) {
+ std::vector<std::string> preset_colors({"red", "blue", "yellow", "orange", "purple", "pink"});
+ VERIFY(annotation_.interesting_bins().size() <= preset_colors.size());
+ size_t i = 0;
+ for (const auto& b_id : annotation_.interesting_bins()) {
+ color_map_[b_id] = preset_colors[i];
+ ++i;
+ }
+ }
+
+ string GetValue(typename Graph::VertexId) const { return "black"; }
+
+ string GetValue(typename Graph::EdgeId edge) const {
+ if (annotation_.Annotation(edge).empty()) {
+ return "black";
+ }
+ vector<std::string> colors;
+ auto ann = annotation_.Annotation(edge);
+ std::ostringstream ss;
+ std::transform(ann.begin(), ann.end(), ostream_iterator<string>(ss, ":"), [&](bin_id b){
+ return get(color_map_, b);
+ });
+ return ss.str();
+ }
+
+};
+
+void PrintColoredAnnotatedGraphAroundEdge(const conj_graph_pack& gp,
+ const EdgeId& edge,
+ const EdgeAnnotation& annotation,
+ const string& output_filename) {
+ //std::cout << output_filename << std::endl;
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ auto colorer_ptr =
+ std::make_shared<AnnotatedGraphColorer<Graph>>(annotation);
+ GraphComponent<Graph> component = omnigraph::EdgeNeighborhood(gp.g, edge, 100, 10000);
+ visualization::visualization_utils::WriteComponent<Graph>(component, output_filename, colorer_ptr, labeler);
+}
+
+void PrintAnnotatedAlongPath(const conj_graph_pack& gp,
+ const vector<EdgeId>& path,
+ const EdgeAnnotation& annotation,
+ const string& output_prefix) {
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ auto colorer_ptr =
+ std::make_shared<AnnotatedGraphColorer<Graph>>(annotation);
+ visualization::visualization_utils::WriteComponentsAlongPath<Graph>(gp.g, path, output_prefix, colorer_ptr, labeler);
+}
+
+}
\ No newline at end of file
diff --git a/src/projects/online_vis/CMakeLists.txt b/src/projects/online_vis/CMakeLists.txt
index d020b82..f662879 100644
--- a/src/projects/online_vis/CMakeLists.txt
+++ b/src/projects/online_vis/CMakeLists.txt
@@ -24,11 +24,11 @@ include_directories(./drawing_commands)
include_directories(${CMAKE_SOURCE_DIR}/debruijn)
if (READLINE_FOUND AND CURSES_FOUND)
- target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY} ${CURSES_NCURSES_LIBRARY})
+ target_link_libraries(online_vis common_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY} ${CURSES_NCURSES_LIBRARY})
elseif (READLINE_FOUND)
- target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY})
+ target_link_libraries(online_vis common_modules nlopt format ${COMMON_LIBRARIES} ${READLINE_LIBRARY})
else()
- target_link_libraries(online_vis spades_modules nlopt format ${COMMON_LIBRARIES})
+ target_link_libraries(online_vis common_modules nlopt format ${COMMON_LIBRARIES})
endif()
if (DEFINED static_build)
diff --git a/src/projects/online_vis/debruijn_environment.hpp b/src/projects/online_vis/debruijn_environment.hpp
index 0bd7e3a..9886b25 100644
--- a/src/projects/online_vis/debruijn_environment.hpp
+++ b/src/projects/online_vis/debruijn_environment.hpp
@@ -25,7 +25,7 @@ class DebruijnEnvironment : public Environment {
GraphElementFinder<Graph> element_finder_;
std::shared_ptr<MapperClass> mapper_;
FillerClass filler_;
- omnigraph::DefaultLabeler<Graph> labeler_;
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler_;
debruijn_graph::ReadPathFinder<Graph> path_finder_;
ColoringClass coloring_;
//CompositeLabeler<Graph> labeler_;
@@ -87,7 +87,7 @@ class DebruijnEnvironment : public Environment {
DEBUG("Colorer done");
Path<EdgeId> path1 = mapper_->MapSequence(gp_.genome.GetSequence()).path();
Path<EdgeId> path2 = mapper_->MapSequence(!gp_.genome.GetSequence()).path();
- coloring_ = omnigraph::visualization::DefaultColorer(gp_.g, path1, path2);
+ coloring_ = visualization::graph_colorer::DefaultColorer(gp_.g, path1, path2);
ResetPositions();
}
@@ -193,7 +193,7 @@ class DebruijnEnvironment : public Environment {
return filler_;
}
- omnigraph::GraphLabeler<Graph>& labeler() {
+ visualization::graph_labeler::GraphLabeler<Graph>& labeler() {
return labeler_;
}
diff --git a/src/projects/online_vis/drawing_commands/draw_contig_command.hpp b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
index 37b90b9..0db8d64 100644
--- a/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_contig_command.hpp
@@ -10,7 +10,7 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads/wrapper_collection.hpp"
namespace online_visualization {
class DrawContigCommand : public DrawingCommand {
diff --git a/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
index c3b2011..9b8ef4f 100644
--- a/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_missasemblies.hpp
@@ -9,7 +9,7 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads/wrapper_collection.hpp"
namespace online_visualization {
class DrawMisassemblies : public DrawingCommand {
@@ -179,7 +179,7 @@ public:
string file = args[1];
- FillPos(curr_env.graph_pack(), file, "miss", true);
+ visualization::position_filler::FillPos(curr_env.graph_pack(), file, "miss", true);
cout << "All contigs are mapped" << endl;
diff --git a/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp b/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp
index 68ae311..d719cf8 100644
--- a/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_polymorphic_regions.hpp
@@ -10,7 +10,7 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads/wrapper_collection.hpp"
namespace online_visualization {
@@ -32,16 +32,15 @@ class DrawPolymorphicRegions : public DrawingCommand {
verticesToAdd.push_back(curr_env.graph().EdgeEnd(edge));
}
}
- GraphComponent<Graph> polymorphicComponent(curr_env.graph(), verticesToAdd.begin(), verticesToAdd.end());
- return polymorphicComponent;
+ return GraphComponent<Graph>::FromVertices(curr_env.graph(), verticesToAdd);
}
void DrawPicture(DebruijnEnvironment& curr_env, Sequence& genome) const {
size_t windowSize = 500;
for(size_t i = 0; i < genome.size() - windowSize - 1 - curr_env.k_value(); ++i)
{
- runtime_k::RtSeq firstKmer = genome.Subseq(i).start<runtime_k::RtSeq>(curr_env.k_value() + 1);
- runtime_k::RtSeq secondKmer = genome.Subseq(i + windowSize).start<runtime_k::RtSeq>(curr_env.k_value() + 1);
+ RtSeq firstKmer = genome.Subseq(i).start<RtSeq>(curr_env.k_value() + 1);
+ RtSeq secondKmer = genome.Subseq(i + windowSize).start<RtSeq>(curr_env.k_value() + 1);
firstKmer = curr_env.kmer_mapper().Substitute(firstKmer);
secondKmer = curr_env.kmer_mapper().Substitute(secondKmer);
pair<EdgeId, size_t> positionFirst = curr_env.index().get(firstKmer);
@@ -80,8 +79,12 @@ class DrawPolymorphicRegions : public DrawingCommand {
if(polymorphicRegion.e_size() > 5)
{
- visualization::WriteComponentSinksSources(polymorphicRegion, curr_env.folder() + "/" + ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot", visualization::DefaultColorer(curr_env.graph()),
- curr_env.labeler());
+ using namespace visualization::visualization_utils;
+ WriteComponentSinksSources(polymorphicRegion,
+ curr_env.folder() + "/" +
+ ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot",
+ visualization::graph_colorer::DefaultColorer(curr_env.graph()),
+ curr_env.labeler());
INFO("Component is written to " + curr_env.folder() + ToString(curr_env.graph().int_id(*polymorphicRegion.vertices().begin())) + ".dot");
}
diff --git a/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
index 23c69ed..2eb6ead 100644
--- a/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_poorly_assembled.hpp
@@ -10,9 +10,9 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
+#include "io/reads/wrapper_collection.hpp"
#include <boost/algorithm/string.hpp>
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
#include <boost/algorithm/string/predicate.hpp>
@@ -120,7 +120,7 @@ public:
class UnresolvedPrinter : public RepeatProcessor {
void DrawGap(DebruijnEnvironment& curr_env, const vector<EdgeId>& path, string filename, string /*label*/ = "") const {
- omnigraph::visualization::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, filename, curr_env.coloring(), curr_env.labeler());
+ visualization::visualization_utils::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, filename, curr_env.coloring(), curr_env.labeler());
LOG("The pictures is written to " << filename);
}
diff --git a/src/projects/online_vis/drawing_commands/draw_position_command.hpp b/src/projects/online_vis/drawing_commands/draw_position_command.hpp
index 51e792b..19bbe6e 100644
--- a/src/projects/online_vis/drawing_commands/draw_position_command.hpp
+++ b/src/projects/online_vis/drawing_commands/draw_position_command.hpp
@@ -17,7 +17,7 @@
namespace online_visualization {
class DrawPositionCommand : public DrawingCommand {
private:
- void DrawPicture(DebruijnEnvironment& curr_env, runtime_k::RtSeq kmer, string label = "") const {
+ void DrawPicture(DebruijnEnvironment& curr_env, RtSeq kmer, string label = "") const {
kmer = curr_env.kmer_mapper().Substitute(kmer);
if (!curr_env.index().contains(kmer)) {
cout << "No corresponding graph location " << endl;
@@ -72,7 +72,7 @@ namespace online_visualization {
}
if (CheckPositionBounds(position, genome.size(), curr_env.k_value())) {
- DrawPicture(curr_env, genome.Subseq(position).start<runtime_k::RtSeq>(curr_env.k_value() + 1), args[1]);
+ DrawPicture(curr_env, genome.Subseq(position).start<RtSeq>(curr_env.k_value() + 1), args[1]);
}
}
diff --git a/src/projects/online_vis/drawing_commands/drawing_command.hpp b/src/projects/online_vis/drawing_commands/drawing_command.hpp
index c825b7e..4fcba92 100644
--- a/src/projects/online_vis/drawing_commands/drawing_command.hpp
+++ b/src/projects/online_vis/drawing_commands/drawing_command.hpp
@@ -11,7 +11,7 @@
#include "../command.hpp"
#include "../errors.hpp"
#include "../argument_list.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
#include "io/reads/single_read.hpp"
namespace online_visualization {
@@ -28,7 +28,7 @@ protected:
//linkstream << curr_env.folder_ << "/" << curr_env.file_name_base_ << "_latest.dot";
//EdgePosGraphLabeler<Graph> labeler(curr_env.graph(), gp_.edge_pos);
omnigraph::GraphComponent<Graph> component = VertexNeighborhood(curr_env.graph(), vertex, curr_env.max_vertices_, curr_env.edge_length_bound_);
- omnigraph::visualization::WriteComponent<Graph>(component, file_name, curr_env.coloring_, curr_env.labeler());
+ visualization::visualization_utils::WriteComponent<Graph>(component, file_name, curr_env.coloring_, curr_env.labeler());
//WriteComponents <Graph> (curr_env.graph(), splitter, linkstream.str(), *DefaultColorer(curr_env.graph(), curr_env.coloring_), curr_env.labeler());
LOG("The picture is written to " << file_name);
@@ -42,7 +42,7 @@ protected:
string directory = namestream.str();
make_dir(directory);
namestream << label << "_";
- omnigraph::visualization::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, namestream.str(), curr_env.coloring_, curr_env.labeler());
+ visualization::visualization_utils::WriteComponentsAlongPath<Graph>(curr_env.graph(), path, namestream.str(), curr_env.coloring_, curr_env.labeler());
LOG("The pictures is written to " << directory);
curr_env.picture_counter_++;
@@ -61,7 +61,7 @@ protected:
make_dir(namestream.str());
namestream << label;
make_dir(namestream.str());
- omnigraph::visualization::WriteSizeLimitedComponents<Graph>(curr_env.graph(), namestream.str(), omnigraph::ConnectedSplitter<Graph>(curr_env.graph()), curr_env.coloring_, curr_env.labeler(), min_size, max_size, 10000000);
+ visualization::visualization_utils::WriteSizeLimitedComponents<Graph>(curr_env.graph(), namestream.str(), omnigraph::ConnectedSplitter<Graph>(curr_env.graph()), curr_env.coloring_, curr_env.labeler(), min_size, max_size, 10000000);
LOG("The pictures is written to " << namestream.str());
curr_env.picture_counter_++;
}
diff --git a/src/projects/online_vis/drawing_commands/show_position_command.hpp b/src/projects/online_vis/drawing_commands/show_position_command.hpp
index eb9daa1..f957b39 100644
--- a/src/projects/online_vis/drawing_commands/show_position_command.hpp
+++ b/src/projects/online_vis/drawing_commands/show_position_command.hpp
@@ -17,7 +17,7 @@
namespace online_visualization {
class ShowPositionCommand : public DrawingCommand {
private:
- int ShowPicture(DebruijnEnvironment& curr_env, runtime_k::RtSeq kmer, string label = "") const {
+ int ShowPicture(DebruijnEnvironment& curr_env, RtSeq kmer, string label = "") const {
kmer = curr_env.kmer_mapper().Substitute(kmer);
if (!curr_env.index().contains(kmer)) {
FireNoCorrespondingGraphLocation(label);
@@ -70,7 +70,7 @@ namespace online_visualization {
genome = !genome;
}
if (CheckPositionBounds(position, genome.size(), curr_env.k_value())) {
- int result = ShowPicture(curr_env, genome.Subseq(position).start<runtime_k::RtSeq>(curr_env.k_value() + 1), args[1]);
+ int result = ShowPicture(curr_env, genome.Subseq(position).start<RtSeq>(curr_env.k_value() + 1), args[1]);
if (result)
FireGenericError("Something is wrong");
}
diff --git a/src/projects/online_vis/environment.hpp b/src/projects/online_vis/environment.hpp
index 8f6a05a..ff2eaaf 100644
--- a/src/projects/online_vis/environment.hpp
+++ b/src/projects/online_vis/environment.hpp
@@ -14,11 +14,11 @@
namespace online_visualization {
-typedef debruijn_graph::NewExtendedSequenceMapper<debruijn_graph::Graph, Index> MapperClass;
-typedef debruijn_graph::PosFiller<Graph> FillerClass;
+typedef debruijn_graph::BasicSequenceMapper<debruijn_graph::Graph, Index> MapperClass;
+typedef visualization::position_filler::PosFiller<Graph> FillerClass;
typedef debruijn_graph::KmerMapper<Graph> KmerMapperClass;
typedef omnigraph::GraphElementFinder<Graph> ElementFinder;
-typedef shared_ptr<omnigraph::visualization::GraphColorer<Graph>> ColoringClass;
+typedef shared_ptr<visualization::graph_colorer::GraphColorer<Graph>> ColoringClass;
class Environment : private boost::noncopyable {
protected:
diff --git a/src/projects/online_vis/main.cpp b/src/projects/online_vis/main.cpp
index 2a7d08a..7684637 100644
--- a/src/projects/online_vis/main.cpp
+++ b/src/projects/online_vis/main.cpp
@@ -9,15 +9,15 @@
#include "vis_logger.hpp"
#include "standard_vis.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/stacktrace.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/stacktrace.hpp"
#include "pipeline/config_struct.hpp"
-#include "io/reads_io/io_helper.hpp"
-#include "dev_support/simple_tools.hpp"
+#include "io/reads/io_helper.hpp"
+#include "utils/simple_tools.hpp"
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
-#include "dev_support/memory_limit.hpp"
+#include "utils/memory_limit.hpp"
#include "io/dataset_support/read_converter.hpp"
#include "debruijn_online_visualizer.hpp"
diff --git a/src/projects/online_vis/online_visualizer.hpp b/src/projects/online_vis/online_visualizer.hpp
index 551a9f3..2d6e337 100644
--- a/src/projects/online_vis/online_visualizer.hpp
+++ b/src/projects/online_vis/online_visualizer.hpp
@@ -13,7 +13,7 @@
#include "command.hpp"
#include "loaded_environments.hpp"
#include "environment.hpp"
-#include "dev_support/autocompletion.hpp"
+#include "utils/autocompletion.hpp"
//#include "all_commands.hpp"
#include "base_commands.hpp"
diff --git a/src/projects/online_vis/position_commands/fill_position_command.hpp b/src/projects/online_vis/position_commands/fill_position_command.hpp
index 604f926..28c3ea3 100644
--- a/src/projects/online_vis/position_commands/fill_position_command.hpp
+++ b/src/projects/online_vis/position_commands/fill_position_command.hpp
@@ -50,7 +50,7 @@ namespace online_visualization {
string name = args[1];
string file = args[2];
- FillPos(curr_env.graph_pack(), file, name, true);
+ visualization::position_filler::FillPos(curr_env.graph_pack(), file, name, true);
}
};
}
diff --git a/src/projects/online_vis/processing_commands.hpp b/src/projects/online_vis/processing_commands.hpp
index 6d1a620..a9ca0b4 100644
--- a/src/projects/online_vis/processing_commands.hpp
+++ b/src/projects/online_vis/processing_commands.hpp
@@ -43,7 +43,7 @@ private:
length = curr_env.edge_length_bound();
}
- pred::TypedPredicate<EdgeId> condition = LengthUpperBound<Graph>(curr_env.graph(), length);
+ func::TypedPredicate<EdgeId> condition = LengthUpperBound<Graph>(curr_env.graph(), length);
if (args.size() > 2 && (args[2] == "Y" || args[2] == "y")) {
cout << "Trying to activate genome quality condition" << endl;
if (curr_env.genome().size() == 0) {
@@ -60,7 +60,7 @@ private:
}
debruijn::simplification::SimplifInfoContainer info(debruijn_graph::config::pipeline_type::base);
info.set_chunk_cnt(10);
- debruijn::simplification::TipClipperInstance(curr_env.graph(), condition, info, (omnigraph::HandlerF<Graph>)nullptr)->Run();
+ debruijn::simplification::TipClipperInstance(curr_env.graph(), condition, info, (omnigraph::EdgeRemovalHandlerF<Graph>)nullptr)->Run();
}
};
}
diff --git a/src/projects/online_vis/standard_vis.hpp b/src/projects/online_vis/standard_vis.hpp
index 68fde86..a2626e1 100644
--- a/src/projects/online_vis/standard_vis.hpp
+++ b/src/projects/online_vis/standard_vis.hpp
@@ -8,7 +8,7 @@
#pragma once
#include "pipeline/graph_pack.hpp"
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include <readline/readline.h>
#include <readline/history.h>
diff --git a/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp b/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp
index ee1f5fd..331bc50 100644
--- a/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp
+++ b/src/projects/online_vis/statistics_commands/junction_sequence_command.hpp
@@ -10,8 +10,8 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-#include "assembly_graph/graph_alignment/sequence_mapper.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
#include "assembly_graph/paths/path_utils.hpp"
namespace online_visualization {
diff --git a/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp b/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp
index eaf3485..41ed613 100644
--- a/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp
+++ b/src/projects/online_vis/statistics_commands/print_contigs_stats.hpp
@@ -10,7 +10,7 @@
#include "../environment.hpp"
#include "../command.hpp"
#include "../errors.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
namespace online_visualization {
class PrintContigsStatsCommand : public LocalCommand<DebruijnEnvironment> {
diff --git a/src/projects/online_vis/vis_logger.hpp b/src/projects/online_vis/vis_logger.hpp
index 42bd6a7..a0c0dbe 100644
--- a/src/projects/online_vis/vis_logger.hpp
+++ b/src/projects/online_vis/vis_logger.hpp
@@ -5,7 +5,7 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
#undef INFO
#define INFO(message) \
@@ -16,13 +16,13 @@
#define LOG(message) \
{ \
- std::cout << message << endl; \
+ std::cout << message << std::endl; \
} \
//#define trace(message) LOG_MSG(logging::L_TRACE, message)
#define debug(print, message) \
{ \
if (print) { \
- std::cout << message << std::endl; \
+ std::cout << message << std::endl; \
} \
}
diff --git a/src/projects/scaffold_correction/CMakeLists.txt b/src/projects/scaffold_correction/CMakeLists.txt
index 45e47b8..3f0f591 100644
--- a/src/projects/scaffold_correction/CMakeLists.txt
+++ b/src/projects/scaffold_correction/CMakeLists.txt
@@ -9,7 +9,7 @@ project(moleculo CXX)
add_executable(scaffold_correction
main.cpp)
-target_link_libraries(scaffold_correction spades_modules ${COMMON_LIBRARIES})
+target_link_libraries(scaffold_correction common_modules ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(scaffold_correction PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/scaffold_correction/main.cpp b/src/projects/scaffold_correction/main.cpp
index 9086e90..56eca8d 100644
--- a/src/projects/scaffold_correction/main.cpp
+++ b/src/projects/scaffold_correction/main.cpp
@@ -7,14 +7,13 @@
/*
* Assembler Main
*/
-#include "dev_support/logger/log_writers.hpp"
-
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/stacktrace.hpp"
-#include "dev_support/memory_limit.hpp"
-#include "dev_support/copy_file.hpp"
-#include "dev_support/perfcounter.hpp"
-#include "data_structures/sequence/runtime_k.hpp"
+#include "utils/logger/log_writers.hpp"
+
+#include "utils/segfault_handler.hpp"
+#include "utils/stacktrace.hpp"
+#include "utils/memory_limit.hpp"
+#include "utils/copy_file.hpp"
+#include "utils/perfcounter.hpp"
#include "scaffold_correction.hpp"
#include "pipeline/config_struct.hpp"
diff --git a/src/projects/scaffold_correction/scaffold_correction.hpp b/src/projects/scaffold_correction/scaffold_correction.hpp
index 0237e6b..7f056aa 100644
--- a/src/projects/scaffold_correction/scaffold_correction.hpp
+++ b/src/projects/scaffold_correction/scaffold_correction.hpp
@@ -5,16 +5,16 @@
//***************************************************************************
#pragma once
-#include "io/reads_io/osequencestream.hpp"
-#include "io/reads_io/file_reader.hpp"
+#include "io/reads/osequencestream.hpp"
+#include "io/reads/file_reader.hpp"
#include "pipeline/stage.hpp"
#include "pipeline/graph_pack.hpp"
#include "assembly_graph/paths/path_processor.hpp"
#include "stages/construction.hpp"
#include "pipeline/config_struct.hpp"
-#include "algorithms/dijkstra/dijkstra_algorithm.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
+#include "assembly_graph/dijkstra/dijkstra_algorithm.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
namespace scaffold_correction {
typedef debruijn_graph::ConjugateDeBruijnGraph Graph;
@@ -229,14 +229,13 @@ namespace spades {
public:
typedef debruijn_graph::config::debruijn_config::scaffold_correction Config;
private:
- size_t k_;
std::string output_file_;
const Config &config_;
public:
- ScaffoldCorrectionStage(size_t k, string output_file,
+ ScaffoldCorrectionStage(string output_file,
const Config &config) :
AssemblyStage("ScaffoldCorrection", "scaffold_correction"),
- k_(k), output_file_(output_file), config_(config) {
+ output_file_(output_file), config_(config) {
}
vector<Sequence> CollectScaffoldParts(const io::SingleRead &scaffold) const {
@@ -324,7 +323,7 @@ namespace spades {
cfg::get().load_from,
cfg::get().output_saves});
manager.add(new debruijn_graph::Construction())
- .add(new ScaffoldCorrectionStage(cfg::get().K, cfg::get().output_dir + "corrected_scaffolds.fasta", *cfg::get().sc_cor));
+ .add(new ScaffoldCorrectionStage(cfg::get().output_dir + "corrected_scaffolds.fasta", *cfg::get().sc_cor));
INFO("Output directory: " << cfg::get().output_dir);
conj_gp.kmer_mapper.Attach();
manager.run(conj_gp, cfg::get().entry_point.c_str());
diff --git a/src/projects/spades/CMakeLists.txt b/src/projects/spades/CMakeLists.txt
index f245266..e8f4743 100644
--- a/src/projects/spades/CMakeLists.txt
+++ b/src/projects/spades/CMakeLists.txt
@@ -14,11 +14,13 @@ add_executable(spades main.cpp
second_phase_setup.cpp
distance_estimation.cpp
repeat_resolving.cpp
- pacbio_aligning.cpp
- chromosome_removal.cpp)
-
+ contig_output_stage.cpp
+ hybrid_aligning.cpp
+ chromosome_removal.cpp
+ ../mts/contig_abundance.cpp)
+
target_include_directories(spades PRIVATE ${EXT_DIR}/include/ConsensusCore)
-target_link_libraries(spades ConsensusCore spades_modules nlopt BamTools ssw ${COMMON_LIBRARIES})
+target_link_libraries(spades ConsensusCore common_modules nlopt BamTools ssw ${COMMON_LIBRARIES})
if (SPADES_STATIC_BUILD)
set_target_properties(spades PROPERTIES LINK_SEARCH_END_STATIC 1)
diff --git a/src/projects/spades/chromosome_removal.cpp b/src/projects/spades/chromosome_removal.cpp
index f2282d5..fdedc68 100644
--- a/src/projects/spades/chromosome_removal.cpp
+++ b/src/projects/spades/chromosome_removal.cpp
@@ -6,11 +6,13 @@
#include "assembly_graph/graph_support/contig_output.hpp"
#include "stages/simplification_pipeline/graph_simplification.hpp"
-#include "algorithms/simplification/ec_threshold_finder.hpp"
-#include "assembly_graph/graph_core/basic_graph_stats.hpp"
-
+#include "modules/simplification/ec_threshold_finder.hpp"
+#include "assembly_graph/core/basic_graph_stats.hpp"
#include "chromosome_removal.hpp"
+#include "math/xmath.h"
+
+
namespace debruijn_graph {
@@ -90,16 +92,8 @@ size_t ChromosomeRemoval::CalculateComponentSize(EdgeId e, Graph &g_) {
double ChromosomeRemoval::RemoveLongGenomicEdges(conj_graph_pack &gp, size_t long_edge_bound, double coverage_limits, double external_chromosome_coverage){
INFO("Removing of long chromosomal edges started");
- vector <pair<double, size_t> > coverages;
- size_t total_len = 0, short_len = 0, cur_len = 0;
- for (auto iter = gp.g.ConstEdgeBegin(); ! iter.IsEnd(); ++iter){
- if (gp.g.length(*iter) > cfg::get().pd->edge_length_for_median) {
- coverages.push_back(make_pair(gp.g.coverage(*iter), gp.g.length(*iter)));
- total_len += gp.g.length(*iter);
- } else {
- short_len += gp.g.length(*iter);
- }
- }
+ CoverageUniformityAnalyzer coverage_analyzer(gp.g, long_edge_bound);
+ size_t total_len = coverage_analyzer.TotalLongEdgeLength();
if (total_len == 0) {
if (external_chromosome_coverage < 1.0) {
WARN("plasmid detection failed, not enough long edges");
@@ -109,29 +103,17 @@ double ChromosomeRemoval::RemoveLongGenomicEdges(conj_graph_pack &gp, size_t lon
}
return 0;
}
- std::sort(coverages.begin(), coverages.end());
- size_t i = 0;
- while (cur_len < total_len/2 && i <coverages.size()) {
- cur_len += coverages[i].second;
- i++;
- }
double median_long_edge_coverage;
if (external_chromosome_coverage < 1.0) {
- median_long_edge_coverage = coverages[i-1].first;
- INFO ("genomic coverage is "<< median_long_edge_coverage << " calculated of length " << size_t (double(total_len) * 0.5));
- size_t outsiders_length = 0;
- for (size_t j = 0; j < coverages.size(); j++) {
- if ( coverages[j].first >= median_long_edge_coverage * (1 + coverage_limits) || coverages[j].first <= median_long_edge_coverage * (1 - coverage_limits)) {
- outsiders_length += coverages[j].second;
- }
- }
- if (outsiders_length * 5 > total_len) {
- WARN ("More than 20% of long edges have coverage significantly different from median (total " << size_t (double(outsiders_length) * 0.5) <<" of "<< size_t (double(total_len) * 0.5) << " bases).");
+ median_long_edge_coverage = coverage_analyzer.CountMedianCoverage();
+ double fraction = coverage_analyzer.UniformityFraction(coverage_limits, median_long_edge_coverage);
+ if (math::gr(0.8, fraction)) {
+ WARN ("More than 20% of long edges have coverage significantly different from median (total " << size_t ((1-fraction) * 0.5 * double(total_len)) <<" of "<< size_t (double(total_len) * 0.5) << " bases).");
WARN ("In most cases it means that either read coverage is uneven or significant contamination is present - both of these two cases make plasmidSPAdes' results unreliable");
WARN ("However, that situation may still be OK if you expect to see large plasmids in your dataset, so plasmidSPAdes will continue to work");
} else {
- INFO(size_t(double(outsiders_length)/ double(total_len) * 100) << "% of bases from long edges have coverage significantly different from median");
+ INFO(size_t((1 - fraction) * 100) << "% of bases from long edges have coverage significantly different from median");
}
for (auto iter = gp.g.ConstEdgeBegin(); ! iter.IsEnd(); ++iter) {
if (long_component_.find(*iter) == long_component_.end()) {
@@ -167,8 +149,7 @@ void ChromosomeRemoval::PlasmidSimplify(conj_graph_pack &gp, size_t long_edge_bo
DEBUG("Simplifying graph for plasmid project");
size_t iteration_count = 10;
for (size_t i = 0; i < iteration_count; i++) {
- //pred::TypedPredicate<typename Graph::EdgeId> condition = make_shared<LengthUpperBound<Graph>>(gp.g, long_edge_bound) ;
- omnigraph::EdgeRemovingAlgorithm<Graph> tc(gp.g, pred::And(DeadEndCondition<Graph>(gp.g), LengthUpperBound<Graph>(gp.g, long_edge_bound)),
+ omnigraph::EdgeRemovingAlgorithm<Graph> tc(gp.g, func::And(DeadEndCondition<Graph>(gp.g), LengthUpperBound<Graph>(gp.g, long_edge_bound)),
removal_handler, true);
tc.Run();
}
diff --git a/src/projects/spades/chromosome_removal.hpp b/src/projects/spades/chromosome_removal.hpp
index f5e2cf9..77eb078 100644
--- a/src/projects/spades/chromosome_removal.hpp
+++ b/src/projects/spades/chromosome_removal.hpp
@@ -7,7 +7,8 @@
#pragma once
#include "pipeline/stage.hpp"
-#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/graph_support/coverage_uniformity_analyzer.hpp"
namespace debruijn_graph {
diff --git a/src/projects/spades/contig_output_stage.cpp b/src/projects/spades/contig_output_stage.cpp
new file mode 100644
index 0000000..15f71be
--- /dev/null
+++ b/src/projects/spades/contig_output_stage.cpp
@@ -0,0 +1,55 @@
+//***************************************************************************
+//* Copyright (c) 2015-2017 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "contig_output_stage.hpp"
+#include "assembly_graph/paths/bidirectional_path_io/bidirectional_path_output.hpp"
+
+namespace debruijn_graph {
+
+void ContigOutput::run(conj_graph_pack &gp, const char*) {
+ auto output_dir = cfg::get().output_dir;
+
+ OutputContigs(gp.g, output_dir + "before_rr", false);
+ OutputContigsToFASTG(gp.g, output_dir + "assembly_graph", gp.components);
+
+ if (output_paths_ && gp.contig_paths.size() != 0) {
+ DefaultContigCorrector<ConjugateDeBruijnGraph> corrector(gp.g);
+ DefaultContigConstructor<ConjugateDeBruijnGraph> constructor(gp.g, corrector);
+
+ auto name_generator = path_extend::MakeContigNameGenerator(cfg::get().mode, gp);
+ path_extend::ContigWriter writer(gp.g, constructor, gp.components, name_generator);
+
+ bool output_broken_scaffolds = cfg::get().pe_params.param_set.scaffolder_options.enabled &&
+ cfg::get().use_scaffolder &&
+ cfg::get().co.obs_mode != config::output_broken_scaffolds::none;
+
+ if (output_broken_scaffolds) {
+ int min_gap = 0;
+ if (cfg::get().co.obs_mode == config::output_broken_scaffolds::break_all) {
+ min_gap = 1;
+ } else if (cfg::get().co.obs_mode == config::output_broken_scaffolds::break_gaps) {
+ min_gap = int(gp.g.k());
+ } else {
+ WARN("Unsupported contig output mode");
+ }
+
+ path_extend::ScaffoldBreaker breaker(min_gap);
+ path_extend::PathContainer broken_scaffolds;
+ breaker.Break(gp.contig_paths, broken_scaffolds);
+ writer.OutputPaths(broken_scaffolds, output_dir + cfg::get().co.contigs_name);
+ }
+
+ writer.OutputPaths(gp.contig_paths, output_dir + cfg::get().co.scaffolds_name);
+
+ OutputContigsToGFA(gp.g, gp.contig_paths, cfg::get().output_dir + "assembly_graph");
+ } else {
+ OutputContigs(gp.g, output_dir + "simplified_contigs", cfg::get().use_unipaths);
+ OutputContigs(gp.g, output_dir + cfg::get().co.contigs_name, false);
+ }
+}
+
+}
diff --git a/src/projects/spades/contig_output_stage.hpp b/src/projects/spades/contig_output_stage.hpp
new file mode 100644
index 0000000..753a0b6
--- /dev/null
+++ b/src/projects/spades/contig_output_stage.hpp
@@ -0,0 +1,29 @@
+//***************************************************************************
+//* Copyright (c) 2015-2017 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+
+class ContigOutput : public spades::AssemblyStage {
+private:
+ bool output_paths_;
+public:
+ ContigOutput(bool output_paths = true)
+ : AssemblyStage("Contig Output", "contig_output"), output_paths_(output_paths) { }
+
+ void load(conj_graph_pack &, const std::string &, const char *) { }
+
+ void save(const conj_graph_pack &, const std::string &, const char *) const { }
+
+ void run(conj_graph_pack &gp, const char *);
+};
+
+}
\ No newline at end of file
diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp
index ed6ebf2..1950e85 100644
--- a/src/projects/spades/distance_estimation.cpp
+++ b/src/projects/spades/distance_estimation.cpp
@@ -41,8 +41,7 @@ void estimate_with_estimator(const Graph &graph,
cfg::get().amb_de.relative_length_threshold,
cfg::get().amb_de.relative_seq_threshold);
PairInfoFilter<Graph>(amb_de_checker).Filter(clustered_index);
- }
- else
+ } else
PairInfoFilter<Graph>(checker).Filter(clustered_index);
// filter.Filter(clustered_index);
DEBUG("Info Filtered");
@@ -126,8 +125,7 @@ void estimate_distance(conj_graph_pack& gp,
} else
weight_function = UnityFunction;
-// PairInfoWeightFilter<Graph> filter(gp.g, config.de.filter_threshold);
- PairInfoWeightChecker<Graph> checker(gp.g, config.de.filter_threshold);
+ PairInfoWeightChecker<Graph> checker(gp.g, config.de.clustered_filter_threshold);
INFO("Weight Filter Done");
@@ -228,7 +226,7 @@ void DistanceEstimation::run(conj_graph_pack &gp, const char*) {
}
if (!cfg::get().preserve_raw_paired_index) {
INFO("Clearing raw paired index");
- gp.paired_indices[i].Clear();
+ gp.paired_indices[i].clear();
}
}
}
diff --git a/src/projects/spades/gap_closer.cpp b/src/projects/spades/gap_closer.cpp
index e311945..0445846 100644
--- a/src/projects/spades/gap_closer.cpp
+++ b/src/projects/spades/gap_closer.cpp
@@ -7,19 +7,16 @@
#include "gap_closer.hpp"
#include "assembly_graph/stats/picture_dump.hpp"
-#include "algorithms/simplification/compressor.hpp"
+#include "modules/simplification/compressor.hpp"
#include "io/dataset_support/read_converter.hpp"
#include <stack>
namespace debruijn_graph {
-template<class Graph, class SequenceMapper>
class GapCloserPairedIndexFiller {
private:
- typedef typename Graph::EdgeId EdgeId;
- typedef typename Graph::VertexId VertexId;
const Graph &graph_;
- const SequenceMapper &mapper_;
+ const SequenceMapper<Graph> &mapper_;
size_t CorrectLength(Path<EdgeId> path, size_t idx) const {
size_t answer = graph_.length(path[idx]);
@@ -143,13 +140,13 @@ private:
INFO("Merging paired indices");
for (auto &index: buffer_pi) {
paired_index.Merge(index);
- index.Clear();
+ index.clear();
}
}
public:
- GapCloserPairedIndexFiller(const Graph &graph, const SequenceMapper &mapper)
+ GapCloserPairedIndexFiller(const Graph &graph, const SequenceMapper<Graph> &mapper)
: graph_(graph), mapper_(mapper) { }
/**
@@ -167,11 +164,7 @@ public:
};
-template<class Graph, class SequenceMapper>
class GapCloser {
-public:
- typedef std::function<bool(const Sequence &)> SequenceCheckF;
-private:
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::VertexId VertexId;
@@ -183,22 +176,6 @@ private:
const int init_gap_val_;
const omnigraph::de::DEWeight weight_threshold_;
- SequenceMapper mapper_;
- std::unordered_set<runtime_k::RtSeq> new_kmers_;
-
- bool CheckNoKmerClash(const Sequence &s) {
- runtime_k::RtSeq kmer(k_ + 1, s);
- kmer >>= 'A';
- for (size_t i = k_; i < s.size(); ++i) {
- kmer <<= s[i];
- if (new_kmers_.count(kmer)) {
- return false;
- }
- }
- std::vector<EdgeId> path = mapper_.MapSequence(s).simple_path();
- return path.empty();
- }
-
std::vector<size_t> DiffPos(const Sequence &s1, const Sequence &s2) const {
VERIFY(s1.size() == s2.size());
std::vector<size_t> answer;
@@ -259,67 +236,40 @@ private:
: long_seq.Subseq(long_seq.size() - short_seq.size()) == short_seq;
}
- void AddEdge(VertexId start, VertexId end, const Sequence &s) {
- runtime_k::RtSeq kmer(k_ + 1, s);
- kmer >>= 'A';
- for (size_t i = k_; i < s.size(); ++i) {
- kmer <<= s[i];
- new_kmers_.insert(kmer);
- new_kmers_.insert(!kmer);
- }
- g_.AddEdge(start, end, s);
- }
-
- bool CorrectLeft(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
+ void CorrectLeft(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
DEBUG("Can correct first with sequence from second.");
Sequence new_sequence = g_.EdgeNucls(first).Subseq(g_.length(first) - overlap + diff_pos.front(),
g_.length(first) + k_ - overlap)
+ g_.EdgeNucls(second).First(k_);
DEBUG("Checking new k+1-mers.");
- if (CheckNoKmerClash(new_sequence)) {
- DEBUG("Check ok.");
- DEBUG("Splitting first edge.");
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(first, g_.length(first) - overlap + diff_pos.front());
- first = split_res.first;
- tips_paired_idx_.Remove(split_res.second);
- DEBUG("Adding new edge.");
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
- new_sequence);
- return true;
- } else {
- DEBUG("Check fail.");
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
- return false;
+ DEBUG("Check ok.");
+ DEBUG("Splitting first edge.");
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(first, g_.length(first) - overlap + diff_pos.front());
+ first = split_res.first;
+ tips_paired_idx_.Remove(split_res.second);
+ DEBUG("Adding new edge.");
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
+ g_.AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
+ new_sequence);
}
- bool CorrectRight(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
+ void CorrectRight(EdgeId first, EdgeId second, int overlap, const vector<size_t> &diff_pos) {
DEBUG("Can correct second with sequence from first.");
Sequence new_sequence =
g_.EdgeNucls(first).Last(k_) + g_.EdgeNucls(second).Subseq(overlap, diff_pos.back() + 1 + k_);
DEBUG("Checking new k+1-mers.");
- if (CheckNoKmerClash(new_sequence)) {
- DEBUG("Check ok.");
- DEBUG("Splitting second edge.");
- pair<EdgeId, EdgeId> split_res = g_.SplitEdge(second, diff_pos.back() + 1);
- second = split_res.second;
- tips_paired_idx_.Remove(split_res.first);
- DEBUG("Adding new edge.");
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
- VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
-
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
- new_sequence);
- return true;
- } else {
- DEBUG("Check fail.");
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
- return false;
+ DEBUG("Check ok.");
+ DEBUG("Splitting second edge.");
+ pair<EdgeId, EdgeId> split_res = g_.SplitEdge(second, diff_pos.back() + 1);
+ second = split_res.second;
+ tips_paired_idx_.Remove(split_res.first);
+ DEBUG("Adding new edge.");
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeEnd(first)), true));
+ VERIFY(MatchesEnd(new_sequence, g_.VertexNucls(g_.EdgeStart(second)), false));
+
+ g_.AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second),
+ new_sequence);
}
bool HandlePositiveHammingDistanceCase(EdgeId first, EdgeId second, int overlap) {
@@ -327,9 +277,11 @@ private:
vector<size_t> diff_pos = DiffPos(g_.EdgeNucls(first).Last(overlap),
g_.EdgeNucls(second).First(overlap));
if (CanCorrectLeft(first, overlap, diff_pos)) {
- return CorrectLeft(first, second, overlap, diff_pos);
+ CorrectLeft(first, second, overlap, diff_pos);
+ return true;
} else if (CanCorrectRight(second, overlap, diff_pos)) {
- return CorrectRight(first, second, overlap, diff_pos);
+ CorrectRight(first, second, overlap, diff_pos);
+ return true;
} else {
DEBUG("Can't correct tips due to the graph structure");
return false;
@@ -347,15 +299,10 @@ private:
//old code
Sequence edge_sequence = g_.EdgeNucls(first).Last(k_)
+ g_.EdgeNucls(second).Subseq(overlap, k_);
- if (CheckNoKmerClash(edge_sequence)) {
- DEBUG("Gap filled: Gap size = " << k_ - overlap << " Result seq "
- << edge_sequence.str());
- AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second), edge_sequence);
- return true;
- } else {
- DEBUG("Filled k-mer already present in graph");
- return false;
- }
+ DEBUG("Gap filled: Gap size = " << k_ - overlap << " Result seq "
+ << edge_sequence.str());
+ g_.AddEdge(g_.EdgeEnd(first), g_.EdgeStart(second), edge_sequence);
+ return true;
}
bool ProcessPair(EdgeId first, EdgeId second) {
@@ -439,7 +386,6 @@ public:
GapCloser(Graph &g, omnigraph::de::PairedInfoIndexT<Graph> &tips_paired_idx,
size_t min_intersection, double weight_threshold,
- const SequenceMapper &mapper,
size_t hamming_dist_bound = 0 /*min_intersection_ / 5*/)
: g_(g),
k_((int) g_.k()),
@@ -447,9 +393,7 @@ public:
min_intersection_(min_intersection),
hamming_dist_bound_(hamming_dist_bound),
init_gap_val_(-10),
- weight_threshold_(weight_threshold),
- mapper_(mapper),
- new_kmers_() {
+ weight_threshold_(weight_threshold) {
VERIFY(min_intersection_ < g_.k());
DEBUG("weight_threshold=" << weight_threshold_);
DEBUG("min_intersect=" << min_intersection_);
@@ -462,19 +406,17 @@ private:
template<class Streams>
void CloseGaps(conj_graph_pack &gp, Streams &streams) {
- typedef NewExtendedSequenceMapper<Graph, Index> Mapper;
auto mapper = MapperInstance(gp);
- GapCloserPairedIndexFiller<Graph, Mapper> gcpif(gp.g, *mapper);
+ GapCloserPairedIndexFiller gcpif(gp.g, *mapper);
PairedIndexT tips_paired_idx(gp.g);
gcpif.FillIndex(tips_paired_idx, streams);
- GapCloser<Graph, Mapper> gap_closer(gp.g, tips_paired_idx,
- cfg::get().gc.minimal_intersection, cfg::get().gc.weight_threshold,
- *mapper);
+ GapCloser gap_closer(gp.g, tips_paired_idx,
+ cfg::get().gc.minimal_intersection, cfg::get().gc.weight_threshold);
gap_closer.CloseShortGaps();
}
void GapClosing::run(conj_graph_pack &gp, const char *) {
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
printer(config::info_printer_pos::before_first_gap_closer);
diff --git a/src/projects/spades/gap_closing.hpp b/src/projects/spades/gap_closing.hpp
new file mode 100644
index 0000000..182f055
--- /dev/null
+++ b/src/projects/spades/gap_closing.hpp
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "assembly_graph/core/graph.hpp"
+#include "assembly_graph/graph_support/basic_vertex_conditions.hpp"
+#include "assembly_graph/graph_support/edge_removal.hpp"
+#include "modules/simplification/compressor.hpp"
+
+namespace debruijn_graph {
+
+namespace gap_closing {
+typedef omnigraph::GapDescription<Graph> GapDescription;
+
+class GapJoiner {
+ Graph& g_;
+ omnigraph::EdgeRemover<Graph> edge_remover_;
+ bool add_flanks_;
+
+ EdgeId ClipEnd(EdgeId e, size_t pos) {
+ VERIFY(pos > 0);
+ VERIFY(omnigraph::TerminalVertexCondition<Graph>(g_).Check(g_.EdgeEnd(e)));
+ VERIFY(e != g_.conjugate(e));
+ if (pos == g_.length(e)) {
+ return e;
+ } else {
+ auto split_res = g_.SplitEdge(e, pos);
+ edge_remover_.DeleteEdge(split_res.second);
+ return split_res.first;
+ }
+ }
+
+ EdgeId ClipStart(EdgeId e, size_t pos) {
+ return g_.conjugate(ClipEnd(g_.conjugate(e), g_.length(e) - pos));
+ }
+
+ EdgeId AddEdge(VertexId v1, VertexId v2, const Sequence& gap_seq) {
+ if (!add_flanks_) {
+ VERIFY_MSG(g_.VertexNucls(v1) == gap_seq.Subseq(0, g_.k()),
+ g_.VertexNucls(v1) << " not equal " << gap_seq.Subseq(0, g_.k()));
+ VERIFY_MSG(g_.VertexNucls(v2) == gap_seq.Subseq(gap_seq.size() - g_.k()),
+ g_.VertexNucls(v2) << " not equal " << gap_seq.Subseq(gap_seq.size() - g_.k()));
+ return g_.AddEdge(v1, v2, gap_seq);
+ } else {
+ DEBUG("Adding gap seq " << gap_seq);
+ DEBUG("Between vertices " << g_.VertexNucls(v1) << " and " << g_.VertexNucls(v2));
+ return g_.AddEdge(v1, v2, g_.VertexNucls(v1) + gap_seq + g_.VertexNucls(v2));
+ }
+ }
+
+public:
+ GapJoiner(Graph& g, bool add_flanks = false) :
+ g_(g),
+ edge_remover_(g),
+ add_flanks_(add_flanks) {
+ }
+
+ EdgeId operator() (const GapDescription& gap, bool compress = true) {
+ VERIFY(gap.start != gap.end && gap.start != g_.conjugate(gap.end));
+ DEBUG("Processing gap " << gap.str(g_));
+ EdgeId start = ClipEnd(gap.start, gap.edge_gap_start_position);
+ EdgeId end = ClipStart(gap.end, gap.edge_gap_end_position);
+ EdgeId new_edge = AddEdge(g_.EdgeEnd(start), g_.EdgeStart(end), gap.gap_seq);
+
+ if (compress) {
+ return omnigraph::Compressor<Graph>(g_).CompressVertexEdgeId(g_.EdgeStart(new_edge));
+ } else {
+ return new_edge;
+ }
+ }
+private:
+ DECL_LOGGER("GapJoiner");
+};
+
+}
+}
diff --git a/src/projects/spades/hybrid_aligning.cpp b/src/projects/spades/hybrid_aligning.cpp
new file mode 100644
index 0000000..ffdd915
--- /dev/null
+++ b/src/projects/spades/hybrid_aligning.cpp
@@ -0,0 +1,462 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#include "modules/alignment/pacbio/pac_index.hpp"
+#include "hybrid_gap_closer.hpp"
+#include "modules/alignment/long_read_mapper.hpp"
+#include "modules/alignment/short_read_mapper.hpp"
+#include "io/reads/wrapper_collection.hpp"
+#include "assembly_graph/stats/picture_dump.hpp"
+#include "hybrid_aligning.hpp"
+#include "pair_info_count.hpp"
+#include "io/reads/multifile_reader.hpp"
+
+namespace debruijn_graph {
+
+namespace gap_closing {
+
+//TODO standard aligner badly needs spurious match filtering
+class GapTrackingListener : public SequenceMapperListener {
+ const Graph& g_;
+ GapStorage& gap_storage_;
+ const GapStorage empty_storage_;
+ vector<GapStorage> buffer_storages_;
+
+ const GapDescription INVALID_GAP;
+
+ boost::optional<Sequence> Subseq(const io::SingleRead& read, size_t start, size_t end) const {
+ DEBUG("Requesting subseq of read length " << read.size() << " from " << start << " to " << end);
+ VERIFY(end > start);
+ //if (end == start) {
+ // DEBUG("Returning empty sequence");
+ // return boost::make_optional(Sequence());
+ //}
+ auto subread = read.Substr(start, end);
+ if (subread.IsValid()) {
+ DEBUG("Gap seq valid. Length " << subread.size());
+ return boost::make_optional(subread.sequence());
+ } else {
+ DEBUG("Gap seq invalid. Length " << subread.size());
+ DEBUG("sequence: " << subread.GetSequenceString());
+ return boost::none;
+ }
+ }
+
+ boost::optional<Sequence> Subseq(const io::SingleReadSeq& read, size_t start, size_t end) const {
+ return boost::make_optional(read.sequence().Subseq(start, end));
+ }
+
+ template<class ReadT>
+ GapDescription
+ CreateDescription(const ReadT& read, size_t seq_start, size_t seq_end,
+ EdgeId left, size_t left_offset,
+ EdgeId right, size_t right_offset) const {
+ VERIFY(left_offset > 0 && right_offset < g_.length(right));
+
+ DEBUG("Creating gap description");
+
+ //trying to shift on the left edge
+ if (seq_start >= seq_end) {
+ //+1 is a trick to avoid empty gap sequences
+ size_t overlap = seq_start - seq_end + 1;
+ DEBUG("Overlap of size " << overlap << " detected. Fixing.");
+ size_t left_shift = std::min(overlap, left_offset - 1);
+ VERIFY(seq_start >= left_shift);
+ seq_start -= left_shift;
+ left_offset -= left_shift;
+ }
+ //trying to shift on the right edge
+ if (seq_start >= seq_end) {
+ //+1 is a trick to avoid empty gap sequences
+ size_t overlap = seq_start - seq_end + 1;
+ DEBUG("Overlap of size " << overlap << " remained. Fixing.");
+ size_t right_shift = std::min(overlap, g_.length(right) - right_offset - 1);
+ VERIFY(seq_end + right_shift <= read.size());
+ seq_end += right_shift;
+ right_offset += right_shift;
+ }
+
+ if (seq_start < seq_end) {
+ auto gap_seq = Subseq(read, seq_start, seq_end);
+ if (gap_seq) {
+ DEBUG("Gap info successfully created");
+ return GapDescription(left, right,
+ *gap_seq,
+ left_offset,
+ right_offset);
+ } else {
+ DEBUG("Something wrong with read subsequence");
+ }
+ } else {
+ size_t overlap = seq_start - seq_end + 1;
+ DEBUG("Failed to fix overlap of size " << overlap);
+ }
+ return INVALID_GAP;
+ }
+
+ template<class ReadT>
+ vector<GapDescription> InferGaps(const ReadT& read,
+ const MappingPath<EdgeId>& mapping) const {
+ TerminalVertexCondition<Graph> tip_condition(g_);
+ DEBUG("Inferring gaps")
+ VERIFY(!mapping.empty());
+ vector<GapDescription> answer;
+ for (size_t i = 0; i < mapping.size() - 1; ++i) {
+ EdgeId e1 = mapping.edge_at(i);
+ EdgeId e2 = mapping.edge_at(i + 1);
+
+ //sorry, loops and other special cases
+ if (e1 != e2 && e1 != g_.conjugate(e2)
+ && e1 != g_.conjugate(e1) && e2 != g_.conjugate(e2)
+ && tip_condition.Check(g_.EdgeEnd(e1))
+ && tip_condition.Check(g_.EdgeStart(e2))) {
+
+ MappingRange mr1 = mapping.mapping_at(i);
+ MappingRange mr2 = mapping.mapping_at(i + 1);
+ DEBUG("Creating description from mapping ranges " << mr1 << " and " << mr2);
+ size_t seq_start = mr1.initial_range.end_pos + g_.k();
+ size_t seq_end = mr2.initial_range.start_pos;
+
+ auto gap = CreateDescription(read, seq_start, seq_end,
+ e1, mr1.mapped_range.end_pos,
+ e2, mr2.mapped_range.start_pos);
+
+ if (gap != INVALID_GAP) {
+ answer.push_back(gap);
+ }
+ }
+ }
+ return answer;
+ }
+
+ template<class ReadT>
+ void InnerProcessRead(size_t thread_index, const ReadT& read, const MappingPath<EdgeId>& mapping) {
+ DEBUG("Processing read");
+ if (!mapping.empty()) {
+ for (const auto& gap: InferGaps(read, mapping)) {
+ DEBUG("Adding gap info " << gap.str(g_));
+ buffer_storages_[thread_index].AddGap(gap);
+ }
+ } else {
+ DEBUG("Mapping was empty");
+ }
+ DEBUG("Read processed");
+ }
+
+public:
+
+ //ALERT passed path_storage should be empty!
+ GapTrackingListener(const Graph& g,
+ GapStorage& gap_storage) :
+ g_(g), gap_storage_(gap_storage), empty_storage_(gap_storage) {
+ VERIFY(empty_storage_.size() == 0);
+ }
+
+ void StartProcessLibrary(size_t threads_count) override {
+ for (size_t i = 0; i < threads_count; ++i) {
+ buffer_storages_.push_back(empty_storage_);
+ }
+ }
+
+ void StopProcessLibrary() override {
+ //FIXME put this code into ancestor
+ for (size_t i = 0; i < buffer_storages_.size(); ++i) {
+ MergeBuffer(i);
+ }
+ buffer_storages_.clear();
+ }
+
+ void MergeBuffer(size_t thread_index) override {
+ DEBUG("Merge buffer " << thread_index << " with size " << buffer_storages_[thread_index].size());
+ gap_storage_.AddStorage(buffer_storages_[thread_index]);
+ buffer_storages_[thread_index].clear();
+ DEBUG("Now size " << gap_storage_.size());
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleRead& read,
+ const MappingPath<EdgeId>& mapping) override {
+ InnerProcessRead(thread_index, read, mapping);
+ }
+
+ void ProcessSingleRead(size_t thread_index,
+ const io::SingleReadSeq& read,
+ const MappingPath<EdgeId>& mapping) override {
+ InnerProcessRead(thread_index, read, mapping);
+ }
+
+ void ProcessPairedRead(size_t,
+ const io::PairedReadSeq&,
+ const MappingPath<EdgeId>&,
+ const MappingPath<EdgeId>&) override {
+ //nothing to do
+ }
+
+ void ProcessPairedRead(size_t,
+ const io::PairedRead&,
+ const MappingPath<EdgeId>&,
+ const MappingPath<EdgeId>&) override {
+ //nothing to do
+ }
+
+private:
+ DECL_LOGGER("GapTrackingListener");
+};
+
+bool IsNontrivialAlignment(const vector<vector<EdgeId>>& aligned_edges) {
+ for (size_t j = 0; j < aligned_edges.size(); j++)
+ if (aligned_edges[j].size() > 1)
+ return true;
+ return false;
+}
+
+io::SingleStreamPtr GetReadsStream(const io::SequencingLibrary<config::DataSetData>& lib) {
+ io::ReadStreamList<io::SingleRead> streams;
+ for (const auto& reads : lib.single_reads())
+ //do we need input_file function here?
+ //TODO add decent support for N-s?
+ streams.push_back(make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(reads)));
+ return io::MultifileWrap(streams);
+}
+
+class PacbioAligner {
+ const pacbio::PacBioMappingIndex<Graph>& pac_index_;
+ PathStorage<Graph>& path_storage_;
+ GapStorage& gap_storage_;
+ pacbio::StatsCounter stats_;
+ const PathStorage<Graph> empty_path_storage_;
+ const GapStorage empty_gap_storage_;
+ const size_t read_buffer_size_;
+
+ void ProcessReadsBatch(const std::vector<io::SingleRead>& reads, size_t thread_cnt) {
+ vector<PathStorage<Graph>> long_reads_by_thread(thread_cnt,
+ empty_path_storage_);
+ vector<GapStorage> gaps_by_thread(thread_cnt,
+ empty_gap_storage_);
+ vector<pacbio::StatsCounter> stats_by_thread(thread_cnt);
+
+ size_t longer_500 = 0;
+ size_t aligned = 0;
+ size_t nontrivial_aligned = 0;
+
+ #pragma omp parallel for reduction(+: longer_500, aligned, nontrivial_aligned)
+ for (size_t i = 0; i < reads.size(); ++i) {
+ size_t thread_num = omp_get_thread_num();
+ Sequence seq(reads[i].sequence());
+ auto current_read_mapping = pac_index_.GetReadAlignment(seq);
+ for (const auto& gap : current_read_mapping.gaps)
+ gaps_by_thread[thread_num].AddGap(gap);
+
+ const auto& aligned_edges = current_read_mapping.main_storage;
+ for (const auto& path : aligned_edges)
+ long_reads_by_thread[thread_num].AddPath(path, 1, true);
+
+ //counting stats:
+ for (const auto& path : aligned_edges)
+ stats_by_thread[thread_num].path_len_in_edges[path.size()]++;
+
+ if (seq.size() > 500) {
+ longer_500++;
+ if (aligned_edges.size() > 0) {
+ aligned++;
+ stats_by_thread[thread_num].seeds_percentage[
+ size_t(floor(double(current_read_mapping.seed_num) * 1000.0
+ / (double) seq.size()))]++;
+
+ if (IsNontrivialAlignment(aligned_edges)) {
+ nontrivial_aligned++;
+ }
+ }
+ }
+ }
+
+ INFO("Read batch of size: " << reads.size() << " processed; "
+ << longer_500 << " of them longer than 500; among long reads aligned: "
+ << aligned << "; paths of more than one edge received: "
+ << nontrivial_aligned);
+
+ for (size_t i = 0; i < thread_cnt; i++) {
+ path_storage_.AddStorage(long_reads_by_thread[i]);
+ gap_storage_.AddStorage(gaps_by_thread[i]);
+ stats_.AddStorage(stats_by_thread[i]);
+ }
+ }
+
+public:
+ PacbioAligner(const pacbio::PacBioMappingIndex<Graph>& pac_index,
+ PathStorage<Graph>& path_storage,
+ GapStorage& gap_storage,
+ size_t read_buffer_size = 50000) :
+ pac_index_(pac_index),
+ path_storage_(path_storage),
+ gap_storage_(gap_storage),
+ empty_path_storage_(path_storage),
+ empty_gap_storage_(gap_storage),
+ read_buffer_size_(read_buffer_size) {
+ VERIFY(empty_path_storage_.size() == 0);
+ VERIFY(empty_gap_storage_.size() == 0);
+ }
+
+ void operator()(io::SingleStream& read_stream, size_t thread_cnt) {
+ size_t n = 0;
+ size_t buffer_no = 0;
+ while (!read_stream.eof()) {
+ std::vector<io::SingleRead> read_buffer;
+ read_buffer.reserve(read_buffer_size_);
+ io::SingleRead read;
+ for (size_t buf_size = 0; buf_size < read_buffer_size_ && !read_stream.eof(); ++buf_size) {
+ read_stream >> read;
+ read_buffer.push_back(std::move(read));
+ }
+ INFO("Prepared batch " << buffer_no << " of " << read_buffer.size() << " reads.");
+ DEBUG("master thread number " << omp_get_thread_num());
+ ProcessReadsBatch(read_buffer, thread_cnt);
+ ++buffer_no;
+ n += read_buffer.size();
+ INFO("Processed " << n << " reads");
+ }
+ }
+
+ const pacbio::StatsCounter& stats() const {
+ return stats_;
+ }
+};
+
+void PacbioAlignLibrary(const conj_graph_pack& gp,
+ const io::SequencingLibrary<config::DataSetData>& lib,
+ PathStorage<Graph>& path_storage,
+ GapStorage& gap_storage,
+ size_t thread_cnt) {
+ INFO("Aligning library with Pacbio aligner");
+
+ INFO("Using seed size: " << cfg::get().pb.pacbio_k);
+
+ //initializing index
+ pacbio::PacBioMappingIndex<Graph> pac_index(gp.g,
+ cfg::get().pb.pacbio_k,
+ cfg::get().K,
+ cfg::get().pb.ignore_middle_alignment,
+ cfg::get().output_dir,
+ cfg::get().pb);
+
+ PacbioAligner aligner(pac_index, path_storage, gap_storage);
+
+ auto stream = GetReadsStream(lib);
+ aligner(*stream, thread_cnt);
+
+ INFO("For library of " << (lib.is_long_read_lib() ? "long reads" : "contigs") << " :");
+ aligner.stats().report();
+ INFO("PacBio aligning finished");
+}
+
+void CloseGaps(conj_graph_pack& gp, bool rtype,
+ const GapStorage& gap_storage,
+ size_t min_weight) {
+ INFO("Closing gaps with long reads");
+
+ HybridGapCloser::ConsensusF consensus_f;
+ if (rtype) {
+ consensus_f = &PoaConsensus;
+ } else {
+ consensus_f = [=](const vector<string>& gap_seqs) {
+ return TrivialConsenus(gap_seqs, cfg::get().pb.max_contigs_gap_length);
+ };
+ }
+
+ HybridGapCloser gap_closer(gp.g, gap_storage,
+ min_weight, consensus_f,
+ cfg::get().pb.long_seq_limit);
+ auto replacement = gap_closer();
+
+ for (size_t j = 0; j < cfg::get().ds.reads.lib_count(); j++) {
+ gp.single_long_reads[j].ReplaceEdges(replacement);
+ }
+
+ INFO("Closing gaps with long reads finished");
+}
+}
+using namespace gap_closing;
+
+bool ShouldAlignWithPacbioAligner(io::LibraryType lib_type) {
+ return lib_type == io::LibraryType::UntrustedContigs ||
+ lib_type == io::LibraryType::PacBioReads ||
+ lib_type == io::LibraryType::SangerReads ||
+ lib_type == io::LibraryType::NanoporeReads; //||
+// lib_type == io::LibraryType::TSLReads;
+}
+
+void HybridLibrariesAligning::run(conj_graph_pack& gp, const char*) {
+ using namespace omnigraph;
+
+ bool make_additional_saves = parent_->saves_policy().make_saves_;
+ for (size_t lib_id = 0; lib_id < cfg::get().ds.reads.lib_count(); ++lib_id) {
+ if (cfg::get().ds.reads[lib_id].is_hybrid_lib()) {
+ INFO("Hybrid library detected: #" << lib_id);
+
+ const auto& lib = cfg::get().ds.reads[lib_id];
+ bool rtype = lib.is_long_read_lib();
+
+ auto& path_storage = gp.single_long_reads[lib_id];
+ GapStorage gap_storage(gp.g);
+
+ if (ShouldAlignWithPacbioAligner(lib.type())) {
+ //TODO put alternative alignment right here
+ PacbioAlignLibrary(gp, lib,
+ path_storage, gap_storage,
+ cfg::get().max_threads);
+ } else {
+ gp.EnsureBasicMapping();
+ GapTrackingListener mapping_listener(gp.g, gap_storage);
+ INFO("Processing reads from hybrid library " << lib_id);
+
+ //FIXME make const
+ auto& reads = cfg::get_writable().ds.reads[lib_id];
+
+ SequenceMapperNotifier notifier(gp);
+ //FIXME pretty awful, would be much better if listeners were shared ptrs
+ LongReadMapper read_mapper(gp.g, gp.single_long_reads[lib_id],
+ ChooseProperReadPathExtractor(gp.g, reads.type()));
+
+ notifier.Subscribe(lib_id, &mapping_listener);
+ notifier.Subscribe(lib_id, &read_mapper);
+
+ auto mapper_ptr = ChooseProperMapper(gp, reads);
+ //FIXME think of N's proper handling
+ auto single_streams = single_easy_readers(reads, false,
+ /*map_paired*/false, /*handle Ns*/false);
+
+ notifier.ProcessLibrary(single_streams, lib_id, *mapper_ptr);
+ cfg::get_writable().ds.reads[lib_id].data().single_reads_mapped = true;
+
+ INFO("Finished processing long reads from lib " << lib_id);
+ gp.index.Detach();
+ }
+
+ if (make_additional_saves) {
+ INFO("Producing additional saves");
+ path_storage.DumpToFile(cfg::get().output_saves + "long_reads_before_rep.mpr",
+ map<EdgeId, EdgeId>(), /*min_stats_cutoff*/rtype ? 1 : 0, true);
+ gap_storage.DumpToFile(cfg::get().output_saves + "gaps.mpr");
+ }
+
+ INFO("Padding gaps");
+ size_t min_gap_quantity = rtype ? cfg::get().pb.pacbio_min_gap_quantity
+ : cfg::get().pb.contigs_min_gap_quantity;
+
+ INFO("Min gap weight set to " << min_gap_quantity);
+ gap_storage.PrepareGapsForClosure(min_gap_quantity, /*max flank length*/500);
+
+ gap_closing::CloseGaps(gp, rtype, gap_storage, min_gap_quantity);
+ }
+ }
+
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
+ printer(config::info_printer_pos::final_gap_closed);
+}
+
+}
diff --git a/src/projects/spades/hybrid_aligning.hpp b/src/projects/spades/hybrid_aligning.hpp
new file mode 100644
index 0000000..d29d694
--- /dev/null
+++ b/src/projects/spades/hybrid_aligning.hpp
@@ -0,0 +1,23 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "pipeline/stage.hpp"
+
+namespace debruijn_graph {
+
+class HybridLibrariesAligning : public spades::AssemblyStage {
+public:
+ HybridLibrariesAligning()
+ : AssemblyStage("Hybrid Aligning", "hybrid_aligning") {
+ }
+ void run(conj_graph_pack &gp, const char*);
+};
+
+}
+
diff --git a/src/projects/spades/hybrid_gap_closer.hpp b/src/projects/spades/hybrid_gap_closer.hpp
new file mode 100644
index 0000000..0443715
--- /dev/null
+++ b/src/projects/spades/hybrid_gap_closer.hpp
@@ -0,0 +1,743 @@
+//***************************************************************************
+//* Copyright (c) 2015 Saint Petersburg State University
+//* Copyright (c) 2011-2014 Saint Petersburg Academic University
+//* All Rights Reserved
+//* See file LICENSE for details.
+//***************************************************************************
+
+#pragma once
+
+#include "assembly_graph/core/graph.hpp"
+#include "modules/alignment/sequence_mapper.hpp"
+#include "ConsensusCore/Poa/PoaConfig.hpp"
+#include "ConsensusCore/Poa/PoaConsensus.hpp"
+#include "gap_closing.hpp"
+
+#include <algorithm>
+#include <fstream>
+
+namespace debruijn_graph {
+namespace gap_closing {
+typedef vector<GapDescription> GapInfos;
+
+typedef pair<EdgeId, EdgeId> EdgePair;
+inline EdgePair Conjugate(const Graph& g, EdgePair ep) {
+ return EdgePair(g.conjugate(ep.second), g.conjugate(ep.first));
+}
+
+inline bool IsCanonical(const Graph& g, const EdgePair& ep) {
+ return ep <= Conjugate(g, ep);
+}
+
+inline bool IsCanonical(const Graph& g, EdgeId a, EdgeId b) {
+ return IsCanonical(g, EdgePair(a,b));
+}
+
+inline bool IsCanonical(const Graph& g, EdgeId e) {
+ return e <= g.conjugate(e);
+}
+
+inline EdgePair GetCanonical(const Graph& g, const EdgePair& ep) {
+ return IsCanonical(g, ep) ? ep : Conjugate(g, ep);
+}
+
+class GapStorage {
+public:
+ typedef typename GapInfos::const_iterator gap_info_it;
+ typedef std::pair<gap_info_it, gap_info_it> info_it_pair;
+private:
+ typedef std::function<bool (gap_info_it, gap_info_it)> CandidatesPred;
+ typedef std::function<bool (const EdgePair&)> EdgePairPred;
+ typedef std::function<bool (const GapDescription&)> DescriptionPred;
+ typedef std::set<EdgePair> ConnectionSet;
+
+ const Graph& g_;
+
+ map<EdgeId, GapInfos> inner_index_;
+ vector<EdgeId> index_;
+
+ DECL_LOGGER("GapStorage");
+
+ void HiddenAddGap(const GapDescription& p) {
+ inner_index_[p.start].push_back(p);
+ }
+
+ size_t FillIndex() {
+ VERIFY(index_.empty());
+ index_.reserve(inner_index_.size());
+ set<EdgeId> tmp;
+ for (const auto& kv : inner_index_) {
+ index_.push_back(kv.first);
+ }
+ return index_.size();
+ }
+
+ typename std::vector<GapDescription>::iterator
+ const_iterator_cast(std::vector<GapDescription> &v,
+ typename std::vector<GapDescription>::const_iterator iter) const {
+ return v.begin() + (iter - v.cbegin());
+ }
+
+ //Function should return true if corresponding part of the index should be removed
+ void FilterByCandidates(const CandidatesPred &filter_f) {
+ for (auto it = inner_index_.begin(); it != inner_index_.end(); ) {
+ vector<GapDescription>& gaps = it->second;
+ auto ep_ranges = EdgePairGaps(gaps);
+
+ auto copy_dest = gaps.begin();
+ for (const info_it_pair& ep_gaps : ep_ranges) {
+ if (filter_f(ep_gaps.first, ep_gaps.second)) {
+ DEBUG("Erasing candidates between " << g_.int_id(ep_gaps.first->start) << " and "
+ << g_.int_id(ep_gaps.first->end));
+ } else {
+ if (copy_dest == const_iterator_cast(gaps, ep_gaps.first)) {
+ copy_dest = const_iterator_cast(gaps, ep_gaps.second);
+ } else {
+ copy_dest = std::move(ep_gaps.first, ep_gaps.second, copy_dest);
+ }
+ }
+ }
+ if (copy_dest == gaps.begin()) {
+ inner_index_.erase(it++);
+ } else {
+ gaps.erase(copy_dest, gaps.end());
+ ++it;
+ }
+ }
+ }
+
+ void FilterByEdgePair(const EdgePairPred &filter_f) {
+ FilterByCandidates([=](gap_info_it info_start, gap_info_it /*info_end*/) {
+ return filter_f(EdgePair(info_start->start, info_start->end));
+ });
+ }
+
+ void FilterByDescription(const DescriptionPred &filter_f) {
+ for (auto it = inner_index_.begin(); it != inner_index_.end(); ) {
+ vector<GapDescription>& gaps = it->second;
+ auto res_it = std::remove_if(gaps.begin(), gaps.end(), filter_f);
+ if (res_it == gaps.begin()) {
+ inner_index_.erase(it++);
+ } else {
+ gaps.erase(res_it, gaps.end());
+ ++it;
+ }
+ }
+ }
+
+ vector<EdgeId> SecondEdges(const GapInfos& edge_gaps) const {
+ vector<EdgeId> jump_edges;
+ for (auto it_pair : EdgePairGaps(edge_gaps)) {
+ jump_edges.push_back(it_pair.first->end);
+ }
+ return jump_edges;
+ };
+
+ ConnectionSet GetAllConnections() const {
+ ConnectionSet answer;
+ for (const auto& e_gaps : inner_index_) {
+ EdgeId e1 = e_gaps.first;
+ for (EdgeId e2: SecondEdges(e_gaps.second)) {
+ EdgePair ep(e1, e2);
+ answer.insert(ep);
+ answer.insert(Conjugate(g_, ep));
+ }
+ }
+ return answer;
+ };
+
+ //outputs set of transitively-redundant CANONICAL connections
+ ConnectionSet DetectTransitive() const {
+ auto all_connections = GetAllConnections();
+ ConnectionSet answer;
+ for (auto it = all_connections.begin(), end_it = all_connections.end(); it != end_it; ) {
+ EdgeId left = it->first;
+ vector<EdgeId> right_options;
+ auto inner_it = it;
+ for (; inner_it != end_it && inner_it->first == left; ++inner_it) {
+ right_options.push_back(inner_it->second);
+ }
+
+ for (size_t i = 0; i < right_options.size(); ++i) {
+ for (size_t j = 0; j < right_options.size(); ++j) {
+ if (i == j)
+ continue;
+ if (all_connections.count(EdgePair(right_options[i], right_options[j]))) {
+ //TODO should we add sanity checks that other edges of the triangle are not there?
+ answer.insert(GetCanonical(g_, EdgePair(left, right_options[j])));
+ DEBUG("pair " << g_.int_id(left) << "," << g_.int_id(right_options[j])
+ << " is ignored because of edge between "
+ << g_.int_id(right_options[i]));
+ }
+ }
+ }
+ it = inner_it;
+ }
+ return answer;
+ }
+
+ std::set<EdgeId> AmbiguouslyExtending() const {
+ std::set<EdgeId> answer;
+ std::set<EdgeId> left_edges;
+ for (const auto& e_gaps : inner_index_) {
+ EdgeId e1 = e_gaps.first;
+ for (EdgeId e2: SecondEdges(e_gaps.second)) {
+ if (!left_edges.insert(e1).second) {
+ answer.insert(e1);
+ }
+ if (!left_edges.insert(g_.conjugate(e2)).second) {
+ answer.insert(g_.conjugate(e2));
+ }
+ }
+ }
+ return answer;
+ }
+
+ void FilterIndex(size_t min_weight, size_t max_flank) {
+ DEBUG("Filtering by maximal allowed flanking length " << max_flank);
+ FilterByDescription([=](const GapDescription &gap) {
+ return gap.edge_gap_start_position + max_flank < g_.length(gap.start)
+ || gap.edge_gap_end_position > max_flank;
+ });
+
+ DEBUG("Filtering by weight " << min_weight);
+ FilterByCandidates([=](gap_info_it info_start, gap_info_it info_end) {
+ auto cnt = std::distance(info_start, info_end);
+ VERIFY(cnt > 0);
+ return size_t(cnt) < min_weight;
+ });
+
+
+ DEBUG("Filtering transitive gaps");
+ ConnectionSet transitive_ignore = DetectTransitive();
+
+ FilterByEdgePair([&](const EdgePair &ep) {
+ VERIFY(IsCanonical(g_, ep));
+ return transitive_ignore.count(ep);
+ });
+
+ DEBUG("Filtering ambiguous situations");
+ std::set<EdgeId> ambiguously_extending = AmbiguouslyExtending();
+ FilterByEdgePair([&](const EdgePair &ep) {
+ return ambiguously_extending.count(ep.first) ||
+ ambiguously_extending.count(g_.conjugate(ep.second));
+ });
+ }
+
+public:
+
+ GapStorage(const Graph& g)
+ : g_(g) {
+ }
+
+ const map<EdgeId, GapInfos>& inner_index() const {
+ return inner_index_;
+ };
+
+ EdgeId operator[](size_t i) const {
+ return index_.at(i);
+ }
+
+ size_t size() const {
+ return index_.size();
+ }
+
+ void AddGap(const GapDescription& p) {
+ if (IsCanonical(g_, p.start, p.end)) {
+ HiddenAddGap(p);
+ } else {
+ HiddenAddGap(p.conjugate(g_));
+ }
+ }
+
+ void AddStorage(const GapStorage& to_add) {
+ const auto& idx = to_add.inner_index_;
+ for (auto iter = idx.begin(); iter != idx.end(); ++iter)
+ inner_index_[iter->first].insert(inner_index_[iter->first].end(), iter->second.begin(), iter->second.end());
+ }
+
+ void clear() {
+ GapStorage empty(g_);
+ std::swap(inner_index_, empty.inner_index_);
+ std::swap(index_, empty.index_);
+ }
+
+ void DumpToFile(const string filename) const {
+ ofstream filestr(filename);
+ for (const auto& e_gaps : inner_index_) {
+ EdgeId e = e_gaps.first;
+ auto gaps = e_gaps.second;
+ DEBUG(g_.int_id(e) << " " << gaps.size());
+ filestr << g_.int_id(e) << " " << gaps.size() << endl;
+ std::sort(gaps.begin(), gaps.end());
+ for (const auto& gap : gaps) {
+ filestr << gap.str(g_);
+ }
+ filestr << endl;
+ }
+ }
+
+// void LoadFromFile(const string s) {
+// FILE* file = fopen((s).c_str(), "r");
+// int res;
+// char ss[5000];
+// map<int, EdgeId> tmp_map;
+// for (auto iter = g.ConstEdgeBegin(); !iter.IsEnd(); ++iter) {
+// tmp_map[g.int_id(*iter)] = *iter;
+// }
+// while (!feof(file)) {
+// int first_id, second_id, first_ind, second_ind;
+// int size;
+// res = fscanf(file, "%d %d\n", &first_id, &size);
+// VERIFY(res == 2);
+// for (int i = 0; i < size; i++) {
+// res = fscanf(file, "%d %d\n", &first_id, &first_ind);
+// VERIFY(res == 2);
+// res = fscanf(file, "%d %d\n", &second_id, &second_ind);
+// VERIFY(res == 2);
+// res = fscanf(file, "%s\n", ss);
+// VERIFY(res == 1);
+// GapDescription<Graph> gap(tmp_map[first_id], tmp_map[second_id], Sequence(ss), first_ind, second_ind);
+// this->AddGap(gap);
+// }
+// }
+// }
+
+ //edge_gaps must be sorted
+ vector<info_it_pair> EdgePairGaps(const GapInfos& edge_gaps) const {
+ vector<info_it_pair> answer;
+ auto ep_start = edge_gaps.begin();
+ for (auto it = ep_start; it != edge_gaps.end(); ++it) {
+ if (it->end != ep_start->end) {
+ answer.push_back({ep_start, it});
+ ep_start = it;
+ }
+ }
+ answer.push_back({ep_start, edge_gaps.end()});
+ return answer;
+ };
+
+ void PrepareGapsForClosure(size_t min_weight, size_t max_flank) {
+ for (auto& e_gaps : inner_index_) {
+ auto& gaps = e_gaps.second;
+ std::sort(gaps.begin(), gaps.end());
+ }
+ DEBUG("Raw extensions available for " << inner_index_.size() << " edges");
+
+ FilterIndex(min_weight, max_flank);
+ DEBUG("Filtered extensions available for " << inner_index_.size() << " edges");
+ FillIndex();
+ }
+};
+
+inline string PoaConsensus(const vector<string>& gap_seqs) {
+ const ConsensusCore::PoaConsensus* pc = ConsensusCore::PoaConsensus::FindConsensus(
+ gap_seqs,
+ ConsensusCore::PoaConfig::GLOBAL_ALIGNMENT);
+ return pc->Sequence();
+}
+
+inline string TrivialConsenus(const vector<string>& gap_seqs, size_t max_length) {
+ VERIFY(!gap_seqs.empty());
+ return gap_seqs.front().length() < max_length ? gap_seqs.front() : "";
+}
+
+/*Keys are actual edges of the graph, values are original edges*/
+/*In general many-to-many relationship*/
+class EdgeFateTracker : omnigraph::GraphActionHandler<Graph> {
+ map<EdgeId, set<EdgeId>> storage_;
+
+ void FillRelevant(EdgeId e, set<EdgeId>& relevant) const {
+ auto it = storage_.find(e);
+ if (it != storage_.end()) {
+ //one of novel edges
+ relevant.insert(it->second.begin(), it->second.end());
+ } else {
+ //one of original edges
+ relevant.insert(e);
+ }
+ }
+
+public:
+ EdgeFateTracker(const Graph& g) :
+ omnigraph::GraphActionHandler<Graph>(g, "EdgeFateTracker") {
+ }
+
+ void HandleAdd(EdgeId e) override {
+ if (!storage_.count(e))
+ storage_[e] = {};
+ }
+
+ void HandleDelete(EdgeId e) override {
+ storage_.erase(e);
+ }
+
+ void HandleMerge(const vector<EdgeId>& old_edges, EdgeId new_edge) override {
+ set<EdgeId> relevant_records;
+ for (EdgeId e : old_edges) {
+ FillRelevant(e, relevant_records);
+ }
+ storage_[new_edge] = relevant_records;
+ }
+
+ void HandleGlue(EdgeId /*new_edge*/, EdgeId /*edge1*/, EdgeId /*edge2*/) override {
+ VERIFY(false);
+ }
+
+ void HandleSplit(EdgeId old_edge, EdgeId new_edge_1,
+ EdgeId new_edge_2) override {
+ set<EdgeId> relevant_records;
+ FillRelevant(old_edge, relevant_records);
+ storage_[new_edge_1] = relevant_records;
+ storage_[new_edge_2] = relevant_records;
+ }
+
+ map<EdgeId, EdgeId> Old2NewMapping() const {
+ map<EdgeId, EdgeId> old_2_new;
+ for (const auto& new_2_olds : storage_) {
+ for (EdgeId e : new_2_olds.second) {
+ VERIFY(!old_2_new.count(e));
+ old_2_new[e] = new_2_olds.first;
+ }
+ }
+ return old_2_new;
+ }
+
+};
+
+class MultiGapJoiner {
+ typedef map<EdgeId, pair<size_t, size_t>> SplitInfo;
+
+ Graph& g_;
+ GapJoiner inner_joiner_;
+
+ bool CheckGapsValidity(const vector<GapDescription>& gaps) const {
+ vector<GapDescription> answer;
+ return std::all_of(gaps.begin(), gaps.end(), [&](const GapDescription &gap) {
+ return IsCanonical(g_, gap.start, gap.end) && gap.start != gap.end && gap.start != g_.conjugate(gap.end);
+ });
+ }
+
+ void Add(size_t idx, EdgeId e, size_t pos, SplitInfo& primary, SplitInfo& secondary) const {
+ SplitInfo* storage = &primary;
+ if (!IsCanonical(g_, e)) {
+ e = g_.conjugate(e);
+ pos = g_.length(e) - pos;
+ storage = &secondary;
+ }
+ VERIFY(!storage->count(e));
+ storage->insert(make_pair(e, make_pair(idx, pos)));
+ }
+
+ vector<EdgeId> EdgesNeedingSplit(const SplitInfo& left_split_info, const SplitInfo& right_split_info) const {
+ vector<EdgeId> answer;
+ for (EdgeId e : key_set(left_split_info))
+ if (right_split_info.count(e))
+ answer.push_back(e);
+ return answer;
+ }
+
+ size_t ArtificialSplitPos(size_t left_split, size_t right_split) const {
+ if (right_split < left_split + 2) {
+ DEBUG("Artificial split impossible");
+ return -1ul;
+ }
+ return (left_split + right_split) / 2;
+ }
+
+ bool Update(EdgeId& e, size_t& gap_pos, EdgePair split_orig_ep, EdgePair split_res, bool gap_start) const {
+ EdgeId split_orig = split_orig_ep.first;
+ if (e == split_orig_ep.second) {
+ split_orig = split_orig_ep.second;
+ split_res = Conjugate(g_, split_res);
+ }
+ if (e == split_orig) {
+ if (gap_start) {
+ e = split_res.second;
+ gap_pos = gap_pos - g_.length(split_res.first);
+ } else {
+ e = split_res.first;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ void UpdateGap(GapDescription& gap, EdgePair split_orig, EdgePair split_res) const {
+ bool u1 = Update(gap.start, gap.edge_gap_start_position, split_orig, split_res, true);
+ bool u2 = Update(gap.end, gap.edge_gap_end_position, split_orig, split_res, false);
+ VERIFY(u1 != u2);
+ }
+
+ bool CheckInsert(EdgeId e, set<EdgeId>& used_edges) const {
+ return used_edges.insert(e).second;
+ }
+
+ bool CheckInsert(const vector<EdgeId> edges, set<EdgeId>& used_edges) const {
+ for (EdgeId e : edges) {
+ if (!CheckInsert(e, used_edges)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ std::set<EdgeId> RelevantEdges(const GapDescription& gap) const {
+ std::set<EdgeId> answer;
+ answer.insert(gap.start);
+ answer.insert(g_.conjugate(gap.start));
+ answer.insert(gap.end);
+ answer.insert(g_.conjugate(gap.end));
+ return answer;
+ }
+
+ bool CheckGaps(const vector<GapDescription>& gaps) const {
+ set<EdgeId> used_edges;
+ for (const auto& gap : gaps) {
+ const auto relevant = RelevantEdges(gap);
+ //TODO check the semantics of all_of
+ if (!std::all_of(relevant.begin(), relevant.end(), [&](const EdgeId& e) {
+ return used_edges.insert(e).second;
+ })) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ vector<GapDescription> ArtificialSplitAndGapUpdate(vector<GapDescription> canonical_gaps) const {
+ SplitInfo left_split_pos;
+ SplitInfo right_split_pos;
+ for (size_t i = 0; i < canonical_gaps.size(); ++i) {
+ const auto& gap = canonical_gaps[i];
+ DEBUG("Processing gap " << gap.str(g_));
+ Add(i, gap.start, gap.edge_gap_start_position, right_split_pos, left_split_pos);
+ Add(i, gap.end, gap.edge_gap_end_position, left_split_pos, right_split_pos);
+ }
+
+ set<size_t> to_ignore;
+
+ for (EdgeId e : EdgesNeedingSplit(left_split_pos, right_split_pos)) {
+ size_t artificial_split_pos = ArtificialSplitPos(left_split_pos[e].second, right_split_pos[e].second);
+ if (artificial_split_pos == -1ul) {
+ to_ignore.insert(left_split_pos[e].first);
+ to_ignore.insert(right_split_pos[e].first);
+ } else {
+ DEBUG("Splitting edge " << g_.str(e) << " at pos " << artificial_split_pos);
+ DEBUG("Will update gap " << canonical_gaps[left_split_pos[e].first].str(g_) << " and " << canonical_gaps[right_split_pos[e].first].str(g_));
+ EdgePair ep(e, g_.conjugate(e));
+ auto split_res = g_.SplitEdge(e, artificial_split_pos);
+ UpdateGap(canonical_gaps[left_split_pos[e].first], ep, split_res);
+ UpdateGap(canonical_gaps[right_split_pos[e].first], ep, split_res);
+ }
+ }
+
+ vector<GapDescription> updated_gaps;
+ updated_gaps.reserve(canonical_gaps.size());
+ for (size_t i = 0; i < canonical_gaps.size(); ++i) {
+ if (!to_ignore.count(i)) {
+ updated_gaps.push_back(canonical_gaps[i]);
+ }
+ }
+
+ VERIFY(CheckGaps(updated_gaps));
+ return updated_gaps;
+ };
+
+public:
+ MultiGapJoiner(Graph& g) : g_(g), inner_joiner_(g, true) {
+ }
+
+ //Resulting graph should be condensed
+ void operator()(const vector<GapDescription>& gaps) {
+ size_t closed_gaps = 0;
+ VERIFY_MSG(CheckGapsValidity(gaps), "Gap check failed");
+ for (const auto& gap : ArtificialSplitAndGapUpdate(gaps)) {
+ inner_joiner_(gap, /*condense*/false);
+ ++closed_gaps;
+ }
+ INFO("Closed " << closed_gaps << " gaps");
+ }
+private:
+ DECL_LOGGER("MultiGapJoiner");
+};
+
+class HybridGapCloser {
+public:
+ typedef std::function<string (const vector<string>&)> ConsensusF;
+private:
+ typedef RtSeq Kmer;
+ typedef typename GapStorage::gap_info_it gap_info_it;
+
+ DECL_LOGGER("HybridGapCloser");
+
+ Graph& g_;
+ const GapStorage& storage_;
+ const size_t min_weight_;
+ ConsensusF consensus_;
+ const size_t long_seq_limit_;
+ const size_t max_consensus_reads_;
+
+ const GapDescription INVALID_GAP;
+
+ string PrintLengths(const vector<string>& gap_seqs) const {
+ stringstream ss;
+ for (const auto& gap_v : gap_seqs)
+ ss << gap_v.length() << " ";
+ return ss.str();
+ }
+
+ GapDescription ConstructConsensus(EdgeId start,
+ EdgeId end,
+ size_t edge_gap_start_position,
+ size_t edge_gap_end_position,
+ const vector<string>& gap_variants) const {
+ DEBUG(gap_variants.size() << " gap closing variants, lengths: " << PrintLengths(gap_variants));
+ DEBUG("var size original " << gap_variants.size());
+ vector<string> new_gap_variants(gap_variants.begin(), gap_variants.end());
+ new_gap_variants.resize(std::min(max_consensus_reads_, gap_variants.size()));
+ auto s = consensus_(new_gap_variants);
+ DEBUG("consenus for " << g_.int_id(start)
+ << " and " << g_.int_id(end)
+ << " found: '" << s << "'");
+ return GapDescription(start, end,
+ Sequence(s),
+ edge_gap_start_position, edge_gap_end_position);
+ }
+
+ //all gaps guaranteed to correspond to a single edge pair
+ GapInfos PadGaps(gap_info_it start, gap_info_it end) const {
+ size_t start_min = std::numeric_limits<size_t>::max();
+ size_t end_max = 0;
+ size_t long_seqs = 0;
+ size_t short_seqs = 0;
+ for (auto it = start; it != end; ++it) {
+ const auto& gap = *it;
+ if (gap.gap_seq.size() > long_seq_limit_)
+ long_seqs++;
+ else
+ short_seqs++;
+
+ start_min = std::min(start_min, gap.edge_gap_start_position);
+ end_max = std::max(end_max, gap.edge_gap_end_position);
+ }
+
+ const bool exclude_long_seqs = (short_seqs >= min_weight_ && short_seqs > long_seqs);
+
+ GapInfos answer;
+ for (auto it = start; it != end; ++it) {
+ const auto& gap = *it;
+
+ if (exclude_long_seqs && gap.gap_seq.size() > long_seq_limit_)
+ continue;
+
+ string s = g_.EdgeNucls(gap.start).Subseq(start_min + g_.k(), gap.edge_gap_start_position + g_.k()).str();
+ s += gap.gap_seq.str();
+ s += g_.EdgeNucls(gap.end).Subseq(gap.edge_gap_end_position, end_max).str();
+ answer.push_back(GapDescription(gap.start, gap.end, Sequence(s), start_min, end_max));
+ }
+ return answer;
+ }
+
+ GapDescription ConstructConsensus(gap_info_it start_it, gap_info_it end_it) const {
+ DEBUG("Considering extension " << g_.str(start_it->end));
+ size_t cur_len = end_it - start_it;
+
+ //low weight connections filtered earlier
+ VERIFY(cur_len >= min_weight_);
+
+ auto padded_gaps = PadGaps(start_it, end_it);
+ //all start and end positions are equal here
+ if (padded_gaps.size() < min_weight_) {
+ DEBUG("Connection weight too low after padding");
+ return INVALID_GAP;
+ }
+
+ vector<string> gap_variants;
+ std::transform(padded_gaps.begin(), padded_gaps.end(), std::back_inserter(gap_variants),
+ [](const GapDescription& gap) {
+ return gap.gap_seq.str();
+ });
+
+ //for (auto it = start_it; it != end_it; ++it) {
+ // VERIFY(it->start == start_it->start);
+ // VERIFY(it->end == start_it->end);
+ // VERIFY(it->edge_gap_start_position == start_it->edge_gap_start_position);
+ // VERIFY(it->edge_gap_end_position == start_it->edge_gap_end_position);
+ //}
+ auto padded_gap = padded_gaps.front();
+
+ return ConstructConsensus(padded_gap.start, padded_gap.end,
+ padded_gap.edge_gap_start_position,
+ padded_gap.edge_gap_end_position,
+ gap_variants);
+ }
+
+ GapDescription ConstructConsensus(EdgeId e) const {
+ DEBUG("Constructing consensus for edge " << g_.str(e));
+ vector<GapDescription> closures;
+ for (const auto& edge_pair_gaps : storage_.EdgePairGaps(get(storage_.inner_index(), e))) {
+ auto consensus = ConstructConsensus(edge_pair_gaps.first, edge_pair_gaps.second);
+ if (consensus != INVALID_GAP) {
+ closures.push_back(consensus);
+ }
+ }
+
+ if (closures.size() == 1) {
+ DEBUG("Found unique extension " << closures.front().str(g_));
+ return closures.front();
+ }
+
+ if (closures.size() > 1) {
+ DEBUG("Non-unique extension");
+ }
+ return INVALID_GAP;
+ }
+
+ vector<GapDescription> ConstructConsensus() const {
+ vector<vector<GapDescription>> closures_by_thread(omp_get_max_threads());
+
+ # pragma omp parallel for
+ for (size_t i = 0; i < storage_.size(); i++) {
+ EdgeId e = storage_[i];
+ size_t thread_num = omp_get_thread_num();
+ GapDescription gap = ConstructConsensus(e);
+ if (gap != INVALID_GAP) {
+ closures_by_thread[thread_num].push_back(gap);
+ }
+ }
+
+ vector<GapDescription> closures;
+ for (auto& new_per_thread : closures_by_thread) {
+ std::copy(new_per_thread.begin(), new_per_thread.end(), std::back_inserter(closures));
+ new_per_thread.clear();
+ }
+ return closures;
+ }
+
+public:
+ HybridGapCloser(Graph& g, const GapStorage& storage,
+ size_t min_weight, ConsensusF consensus,
+ size_t long_seq_limit,
+ size_t max_consensus_reads = 20)
+ : g_(g), storage_(storage),
+ min_weight_(min_weight),
+ consensus_(consensus),
+ long_seq_limit_(long_seq_limit),
+ max_consensus_reads_(max_consensus_reads) {
+ }
+
+ map<EdgeId, EdgeId> operator()() {
+ EdgeFateTracker fate_tracker(g_);
+ MultiGapJoiner gap_joiner(g_);
+
+ gap_joiner(ConstructConsensus());
+
+ CompressAllVertices(g_, true, /*chunk_cnt*/100);
+ return fate_tracker.Old2NewMapping();
+ };
+
+};
+
+}
+}
diff --git a/src/projects/spades/launch.hpp b/src/projects/spades/launch.hpp
index 7d3eb40..91df3b9 100644
--- a/src/projects/spades/launch.hpp
+++ b/src/projects/spades/launch.hpp
@@ -19,18 +19,40 @@
#include "second_phase_setup.hpp"
#include "repeat_resolving.hpp"
#include "distance_estimation.hpp"
-#include "pacbio_aligning.hpp"
+#include "hybrid_aligning.hpp"
#include "chromosome_removal.hpp"
+#include "series_analysis.hpp"
#include "pipeline/stage.hpp"
+#include "contig_output_stage.hpp"
namespace spades {
+inline bool MetaCompatibleLibraries() {
+ const auto& libs = cfg::get().ds.reads;
+ if (libs[0].type() != io::LibraryType::PairedEnd)
+ return false;
+ if (libs.lib_count() > 2)
+ return false;
+ if (libs.lib_count() == 2 &&
+ libs[1].type() != io::LibraryType::TSLReads &&
+ libs[1].type() != io::LibraryType::PacBioReads && libs[1].type() != io::LibraryType::NanoporeReads)
+ return false;
+ return true;
+}
+
+inline bool HybridLibrariesPresent() {
+ for (size_t lib_id = 0; lib_id < cfg::get().ds.reads.lib_count(); ++lib_id)
+ if (cfg::get().ds.reads[lib_id].is_hybrid_lib())
+ return true;
+ return false;
+}
+
void assemble_genome() {
INFO("SPAdes started");
- if (cfg::get().mode == debruijn_graph::config::pipeline_type::meta &&
- (cfg::get().ds.reads.lib_count() != 1 || cfg::get().ds.reads[0].type() != io::LibraryType::PairedEnd)) {
- ERROR("Sorry, current version of metaSPAdes can work with single library only (paired-end only).");
- exit(239);
+ if (cfg::get().mode == debruijn_graph::config::pipeline_type::meta && !MetaCompatibleLibraries()) {
+ ERROR("Sorry, current version of metaSPAdes can work either with single library (paired-end only) "
+ "or in paired-end + TSLR mode.");
+ exit(239);
}
INFO("Starting from stage: " << cfg::get().entry_point);
@@ -53,7 +75,6 @@ void assemble_genome() {
cfg::get().flanking_range,
cfg::get().pos.max_mapping_gap,
cfg::get().pos.max_gap_diff);
-
if (cfg::get().need_mapping) {
INFO("Will need read mapping, kmer mapper will be attached");
conj_gp.kmer_mapper.Attach();
@@ -79,40 +100,35 @@ void assemble_genome() {
SPAdes.add(new debruijn_graph::PairInfoCount(true))
.add(new debruijn_graph::DistanceEstimation(true))
.add(new debruijn_graph::RepeatResolution(true))
+ .add(new debruijn_graph::ContigOutput())
.add(new debruijn_graph::SecondPhaseSetup());
SPAdes.add(new debruijn_graph::Simplification());
}
+ if (!cfg::get().series_analysis.empty())
+ SPAdes.add(new debruijn_graph::SeriesAnalysis());
+
if (cfg::get().pd) {
SPAdes.add(new debruijn_graph::ChromosomeRemoval());
}
- //begin pacbio
- bool run_pacbio = false;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].is_pacbio_alignable()) {
- run_pacbio = true;
- break;
- }
+ if (HybridLibrariesPresent()) {
+ SPAdes.add(new debruijn_graph::HybridLibrariesAligning());
}
- if (run_pacbio) {
- //currently not integrated with two step rr process
- VERIFY(!two_step_rr);
- SPAdes.add(new debruijn_graph::PacBioAligning());
- }
- //not a handler, no graph modification allowed after PacBioAligning stage!
- //end pacbio
-
- SPAdes.add(new debruijn_graph::PairInfoCount())
+
+ //No graph modification allowed after HybridLibrariesAligning stage!
+
+ SPAdes.add(new debruijn_graph::ContigOutput(false))
+ .add(new debruijn_graph::PairInfoCount())
.add(new debruijn_graph::DistanceEstimation())
.add(new debruijn_graph::RepeatResolution());
-
-
} else {
- SPAdes.add(new debruijn_graph::ContigOutput());
+ SPAdes.add(new debruijn_graph::ContigOutput(false));
}
+ SPAdes.add(new debruijn_graph::ContigOutput());
+
SPAdes.run(conj_gp, cfg::get().entry_point.c_str());
// For informing spades.py about estimated params
diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp
index a14d4fa..e162e2e 100644
--- a/src/projects/spades/main.cpp
+++ b/src/projects/spades/main.cpp
@@ -8,12 +8,12 @@
/*
* Assembler Main
*/
-#include "dev_support/logger/log_writers.hpp"
+#include "utils/logger/log_writers.hpp"
-#include "dev_support/memory_limit.hpp"
-#include "dev_support/segfault_handler.hpp"
+#include "utils/memory_limit.hpp"
+#include "utils/segfault_handler.hpp"
#include "launch.hpp"
-#include "dev_support/copy_file.hpp"
+#include "utils/copy_file.hpp"
#include "version.hpp"
void load_config(const vector<string>& cfg_fns) {
diff --git a/src/projects/spades/mismatch_correction.cpp b/src/projects/spades/mismatch_correction.cpp
index d19ffb2..dd181ad 100644
--- a/src/projects/spades/mismatch_correction.cpp
+++ b/src/projects/spades/mismatch_correction.cpp
@@ -5,7 +5,7 @@
//* See file LICENSE for details.
//***************************************************************************
-#include <algorithms/mismatch_shall_not_pass.hpp>
+#include <modules/mismatch_shall_not_pass.hpp>
#include "mismatch_correction.hpp"
#include "io/dataset_support/read_converter.hpp"
@@ -21,7 +21,7 @@ void MismatchCorrection::run(conj_graph_pack &gp, const char*) {
if (dataset.reads[i].is_mismatch_correctable())
libs.push_back(i);
}
- auto streams = single_binary_readers_for_libs(dataset, libs, true, true);
+ auto streams = io::single_binary_readers_for_libs(dataset, libs, true, true);
size_t corrected = MismatchShallNotPass<conj_graph_pack, io::SingleReadSeq>(gp, 2).ParallelStopAllMismatches(streams, 1);
INFO("Corrected " << corrected << " nucleotides");
}
diff --git a/src/projects/spades/pacbio_aligning.cpp b/src/projects/spades/pacbio_aligning.cpp
deleted file mode 100644
index 974251f..0000000
--- a/src/projects/spades/pacbio_aligning.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#include "assembly_graph/graph_alignment/pacbio/pac_index.hpp"
-#include "assembly_graph/graph_alignment/pacbio/pacbio_gap_closer.hpp"
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
-#include "io/reads_io/wrapper_collection.hpp"
-#include "assembly_graph/stats/picture_dump.hpp"
-#include "pacbio_aligning.hpp"
-
-namespace debruijn_graph {
-
-void ProcessReadsBatch(conj_graph_pack &gp,
- std::vector<io::SingleRead>& reads,
- pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph>& pac_index,
- PathStorage<Graph>& long_reads, pacbio::GapStorage<Graph>& gaps,
- size_t buf_size, int n, size_t min_gap_quantity, pacbio::StatsCounter& stats) {
- vector<PathStorage<Graph> > long_reads_by_thread(cfg::get().max_threads,
- PathStorage<Graph>(gp.g));
- vector<pacbio::GapStorage<Graph> > gaps_by_thread(cfg::get().max_threads,
- pacbio::GapStorage<Graph>(gp.g, min_gap_quantity,cfg::get().pb.long_seq_limit));
- vector<pacbio::StatsCounter> stats_by_thread(cfg::get().max_threads);
-
- size_t longer_500 = 0;
- size_t aligned = 0;
- size_t nontrivial_aligned = 0;
-
-# pragma omp parallel for shared(reads, long_reads_by_thread, pac_index, n, aligned, nontrivial_aligned)
- for (size_t i = 0; i < buf_size; ++i) {
- if (i % 1000 == 0) {
- DEBUG("thread number " << omp_get_thread_num());
- }
- size_t thread_num = omp_get_thread_num();
- Sequence seq(reads[i].sequence());
-# pragma omp atomic
- n++;
- auto current_read_mapping = pac_index.GetReadAlignment(seq);
- auto aligned_edges = current_read_mapping.main_storage;
- auto gaps = current_read_mapping.gaps;
- for (auto iter = gaps.begin(); iter != gaps.end(); ++iter)
- gaps_by_thread[thread_num].AddGap(*iter, true);
-
- for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter)
- long_reads_by_thread[thread_num].AddPath(*iter, 1, true);
- //counting stats:
- for (auto iter = aligned_edges.begin(); iter != aligned_edges.end(); ++iter) {
- stats_by_thread[thread_num].path_len_in_edges[iter->size()]++;
- }
-# pragma omp critical
- {
-// INFO(current_read_mapping.seed_num);
- if (seq.size() > 500) {
- longer_500++;
- if (aligned_edges.size() > 0) {
- aligned++;
- stats_by_thread[thread_num].seeds_percentage[size_t(
- floor(double(current_read_mapping.seed_num) * 1000.0 / (double) seq.size()))]++;
- for (size_t j = 0; j < aligned_edges.size(); j++) {
- if (aligned_edges[j].size() > 1) {
- nontrivial_aligned++;
- break;
- }
- }
- }
- }
- }
-# pragma omp critical
- {
- VERBOSE_POWER(n, " reads processed");
- }
- }
- INFO("Read batch of size: " << buf_size << " processed; "<< longer_500 << " of them longer than 500; among long reads aligned: " << aligned << "; paths of more than one edge received: " << nontrivial_aligned );
-
- for (size_t i = 0; i < cfg::get().max_threads; i++) {
- long_reads.AddStorage(long_reads_by_thread[i]);
- gaps.AddStorage(gaps_by_thread[i]);
- stats.AddStorage(stats_by_thread[i]);
- }
-}
-
-void align_pacbio(conj_graph_pack &gp, int lib_id, bool make_additional_saves) {
- io::ReadStreamList<io::SingleRead> streams;
- for (const auto& reads : cfg::get().ds.reads[lib_id].single_reads())
- //do we need input_file function here?
- streams.push_back(make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(reads)));
-
- //make_shared<io::FixingWrapper>(make_shared<io::FileReadStream>(file));
- // auto pacbio_read_stream = single_easy_reader(cfg::get().ds.reads[lib_id],
-// false, false);
-
-// io::ReadStreamList<io::SingleRead> streams(pacbio_read_stream);
- // pacbio_read_stream.release();
- int n = 0;
- PathStorage<Graph>& long_reads = gp.single_long_reads[lib_id];
- pacbio::StatsCounter stats;
- size_t min_gap_quantity = 2;
- size_t rtype = 0;
- bool consensus_gap_closing = false;
- if (cfg::get().ds.reads[lib_id].type() == io::LibraryType::PacBioReads ||
- cfg::get().ds.reads[lib_id].type() == io::LibraryType::SangerReads ||
- cfg::get().ds.reads[lib_id].type() == io::LibraryType::NanoporeReads) {
- min_gap_quantity = cfg::get().pb.pacbio_min_gap_quantity;
- rtype = 1;
- consensus_gap_closing = true;
- } else {
- min_gap_quantity = cfg::get().pb.contigs_min_gap_quantity;
- rtype = 2;
- }
- pacbio::GapStorage<ConjugateDeBruijnGraph> gaps(gp.g, min_gap_quantity, cfg::get().pb.long_seq_limit);
- size_t read_buffer_size = 50000;
- std::vector<io::SingleRead> reads(read_buffer_size);
- io::SingleRead read;
- size_t buffer_no = 0;
- INFO("Usign seed size: " << cfg::get().pb.pacbio_k);
- pacbio::PacBioMappingIndex<ConjugateDeBruijnGraph> pac_index(gp.g,
- cfg::get().pb.pacbio_k,
- cfg::get().K, cfg::get().pb.ignore_middle_alignment, cfg::get().output_dir, cfg::get().pb);
-
-// path_extend::ContigWriter cw(gp.g);
-// cw.WriteEdges("before_rr_with_ids.fasta");
-// ofstream filestr("pacbio_mapped.mpr");
-// filestr.close();
- for (auto iter = streams.begin(); iter != streams.end(); ++iter) {
- auto &stream = *iter;
- while (!stream.eof()) {
- size_t buf_size = 0;
- for (; buf_size < read_buffer_size && !stream.eof(); ++buf_size)
- stream >> reads[buf_size];
- INFO("Prepared batch " << buffer_no << " of " << buf_size << " reads.");
- DEBUG("master thread number " << omp_get_thread_num());
- ProcessReadsBatch(gp, reads, pac_index, long_reads, gaps, buf_size, n, min_gap_quantity, stats);
- // INFO("Processed batch " << buffer_no);
- ++buffer_no;
- }
- }
- string ss = (rtype == 1 ? "long reads": "contigs");
- INFO("For lib " << lib_id << " of " << ss <<" :");
- stats.report();
- map<EdgeId, EdgeId> replacement;
- size_t min_stats_cutoff =(rtype == 1 ? 1 : 0);
- if (make_additional_saves)
- long_reads.DumpToFile(cfg::get().output_saves + "long_reads_before_rep.mpr",
- replacement, min_stats_cutoff, true);
- gaps.DumpToFile(cfg::get().output_saves + "gaps.mpr");
- gaps.PadGapStrings();
- if (make_additional_saves)
- gaps.DumpToFile(cfg::get().output_saves + "gaps_padded.mpr");
- pacbio::PacbioGapCloser<Graph> gap_closer(gp.g, consensus_gap_closing, cfg::get().pb.max_contigs_gap_length);
- gap_closer.ConstructConsensus(cfg::get().max_threads, gaps);
- gap_closer.CloseGapsInGraph(replacement);
- long_reads.ReplaceEdges(replacement);
- for(int j = 0; j < lib_id; j++) {
- gp.single_long_reads[j].ReplaceEdges(replacement);
- }
-
- gap_closer.DumpToFile(cfg::get().output_saves + "gaps_pb_closed.fasta");
- INFO("PacBio aligning finished");
- return;
-}
-
-void PacBioAligning::run(conj_graph_pack &gp, const char*) {
- using namespace omnigraph;
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
- int lib_id = -1;
- bool make_additional_saves = parent_->saves_policy().make_saves_;
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if ( cfg::get().ds.reads[i].is_pacbio_alignable() ) {
- lib_id = (int) i;
- align_pacbio(gp, lib_id, make_additional_saves);
- }
- }
-
- if (lib_id == -1)
- INFO("no PacBio lib found");
-
- stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
- printer(config::info_printer_pos::final_gap_closed);
-}
-
-}
-
diff --git a/src/projects/spades/pacbio_aligning.hpp b/src/projects/spades/pacbio_aligning.hpp
deleted file mode 100644
index 4e7d2a9..0000000
--- a/src/projects/spades/pacbio_aligning.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "pipeline/stage.hpp"
-
-namespace debruijn_graph {
-
-class PacBioAligning : public spades::AssemblyStage {
-public:
- PacBioAligning()
- : AssemblyStage("PacBio Aligning", "pacbio_aligning") {
- }
- void run(conj_graph_pack &gp, const char*);
-};
-
-}
-
diff --git a/src/projects/spades/pair_info_count.cpp b/src/projects/spades/pair_info_count.cpp
index bc01e1d..30edba3 100644
--- a/src/projects/spades/pair_info_count.cpp
+++ b/src/projects/spades/pair_info_count.cpp
@@ -9,27 +9,129 @@
#include "io/dataset_support/read_converter.hpp"
#include "pair_info_count.hpp"
-#include "assembly_graph/graph_alignment/short_read_mapper.hpp"
-#include "assembly_graph/graph_alignment/long_read_mapper.hpp"
+#include "modules/alignment/short_read_mapper.hpp"
+#include "modules/alignment/long_read_mapper.hpp"
+#include "modules/alignment/bwa_sequence_mapper.hpp"
#include "paired_info/pair_info_filler.hpp"
-#include "algorithms/path_extend/split_graph_pair_info.hpp"
-#include "paired_info/bwa_pair_info_filler.hpp"
+#include "modules/path_extend/split_graph_pair_info.hpp"
+
+#include "adt/bf.hpp"
+#include "adt/hll.hpp"
namespace debruijn_graph {
+typedef io::SequencingLibrary<config::DataSetData> SequencingLib;
+using PairedInfoFilter = bf::counting_bloom_filter<std::pair<EdgeId, EdgeId>, 2>;
+using EdgePairCounter = hll::hll<std::pair<EdgeId, EdgeId>>;
+
+class DEFilter : public SequenceMapperListener {
+ public:
+ DEFilter(PairedInfoFilter &filter, const Graph &g)
+ : bf_(filter), g_(g) {}
+
+ void ProcessPairedRead(size_t,
+ const io::PairedRead&,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(read1, read2);
+ }
+ void ProcessPairedRead(size_t,
+ const io::PairedReadSeq&,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(read1, read2);
+ }
+ private:
+ void ProcessPairedRead(const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2) {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+ bf_.add({mapping_edge_1.first, mapping_edge_2.first});
+ bf_.add({g_.conjugate(mapping_edge_2.first), g_.conjugate(mapping_edge_1.first)});
+ }
+ }
+ }
+
+ PairedInfoFilter &bf_;
+ const Graph &g_;
+};
+
+class EdgePairCounterFiller : public SequenceMapperListener {
+ static uint64_t EdgePairHash(const std::pair<EdgeId, EdgeId> &e) {
+ uint64_t h1 = e.first.hash();
+ return CityHash64WithSeeds((const char*)&h1, sizeof(h1), e.second.hash(), 0x0BADF00D);
+ }
+
+ public:
+ EdgePairCounterFiller(size_t thread_num)
+ : counter_(EdgePairHash) {
+ buf_.reserve(thread_num);
+ for (unsigned i = 0; i < thread_num; ++i)
+ buf_.emplace_back(EdgePairHash);
+ }
+
+ void MergeBuffer(size_t i) override {
+ counter_.merge(buf_[i]);
+ buf_[i].clear();
+ }
-bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length_threshold) {
+ void ProcessPairedRead(size_t idx,
+ const io::PairedRead&,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(buf_[idx], read1, read2);
+ }
+ void ProcessPairedRead(size_t idx,
+ const io::PairedReadSeq&,
+ const MappingPath<EdgeId>& read1,
+ const MappingPath<EdgeId>& read2) override {
+ ProcessPairedRead(buf_[idx], read1, read2);
+ }
+ std::pair<double, bool> cardinality() const {
+ return counter_.cardinality();
+ }
+ private:
+ void ProcessPairedRead(EdgePairCounter &buf,
+ const MappingPath<EdgeId>& path1,
+ const MappingPath<EdgeId>& path2) {
+ for (size_t i = 0; i < path1.size(); ++i) {
+ std::pair<EdgeId, MappingRange> mapping_edge_1 = path1[i];
+ for (size_t j = 0; j < path2.size(); ++j) {
+ std::pair<EdgeId, MappingRange> mapping_edge_2 = path2[j];
+ buf.add({mapping_edge_1.first, mapping_edge_2.first});
+ }
+ }
+ }
+
+ std::vector<EdgePairCounter> buf_;
+ EdgePairCounter counter_;
+};
+
+static bool CollectLibInformation(const conj_graph_pack &gp,
+ size_t &edgepairs,
+ size_t ilib, size_t edge_length_threshold) {
INFO("Estimating insert size (takes a while)");
- InsertSizeCounter hist_counter(gp, edge_length_threshold, /* ignore negative */ true);
+ InsertSizeCounter hist_counter(gp, edge_length_threshold);
+ EdgePairCounterFiller pcounter(cfg::get().max_threads);
+
SequenceMapperNotifier notifier(gp);
notifier.Subscribe(ilib, &hist_counter);
+ notifier.Subscribe(ilib, &pcounter);
- auto& reads = cfg::get_writable().ds.reads[ilib];
+ SequencingLib &reads = cfg::get_writable().ds.reads[ilib];
+ auto &data = reads.data();
auto paired_streams = paired_binary_readers(reads, false);
+ notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads, cfg::get().bwa.bwa_enable));
+ //Check read length after lib processing since mate pairs a not used until this step
VERIFY(reads.data().read_length != 0);
- notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
+
+ auto pres = pcounter.cardinality();
+ edgepairs = (!pres.second ? 64ull * 1024 * 1024 : size_t(pres.first));
+ INFO("Edge pairs: " << edgepairs << (!pres.second ? " (rough upper limit)" : ""));
INFO(hist_counter.mapped() << " paired reads (" <<
((double) hist_counter.mapped() * 100.0 / (double) hist_counter.total()) <<
@@ -39,125 +141,150 @@ bool RefineInsertSizeForLib(conj_graph_pack &gp, size_t ilib, size_t edge_length
if (hist_counter.mapped() == 0)
return false;
+
std::map<size_t, size_t> percentiles;
- hist_counter.FindMean(reads.data().mean_insert_size, reads.data().insert_size_deviation, percentiles);
- hist_counter.FindMedian(reads.data().median_insert_size, reads.data().insert_size_mad,
- reads.data().insert_size_distribution);
- if (reads.data().median_insert_size < gp.k_value + 2) {
+ hist_counter.FindMean(data.mean_insert_size, data.insert_size_deviation, percentiles);
+ hist_counter.FindMedian(data.median_insert_size, data.insert_size_mad,
+ data.insert_size_distribution);
+ if (data.median_insert_size < gp.k_value + 2)
return false;
- }
- std::tie(reads.data().insert_size_left_quantile,
- reads.data().insert_size_right_quantile) = omnigraph::GetISInterval(0.8,
- reads.data().insert_size_distribution);
+ std::tie(data.insert_size_left_quantile,
+ data.insert_size_right_quantile) = omnigraph::GetISInterval(0.8,
+ data.insert_size_distribution);
- return !reads.data().insert_size_distribution.empty();
+ return !data.insert_size_distribution.empty();
}
-void ProcessSingleReads(conj_graph_pack &gp, size_t ilib,
- bool use_binary = true) {
+// FIXME: This needs to be static
+void ProcessSingleReads(conj_graph_pack &gp,
+ size_t ilib,
+ bool use_binary = true,
+ bool map_paired = false) {
+ //FIXME make const
auto& reads = cfg::get_writable().ds.reads[ilib];
+
SequenceMapperNotifier notifier(gp);
- GappedLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
- SimpleLongReadMapper simple_read_mapper(gp, gp.single_long_reads[ilib]);
+ //FIXME pretty awful, would be much better if listeners were shared ptrs
+ LongReadMapper read_mapper(gp.g, gp.single_long_reads[ilib],
+ ChooseProperReadPathExtractor(gp.g, reads.type()));
- if(reads.type() == io::LibraryType::PathExtendContigs) {
- notifier.Subscribe(ilib, &read_mapper);
- } else {
- notifier.Subscribe(ilib, &simple_read_mapper);
- }
+ notifier.Subscribe(ilib, &read_mapper);
- auto mapper_ptr = ChooseProperMapper(gp, reads);
+ auto mapper_ptr = ChooseProperMapper(gp, reads, cfg::get().bwa.bwa_enable);
if (use_binary) {
- auto single_streams = single_binary_readers(reads, false, true);
+ auto single_streams = single_binary_readers(reads, false, map_paired);
notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
} else {
auto single_streams = single_easy_readers(reads, false,
- true, /*handle Ns*/false);
+ map_paired, /*handle Ns*/false);
notifier.ProcessLibrary(single_streams, ilib, *mapper_ptr);
}
cfg::get_writable().ds.reads[ilib].data().single_reads_mapped = true;
}
-void ProcessPairedReads(conj_graph_pack &gp, size_t ilib, bool map_single_reads) {
- auto& reads = cfg::get_writable().ds.reads[ilib];
- bool calculate_threshold = (reads.type() == io::LibraryType::PairedEnd);
- SequenceMapperNotifier notifier(gp);
- INFO("Left insert size qauntile " << reads.data().insert_size_left_quantile <<
- ", right insert size quantile " << reads.data().insert_size_right_quantile);
+static void ProcessPairedReads(conj_graph_pack &gp,
+ std::unique_ptr<PairedInfoFilter> filter, unsigned filter_threshold,
+ size_t ilib) {
+ SequencingLib &reads = cfg::get_writable().ds.reads[ilib];
+ const auto &data = reads.data();
- SimpleLongReadMapper read_mapper(gp, gp.single_long_reads[ilib]);
- if (map_single_reads) {
- notifier.Subscribe(ilib, &read_mapper);
- }
+ bool calculate_threshold = (reads.type() == io::LibraryType::PairedEnd &&
+ !cfg::get().pe_params.param_set.extension_options.use_default_single_threshold);
+ unsigned round_thr = 0;
+ // Do not round if filtering is disabled
+ if (filter)
+ round_thr = unsigned(std::min(cfg::get().de.max_distance_coeff * data.insert_size_deviation * cfg::get().de.rounding_coeff,
+ cfg::get().de.rounding_thr));
- path_extend::SplitGraphPairInfo split_graph(
- gp, (size_t) reads.data().median_insert_size,
- (size_t) reads.data().insert_size_deviation,
- (size_t) reads.data().insert_size_left_quantile,
- (size_t) reads.data().insert_size_right_quantile,
- reads.data().read_length, gp.g.k(),
- cfg::get().pe_params.param_set.split_edge_length,
- reads.data().insert_size_distribution);
- if (calculate_threshold) {
+ SequenceMapperNotifier notifier(gp);
+ INFO("Left insert size quantile " << data.insert_size_left_quantile <<
+ ", right insert size quantile " << data.insert_size_right_quantile <<
+ ", filtering threshold " << filter_threshold <<
+ ", rounding threshold " << round_thr);
+
+ path_extend::SplitGraphPairInfo
+ split_graph(gp, (size_t)data.median_insert_size,
+ (size_t) data.insert_size_deviation,
+ (size_t) data.insert_size_left_quantile,
+ (size_t) data.insert_size_right_quantile,
+ data.read_length, gp.g.k(),
+ cfg::get().pe_params.param_set.split_edge_length,
+ data.insert_size_distribution);
+ if (calculate_threshold)
notifier.Subscribe(ilib, &split_graph);
+
+ LatePairedIndexFiller::WeightF weight;
+ if (filter) {
+ weight = [&](const std::pair<EdgeId, EdgeId> &ep,
+ const MappingRange&, const MappingRange&) {
+ return (filter->lookup(ep) > filter_threshold ? 1. : 0.);
+ };
+ } else {
+ weight = [&](const std::pair<EdgeId, EdgeId> &,
+ const MappingRange&, const MappingRange&) {
+ return 1.;
+ };
}
- LatePairedIndexFiller pif(gp.g, PairedReadCountWeight, gp.paired_indices[ilib]);
+ LatePairedIndexFiller pif(gp.g,
+ weight, round_thr,
+ gp.paired_indices[ilib]);
notifier.Subscribe(ilib, &pif);
- auto paired_streams = paired_binary_readers(reads, false, (size_t) reads.data().mean_insert_size);
- notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads));
+ auto paired_streams = paired_binary_readers(reads, false, (size_t) data.mean_insert_size);
+ notifier.ProcessLibrary(paired_streams, ilib, *ChooseProperMapper(gp, reads, cfg::get().bwa.bwa_enable));
cfg::get_writable().ds.reads[ilib].data().pi_threshold = split_graph.GetThreshold();
-
- if (map_single_reads) {
- ProcessSingleReads(gp, ilib);
- }
}
-bool HasGoodRRLibs() {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto &lib = cfg::get().ds.reads[i];
+static bool HasGoodRRLibs() {
+ for (const auto &lib : cfg::get().ds.reads) {
if (lib.is_contig_lib())
continue;
+
if (lib.is_paired() &&
- lib.data().mean_insert_size == 0.0) {
+ lib.data().mean_insert_size == 0.0)
continue;
- }
- if (lib.is_repeat_resolvable()) {
+
+ if (lib.is_repeat_resolvable())
return true;
- }
}
+
return false;
}
-bool HasOnlyMP() {
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- if (cfg::get().ds.reads[i].type() == io::LibraryType::PathExtendContigs)
+static bool HasOnlyMP() {
+ for (const auto &lib : cfg::get().ds.reads) {
+ if (lib.type() == io::LibraryType::PathExtendContigs)
continue;
- if (cfg::get().ds.reads[i].type() != io::LibraryType::MatePairs &&
- cfg::get().ds.reads[i].type() != io::LibraryType::HQMatePairs) {
+
+ if (lib.type() != io::LibraryType::MatePairs &&
+ lib.type() != io::LibraryType::HQMatePairs)
return false;
- }
}
+
return true;
}
//todo improve logic
-bool ShouldMapSingleReads(size_t ilib) {
+static bool ShouldMapSingleReads(size_t ilib) {
using config::single_read_resolving_mode;
switch (cfg::get().single_reads_rr) {
- case single_read_resolving_mode::none: {
- return false;
- }
- case single_read_resolving_mode::all: {
+ case single_read_resolving_mode::all:
return true;
- }
- case single_read_resolving_mode::only_single_libs: {
+ case single_read_resolving_mode::only_single_libs:
//Map when no PacBio/paried libs or only mate-pairs or single lib itself
- return !HasGoodRRLibs() || HasOnlyMP() ||
- (cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads);
- }
+ if (!HasGoodRRLibs() || HasOnlyMP() ||
+ cfg::get().ds.reads[ilib].type() == io::LibraryType::SingleReads) {
+ if (cfg::get().mode != debruijn_graph::config::pipeline_type::meta) {
+ return true;
+ } else {
+ WARN("Single reads are not used in metagenomic mode");
+ }
+ }
+ break;
+ case single_read_resolving_mode::none:
+ break;
default:
VERIFY_MSG(false, "Invalid mode value");
}
@@ -168,86 +295,82 @@ void PairInfoCount::run(conj_graph_pack &gp, const char *) {
gp.InitRRIndices();
gp.EnsureBasicMapping();
- //fixme implement better universal logic
- size_t edge_length_threshold = cfg::get().mode == config::pipeline_type::meta ? 1000 : stats::Nx(gp.g, 50);
+ //TODO implement better universal logic
+ size_t edge_length_threshold = cfg::get().mode == config::pipeline_type::meta ? 900 : stats::Nx(gp.g, 50);
INFO("Min edge length for estimation: " << edge_length_threshold);
- bwa_pair_info::BWAPairInfoFiller bwa_counter(gp.g,
- cfg::get().bwa.path_to_bwa,
- path::append_path(cfg::get().output_dir, "bwa_count"),
- cfg::get().max_threads, !cfg::get().bwa.debug);
-
for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto &lib = cfg::get().ds.reads[i];
-
- if (cfg::get().bwa.bwa_enable && lib.is_bwa_alignable()) {
- //Run insert size estimation and pair index filler together to save disc space (removes SAM file right after processing the lib)
- bwa_counter.ProcessLib(i, cfg::get_writable().ds.reads[i], gp.paired_indices[i],
- edge_length_threshold, cfg::get().bwa.min_contig_len);
- } else if (lib.is_paired()) {
- INFO("Estimating insert size for library #" << i);
- const auto &lib_data = lib.data();
- size_t rl = lib_data.read_length;
- size_t k = cfg::get().K;
- bool insert_size_refined = RefineInsertSizeForLib(gp, i, edge_length_threshold);
-
- if (!insert_size_refined) {
- cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0;
- WARN("Unable to estimate insert size for paired library #" << i);
- if (rl > 0 && rl <= k) {
- WARN("Maximum read length (" << rl << ") should be greater than K (" << k << ")");
- } else if (rl <= k * 11 / 10) {
- WARN("Maximum read length (" << rl << ") is probably too close to K (" << k << ")");
- } else {
- WARN("None of paired reads aligned properly. Please, check orientation of your read pairs.");
+ auto &lib = cfg::get_writable().ds.reads[i];
+ if (lib.is_hybrid_lib()) {
+ INFO("Library #" << i << " was mapped earlier on hybrid aligning stage, skipping");
+ continue;
+ } else if (lib.is_contig_lib()) {
+ INFO("Mapping contigs library #" << i);
+ ProcessSingleReads(gp, i, false);
+ } else {
+ if (lib.is_paired()) {
+ INFO("Estimating insert size for library #" << i);
+ const auto &lib_data = lib.data();
+ size_t rl = lib_data.read_length;
+ size_t k = cfg::get().K;
+
+ size_t edgepairs = 0;
+ if (!CollectLibInformation(gp, edgepairs, i, edge_length_threshold)) {
+ cfg::get_writable().ds.reads[i].data().mean_insert_size = 0.0;
+ WARN("Unable to estimate insert size for paired library #" << i);
+ if (rl > 0 && rl <= k) {
+ WARN("Maximum read length (" << rl << ") should be greater than K (" << k << ")");
+ } else if (rl <= k * 11 / 10) {
+ WARN("Maximum read length (" << rl << ") is probably too close to K (" << k << ")");
+ } else {
+ WARN("None of paired reads aligned properly. Please, check orientation of your read pairs.");
+ }
+ continue;
}
- continue;
- } else {
+
INFO(" Insert size = " << lib_data.mean_insert_size <<
", deviation = " << lib_data.insert_size_deviation <<
", left quantile = " << lib_data.insert_size_left_quantile <<
", right quantile = " << lib_data.insert_size_right_quantile <<
", read length = " << lib_data.read_length);
- if (lib_data.mean_insert_size < 1.1 * (double) rl) {
+ if (lib_data.mean_insert_size < 1.1 * (double) rl)
WARN("Estimated mean insert size " << lib_data.mean_insert_size
<< " is very small compared to read length " << rl);
- }
- }
- }
- }
- for (size_t i = 0; i < cfg::get().ds.reads.lib_count(); ++i) {
- const auto &lib = cfg::get().ds.reads[i];
- if (lib.is_pacbio_alignable()) {
- INFO("Library #" << i << " was mapped by PacBio mapper, skipping");
- continue;
- } else if (lib.is_contig_lib()) {
- INFO("Mapping contigs library #" << i);
- ProcessSingleReads(gp, i, false);
- } else if (cfg::get().bwa.bwa_enable && lib.is_bwa_alignable()) {
- INFO("Library #" << i << " was mapped by BWA, skipping");
- continue;
- } else {
- INFO("Mapping library #" << i);
- bool map_single_reads = ShouldMapSingleReads(i);
- cfg::get_writable().use_single_reads |= map_single_reads;
-
- if(cfg::get().mode == debruijn_graph::config::pipeline_type::meta
- && cfg::get().use_single_reads) {
- map_single_reads = false;
- cfg::get_writable().use_single_reads = false;
- WARN("Single reads mappings are not used in metagenomic mode");
- }
+ std::unique_ptr<PairedInfoFilter> filter;
+ unsigned filter_threshold = cfg::get().de.raw_filter_threshold;
+
+ // Only filter paired-end libraries
+ if (filter_threshold && lib.type() == io::LibraryType::PairedEnd) {
+ filter.reset(new PairedInfoFilter([](const std::pair<EdgeId, EdgeId> &e, uint64_t seed) {
+ uint64_t h1 = e.first.hash();
+ return CityHash64WithSeeds((const char*)&h1, sizeof(h1), e.second.hash(), seed);
+ },
+ 12 * edgepairs));
+
+ INFO("Filtering data for library #" << i);
+ {
+ SequenceMapperNotifier notifier(gp);
+ DEFilter filter_counter(*filter, gp.g);
+ notifier.Subscribe(i, &filter_counter);
+
+ auto reads = paired_binary_readers(lib, false);
+ VERIFY(lib.data().read_length != 0);
+ notifier.ProcessLibrary(reads, i, *ChooseProperMapper(gp, lib, cfg::get().bwa.bwa_enable));
+ }
+ }
- if (lib.is_paired() && lib.data().mean_insert_size != 0.0) {
- INFO("Mapping paired reads (takes a while) ");
- ProcessPairedReads(gp, i, map_single_reads);
- } else if (map_single_reads) {
- INFO("Mapping single reads (takes a while) ");
- ProcessSingleReads(gp, i);
+ INFO("Mapping library #" << i);
+ if (lib.data().mean_insert_size != 0.0) {
+ INFO("Mapping paired reads (takes a while) ");
+ ProcessPairedReads(gp, std::move(filter), filter_threshold, i);
+ }
}
- if (map_single_reads) {
+ if (ShouldMapSingleReads(i)) {
+ cfg::get_writable().use_single_reads = true;
+ INFO("Mapping single reads of library #" << i);
+ ProcessSingleReads(gp, i, /*use_binary*/true, /*map_paired*/true);
INFO("Total paths obtained from single reads: " << gp.single_long_reads[i].size());
}
}
diff --git a/src/projects/spades/repeat_resolving.cpp b/src/projects/spades/repeat_resolving.cpp
index e5044d8..8deb72b 100644
--- a/src/projects/spades/repeat_resolving.cpp
+++ b/src/projects/spades/repeat_resolving.cpp
@@ -5,73 +5,63 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/logger/logger.hpp"
+#include "utils/logger/logger.hpp"
#include "assembly_graph/stats/picture_dump.hpp"
-#include "visualization/graph_labeler.hpp"
-#include "paired_info/distance_estimation.hpp"
-#include "paired_info/smoothing_distance_estimation.hpp"
-#include "algorithms/path_extend/path_extend_launch.hpp"
-#include "assembly_graph/graph_support/contig_output.hpp"
-#include "visualization/position_filler.hpp"
-#include "assembly_graph/graph_alignment/long_read_storage.hpp"
+#include "modules/path_extend/pipeline/launcher.hpp"
+
#include "repeat_resolving.hpp"
namespace debruijn_graph {
-void PEResolving(conj_graph_pack& gp) {
- string scaffolds_name = cfg::get().mode == config::pipeline_type::rna ? "transcripts" : "scaffolds";
- bool output_broke_scaffolds = cfg::get().mode != config::pipeline_type::rna;
-
- path_extend::PathExtendParamsContainer params(cfg::get().pe_params,
+static void PEResolving(conj_graph_pack& gp) {
+ path_extend::PathExtendParamsContainer params(cfg::get().ds,
+ cfg::get().pe_params,
cfg::get().output_dir,
- "final_contigs",
- scaffolds_name,
cfg::get().mode,
cfg::get().uneven_depth,
cfg::get().avoid_rc_connections,
- cfg::get().use_scaffolder,
- output_broke_scaffolds);
+ cfg::get().use_scaffolder);
- path_extend::ResolveRepeatsPe(cfg::get().ds, params, gp);
+ path_extend::PathExtendLauncher exspander(cfg::get().ds, params, gp);
+ exspander.Launch();
}
-inline bool HasValidLibs() {
+static bool HasValidLibs() {
for (const auto& lib : cfg::get().ds.reads) {
- if (lib.is_repeat_resolvable()) {
- if (!lib.is_paired() || !math::eq(lib.data().mean_insert_size, 0.0)) {
- return true;
- }
+ if (!lib.is_repeat_resolvable())
+ continue;
+
+ if (!lib.is_paired() ||
+ !math::eq(lib.data().mean_insert_size, 0.0)) {
+ return true;
}
}
+
return false;
}
void RepeatResolution::run(conj_graph_pack &gp, const char*) {
- if (cfg::get().developer_mode) {
+ if (cfg::get().developer_mode)
stats::PrepareForDrawing(gp);
- }
- omnigraph::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
stats::detail_info_printer printer(gp, labeler, cfg::get().output_dir);
printer(config::info_printer_pos::before_repeat_resolution);
//todo awful hack to get around PE using cfg::get everywhere...
+ //Is it possible to fix this problem now or still too soon?
auto tmp_params_storage = cfg::get().pe_params;
if (preliminary_) {
INFO("Setting up preliminary path extend settings")
cfg::get_writable().pe_params = *cfg::get().prelim_pe_params;
}
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false);
- OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph",gp.components);
bool no_valid_libs = !HasValidLibs();
-
bool use_single_reads = cfg::get().use_single_reads;
if (cfg::get().rr_enable && no_valid_libs && !use_single_reads)
WARN("Insert size was not estimated for any of the paired libraries, repeat resolution module will not run.");
if ((no_valid_libs || cfg::get().rm == config::resolving_mode::none) && !use_single_reads) {
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
return;
}
if (cfg::get().rm == config::resolving_mode::path_extend) {
@@ -79,7 +69,6 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
PEResolving(gp);
} else {
INFO("Unsupported repeat resolver");
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
}
if (preliminary_) {
INFO("Restoring initial path extend settings")
@@ -87,15 +76,6 @@ void RepeatResolution::run(conj_graph_pack &gp, const char*) {
}
}
-void ContigOutput::run(conj_graph_pack &gp, const char*) {
- OutputContigs(gp.g, cfg::get().output_dir + "simplified_contigs", cfg::get().use_unipaths);
- OutputContigs(gp.g, cfg::get().output_dir + "before_rr", false);
- OutputContigsToFASTG(gp.g, cfg::get().output_dir + "assembly_graph", gp.components);
- OutputContigs(gp.g, cfg::get().output_dir + "final_contigs", false);
-}
} // debruijn_graph
-
-
-
diff --git a/src/projects/spades/repeat_resolving.hpp b/src/projects/spades/repeat_resolving.hpp
index 8178e4a..8d34eeb 100644
--- a/src/projects/spades/repeat_resolving.hpp
+++ b/src/projects/spades/repeat_resolving.hpp
@@ -26,17 +26,5 @@ public:
void run(conj_graph_pack &gp, const char *);
};
-class ContigOutput : public spades::AssemblyStage {
-public:
- ContigOutput()
- : AssemblyStage("Contig Output", "contig_output") { }
-
- void load(conj_graph_pack &, const std::string &, const char *) { }
-
- void save(const conj_graph_pack &, const std::string &, const char *) const { }
-
- void run(conj_graph_pack &gp, const char *);
-};
-
}
diff --git a/src/projects/spades/second_phase_setup.cpp b/src/projects/spades/second_phase_setup.cpp
index 9f09674..1a6854d 100644
--- a/src/projects/spades/second_phase_setup.cpp
+++ b/src/projects/spades/second_phase_setup.cpp
@@ -19,6 +19,7 @@ namespace debruijn_graph {
void SecondPhaseSetup::run(conj_graph_pack &gp, const char*) {
INFO("Preparing second phase");
gp.ClearRRIndices();
+ gp.ClearPaths();
std::string old_pe_contigs_filename = cfg::get().output_dir + "final_contigs.fasta";
std::string new_pe_contigs_filename = cfg::get().output_dir + "first_pe_contigs.fasta";
@@ -35,7 +36,7 @@ void SecondPhaseSetup::run(conj_graph_pack &gp, const char*) {
cfg::get_writable().ds.reads.push_back(untrusted_contigs);
//FIXME get rid of this awful variable
- cfg::get_writable().use_single_reads = false;
+ VERIFY(!cfg::get().use_single_reads);
INFO("Ready to run second phase");
}
diff --git a/src/projects/spades/series_analysis.hpp b/src/projects/spades/series_analysis.hpp
new file mode 100644
index 0000000..7860e51
--- /dev/null
+++ b/src/projects/spades/series_analysis.hpp
@@ -0,0 +1,323 @@
+#pragma once
+
+#include "pipeline/stage.hpp"
+#include "assembly_graph/graph_support/graph_processing_algorithm.hpp"
+#include "assembly_graph/graph_support/basic_edge_conditions.hpp"
+#include "modules/simplification/tip_clipper.hpp"
+#include "projects/mts/contig_abundance.hpp"
+#include "io/reads/osequencestream.hpp"
+
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace debruijn_graph {
+
+struct SeriesAnalysisConfig {
+ uint k;
+ uint sample_cnt;
+ uint frag_size;
+ uint min_len;
+
+ std::string kmer_mult, bin, bin_prof, edges_sqn, edges_mpl, edge_fragments_mpl;
+};
+
+}
+
+namespace llvm { namespace yaml {
+
+template<> struct MappingTraits<debruijn_graph::SeriesAnalysisConfig> {
+ static void mapping(IO& io, debruijn_graph::SeriesAnalysisConfig& cfg) {
+ io.mapRequired("k", cfg.k);
+ io.mapRequired("sample_cnt", cfg.sample_cnt);
+ io.mapRequired("kmer_mult", cfg.kmer_mult);
+ io.mapRequired("bin", cfg.bin);
+ io.mapRequired("bin_prof", cfg.bin_prof);
+ io.mapRequired("min_len", cfg.min_len);
+ io.mapRequired("edges_sqn", cfg.edges_sqn);
+ io.mapRequired("edges_mpl", cfg.edges_mpl);
+ io.mapRequired("edge_fragments_mpl", cfg.edge_fragments_mpl);
+ io.mapRequired("frag_size", cfg.frag_size);
+ }
+};
+
+} }
+
+namespace debruijn_graph {
+
+template<class graph_pack>
+shared_ptr<visualization::graph_colorer::GraphColorer<typename graph_pack::graph_t>> DefaultGPColorer(
+ const graph_pack& gp) {
+ io::SingleRead genome("ref", gp.genome.str());
+ auto mapper = MapperInstance(gp);
+ auto path1 = mapper->MapRead(genome).path();
+ auto path2 = mapper->MapRead(!genome).path();
+ return visualization::graph_colorer::DefaultColorer(gp.g, path1, path2);
+}
+
+inline double l2_norm(const AbundanceVector& v) {
+ double s = 0.;
+ for (auto val : v) {
+ s += val * val;
+ }
+ return std::sqrt(s);
+}
+
+inline double cosine_sim(const AbundanceVector& v1, const AbundanceVector& v2) {
+ double s = 0.;
+ for (size_t i = 0; i < v1.size(); ++i) {
+ s += v1[i] * v2[i];
+ }
+ return s / (l2_norm(v1) * l2_norm(v2));
+}
+
+template<class Graph>
+class EdgeAbundance: public omnigraph::GraphActionHandler<Graph> {
+ typedef map<EdgeId, AbundanceVector> Storage;
+ typedef Storage::const_iterator const_iterator;
+ Storage edge_abundance_;
+ const ContigAbundanceCounter& abundance_counter_;
+
+public:
+ EdgeAbundance(const Graph& g, const ContigAbundanceCounter& abundance_counter) :
+ omnigraph::GraphActionHandler<Graph>(g, "EdgeAbundance"),
+ abundance_counter_(abundance_counter){}
+
+ void Fill() {
+ for (auto it = this->g().ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ HandleAdd(*it);
+ }
+ }
+
+ virtual void HandleAdd(EdgeId e) override {
+ auto ab = abundance_counter_(this->g().EdgeNucls(e).str());
+ if (!ab) {
+ INFO("Couldn't estimate abundance of edge " << this->g().str(e));
+ } else {
+ edge_abundance_[e] = *ab;
+ }
+ }
+
+ const_iterator begin() const {
+ return edge_abundance_.begin();
+ }
+
+ const_iterator end() const {
+ return edge_abundance_.end();
+ }
+
+ const_iterator find(EdgeId e) const {
+ return edge_abundance_.find(e);
+ }
+
+ size_t count(EdgeId e) const {
+ return edge_abundance_.count(e);
+ }
+
+private:
+ DECL_LOGGER("EdgeAbundance");
+};
+
+template<class Graph>
+class AggressiveClearing: public omnigraph::EdgeProcessingAlgorithm<Graph> {
+ typedef typename Graph::EdgeId EdgeId;
+ const EdgeAbundance<Graph>& edge_abundance_;
+ const AbundanceVector base_profile_;
+ const double similarity_threshold_;
+ const double norm_ratio_threshold_;
+ EdgeRemover<Graph> edge_remover_;
+ func::TypedPredicate<EdgeId> topological_condition_;
+
+protected:
+ virtual bool ProcessEdge(EdgeId e) override {
+ DEBUG("Processing edge " << this->g().str(e));
+ if (!topological_condition_(e)) {
+ DEBUG("Topological condition failed");
+ return false;
+ }
+ auto it = edge_abundance_.find(e);
+ if (it == edge_abundance_.end()) {
+ DEBUG("Edge " << this->g().str(e) << " did not have valid abundance profile");
+ return false;
+ }
+ const auto& profile = it->second;
+ DEBUG("Edge profile " << PrintVector(profile));
+ double sim = cosine_sim(profile, base_profile_);
+ double norm_ratio = l2_norm(profile) / l2_norm(base_profile_);
+
+ DEBUG("Similarity between edge and base profiles " << sim);
+ DEBUG("Norm ratio " << norm_ratio);
+ if (math::ls(norm_ratio, norm_ratio_threshold_)
+ || math::ls(sim, similarity_threshold_)) {
+ DEBUG("Removing edge " << this->g().str(e));
+
+ edge_remover_.DeleteEdge(e);
+ return true;
+ }
+ return false;
+ }
+
+public:
+ AggressiveClearing(Graph &g,
+ const EdgeAbundance<Graph>& edge_abundance,
+ const AbundanceVector& base_profile,
+ double similarity_threshold,
+ double norm_ratio_threshold,
+ const std::function<void(EdgeId)> &removal_handler = 0) :
+ EdgeProcessingAlgorithm<Graph>(g, true),
+ edge_abundance_(edge_abundance),
+ base_profile_(base_profile),
+ similarity_threshold_(similarity_threshold),
+ norm_ratio_threshold_(norm_ratio_threshold),
+ edge_remover_(g, removal_handler),
+ topological_condition_(func::Or(AlternativesPresenceCondition<Graph>(g), TipCondition<Graph>(g))) {
+ DEBUG("Base profile " << PrintVector(base_profile_));
+ }
+private:
+ DECL_LOGGER("AggressiveClearing");
+};
+
+class SeriesAnalysis : public spades::AssemblyStage {
+
+ boost::optional<AbundanceVector> InferAbundance(const std::string& bin_mult_fn,
+ const std::string& b_id) const {
+ path::CheckFileExistenceFATAL(bin_mult_fn);
+
+ ifstream is(bin_mult_fn);
+ vector<AbundanceVector> abundances;
+ while (true) {
+ string name;
+ is >> name;
+ if (!is.fail()) {
+ AbundanceVector vec(SampleCount(), 0.0);
+ for (size_t i = 0; i < SampleCount(); ++i) {
+ is >> vec[i];
+ VERIFY(!is.fail());
+ }
+ if (name == b_id) {
+ abundances.push_back(vec);
+ }
+ } else {
+ INFO("Read " << abundances.size() << " profiles for bin " << b_id);
+ break;
+ }
+ }
+ return boost::optional<AbundanceVector>(MeanVector(abundances));
+ }
+
+ void PrintEdgeFragmentProfiles(const conj_graph_pack &gp, const ContigAbundanceCounter &abundance_counter,
+ size_t split_length, size_t min_len, std::ostream &os) const {
+ for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ io::SingleRead full_contig(ToString(gp.g.int_id(e)), gp.g.EdgeNucls(e).str());
+ for (size_t i = 0; i < full_contig.size(); i += split_length) {
+ if (full_contig.size() - i < min_len) {
+ DEBUG("Fragment shorter than min_length_bound " << min_len);
+ break;
+ }
+
+ io::SingleRead contig = full_contig.Substr(i, std::min(i + split_length, full_contig.size()));
+
+ DEBUG("Processing fragment # " << (i / split_length) << " with id " << contig.name());
+
+ auto abundance_vec = abundance_counter(contig.GetSequenceString(), contig.name());
+
+ if (abundance_vec) {
+ size_t len = contig.GetSequenceString().size();
+ os << contig.name() << " " << len << " " << PrintVector(*abundance_vec) << std::endl;
+ //copy(abundance_vec->begin(), abundance_vec->begin() + config.sample_cnt,
+ // ostream_iterator<Mpl>(ss, " "));
+ DEBUG("Successfully estimated abundance of " << contig.name());
+ } else {
+ DEBUG("Failed to estimate abundance of " << contig.name());
+ }
+ }
+ }
+ }
+
+public:
+ SeriesAnalysis() : AssemblyStage("Series Analysis", "series_analysis") { }
+
+ void load(conj_graph_pack &, const std::string &, const char *) { }
+
+ void save(const conj_graph_pack &, const std::string &, const char *) const { }
+
+ void run(conj_graph_pack &gp, const char *) {
+ std::string cfg = cfg::get().series_analysis;
+ INFO("Series analysis enabled with config " << cfg);
+
+ auto Buf = llvm::MemoryBuffer::getFile(cfg);
+ VERIFY_MSG(Buf, "Failed to load config file " + cfg);
+
+ llvm::yaml::Input yin(*Buf.get());
+ SeriesAnalysisConfig config;
+ yin >> config;
+
+ SetSampleCount(config.sample_cnt);
+
+ ContigAbundanceCounter abundance_counter(config.k,
+ SingleClusterAnalyzer(2., 0.4),
+ cfg::get().tmp_dir);
+
+ DEBUG("Initiating abundance counter");
+ abundance_counter.Init(config.kmer_mult);
+ DEBUG("Abundance counter ready");
+
+ if (!config.edges_sqn.empty()) {
+ io::osequencestream oss(config.edges_sqn);
+ for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ string s = gp.g.EdgeNucls(e).str();
+ oss << io::SingleRead(io::MakeContigId(gp.g.int_id(e), s.size()), s);
+ }
+ }
+
+ if (!config.edges_mpl.empty()) {
+ ofstream os(config.edges_mpl);
+ PrintEdgeFragmentProfiles(gp, abundance_counter, -1ul, config.min_len, os);
+ }
+
+ if (!config.edge_fragments_mpl.empty()) {
+ ofstream os(config.edge_fragments_mpl);
+ PrintEdgeFragmentProfiles(gp, abundance_counter, config.frag_size, config.min_len, os);
+ }
+
+ boost::optional<AbundanceVector> bin_profile = InferAbundance(config.bin_prof, config.bin);
+ if (!bin_profile) {
+ ERROR("Couldn't estimate profile of bin");
+ return;
+ }
+
+ EdgeAbundance<Graph> edge_abundance(gp.g, abundance_counter);
+ edge_abundance.Fill();
+
+ gp.EnsureBasicMapping();
+ gp.FillQuality();
+ visualization::graph_labeler::DefaultLabeler<Graph> labeler(gp.g, gp.edge_pos);
+ auto colorer = DefaultGPColorer(gp);
+ path::make_dir(cfg::get().output_dir + "pictures/");
+ QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler(gp.g, gp.edge_qual, labeler, colorer,
+ cfg::get().output_dir + "pictures/");
+
+ INFO("Launching aggressive graph clearing");
+ //positive quality edges removed (folder colored_edges_deleted)
+ AggressiveClearing<Graph> clearing(gp.g, edge_abundance,
+ *bin_profile, 0.8, 0.3, [&](EdgeId e) {
+ qual_removal_handler.HandleDelete(e);});
+ clearing.Run();
+ INFO("Graph clearing finished");
+
+ INFO("Drawing edges with failed abundance estimate")
+ path::make_dir(cfg::get().output_dir + "pictures_no_ab/");
+ QualityEdgeLocalityPrintingRH<Graph> qual_removal_handler2(gp.g, gp.edge_qual, labeler, colorer,
+ cfg::get().output_dir + "pictures_no_ab/");
+
+ for (auto it = gp.g.ConstEdgeBegin(true); !it.IsEnd(); ++it) {
+ EdgeId e = *it;
+ if (edge_abundance.count(e) == 0) {
+ qual_removal_handler2.HandleDelete(e);
+ }
+ }
+ }
+};
+
+}
diff --git a/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp b/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp
index b0c4f8f..de95af6 100644
--- a/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp
+++ b/src/projects/truseq_analysis/AlignmentAnalyserNew.cpp
@@ -9,8 +9,8 @@
// Created by anton on 5/15/15.
//
-#include "dev_support/standard_base.hpp"
-#include "algorithms/dijkstra/dijkstra_helper.hpp"
+#include "utils/standard_base.hpp"
+#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
#include "AlignmentAnalyserNew.hpp"
namespace alignment_analysis {
diff --git a/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp b/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp
index 0ad6484..d0a65d3 100644
--- a/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp
+++ b/src/projects/truseq_analysis/AlignmentAnalyserNew.hpp
@@ -7,7 +7,7 @@
#pragma once
-#include "assembly_graph/graph_core/graph.hpp"
+#include "assembly_graph/core/graph.hpp"
#include "assembly_graph/paths/mapping_path.hpp"
#include "consistent_mapping.h"
diff --git a/src/projects/truseq_analysis/CMakeLists.txt b/src/projects/truseq_analysis/CMakeLists.txt
index 0b07475..3fa5aa1 100644
--- a/src/projects/truseq_analysis/CMakeLists.txt
+++ b/src/projects/truseq_analysis/CMakeLists.txt
@@ -11,5 +11,5 @@ add_executable(truseq_analysis
main.cpp
alignment_analyser.cpp AlignmentAnalyserNew.cpp consistent_mapping.cpp analysis_pipeline.cpp)
-target_link_libraries(truseq_analysis spades_modules ${COMMON_LIBRARIES})
+target_link_libraries(truseq_analysis common_modules ${COMMON_LIBRARIES})
diff --git a/src/projects/truseq_analysis/alignment_analyser.cpp b/src/projects/truseq_analysis/alignment_analyser.cpp
index 9f5c102..11e03ef 100644
--- a/src/projects/truseq_analysis/alignment_analyser.cpp
+++ b/src/projects/truseq_analysis/alignment_analyser.cpp
@@ -5,7 +5,7 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include "alignment_analyser.hpp"
namespace alignment_analysis {
diff --git a/src/projects/truseq_analysis/alignment_analyser.hpp b/src/projects/truseq_analysis/alignment_analyser.hpp
index 2da4fde..7bca8d8 100644
--- a/src/projects/truseq_analysis/alignment_analyser.hpp
+++ b/src/projects/truseq_analysis/alignment_analyser.hpp
@@ -7,7 +7,7 @@
#pragma once
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include "pipeline/graph_pack.hpp"
#include "consistent_mapping.h"
@@ -18,7 +18,7 @@ namespace alignment_analysis {
typedef debruijn_graph::conj_graph_pack::graph_t Graph;
typedef Graph::EdgeId EdgeId;
typedef Graph::VertexId VertexId;
- typedef debruijn_graph::NewExtendedSequenceMapper<Graph, debruijn_graph::conj_graph_pack::index_t> Mapper;
+ typedef debruijn_graph::BasicSequenceMapper<Graph, debruijn_graph::conj_graph_pack::index_t> Mapper;
stringstream log_;
const Graph &graph_;
const Mapper &mapper_;
diff --git a/src/projects/truseq_analysis/analysis_pipeline.cpp b/src/projects/truseq_analysis/analysis_pipeline.cpp
index 413e6cc..2b39f5f 100644
--- a/src/projects/truseq_analysis/analysis_pipeline.cpp
+++ b/src/projects/truseq_analysis/analysis_pipeline.cpp
@@ -9,10 +9,10 @@
// Created by anton on 16.05.15.
//
+#include "io/reads/file_reader.hpp"
#include "stages/construction.hpp"
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include "analysis_pipeline.hpp"
-#include "modules/io/reads_io/file_reader.hpp"
spades::VariationDetectionStage::VariationDetectionStage(string output_file, const Config &config) : AssemblyStage("VariationDetection", "variation_detection"),
output_file_(output_file), config_(config) {
@@ -138,4 +138,4 @@ vector <alignment_analysis::ConsistentMapping> spades::VariationDetectionStage::
}
}
return result;
-}
\ No newline at end of file
+}
diff --git a/src/projects/truseq_analysis/analysis_pipeline.hpp b/src/projects/truseq_analysis/analysis_pipeline.hpp
index a2d330f..4269650 100644
--- a/src/projects/truseq_analysis/analysis_pipeline.hpp
+++ b/src/projects/truseq_analysis/analysis_pipeline.hpp
@@ -7,7 +7,7 @@
#pragma once
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include <pipeline/stage.hpp>
#include "alignment_analyser.hpp"
#include "AlignmentAnalyserNew.hpp"
diff --git a/src/projects/truseq_analysis/consistent_mapping.cpp b/src/projects/truseq_analysis/consistent_mapping.cpp
index 2e3cc63..449f9cf 100644
--- a/src/projects/truseq_analysis/consistent_mapping.cpp
+++ b/src/projects/truseq_analysis/consistent_mapping.cpp
@@ -5,7 +5,7 @@
//* See file LICENSE for details.
//***************************************************************************
-#include "dev_support/standard_base.hpp"
+#include "utils/standard_base.hpp"
#include "AlignmentAnalyserNew.hpp"
#include "consistent_mapping.h"
diff --git a/src/projects/truseq_analysis/main.cpp b/src/projects/truseq_analysis/main.cpp
index 3cd961b..1588396 100644
--- a/src/projects/truseq_analysis/main.cpp
+++ b/src/projects/truseq_analysis/main.cpp
@@ -8,10 +8,10 @@
/*
* TruSeq Analysis Main
*/
-#include "dev_support/logger/log_writers.hpp"
-#include "dev_support/segfault_handler.hpp"
-#include "dev_support/memory_limit.hpp"
-#include "dev_support/copy_file.hpp"
+#include "utils/logger/log_writers.hpp"
+#include "utils/segfault_handler.hpp"
+#include "utils/memory_limit.hpp"
+#include "utils/copy_file.hpp"
#include "pipeline/config_struct.hpp"
#include "analysis_pipeline.hpp"
diff --git a/src/spades_pipeline/hammer_logic.py b/src/spades_pipeline/hammer_logic.py
index 1e2b035..1d971b8 100644
--- a/src/spades_pipeline/hammer_logic.py
+++ b/src/spades_pipeline/hammer_logic.py
@@ -75,6 +75,8 @@ def prepare_config_bh(filename, cfg, log):
subst_dict["input_qvoffset"] = cfg.qvoffset
if "count_filter_singletons" in cfg.__dict__:
subst_dict["count_filter_singletons"] = cfg.count_filter_singletons
+ if "read_buffer_size" in cfg.__dict__:
+ subst_dict["count_split_buffer"] = cfg.read_buffer_size
process_cfg.substitute_params(filename, subst_dict, log)
diff --git a/src/spades_pipeline/options_storage.py b/src/spades_pipeline/options_storage.py
index 92e6580..1919e5a 100644
--- a/src/spades_pipeline/options_storage.py
+++ b/src/spades_pipeline/options_storage.py
@@ -27,12 +27,13 @@ MAX_LIBS_NUMBER = 9
OLD_STYLE_READS_OPTIONS = ["--12", "-1", "-2", "-s"]
SHORT_READS_TYPES = {"pe": "paired-end", "s": "single", "mp": "mate-pairs", "hqmp": "hq-mate-pairs", "nxmate": "nxmate"}
# other libs types:
-LONG_READS_TYPES = ["pacbio", "sanger", "nanopore", "trusted-contigs", "untrusted-contigs"]
+LONG_READS_TYPES = ["pacbio", "sanger", "nanopore", "tslr", "trusted-contigs", "untrusted-contigs"]
# final contigs and scaffolds names
contigs_name = "contigs.fasta"
scaffolds_name = "scaffolds.fasta"
assembly_graph_name = "assembly_graph.fastg"
+assembly_graph_name_gfa = "assembly_graph.gfa"
contigs_paths = "contigs.paths"
scaffolds_paths = "scaffolds.paths"
transcripts_name = "transcripts.fasta"
@@ -88,6 +89,7 @@ cov_cutoff = 'off' # default is 'off'
# hidden options
mismatch_corrector = None
reference = None
+series_analysis = None
configs_dir = None
iterations = None
bh_heap_check = None
@@ -129,7 +131,7 @@ dict_of_rel2abs = dict()
long_options = "12= threads= memory= tmp-dir= iterations= phred-offset= sc iontorrent meta large-genome rna plasmid "\
"only-error-correction only-assembler "\
"disable-gzip-output disable-gzip-output:false disable-rr disable-rr:false " \
- "help version test debug debug:false reference= config-file= dataset= "\
+ "help version test debug debug:false reference= series-analysis= config-file= dataset= "\
"bh-heap-check= spades-heap-check= read-buffer-size= help-hidden "\
"mismatch-correction mismatch-correction:false careful careful:false "\
"continue restart-from= diploid truseq cov-cutoff= configs-dir= stop-after=".split()
@@ -245,6 +247,7 @@ def usage(spades_version, show_hidden=False, mode=None):
sys.stderr.write("--sanger\t<filename>\tfile with Sanger reads\n")
sys.stderr.write("--pacbio\t<filename>\tfile with PacBio reads\n")
sys.stderr.write("--nanopore\t<filename>\tfile with Nanopore reads\n")
+ sys.stderr.write("--tslr\t<filename>\tfile with TSLR-contigs\n")
sys.stderr.write("--trusted-contigs\t<filename>\tfile with trusted contigs\n")
sys.stderr.write("--untrusted-contigs\t<filename>\tfile with untrusted contigs\n")
if mode == "dip":
@@ -310,6 +313,7 @@ def usage(spades_version, show_hidden=False, mode=None):
" of mismatches and short indels" + "\n")
sys.stderr.write("--reference\t<filename>\tfile with reference for deep analysis"\
" (only in debug mode)" + "\n")
+ sys.stderr.write("--series-analysis\t<filename>\tconfig for metagenomics-series-augmented reassembly" + "\n")
sys.stderr.write("--configs-dir\t<configs_dir>\tdirectory with configs" + "\n")
sys.stderr.write("-i/--iterations\t<int>\t\tnumber of iterations for read error"\
" correction [default: %s]\n" % ITERATIONS)
@@ -334,8 +338,8 @@ def usage(spades_version, show_hidden=False, mode=None):
def auto_K_allowed():
- return not k_mers and not single_cell and not iontorrent and not meta
- # kmers were set by default, not SC, and not IonTorrent data, and not metagenomic
+ return not k_mers and not single_cell and not iontorrent and not rna and not meta
+ # kmers were set by default, not SC, not IonTorrent data and not rna and temporary not meta
def set_default_values():
@@ -501,3 +505,10 @@ def enable_truseq_mode():
correct_scaffolds = True
run_truseq_postprocessing = True
only_assembler = True
+
+
+def will_rerun(options):
+ for opt, arg in options:
+ if opt == '--continue' or opt.startswith('--restart-from'): # checks both --restart-from k33 and --restart-from=k33
+ return True
+ return False
diff --git a/src/spades_pipeline/spades_logic.py b/src/spades_pipeline/spades_logic.py
index 1aafd6b..29acf9f 100644
--- a/src/spades_pipeline/spades_logic.py
+++ b/src/spades_pipeline/spades_logic.py
@@ -63,6 +63,8 @@ def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage
if "bwa_paired" in cfg.__dict__:
subst_dict["bwa_enable"] = bool_to_str(True)
subst_dict["path_to_bwa"] = os.path.join(execution_home, "bwa-spades")
+ if "series_analysis" in cfg.__dict__:
+ subst_dict["series_analysis"] = cfg.series_analysis
process_cfg.substitute_params(filename, subst_dict, log)
@@ -120,15 +122,17 @@ def reveal_original_k_mers(RL):
def add_configs(command, configs_dir):
#Order matters here!
mode_config_mapping = [("single_cell", "mda_mode"),
- ("meta", "meta_mode"),
+ ("meta", "meta_mode"),
("truseq_mode", "moleculo_mode"),
("rna", "rna_mode"),
+ ("large_genome", "large_genome_mode"),
("plasmid", "plasmid_mode"),
("careful", "careful_mode"),
("diploid_mode", "diploid_mode")]
-
for (mode, config) in mode_config_mapping:
if options_storage.__dict__[mode]:
+ if mode == "rna" or mode == "meta":
+ command.append(os.path.join(configs_dir, "mda_mode.info"))
command.append(os.path.join(configs_dir, config + ".info"))
@@ -362,6 +366,9 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
if os.path.isfile(os.path.join(latest, "scaffolds.paths")):
if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode:
shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths)
+ if os.path.isfile(os.path.join(latest, "assembly_graph.gfa")):
+ if not os.path.isfile(cfg.result_graph_gfa) or not options_storage.continue_mode:
+ shutil.copyfile(os.path.join(latest, "assembly_graph.gfa"), cfg.result_graph_gfa)
if os.path.isfile(os.path.join(latest, "assembly_graph.fastg")):
if not os.path.isfile(cfg.result_graph) or not options_storage.continue_mode:
shutil.copyfile(os.path.join(latest, "assembly_graph.fastg"), cfg.result_graph)
diff --git a/src/spades_pipeline/support.py b/src/spades_pipeline/support.py
index 2df2199..7fc8d15 100644
--- a/src/spades_pipeline/support.py
+++ b/src/spades_pipeline/support.py
@@ -80,6 +80,7 @@ def check_binaries(binary_dir, log):
def check_file_existence(input_filename, message="", log=None, dipspades=False):
filename = abspath(expanduser(input_filename))
+ check_path_is_ascii(filename, message)
if not os.path.isfile(filename):
error("file not found: %s (%s)" % (filename, message), log=log, dipspades=dipspades)
options_storage.dict_of_rel2abs[input_filename] = filename
@@ -88,17 +89,25 @@ def check_file_existence(input_filename, message="", log=None, dipspades=False):
def check_dir_existence(input_dirname, message="", log=None, dipspades=False):
dirname = abspath(expanduser(input_dirname))
+ check_path_is_ascii(dirname, message)
if not os.path.isdir(dirname):
error("directory not found: %s (%s)" % (dirname, message), log=log, dipspades=dipspades)
options_storage.dict_of_rel2abs[input_dirname] = dirname
return dirname
+
+def check_path_is_ascii(path, message=""):
+ if not is_ascii_string(path):
+ error("path contains non-ASCII characters: %s (%s)" % (path, message))
+
+
def ensure_dir_existence(dirname):
if os.path.isfile(dirname):
os.remove(dirname)
if not os.path.exists(dirname):
os.makedirs(dirname)
+
def recreate_dir(dirname):
if os.path.exists(dirname):
shutil.rmtree(dirname)
@@ -172,6 +181,18 @@ def get_available_memory():
return None
+# based on http://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
+def is_ascii_string(line):
+ try:
+ line.encode('ascii')
+ except UnicodeDecodeError: # python2
+ return False
+ except UnicodeEncodeError: # python3
+ return False
+ else:
+ return True
+
+
def process_readline(line, is_python3=sys.version.startswith('3.')):
if is_python3:
return str(line, 'utf-8').rstrip()
diff --git a/src/utils/adt/bag.hpp b/src/utils/adt/bag.hpp
deleted file mode 100644
index c5abbb3..0000000
--- a/src/utils/adt/bag.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "dev_support/verify.hpp"
-
-template<class T, class hash = std::hash<T>>
-class bag {
- typedef std::unordered_map<T, size_t, hash> Data;
- Data data_;
- size_t size_;
-public:
-
- bag() : size_(0) {
- }
-
- typedef typename Data::const_iterator const_iterator;
-
- void put(const T& t, size_t mult) {
- VERIFY(mult > 0);
- data_[t] += mult;
- size_ += mult;
- }
-
- void put(const T& t) {
- put(t, 1);
- }
-
- bool take(const T& t, size_t mult) {
- VERIFY(mult > 0);
- /*typename map<T, size_t>::iterator*/auto it = data_.find(t);
- if (it == data_.end()) {
- return false;
- } else {
- size_t have = it->second;
- if (have < mult) {
- data_.erase(it->first);
- size_ -= have;
- return false;
- } else if (have == mult) {
- data_.erase(it->first);
- size_ -= have;
- return true;
- } else {
- it->second -= mult;
- size_ -= mult;
- return true;
- }
- }
- }
-
- bool take(const T& t) {
- return take(t, 1);
- }
-
- size_t mult(const T& t) const {
- auto it = data_.find(t);
- if (it == data_.end()) {
- return 0;
- } else {
- return it->second;
- }
- }
-
- void clear() {
- data_.clear();
- size_ = 0;
- }
-
- const_iterator begin() const {
- return data_.begin();
- }
-
- const_iterator end() const {
- return data_.end();
- }
-
- size_t size() const {
- return size_;
- }
-
-};
diff --git a/src/utils/adt/concurrent_dsu.hpp b/src/utils/adt/concurrent_dsu.hpp
deleted file mode 100644
index 176a5e3..0000000
--- a/src/utils/adt/concurrent_dsu.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef CONCURRENTDSU_HPP_
-#define CONCURRENTDSU_HPP_
-
-#include "io/kmers_io/mmapped_writer.hpp"
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstdarg>
-#include <cstdint>
-
-#include <algorithm>
-#include <vector>
-#include <unordered_map>
-#include <atomic>
-#include <fstream>
-
-// Silence bogus gcc warnings
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-class ConcurrentDSU {
- struct atomic_set_t {
- uint64_t data : 61;
- uint64_t aux : 2;
- bool root : 1;
- } __attribute__ ((packed));
-
- static_assert(sizeof(atomic_set_t) == 8, "Unexpected size of atomic_set_t");
-
-public:
- ConcurrentDSU(size_t size)
- : data_(size) {
-
- for (size_t i = 0; i < size; i++)
- data_[i] = {.data = 1, .aux = 0, .root = true};
- }
-
- ~ConcurrentDSU() { }
-
- void unite(size_t x, size_t y) {
- uint64_t x_size, y_size;
- uint64_t x_aux, y_aux;
-
- // Step one: update the links
- while (true) {
- x = find_set(x);
- y = find_set(y);
- if (x == y)
- return;
-
- atomic_set_t x_entry = data_[x], y_entry = data_[y];
- // If someone already changed roots => retry
- if (!x_entry.root || !y_entry.root)
- continue;
-
- // We need to link the smallest subtree to the largest
- x_size = x_entry.data, y_size = y_entry.data;
- x_aux = x_entry.aux, y_aux = y_entry.aux;
- if (x_size > y_size || (x_size == y_size && x > y)) {
- std::swap(x, y);
- std::swap(x_size, y_size);
- std::swap(x_aux, y_aux);
- std::swap(x_entry, y_entry);
- }
-
- // Link 'x' to 'y'. If someone already changed 'x' => try again.
- atomic_set_t new_x_entry = {.data = y, .aux = x_aux, .root = false};
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
-
- // Step two: update the size. We already linked 'x' to 'y'. Therefore we
- // need to add 'x_size' to whichever value is currently inside 'y'.
- while (true) {
- y = find_set(y);
- atomic_set_t y_entry = data_[y];
- // If someone already changed the roots => retry
- if (!y_entry.root)
- continue;
-
- // Update the size. If someone already changed 'y' => try again.
- atomic_set_t new_y_entry = {.data = x_size + y_entry.data, .aux = y_aux, .root = true};
- if (!data_[y].compare_exchange_strong(y_entry, new_y_entry))
- continue;
-
- break;
- }
- }
-
- size_t set_size(size_t i) const {
- while (true) {
- size_t el = find_set(i);
- atomic_set_t entry = data_[el];
- if (!entry.root)
- continue;
-
- return entry.data;
- }
- }
-
- size_t find_set(size_t x) const {
- // Step one: find the root
- size_t r = x;
- atomic_set_t r_entry = data_[r];
- while (!r_entry.root) {
- r = r_entry.data;
- r_entry = data_[r];
- }
-
- // Step two: traverse the path from 'x' to root trying to update the links
- // Note that the links might change, therefore we stop as soon as we'll
- // end at 'some' root.
- while (x != r) {
- atomic_set_t x_entry = data_[x];
- if (x_entry.root)
- break;
-
- // Try to update parent (may fail, it's ok)
- atomic_set_t new_x_entry = {.data = r, .aux = x_entry.aux, .root = false};
- data_[x].compare_exchange_weak(x_entry, new_x_entry);
- x = x_entry.data;
- }
-
- return x;
- }
-
- bool same(size_t x, size_t y) const {
- while (true) {
- x = find_set(x);
- y = find_set(y);
- if (x == y)
- return true;
- if (data_[x].load().root)
- return false;
- }
- }
-
- size_t num_sets() const {
- size_t count = 0;
- for (const auto &entry : data_) {
- count += entry.load(std::memory_order_relaxed).root;
- }
-
- return count;
- }
-
- bool is_root(size_t x) const {
- return data_[x].load(std::memory_order_relaxed).root;
- }
-
- uint64_t aux(size_t x) const {
- return data_[x].load(std::memory_order_relaxed).aux;
- }
-
- uint64_t root_aux(size_t x) const {
- while (true) {
- x = find_set(x);
- atomic_set_t entry = data_[x];
-
- if (!entry.root)
- continue;
-
- return entry.aux;
- }
- }
-
- void set_aux(size_t x, uint64_t data) {
- while (true) {
- atomic_set_t x_entry = data_[x];
- atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = x_entry.root};
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
- }
-
- void set_root_aux(size_t x, uint64_t data) {
- while (true) {
- x = find_set(x);
- atomic_set_t x_entry = data_[x];
- if (!x_entry.root)
- continue;
-
- atomic_set_t new_x_entry = {.data = x_entry.data, .aux = data, .root = true};
- if (!data_[x].compare_exchange_strong(x_entry, new_x_entry))
- continue;
-
- break;
- }
- }
-
- size_t extract_to_file(const std::string &Prefix) {
- // First, touch all the sets to make them directly connect to the root
-# pragma omp parallel for
- for (size_t x = 0; x < data_.size(); ++x)
- (void) find_set(x);
-
- std::unordered_map<size_t, size_t> sizes;
-
-#if 0
- for (size_t x = 0; x < size; ++x) {
- if (data_[x].parent != x) {
- size_t t = data_[x].parent;
- VERIFY(data_[t].parent == t)
- }
- }
-#endif
-
- // Insert all the root elements into the map
- sizes.reserve(num_sets());
- for (size_t x = 0; x < data_.size(); ++x) {
- if (is_root(x))
- sizes[x] = 0;
- }
-
- // Now, calculate the counts. We can do this in parallel, because we know no
- // insertion can occur.
-# pragma omp parallel for
- for (size_t x = 0; x < data_.size(); ++x) {
- size_t &entry = sizes[parent(x)];
-# pragma omp atomic
- entry += 1;
- }
-
- // Now we know the sizes of each cluster. Go over again and calculate the
- // file-relative (cumulative) offsets.
- size_t off = 0;
- for (size_t x = 0; x < data_.size(); ++x) {
- if (is_root(x)) {
- size_t &entry = sizes[x];
- size_t noff = off + entry;
- entry = off;
- off = noff;
- }
- }
-
- // Write down the entries
- std::vector<size_t> out(off);
- for (size_t x = 0; x < data_.size(); ++x) {
- size_t &entry = sizes[parent(x)];
- out[entry++] = x;
- }
- std::ofstream os(Prefix, std::ios::binary | std::ios::out);
- os.write((char *) &out[0], out.size() * sizeof(out[0]));
- os.close();
-
- // Write down the sizes
- MMappedRecordWriter<size_t> index(Prefix + ".idx");
- index.reserve(sizes.size());
- size_t *idx = index.data();
- for (size_t x = 0, i = 0, sz = 0; x < data_.size(); ++x) {
- if (is_root(x)) {
- idx[i++] = sizes[x] - sz;
- sz = sizes[x];
- }
- }
-
- return sizes.size();
- }
-
- void get_sets(std::vector<std::vector<size_t> > &otherWay) {
- otherWay.resize(data_.size());
- for (size_t i = 0; i < data_.size(); i++) {
- size_t set = find_set(i);
- otherWay[set].push_back(i);
- }
- otherWay.erase(remove_if(otherWay.begin(), otherWay.end(), zero_size),
- otherWay.end());
- }
-
-private:
- size_t parent(size_t x) const {
- atomic_set_t val = data_[x];
- return (val.root ? x : val.data);
- }
-
- static bool zero_size(const std::vector<size_t> &v) {
- return v.size() == 0;
- }
-
- mutable std::vector<std::atomic<atomic_set_t> > data_;
-};
-
-#pragma GCC diagnostic pop
-
-#endif /* CONCURRENTDSU_HPP_ */
diff --git a/src/utils/adt/function_traits.hpp b/src/utils/adt/function_traits.hpp
deleted file mode 100644
index 5729a41..0000000
--- a/src/utils/adt/function_traits.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef __ADT_FUNCTION_TRAITS__
-#define __ADT_FUNCTION_TRAITS__
-
-#pragma once
-
-#include <functional>
-
-namespace adt {
-
-template<class F>
-struct function_traits;
-
-// function pointer
-template<class R, class... Args>
-struct function_traits<R(*)(Args...)> : public function_traits<R(Args...)> {
-};
-
-// member function pointer
-template<class C, class R, class... Args>
-struct function_traits<R(C::*)(Args...)> : public function_traits<R(C &, Args...)> {
-};
-
-// const member function pointer
-template<class C, class R, class... Args>
-struct function_traits<R(C::*)(Args...) const> : public function_traits<R(C &, Args...)> {
-};
-
-// member object pointer
-template<class C, class R>
-struct function_traits<R(C::*)> : public function_traits<R(C &)> {
-};
-
-template<class R, class... Args>
-struct function_traits<R(Args...)> {
- using return_type = R;
-
- static constexpr std::size_t arity = sizeof...(Args);
-
- template<std::size_t N>
- struct arg {
- static_assert(N < arity, "invalid argument index");
- using type = typename std::tuple_element<N, std::tuple<Args...>>::type;
- };
-};
-
-template<class F>
-struct function_traits<F &> : public function_traits<F> {
-};
-
-template<class F>
-struct function_traits<F &&> : public function_traits<F> {
-};
-
-// functors & default implementation
-template<class F>
-struct function_traits {
-private:
- using call_type = function_traits<decltype(&F::operator())>;
-
-public:
- using return_type = typename call_type::return_type;
-
- // Remeber to get rid of this argument
- static constexpr std::size_t arity = call_type::arity - 1;
-
- template<std::size_t N>
- struct arg {
- static_assert(N < arity, "invalid argument index");
- // Remeber to get rid of this argument
- using type = typename call_type::template arg<N + 1>::type;
- };
-};
-
-} // namespace adt
-
-#endif // __ADT_FUNCTION_TRAITS__
diff --git a/src/utils/adt/kmer_hash_vector.hpp b/src/utils/adt/kmer_hash_vector.hpp
deleted file mode 100644
index f2b6861..0000000
--- a/src/utils/adt/kmer_hash_vector.hpp
+++ /dev/null
@@ -1,370 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-/*
- * kmer_hash_vector.hpp
- *
- * Created on: Jul 19, 2012
- * Author: alex
- */
-
-#ifndef KMER_HASH_VECTOR_HPP_
-#define KMER_HASH_VECTOR_HPP_
-
-
-#include "data_structures/sequence/runtime_k.hpp"
-#include "kmer_map.hpp"
-
-
-namespace runtime_k {
-
-class IKmerHashVector {
-
-protected:
- static const size_t LOAD_OVERHEAD = 1000;
-
- size_t nthreads_;
-
- size_t cell_size_;
-
-public:
- typedef RtSeq input_value_type;
-
- IKmerHashVector(size_t nthreads)
- : nthreads_ (nthreads)
- , cell_size_ (LOAD_OVERHEAD) {
- }
-
- virtual ~IKmerHashVector() {
-
- }
-
- virtual IKmerHashVector * copy() const = 0;
-
- virtual void clear() = 0;
-
- virtual void clear(size_t i) = 0;
-
- virtual bool is_full() const = 0;
-
- virtual bool is_presisely_full() const = 0;
-
- virtual size_t capacity(size_t i) const = 0;
-
- virtual size_t size(size_t i) const = 0;
-
-
- virtual void insert(const input_value_type& value) = 0;
-
- virtual void reserve(size_t cell_size) = 0;
-
-
- virtual size_t get_k() const = 0;
-
- size_t get_threads_num() const
- {
- return nthreads_;
- }
-
- virtual void dump (KmerMap<int>& destination, size_t bucketNum) = 0;
-};
-
-
-
-class KmerHashVector {
-
-public:
-
- typedef IKmerHashVector base_vector_type;
-
-private:
-
- base_vector_type * data_;
-
-public:
-
- typedef KmerHashVector vector_type;
-
- typedef base_vector_type::input_value_type input_value_type;
-
-
- KmerHashVector(size_t k, size_t nthreads);
-
- KmerHashVector(base_vector_type * vec): data_(vec) {
- }
-
- KmerHashVector(const vector_type& vec) {
- data_ = vec.data_->copy();
- }
-
- vector_type& operator=(const vector_type& vec) {
- if (vec.data_ != data_) {
- delete data_;
- data_ = vec.data_->copy();
- }
-
- return *this;
- }
-
- ~KmerHashVector() {
- delete data_;
- }
-
-
-
- bool is_full() const {
- return data_->is_full();
- }
-
- bool is_presisely_full() const {
- return data_->is_presisely_full();
- }
-
- size_t get_threads_num() const
- {
- return data_->get_threads_num();
- }
-
-
- void insert(const input_value_type& value) {
- data_->insert(value);
- }
-
- void clear() {
- data_->clear();
- }
-
-
- void clear(size_t i) {
- data_->clear(i);
- }
-
- size_t get_k() const {
- return data_->get_k();
- }
-
- size_t capacity(size_t i) const {
- return data_->capacity(i);
- }
-
- void reserve(size_t cell_size) {
- data_->reserve(cell_size);
- }
-
- base_vector_type * get_data() const {
- return data_;
- }
-
- void print_sizes() {
- for (size_t i = 0; i < data_->get_threads_num(); ++i) {
- INFO("Size " << i << ": " << data_->size(i));
- }
- }
-
- void dump (KmerMap<int>& destination, size_t bucketNum) {
- data_->dump(destination, bucketNum);
- }
-};
-
-
-// ================================= VECTOR IMPLEMENTATION =================================
-
-template <size_t size_>
-class KmerHashVectorImpl: public IKmerHashVector {
-
-public:
-
- typedef TypeContainerImpl<size_> type_container;
-
- typedef typename type_container::Kmer Kmer;
-
- typedef typename type_container::vector_type vector_type;
-
- typedef std::vector<vector_type> data_type;
-
- typedef IKmerHashVector base_type;
-
- typedef typename base_type::input_value_type input_value_type;
-
-private:
-
- data_type data_;
-
- size_t k_;
-
-public:
-
- KmerHashVectorImpl(size_t k, size_t nthreads):
- IKmerHashVector(nthreads)
- , data_ (nthreads)
- , k_ (k) {
- }
-
- virtual base_type * copy() const {
- return new KmerHashVectorImpl<size_>(*this);
- }
-
- virtual bool is_full() const {
- return data_[0].size() >= cell_size_;
- }
-
- virtual bool is_presisely_full() const {
- for (size_t i = 0; i < nthreads_; ++i) {
- if (data_[i].size() >= cell_size_)
- return true;
- }
- return false;
- }
-
- virtual void insert(const input_value_type& value) {
- Kmer kmer = type_container::from_sequence(value);
- data_[kmer.GetHash() % nthreads_].push_back(kmer);
- }
-
- virtual void clear() {
- for (size_t i = 0; i < nthreads_; ++i) {
- data_[i].clear();
- }
- }
-
- virtual void clear(size_t i) {
- data_[i].clear();
- }
-
- virtual size_t get_k() const {
- return k_;
- }
-
- virtual size_t capacity(size_t i) const {
- return data_[i].capacity();
- }
-
- virtual size_t size(size_t i) const {
- return data_[i].size();
- }
-
- virtual void reserve(size_t cell_size) {
- cell_size_ = cell_size;
- for (size_t i = 0; i < nthreads_; ++i) {
- data_[i].reserve(cell_size_ + LOAD_OVERHEAD);
- }
- }
-
- const data_type& get_data() const {
- return data_;
- }
-
- virtual void dump (KmerMap<int>& destination, size_t bucketNum) {
- KmerMapImpl<size_, int>& destImpl = dynamic_cast<KmerMapImpl<size_, int>&>(destination.get_data());
-
- for (auto it = data_[bucketNum].begin(), end = data_[bucketNum].end(); it != end; ++it) {
- ++destImpl[*it];
- }
- }
-};
-
-
-// ================================= VECTOR FACTORIES =================================
-// Single factory interface
-class SingleKmerHashVectorFactory {
-
-public:
-
- virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const = 0;
-
- virtual ~SingleKmerHashVectorFactory() {
-
- }
-};
-
-
-// Single factory for specific k and value
-template <size_t ts_>
-class SingleKmerHashVectorFactoryImpl: public SingleKmerHashVectorFactory {
-
-public:
-
- virtual IKmerHashVector * GetHashVector(size_t k, size_t nthreads) const {
- VERIFY_MSG(GET_UPPER_BOUND(k) == GET_K_BY_TS(ts_), k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
- //INFO(k << " -> " << GET_UPPER_BOUND(k) << ", " << ts_ << " -> " << GET_K_BY_TS(ts_));
-
- return new KmerHashVectorImpl< GET_K_BY_TS(ts_) >(k, nthreads);
- }
-
-};
-
-//Factory genetator
-template<size_t ts_>
-class HashVectorGenerator {
-
-public:
-
- static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
- factories[ts_] = new SingleKmerHashVectorFactoryImpl<ts_>();
- HashVectorGenerator<ts_ - 1> :: GenerateHashVectors (factories);
- }
-};
-
-//Terminating factory generator
-template<>
-class HashVectorGenerator<MIN_TS> {
-
-public:
-
- static void GenerateHashVectors(std::vector< SingleKmerHashVectorFactory* > & factories) {
- factories[MIN_TS] = new SingleKmerHashVectorFactoryImpl<MIN_TS>;
- }
-};
-
-
-//Lazy singleton for factory for every required value
-class KmerHashVectorFactory {
-
-private:
-
- std::vector < SingleKmerHashVectorFactory* > single_factories_;
-
- KmerHashVectorFactory() {
- VERIFY_MSG(MIN_K <= MAX_K, "Invalid K value range");
-
- single_factories_ = std::vector < SingleKmerHashVectorFactory* >(MAX_TS + 1);
- HashVectorGenerator<MAX_TS>::GenerateHashVectors(single_factories_);
- }
-
-public:
-
- static KmerHashVectorFactory& GetInstance() {
- static KmerHashVectorFactory instance;
-
- return instance;
- }
-
- KmerHashVector GetHashVector(size_t k, size_t nthreads) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return KmerHashVector(single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads));
- }
-
- IKmerHashVector * GetRawHashVector(size_t k, size_t nthreads) {
- VERIFY_MSG(k >= MIN_K && k <= MAX_K, "K value " + ToString(k) + " is not supported, should be >= " +
- ToString(MIN_K) + " and <= " + ToString(MAX_K));
-
- return single_factories_[GET_T_ELEMENTS_NUMBER(k)]->GetHashVector(k, nthreads);
- }
-};
-
-KmerHashVector GetHashVector(size_t k, size_t nthreads) {
- return KmerHashVectorFactory::GetInstance().GetHashVector(k, nthreads);
-}
-
-KmerHashVector::KmerHashVector(size_t k, size_t nthreads): data_(KmerHashVectorFactory::GetInstance().GetRawHashVector(k, nthreads)) {
-}
-
-} //namespace runtime_k
-
-#endif /* KMER_HASH_VECTOR_HPP_ */
diff --git a/src/utils/adt/kmer_vector.hpp b/src/utils/adt/kmer_vector.hpp
deleted file mode 100644
index 06b9eb3..0000000
--- a/src/utils/adt/kmer_vector.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef __KMER_VECTOR_HPP__
-#define __KMER_VECTOR_HPP__
-
-#include "array_vector.hpp"
-#include "config.hpp"
-
-#ifdef SPADES_USE_JEMALLOC
-
-# include <jemalloc/jemalloc.h>
-
-#endif
-
-template<class Seq>
-class KMerVector {
-private:
- typedef typename Seq::DataType ElTy;
-
- ElTy *realloc() {
-#ifdef SPADES_USE_JEMALLOC
- // First, try to expand in-place
- if (storage_ && sizeof(ElTy) * capacity_ * el_sz_ > 4096 &&
- je_rallocm((void **) &storage_, NULL, sizeof(ElTy) * capacity_ * el_sz_, 0, ALLOCM_NO_MOVE) ==
- ALLOCM_SUCCESS)
- return storage_;
-
- // Failed, do usual malloc / memcpy / free cycle
- ElTy *res = (ElTy *) je_malloc(sizeof(ElTy) * capacity_ * el_sz_);
- if (storage_)
- std::memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
- je_free(storage_);
- storage_ = res;
-#else
- // No JEMalloc, no cookies
- ElTy *res = new ElTy[capacity_ * el_sz_];
- if (storage_)
- std:: memcpy(res, storage_, size_ * sizeof(ElTy) * el_sz_);
-
- delete[] storage_;
- storage_ = res;
-#endif
-
- return storage_;
- }
-
-public:
- typedef typename array_vector<ElTy>::reference reference;
- typedef typename array_vector<ElTy>::value_type value_type;
- typedef typename array_vector<ElTy>::iterator iterator;
- typedef typename array_vector<ElTy>::const_iterator const_iterator;
-
- typedef array_less<ElTy> less2_fast;
- typedef array_equal_to<ElTy> equal_to;
-
- explicit KMerVector(unsigned K, size_t capacity = 1)
- : K_(K), size_(0), capacity_(std::max(capacity, (size_t) 1)), el_sz_(Seq::GetDataSize(K)), storage_(NULL),
- vector_(realloc(), size_, el_sz_) {
- }
-
- KMerVector(KMerVector &&that)
- : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(that.storage_),
- vector_(storage_, size_, el_sz_) {
- that.storage_ = NULL;
- }
-
- KMerVector(const KMerVector &that)
- : K_(that.K_), size_(that.size_), capacity_(that.capacity_), el_sz_(that.el_sz_), storage_(NULL),
- vector_(realloc(), size_, el_sz_) {
- memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
- }
-
- ~KMerVector() {
-#ifdef SPADES_USE_JEMALLOC
- je_free(storage_);
-#else
- delete[] storage_;
-#endif
- }
-
- KMerVector &operator=(const KMerVector &that) {
- if (this != &that) {
- K_ = that.K_;
- size_ = that.size_;
- capacity_ = that.capacity_;
- el_sz_ = that.el_sz_;
-
- storage_ = NULL;
- realloc();
- memcpy(storage_, that.storage_, size_ * sizeof(ElTy) * el_sz_);
-
- vector_.set_data(storage_);
- vector_.set_size(size_);
- }
-
- return *this;
- }
-
- void push_back(const ElTy *data) {
- if (capacity_ == size_)
- reserve(capacity_ * 2);
-
- vector_[size_] = data;
- size_ += 1;
- vector_.set_size(size_);
- }
-
- void push_back(const Seq &s) {
- push_back(s.data());
- }
-
- void reserve(size_t amount) {
- if (capacity_ < amount) {
- capacity_ = amount;
- vector_.set_data(realloc());
- }
- }
-
- void clear() {
- size_ = 0;
- vector_.set_size(size_);
- }
-
- iterator begin() {
- return vector_.begin();
- }
-
- const_iterator begin() const {
- return vector_.begin();
- }
-
- iterator end() {
- return vector_.end();
- }
-
- const_iterator end() const {
- return vector_.end();
- }
-
- const ElTy *data() const {
- return storage_;
- }
-
- size_t size() const {
- return size_;
- }
-
- size_t el_size() const {
- return el_sz_;
- }
-
- size_t el_data_size() const {
- return el_sz_ * sizeof(ElTy);
- }
-
- size_t capacity() const {
- return capacity_;
- }
-
- const ElTy *operator[](size_t idx) const {
- return vector_[idx];
- }
-
-private:
- unsigned K_;
- size_t size_;
- size_t capacity_;
- size_t el_sz_;
- ElTy *storage_;
- array_vector<ElTy> vector_;
-};
-
-
-#endif /* __KMER_VECTOR_HPP */
diff --git a/src/utils/adt/parallel_seq_vector.hpp b/src/utils/adt/parallel_seq_vector.hpp
deleted file mode 100644
index 209cb84..0000000
--- a/src/utils/adt/parallel_seq_vector.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include "utils/adt/parallel_unordered_map.hpp"
-#include "dev_support/openmp_wrapper.h"
-
-#include "data_structures/sequence/runtime_k.hpp"
-#include "kmer_map.hpp"
-#include "kmer_hash_vector.hpp"
-
-class ParallelSeqVector {
-
-public:
- typedef runtime_k::KmerHashVector par_container_t;
-
- typedef runtime_k::KmerMap<int> destination_container_t;
-
- typedef runtime_k::RtSeq Kmer;
-
-private:
-
- size_t k_;
-
- size_t nthreads_;
-
- std::vector<par_container_t> nodes_;
-
-public:
-
- ParallelSeqVector(size_t k, size_t nthreads, size_t cell_size) :
- k_(k),
- nthreads_(nthreads),
- nodes_()
-
- {
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_.push_back(runtime_k::GetHashVector(k_, nthreads_));
- }
-
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_[i].reserve(cell_size);
- }
- }
-
-
- void AddEdge(const Kmer &kmer, size_t thread_number) {
- nodes_[thread_number].insert(kmer);
- }
-
- void CountSequence(const Sequence& s, size_t thread_number) {
- if (s.size() < k_)
- return;
-
- Kmer kmer = s.start<Kmer>(k_);
-
- AddEdge(kmer, thread_number);
- for (size_t j = k_; j < s.size(); ++j) {
- kmer <<= s[j];
- AddEdge(kmer, thread_number);
- }
-
- }
-//
-// void MergeMaps(destination_container_t & dest_container, size_t i) {
-// for (size_t j = 0; j < nthreads_; ++j) {
-// dest_container.transfer(nodes_[j], i);
-// }
-// }
-
- void Dump(destination_container_t & bucket, size_t bucket_number) {
- for (size_t i = 0; i < nodes_.size(); ++i) {
- nodes_[i].dump(bucket, bucket_number);
- nodes_[i].clear(bucket_number);
- }
- }
-
-
- size_t SingleBucketCount() const {
- return nodes_[0].capacity(0);
- }
-
- bool IsFull(size_t i) const {
- return nodes_[i].is_full();
- }
-
- void Clear(size_t i) {
- nodes_[i].clear();
- }
-
- void Clear() {
- for (size_t i = 0; i < nthreads_; ++i) {
- nodes_[i].clear();
- }
- }
-
- void print_sizes() {
- for (size_t i = 0; i < nodes_.size(); ++i) {
- INFO("Size " << i << "::: ");
- nodes_[i].print_sizes();
- }
- }
-
-
-};
diff --git a/src/utils/adt/queue_iterator.hpp b/src/utils/adt/queue_iterator.hpp
deleted file mode 100644
index c879541..0000000
--- a/src/utils/adt/queue_iterator.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#ifndef QUEUE_ITERATOR_HPP_
-#define QUEUE_ITERATOR_HPP_
-
-#include "dev_support/verify.hpp"
-#include <set>
-
-template<typename T, typename Comparator>
-class erasable_priority_queue {
-private:
- std::set<T, Comparator> storage_;
-public:
- /*
- * Be careful! This constructor requires Comparator to have default constructor even if you call it with
- * specified comparator. In this case just create default constructor with VERIFY(false) inside it.
- */
- erasable_priority_queue(const Comparator& comparator = Comparator()) :
- storage_(comparator) {
- }
-
- template<typename InputIterator>
- erasable_priority_queue(InputIterator begin, InputIterator end,
- const Comparator& comparator = Comparator()) :
- storage_(begin, end, comparator) {
- }
-
- void pop() {
- VERIFY(!storage_.empty());
- storage_.erase(storage_.begin());
- }
-
- const T& top() const {
- VERIFY(!storage_.empty());
- return *(storage_.begin());
- }
-
- void push(const T& key) {
- storage_.insert(key);
- }
-
- bool erase(const T& key) {
- bool res = storage_.erase(key) > 0;
- return res;
- }
-
- void clear() {
- storage_.clear();
- }
-
- bool empty() const {
- return storage_.empty();
- }
-
- size_t size() const {
- return storage_.size();
- }
-
- template <class InputIterator>
- void insert ( InputIterator first, InputIterator last ) {
- storage_.insert(first, last);
- }
-
-};
-
-template<typename T, typename Comparator = std::less<T>>
-class DynamicQueueIterator {
-
- bool current_actual_;
- bool current_deleted_;
- T current_;
- erasable_priority_queue<T, Comparator> queue_;
-
-public:
-
- DynamicQueueIterator(const Comparator& comparator = Comparator()) :
- current_actual_(false), current_deleted_(false), queue_(comparator) {
- }
-
- template<typename InputIterator>
- void insert(InputIterator begin, InputIterator end) {
- queue_.insert(begin, end);
- }
-
- void push(const T& to_add) {
- queue_.push(to_add);
- }
-
- void erase(const T& to_remove) {
- if (current_actual_ && to_remove == current_) {
- current_deleted_ = true;
- }
- queue_.erase(to_remove);
- }
-
- void clear() {
- queue_.clear();
- current_actual_ = false;
- current_deleted_ = false;
- }
-
- bool IsEnd() const {
- return queue_.empty();
- }
-
- size_t size() const {
- return queue_.size();
- }
-
- const T& operator*() {
- VERIFY(!queue_.empty());
- if(!current_actual_ || current_deleted_) {
- current_ = queue_.top();
- current_actual_ = true;
- current_deleted_ = false;
- }
- return current_;
- }
-
- void operator++() {
- if (!current_actual_) {
- queue_.pop();
- } else if (!current_deleted_) {
- queue_.erase(current_);
- }
- current_actual_ = false;
- }
-
- //use carefully!
- void ReleaseCurrent() {
- current_actual_ = false;
- }
-
-};
-
-
-#endif /* QUEUE_ITERATOR_HPP_ */
-
diff --git a/src/utils/levenshtein.hpp b/src/utils/levenshtein.hpp
deleted file mode 100644
index 007966a..0000000
--- a/src/utils/levenshtein.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-//***************************************************************************
-//* Copyright (c) 2015 Saint Petersburg State University
-//* Copyright (c) 2011-2014 Saint Petersburg Academic University
-//* All Rights Reserved
-//* See file LICENSE for details.
-//***************************************************************************
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "dev_support/simple_tools.hpp"
-
-/*
- * Little modified copy-paste from http://www.merriampark.com/ldcpp.htm
- */
-inline size_t edit_distance(const std::string &source, const std::string &target) {
-
- // Step 1
-
- const size_t n = source.length();
- const size_t m = target.length();
- if (n == 0) {
- return m;
- }
- if (m == 0) {
- return n;
- }
-
- // Good form to declare a TYPEDEF
-
- typedef std::vector<std::vector<size_t> > Tmatrix;
-
- Tmatrix matrix(n + 1);
-
- // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
- // allow for allocation on declaration of 2.nd dimension of vec of vec
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i].resize(m + 1);
- }
-
- // Step 2
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i][0] = i;
- }
-
- for (size_t j = 0; j <= m; j++) {
- matrix[0][j] = j;
- }
-
- // Step 3
-
- for (size_t i = 1; i <= n; i++) {
-
- const char s_i = source[i - 1];
-
- // Step 4
-
- for (size_t j = 1; j <= m; j++) {
-
- const char t_j = target[j - 1];
-
- // Step 5
-
- size_t cost;
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
-
- // Step 6
-
- const size_t above = matrix[i - 1][j];
- const size_t left = matrix[i][j - 1];
- const size_t diag = matrix[i - 1][j - 1];
- size_t cell = std::min(above + 1, std::min(left + 1, diag + cost));
-
- // Step 6A: Cover transposition, in addition to deletion,
- // insertion and substitution. This step is taken from:
- // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
- // Enhanced Dynamic Programming ASM Algorithm"
- // (http://www.acm.org/~hlb/publications/asm/asm.html)
-
- if (i > 2 && j > 2) {
- size_t trans = matrix[i - 2][j - 2] + 1;
- if (source[i - 2] != t_j) trans++;
- if (s_i != target[j - 2]) trans++;
- if (cell > trans) cell = trans;
- }
-
- matrix[i][j] = cell;
- }
- }
-
- // Step 7
-
- return matrix[n][m];
-}
-
-inline std::pair<std::pair<int, int>, std::string> best_edit_distance_cigar(const std::string &source,
- const std::string &target) {
-
- // Step 1
-
- const size_t n = source.length();
- const size_t m = target.length();
-// if (n == 0) {
-// return m;
-// }
-// if (m == 0) {
-// return n;
-// }
-
- // Good form to declare a TYPEDEF
-
- typedef std::vector<std::vector<int> > Tmatrix;
-
- Tmatrix matrix(n + 1);
-
- // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't
- // allow for allocation on declaration of 2.nd dimension of vec of vec
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i].resize(m + 1);
- }
-
- // Step 2
-
- for (size_t i = 0; i <= n; i++) {
- matrix[i][0] = (int) i;
- }
-
- for (size_t j = 0; j <= m; j++) {
- matrix[0][j] = 0; //free inserts in front
- }
-
- // Step 3
-
- for (size_t i = 1; i <= n; i++) {
-
- const char s_i = source[i - 1];
-
- // Step 4
-
- for (size_t j = 1; j <= m; j++) {
-
- const char t_j = target[j - 1];
-
- // Step 5
-
- int cost;
- if (s_i == t_j) {
- cost = 0;
- }
- else {
- cost = 1;
- }
-
- // Step 6
-
- const int above = matrix[i - 1][j];
- const int left = matrix[i][j - 1];
- const int diag = matrix[i - 1][j - 1];
- int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-
- // Step 6A: Cover transposition, in addition to deletion,
- // insertion and substitution. This step is taken from:
- // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
- // Enhanced Dynamic Programming ASM Algorithm"
- // (http://www.acm.org/~hlb/publications/asm/asm.html)
-
-// if (i>2 && j>2) {
-// int trans=matrix[i-2][j-2]+1;
-// if (source[i-2]!=t_j) trans++;
-// if (s_i!=target[j-2]) trans++;
-// if (cell>trans) cell=trans;
-// }
-
- matrix[i][j] = cell;
- }
- }
-
- // Step 7
- int min = matrix[n][m];
- size_t min_m = m;
-
- for (size_t j = 0; j <= m; j++) {
- if (min > matrix[n][j]) {
- min = matrix[n][j];
- min_m = j;
- }
- }
-
-// INFO("min = "<<min<< " min_m = "<< min_m);
- std::string res = "";
- char last_operation = 0;
- int cnt_last_operation = 0;
- size_t cur_pos_i = n;
- size_t cur_pos_j = min_m;
- char cur_operation = 0;
-
-
-// if (min > 0) {
-// for (int i = 0; i <= n; i++) {
-// INFO(ToString(matrix[i]));
-// }
-// }
-
- while ((cur_pos_i > 0) && (cur_pos_j > 0)) {
- if (matrix[cur_pos_i - 1][cur_pos_j] < matrix[cur_pos_i][cur_pos_j]) {
- cur_operation = 'I';
- cur_pos_i--;
- }
- else {
- if (matrix[cur_pos_i][cur_pos_j - 1] < matrix[cur_pos_i][cur_pos_j]) {
- cur_operation = 'D';
- cur_pos_j--;
- }
- else {
- cur_operation = 'M';
- cur_pos_i--;
- cur_pos_j--;
- }
- }
- if (cur_operation != last_operation) {
- if (last_operation != 0)
- res = ToString(cnt_last_operation) + last_operation + res;
- last_operation = cur_operation;
- cnt_last_operation = 1;
- }
- else {
- cnt_last_operation++;
- }
- }
- res = ToString(cnt_last_operation) + last_operation + res;
- return std::make_pair(std::make_pair(cur_pos_j, min_m), res);
-}
diff --git a/test_dataset_plasmid/pl1.fq.gz b/test_dataset_plasmid/pl1.fq.gz
new file mode 100644
index 0000000..c938262
Binary files /dev/null and b/test_dataset_plasmid/pl1.fq.gz differ
diff --git a/test_dataset_plasmid/pl2.fq.gz b/test_dataset_plasmid/pl2.fq.gz
new file mode 100644
index 0000000..f24a455
Binary files /dev/null and b/test_dataset_plasmid/pl2.fq.gz differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/spades.git
More information about the debian-med-commit
mailing list