[med-svn] [spades] 01/06: Imported Upstream version 3.8.2+dfsg
Sascha Steinbiss
satta at debian.org
Mon Jul 11 10:02:05 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository spades.
commit cba6d9c7b86edd43a3d71a191f02cc3ee98c45b1
Author: Sascha Steinbiss <satta at debian.org>
Date: Mon Jul 11 08:36:20 2016 +0000
Imported Upstream version 3.8.2+dfsg
---
VERSION | 2 +-
changelog.html | 4 +
configs/debruijn/path_extend/pe_params.info | 8 +-
ext/src/ssw/ssw.c | 3 +-
manual.html | 40 ++++----
metaspades.py | 4 +-
plasmidspades.py | 4 +-
spades.py | 4 +-
spades_init.py | 29 ++++--
src/cmake/pack.cmake | 4 +-
.../algorithms/path_extend/extension_chooser.hpp | 108 +++++++++++++--------
.../algorithms/path_extend/loop_traverser.hpp | 15 ++-
.../algorithms/path_extend/overlap_analysis.hpp | 8 +-
.../algorithms/path_extend/path_extend_launch.hpp | 2 +-
src/modules/algorithms/path_extend/pe_io.hpp | 28 +++++-
.../simplification/complex_tip_clipper.hpp | 11 ++-
src/modules/paired_info/pair_info_improver.hpp | 15 +--
src/projects/hammer/hammer_tools.cpp | 1 +
src/projects/spades/distance_estimation.cpp | 4 +-
src/projects/spades/main.cpp | 3 +-
src/spades_pipeline/spades_logic.py | 24 +++--
src/spades_pipeline/support.py | 3 +
22 files changed, 207 insertions(+), 117 deletions(-)
diff --git a/VERSION b/VERSION
index 1693986..0418bab 100644
--- a/VERSION
+++ b/VERSION
@@ -1,2 +1,2 @@
-3.8.1
+3.8.2
diff --git a/changelog.html b/changelog.html
index a11ed47..e83ce50 100644
--- a/changelog.html
+++ b/changelog.html
@@ -3,6 +3,10 @@
<h2>SPAdes Genome Assembler changelog</h2>
+<h3>SPAdes 3.8.2, 10 July 2016</h3>
+
+<p> FIX: Several minor bug-fixes for metaSPAdes and SPAdes pipelines.</p>
+
<h3>SPAdes 3.8.1, 8 June 2016</h3>
<p>FIX: plasmidSPAdes now works with PacBio/Nanopore reads.</p>
diff --git a/configs/debruijn/path_extend/pe_params.info b/configs/debruijn/path_extend/pe_params.info
index 13e04ad..86f1cd6 100644
--- a/configs/debruijn/path_extend/pe_params.info
+++ b/configs/debruijn/path_extend/pe_params.info
@@ -2,7 +2,7 @@ default_pe {
; output options
-debug_output false
+debug_output true
output {
write_overlaped_paths true
@@ -20,7 +20,7 @@ output_broken_scaffolds break_gaps
params {
multi_path_extend false
; old | 2015 | combined | old_pe_2015
- scaffolding_mode old
+ scaffolding_mode old_pe_2015
remove_overlaps true
cut_all_overlaps false
@@ -93,8 +93,8 @@ params {
}
scaffold_graph {
- construct false
- output false
+ construct true
+ output true
always_add 40 ; connection with read count >= always_add are always added to the graph
never_add 5 ; connection with read count < never_add are never added to the graph
relative_threshold 0.25 ; connection with read count >= max_read_count * relative_threshod are added to the graph if satisfy condition above, max_read_count is calculated amond all alternatives
diff --git a/ext/src/ssw/ssw.c b/ext/src/ssw/ssw.c
index a77f39a..39682f2 100755
--- a/ext/src/ssw/ssw.c
+++ b/ext/src/ssw/ssw.c
@@ -470,7 +470,8 @@ static alignment_end* sw_sse2_word (const int8_t* ref,
for (j = 0; LIKELY(j < segLen); ++j) {
vH = _mm_load_si128(pvHStore + j);
vH = _mm_max_epi16(vH, vF);
- _mm_store_si128(pvHStore + j, vH);
+ vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+ _mm_store_si128(pvHStore + j, vH);
vH = _mm_subs_epu16(vH, vGapO);
vF = _mm_subs_epu16(vF, vGapE);
if (UNLIKELY(! _mm_movemask_epi8(_mm_cmpgt_epi16(vF, vH)))) goto end;
diff --git a/manual.html b/manual.html
index 9e7e799..e6cbad1 100644
--- a/manual.html
+++ b/manual.html
@@ -1,6 +1,6 @@
<html>
<head>
- <title>SPAdes 3.8.1 Manual</title>
+ <title>SPAdes 3.8.2 Manual</title>
<style type="text/css">
.code {
background-color: lightgray;
@@ -8,7 +8,7 @@
</style>
</head>
<body>
-<h1>SPAdes 3.8.1 Manual</h1>
+<h1>SPAdes 3.8.2 Manual</h1>
1. <a href="#sec1">About SPAdes</a><br>
1.1. <a href="#sec1.1">Supported data types</a><br>
@@ -35,18 +35,18 @@
<h2>1. About SPAdes</h2>
<p>
SPAdes – St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. This manual will help you to install and run SPAdes.
-SPAdes version 3.8.1 was released under GPLv2 on June 8, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
+SPAdes version 3.8.2 was released under GPLv2 on July 10, 2016 and can be downloaded from <a href="http://bioinf.spbau.ru/en/spades" target="_blank">http://bioinf.spbau.ru/en/spades</a>.
<a name="sec1.1"></a>
<h3>1.1 Supported data types</h3>
<p>
The current version of SPAdes works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. You can also provide additional contigs that will be used as long reads.
<p>
- Version 3.8.1 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
+ Version 3.8.2 of SPAdes supports paired-end reads, mate-pairs and unpaired reads. SPAdes can take as input several paired-end and mate-pair libraries simultaneously. Note, that SPAdes was initially designed for small genomes. It was tested on single-cell and standard bacterial and fungal data sets. SPAdes is not intended for larger genomes (e.g. mammalian size genomes). For such purposes you can use it at your own risk.
<p>
- SPAdes 3.8.1 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
+ SPAdes 3.8.2 includes metaSPAdes – a pipeline designed specially for metagenomic data sets. To learn more see <a href="#meta">options</a>.
<p>
- Also, SPAdes 3.8.1 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
+ Also, SPAdes 3.8.2 includes plasmidSPAdes – a pipeline designed for extracting and assembling plasmids from WGS data sets. To learn more see <a href="#plasmid">options</a>.
<p>
Additionally, SPAdes has a separate modules for assembling highly polymorphic diploid genomes and for TruSeq barcode assembly. For more information see <a href="dipspades_manual.html" target="_blank">dipSPAdes manual</a> and <a href="truspades_manual.html" target="_blank">truSPAdes manual</a> .
@@ -143,7 +143,7 @@ SPAdes comes in several separate modules:
<li> Running SPAdes without preliminary read error correction (e.g. without BayesHammer or IonHammer) will likely require more time and memory. </li>
<li> Each module removes its temporary files as soon as it finishes. </li>
<li> SPAdes uses 512 Mb per thread for buffers, which results in higher memory consumption. If you set memory limit manually, SPAdes will use smaller buffers and thus less RAM. </li>
- <li> Performance statistics is given for SPAdes version 3.8.1. </li>
+ <li> Performance statistics is given for SPAdes version 3.8.2. </li>
</ul>
@@ -157,13 +157,13 @@ SPAdes comes in several separate modules:
<h3>2.1 Downloading SPAdes Linux binaries</h3>
<p>
- To download <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
+ To download <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz">SPAdes Linux binaries</a> and extract them, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Linux.tar.gz
- tar -xzf SPAdes-3.8.1-Linux.tar.gz
- cd SPAdes-3.8.1-Linux/bin/
+ wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Linux.tar.gz
+ tar -xzf SPAdes-3.8.2-Linux.tar.gz
+ cd SPAdes-3.8.2-Linux/bin/
</code>
</pre>
@@ -189,13 +189,13 @@ SPAdes comes in several separate modules:
<h3>2.2 Downloading SPAdes binaries for Mac</h3>
<p>
- To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
+ To obtain <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz">SPAdes binaries for Mac</a>, go to the directory in which you wish SPAdes to be installed and run:
<pre class="code">
<code>
- curl http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1-Darwin.tar.gz -o SPAdes-3.8.1-Darwin.tar.gz
- tar -zxf SPAdes-3.8.1-Darwin.tar.gz
- cd SPAdes-3.8.1-Darwin/bin/
+ curl http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2-Darwin.tar.gz -o SPAdes-3.8.2-Darwin.tar.gz
+ tar -zxf SPAdes-3.8.2-Darwin.tar.gz
+ cd SPAdes-3.8.2-Darwin/bin/
</code>
</pre>
@@ -230,13 +230,13 @@ SPAdes comes in several separate modules:
</ul>
<p>
- If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1.tar.gz">SPAdes source code</a>:
+ If you meet these requirements, you can download the <a href="http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz">SPAdes source code</a>:
<pre class="code">
<code>
- wget http://spades.bioinf.spbau.ru/release3.8.1/SPAdes-3.8.1.tar.gz
- tar -xzf SPAdes-3.8.1.tar.gz
- cd SPAdes-3.8.1
+ wget http://spades.bioinf.spbau.ru/release3.8.2/SPAdes-3.8.2.tar.gz
+ tar -xzf SPAdes-3.8.2.tar.gz
+ cd SPAdes-3.8.2
</code>
</pre>
@@ -340,7 +340,7 @@ Thank you for using SPAdes!
SPAdes takes as input paired-end reads, mate-pairs and single (unpaired) reads in FASTA and FASTQ. For IonTorrent data SPAdes also supports unpaired reads in unmapped BAM format (like the one produced by Torrent Server). However, in order to run read error correction, reads should be in FASTQ or BAM format. Sanger, Oxford Nanopore and PacBio CLR reads can be provided in both formats since SPAdes does not run error correction for these types of data.
<p>
- To run SPAdes 3.8.1 you need at least one library of the following types:
+ To run SPAdes 3.8.2 you need at least one library of the following types:
<ul>
<li>Illumina paired-end/high-quality mate-pairs/unpaired reads</li>
<li>IonTorrent paired-end/high-quality mate-pairs/unpaired reads</li>
diff --git a/metaspades.py b/metaspades.py
index d06205f..9f5034b 100755
--- a/metaspades.py
+++ b/metaspades.py
@@ -756,7 +756,7 @@ def main(args):
dataset_file.close()
spades_cfg.__dict__["dataset"] = dataset_filename
- latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+ used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
if os.path.isdir(misc_dir) and not options_storage.continue_mode:
shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
if k_str.find(":") != -1:
k_str = k_str[:k_str.find(":")]
support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
- log.info("\n===== %s finished. \n" % STAGE_NAME)
+ log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
if not options_storage.run_completed:
if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
support.finish_here(log)
diff --git a/plasmidspades.py b/plasmidspades.py
index d06205f..9f5034b 100755
--- a/plasmidspades.py
+++ b/plasmidspades.py
@@ -756,7 +756,7 @@ def main(args):
dataset_file.close()
spades_cfg.__dict__["dataset"] = dataset_filename
- latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+ used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
if os.path.isdir(misc_dir) and not options_storage.continue_mode:
shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
if k_str.find(":") != -1:
k_str = k_str[:k_str.find(":")]
support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
- log.info("\n===== %s finished. \n" % STAGE_NAME)
+ log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
if not options_storage.run_completed:
if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
support.finish_here(log)
diff --git a/spades.py b/spades.py
index d06205f..9f5034b 100755
--- a/spades.py
+++ b/spades.py
@@ -756,7 +756,7 @@ def main(args):
dataset_file.close()
spades_cfg.__dict__["dataset"] = dataset_filename
- latest_dir = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
+ used_K = spades_logic.run_spades(tmp_configs_dir, bin_home, spades_cfg, dataset_data, ext_python_modules_home, log)
if os.path.isdir(misc_dir) and not options_storage.continue_mode:
shutil.rmtree(misc_dir)
@@ -768,7 +768,7 @@ def main(args):
if k_str.find(":") != -1:
k_str = k_str[:k_str.find(":")]
support.error("failed to continue from K=%s because this K was not processed in the original run!" % k_str, log)
- log.info("\n===== %s finished. \n" % STAGE_NAME)
+ log.info("\n===== %s finished. Used k-mer sizes: %s \n" % (STAGE_NAME, ', '.join(map(str, used_K))))
if not options_storage.run_completed:
if options_storage.stop_after == 'as' or options_storage.stop_after == 'scc' or (options_storage.stop_after and options_storage.stop_after.startswith('k')):
support.finish_here(log)
diff --git a/spades_init.py b/spades_init.py
old mode 100644
new mode 100755
index ce55e1a..4baebdd
--- a/spades_init.py
+++ b/spades_init.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
@@ -7,16 +9,18 @@
import os
import sys
+from os.path import abspath, dirname, realpath, join, isfile
source_dirs = ["", "truspades", "common"]
# developers configuration
-spades_home = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
-bin_home = os.path.join(spades_home, 'bin')
-python_modules_home = os.path.join(spades_home, 'src')
-ext_python_modules_home = os.path.join(spades_home, 'ext', 'src', 'python_libs')
+spades_home = abspath(dirname(realpath(__file__)))
+bin_home = join(spades_home, 'bin')
+python_modules_home = join(spades_home, 'src')
+ext_python_modules_home = join(spades_home, 'ext', 'src', 'python_libs')
spades_version = ''
+
def init():
global spades_home
global bin_home
@@ -25,14 +29,19 @@ def init():
global ext_python_modules_home
# users configuration (spades_init.py and spades binary are in the same directory)
- if os.path.isfile(os.path.join(spades_home, 'spades')):
- install_prefix = os.path.dirname(spades_home)
- bin_home = os.path.join(install_prefix, 'bin')
- spades_home = os.path.join(install_prefix, 'share', 'spades')
+ if isfile(os.path.join(spades_home, 'spades')):
+ install_prefix = dirname(spades_home)
+ bin_home = join(install_prefix, 'bin')
+ spades_home = join(install_prefix, 'share', 'spades')
python_modules_home = spades_home
ext_python_modules_home = spades_home
for dir in source_dirs:
- sys.path.append(os.path.join(python_modules_home, 'spades_pipeline', dir))
+ sys.path.append(join(python_modules_home, 'spades_pipeline', dir))
+
+ spades_version = open(join(spades_home, 'VERSION'), 'r').readline().strip()
+
- spades_version = open(os.path.join(spades_home, 'VERSION'), 'r').readline().strip()
+if __name__ == '__main__':
+ spades_py_path = join(dirname(realpath(__file__)), 'spades.py')
+ sys.stderr.write('Please use ' + spades_py_path + ' for running SPAdes genome assembler\n')
\ No newline at end of file
diff --git a/src/cmake/pack.cmake b/src/cmake/pack.cmake
index 560c2cb..6b72b71 100644
--- a/src/cmake/pack.cmake
+++ b/src/cmake/pack.cmake
@@ -12,10 +12,10 @@ set(CPACK_PACKAGE_NAME "SPAdes")
set(CPACK_PACKAGE_VENDOR "Saint Petersburg Academic University")
set(CPACK_PACKAGE_DESCRIPTION_FILE "${SPADES_MAIN_SRC_DIR}/../README")
set(CPACK_RESOURCE_FILE_LICENSE "${SPADES_MAIN_SRC_DIR}/../LICENSE")
-set(CPACK_PACKAGE_VERSION "3.8.1")
+set(CPACK_PACKAGE_VERSION "3.8.2")
set(CPACK_PACKAGE_VERSION_MAJOR "3")
set(CPACK_PACKAGE_VERSION_MINOR "8")
-set(CPACK_PACKAGE_VERSION_PATCH "1")
+set(CPACK_PACKAGE_VERSION_PATCH "2")
set(CPACK_STRIP_FILES bin/spades bin/hammer bin/ionhammer bin/dipspades bin/spades-bwa bin/corrector bin/scaffold_correction)
# Source stuff
diff --git a/src/modules/algorithms/path_extend/extension_chooser.hpp b/src/modules/algorithms/path_extend/extension_chooser.hpp
index 13f197c..b00f944 100644
--- a/src/modules/algorithms/path_extend/extension_chooser.hpp
+++ b/src/modules/algorithms/path_extend/extension_chooser.hpp
@@ -1401,8 +1401,8 @@ private:
std::queue<VertexId>& can_be_processed, double path_coverage) const {
DEBUG("Updating can be processed");
for (EdgeId e : g_.OutgoingEdges(v)) {
- VertexId neighbour_v = this->g_.EdgeEnd(e);
- if (g_.length(e) < max_edge_length_in_repeat_ && GoodExtension(e, path_coverage)) {
+ VertexId neighbour_v = g_.EdgeEnd(e);
+ if (g_.length(e) <= max_edge_length_in_repeat_ && CompatibleEdge(e, path_coverage)) {
DEBUG("Adding vertex " << neighbour_v.int_id()
<< "through edge " << g_.str(e));
can_be_processed.push(neighbour_v);
@@ -1443,61 +1443,91 @@ private:
return result;
}
- bool GoodExtension(EdgeId e, double path_coverage) const {
+ bool CompatibleEdge(EdgeId e, double path_coverage) const {
return math::ge(g_.coverage(e), path_coverage * delta_);
}
- EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
- set<EdgeId> good_extensions;
- for(auto edge : edges) {
+ //returns lowest coverage among long compatible edges ahead of e
+ //if std::numeric_limits<double>::max() -- no such edges were detected
+ //if negative -- abort at once
+ double AnalyzeExtension(EdgeId ext, double path_coverage) const {
+ double answer = std::numeric_limits<double>::max();
- if(!GoodExtension(edge.e_, path_coverage)) {
- continue;
- }
+ if (!CompatibleEdge(ext, path_coverage)) {
+ DEBUG("Extension coverage too low");
+ return answer;
+ }
- if (g_.length(edge.e_) > max_edge_length_in_repeat_) {
- if (GoodExtension(edge.e_, path_coverage)) {
- good_extensions.insert(edge.e_);
- }
- continue;
- }
+ if (g_.length(ext) > max_edge_length_in_repeat_) {
+ DEBUG("Long extension");
+ return g_.coverage(ext);
+ }
- GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(edge.e_), path_coverage);
- if (gc.v_size() == 0) {
- return EdgeContainer();
- }
+ DEBUG("Short extension, launching repeat component analysis");
+ GraphComponent<Graph> gc = GetRepeatComponent(g_.EdgeEnd(ext), path_coverage);
+ if (gc.v_size() == 0) {
+ DEBUG("Component search failed");
+ return -1.;
+ }
- for (auto e : gc.edges()) {
- if (g_.length(e) > max_edge_length_in_repeat_) {
- DEBUG("Repeat component contains long edges");
- return EdgeContainer();
- }
+ for (auto e : gc.edges()) {
+ if (g_.length(e) > max_edge_length_in_repeat_) {
+ DEBUG("Repeat component contains long edges");
+ return -1.;
}
+ }
- for (auto v : gc.sinks()) {
- for (auto e : g_.OutgoingEdges(v)) {
- if (GoodExtension(e, path_coverage)) {
- good_extensions.insert(edge.e_);
- }
+ DEBUG("Checking long sinks");
+ for (auto v : gc.sinks()) {
+ for (auto e : g_.OutgoingEdges(v)) {
+ if (g_.length(e) > max_edge_length_in_repeat_ &&
+ CompatibleEdge(e, path_coverage) &&
+ math::ls(g_.coverage(e), answer)) {
+ DEBUG("Updating answer to coverage of edge " << g_.str(e));
+ answer = g_.coverage(e);
}
}
}
- DEBUG("Number of good extensions is " << good_extensions.size());
+ return answer;
+ }
- if (good_extensions.size() != 1) {
- DEBUG("Returning");
- return EdgeContainer();
+ EdgeContainer FindExtensionTroughRepeat(const EdgeContainer& edges, double path_coverage) const {
+ static EdgeContainer EMPTY_CONTAINER;
+
+ map<EdgeId, double> good_extension_to_ahead_cov;
+
+ for (auto edge : edges) {
+ DEBUG("Processing candidate extension " << g_.str(edge.e_));
+ double analysis_res = AnalyzeExtension(edge.e_, path_coverage);
+
+ if (analysis_res == std::numeric_limits<double>::max()) {
+ DEBUG("Ignoring extension");
+ } else if (math::ls(analysis_res, 0.)) {
+ DEBUG("Troubles detected, abort mission");
+ return EMPTY_CONTAINER;
+ } else {
+ good_extension_to_ahead_cov[edge.e_] = analysis_res;
+ DEBUG("Extension mapped to ahead coverage of " << analysis_res);
+ }
}
- auto extension = *good_extensions.begin();
- if(math::ls(path_coverage, g_.coverage(extension) * delta_)) {
- DEBUG("Extension coverage too high");
- return EdgeContainer();
+ DEBUG("Number of good extensions is " << good_extension_to_ahead_cov.size());
+
+ if (good_extension_to_ahead_cov.size() == 1) {
+ auto extension_info = *good_extension_to_ahead_cov.begin();
+ DEBUG("Single extension candidate " << g_.str(extension_info.first));
+ if (math::le(extension_info.second, path_coverage / delta_)) {
+ DEBUG("Extending");
+ return FinalFilter(edges, extension_info.first);
+ } else {
+ DEBUG("Predicted ahead coverage is too high");
+ }
+ } else {
+ DEBUG("Multiple extension candidates");
}
- DEBUG("Filtering... Extend with edge " << extension.int_id());
- return FinalFilter(edges, extension);
+ return EMPTY_CONTAINER;
}
CoverageAwareIdealInfoProvider provider_;
diff --git a/src/modules/algorithms/path_extend/loop_traverser.hpp b/src/modules/algorithms/path_extend/loop_traverser.hpp
index 048615f..57eda57 100644
--- a/src/modules/algorithms/path_extend/loop_traverser.hpp
+++ b/src/modules/algorithms/path_extend/loop_traverser.hpp
@@ -26,6 +26,7 @@ class LoopTraverser {
const Graph& g_;
GraphCoverageMap& covMap_;
shared_ptr<ContigsMaker> extender_;
+ static const size_t MAX_EDGE_LENGTH = 1000;
private:
EdgeId FindStart(const set<VertexId>& component_set) const{
EdgeId result;
@@ -164,7 +165,6 @@ private:
for (size_t i = 0; i < shortest_path.size(); ++i) {
nLen += g_.length(shortest_path[i]);
}
- nLen += g_.k();
}
}
}
@@ -182,6 +182,15 @@ private:
endPath->Clear();
}
+ bool ContainsLongEdges(const GraphComponent<Graph>& component) const {
+ for(auto e : component.edges()) {
+ if(g_.length(e) > MAX_EDGE_LENGTH) {
+ return true;
+ }
+ }
+ return false;
+ }
+
public:
LoopTraverser(const Graph& g, GraphCoverageMap& coverageMap, shared_ptr<ContigsMaker> extender) :
g_(g), covMap_(coverageMap), extender_(extender) {
@@ -189,11 +198,13 @@ public:
void TraverseAllLoops() {
DEBUG("TraverseAllLoops");
- shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, 1000);
+ shared_ptr<GraphSplitter<Graph>> splitter = LongEdgesExclusiveSplitter<Graph>(g_, MAX_EDGE_LENGTH);
while (splitter->HasNext()) {
GraphComponent<Graph> component = splitter->Next();
if (component.v_size() > 10)
continue;
+ if(ContainsLongEdges(component))
+ continue;
set<VertexId> component_set(component.v_begin(), component.v_end());
EdgeId start = FindStart(component_set);
EdgeId finish = FindFinish(component_set);
diff --git a/src/modules/algorithms/path_extend/overlap_analysis.hpp b/src/modules/algorithms/path_extend/overlap_analysis.hpp
index 279dc4a..d773b57 100644
--- a/src/modules/algorithms/path_extend/overlap_analysis.hpp
+++ b/src/modules/algorithms/path_extend/overlap_analysis.hpp
@@ -85,10 +85,10 @@ class SWOverlapAnalyzer {
public:
SWOverlapAnalyzer(size_t flank_length)
: flank_length_(flank_length),
- aligner_(/*match_score*/2,
- /*mismatch_penalty*/6,
- /*gap_opening_penalty*/8,
- /*gap_extending_penalty*/8) {
+ aligner_(/*match_score*/1,
+ /*mismatch_penalty*/3,
+ /*gap_opening_penalty*/4,
+ /*gap_extending_penalty*/3) {
}
diff --git a/src/modules/algorithms/path_extend/path_extend_launch.hpp b/src/modules/algorithms/path_extend/path_extend_launch.hpp
index 7acdeff..5b11bc7 100644
--- a/src/modules/algorithms/path_extend/path_extend_launch.hpp
+++ b/src/modules/algorithms/path_extend/path_extend_launch.hpp
@@ -351,7 +351,7 @@ inline shared_ptr<SimpleExtensionChooser> MakeMetaExtensionChooser(const conj_gr
VERIFY(cfg::get().mode == config::pipeline_type::meta);
VERIFY(!lib->IsMp());
shared_ptr<WeightCounter> wc = make_shared<MetagenomicWeightCounter>(gp.g, lib, /*read_length*/cfg::get().ds.RL(),
- /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 300);
+ /*normalized_threshold*/ 0.3, /*raw_threshold*/ 3, /*estimation_edge_length*/ 0);
return make_shared<SimpleExtensionChooser>(gp.g, wc,
pset.extension_options.weight_threshold,
pset.extension_options.priority_coeff);
diff --git a/src/modules/algorithms/path_extend/pe_io.hpp b/src/modules/algorithms/path_extend/pe_io.hpp
index 60c22f1..4aa9ffd 100644
--- a/src/modules/algorithms/path_extend/pe_io.hpp
+++ b/src/modules/algorithms/path_extend/pe_io.hpp
@@ -43,20 +43,37 @@ protected:
ss << constructor_.construct(path[0]).first.substr(0, k_);
}
- for (size_t i = 0; i < path.Size(); ++i) {
+
+ size_t i = 0;
+ while (i < path.Size()) {
int gap = i == 0 ? 0 : path.GapAt(i);
if (gap > (int) k_) {
for (size_t j = 0; j < gap - k_; ++j) {
ss << "N";
}
ss << constructor_.construct(path[i]).first;
- } else {
+ }
+ else {
int overlapLen = (int) k_ - gap;
if (overlapLen >= (int) g_.length(path[i]) + (int) k_) {
- if(overlapLen > (int) g_.length(path[i]) + (int) k_) {
- WARN("Such scaffolding logic leads to local misassemblies");
+ overlapLen -= (int) g_.length(path[i]) + (int) k_;
+ ++i;
+ //skipping overlapping edges
+ while (i < path.Size() && overlapLen >= (int) g_.length(path[i]) + path.GapAt(i)) {
+ overlapLen -= (int) g_.length(path[i]) + path.GapAt(i);
+ ++i;
+ }
+ if (i == path.Size()) {
+ break;
+ }
+
+ overlapLen = overlapLen + (int) k_ - path.GapAt(i);
+ if(overlapLen < 0) {
+ for (size_t j = 0; j < abs(overlapLen); ++j) {
+ ss << "N";
+ }
+ overlapLen = 0;
}
- continue;
}
auto temp_str = g_.EdgeNucls(path[i]).Subseq(overlapLen).str();
if(i != path.Size() - 1) {
@@ -69,6 +86,7 @@ protected:
}
ss << temp_str;
}
+ ++i;
}
return ss.str();
}
diff --git a/src/modules/algorithms/simplification/complex_tip_clipper.hpp b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
index a5fd240..984cfd5 100644
--- a/src/modules/algorithms/simplification/complex_tip_clipper.hpp
+++ b/src/modules/algorithms/simplification/complex_tip_clipper.hpp
@@ -107,7 +107,12 @@ public:
DEBUG("Processing vertex " << g_.str(*it));
DominatedSetFinder<Graph> dom_finder(g_, *it, max_path_length_ * 2);
- dom_finder.FillDominated();
+
+ if(!dom_finder.FillDominated()) {
+ DEBUG("Tip contains too long paths");
+ continue;
+ }
+
auto component = dom_finder.AsGraphComponent();
if(!CheckEdgeLenghts(component)) {
@@ -142,8 +147,8 @@ public:
RemoveComplexTip(component);
}
CompressAllVertices(g_);
- INFO("Complex tip clipper finished");
- INFO("Tips processed " << cnt);
+ DEBUG("Complex tip clipper finished");
+ DEBUG("Tips processed " << cnt);
return something_done_flag;
}
private:
diff --git a/src/modules/paired_info/pair_info_improver.hpp b/src/modules/paired_info/pair_info_improver.hpp
index 89c8945..f1b392a 100644
--- a/src/modules/paired_info/pair_info_improver.hpp
+++ b/src/modules/paired_info/pair_info_improver.hpp
@@ -84,8 +84,8 @@ class PairInfoImprover {
public:
PairInfoImprover(const Graph& g,
Index& clustered_index,
- const io::SequencingLibrary<config::DataSetData> &lib)
- : graph_(g), index_(clustered_index), lib_(lib) { }
+ const io::SequencingLibrary<config::DataSetData> &lib, size_t max_repeat_length)
+ : graph_(g), index_(clustered_index), lib_(lib), max_repeat_length_(max_repeat_length) { }
void ImprovePairedInfo(unsigned num_threads = 1) {
CorrectPairedInfo(num_threads);
@@ -107,13 +107,13 @@ class PairInfoImprover {
public:
ContradictionalRemover(omnigraph::de::PairedInfoIndicesT<Graph> &to_remove,
const Graph &g,
- omnigraph::de::PairedInfoIndexT<Graph>& index)
- : to_remove_(to_remove), graph_(g), index_(index) {}
+ omnigraph::de::PairedInfoIndexT<Graph>& index, size_t max_repeat_length)
+ : to_remove_(to_remove), graph_(g), index_(index), max_repeat_length_(max_repeat_length) {}
bool operator()(EdgeId e) {
omnigraph::de::PairedInfoIndexT<Graph> &to_remove = to_remove_[omp_get_thread_num()];
- if (graph_.length(e)>= cfg::get().max_repeat_length && index_.contains(e))
+ if (graph_.length(e)>= max_repeat_length_ && index_.contains(e))
FindInconsistent(e, to_remove);
return false;
@@ -176,6 +176,7 @@ class PairInfoImprover {
omnigraph::de::PairedInfoIndicesT<Graph> &to_remove_;
const Graph &graph_;
Index& index_;
+ size_t max_repeat_length_;
};
size_t RemoveContradictional(unsigned nthreads) {
@@ -184,7 +185,7 @@ class PairInfoImprover {
omnigraph::de::PairedInfoIndicesT<Graph> to_remove(graph_, nthreads);
// FIXME: Replace with lambda
- ContradictionalRemover remover(to_remove, graph_, index_);
+ ContradictionalRemover remover(to_remove, graph_, index_, max_repeat_length_);
ParallelEdgeProcessor<Graph>(graph_, nthreads).Run(remover);
DEBUG("ParallelRemoveContraditional: Threads finished");
@@ -272,7 +273,7 @@ class PairInfoImprover {
const Graph& graph_;
Index& index_;
const io::SequencingLibrary<config::DataSetData>& lib_;
-
+ size_t max_repeat_length_;
DECL_LOGGER("PairInfoImprover")
};
diff --git a/src/projects/hammer/hammer_tools.cpp b/src/projects/hammer/hammer_tools.cpp
index 7298097..1fa2461 100644
--- a/src/projects/hammer/hammer_tools.cpp
+++ b/src/projects/hammer/hammer_tools.cpp
@@ -185,6 +185,7 @@ void CorrectPairedReadFiles(const KMerData &data,
INFO("Written batch " << buffer_no);
++buffer_no;
}
+ VERIFY_MSG(irsl.eof() && irsr.eof(), "Pair of read files " + fnamel + " and " + fnamer + " contain unequal amount of reads");
}
std::string getLargestPrefix(const std::string &str1, const std::string &str2) {
diff --git a/src/projects/spades/distance_estimation.cpp b/src/projects/spades/distance_estimation.cpp
index 4444f90..481a457 100644
--- a/src/projects/spades/distance_estimation.cpp
+++ b/src/projects/spades/distance_estimation.cpp
@@ -176,7 +176,9 @@ void estimate_distance(conj_graph_pack& gp,
INFO("The refining of clustered pair information has been finished "); // if so, it resolves such conflicts.
INFO("Improving paired information");
- PairInfoImprover<Graph> improver(gp.g, clustered_index, lib);
+ PairInfoImprover<Graph> improver(gp.g, clustered_index, lib,
+ config.mode == debruijn_graph::config::pipeline_type::meta ? std::numeric_limits<size_t>::max() : config.max_repeat_length);
+
improver.ImprovePairedInfo((unsigned) config.max_threads);
if (cfg::get().pe_params.param_set.scaffolder_options.cluster_info) {
diff --git a/src/projects/spades/main.cpp b/src/projects/spades/main.cpp
index 0f28162..5b66301 100644
--- a/src/projects/spades/main.cpp
+++ b/src/projects/spades/main.cpp
@@ -72,7 +72,7 @@ int main(int argc, char **argv) {
for (const auto& cfg_fn : cfg_fns)
INFO("Loading config from " << cfg_fn);
-
+
VERIFY(cfg::get().K >= runtime_k::MIN_K && cfg::get().K < runtime_k::MAX_K);
VERIFY(cfg::get().K % 2 != 0);
@@ -85,6 +85,7 @@ int main(int argc, char **argv) {
SPADES_GIT_REFSPEC
", git revision "
SPADES_GIT_SHA1);
+ INFO("Maximum k-mer length: " << runtime_k::MAX_K);
INFO("Assembling dataset (" << cfg::get().dataset_file << ") with K=" << cfg::get().K);
spades::assemble_genome();
diff --git a/src/spades_pipeline/spades_logic.py b/src/spades_pipeline/spades_logic.py
index e9237cc..c04f5e9 100644
--- a/src/spades_pipeline/spades_logic.py
+++ b/src/spades_pipeline/spades_logic.py
@@ -88,19 +88,19 @@ def update_k_mers_in_special_cases(cur_k_mers, RL, log, silent=False):
if options_storage.auto_K_allowed():
if RL >= 250:
if not silent:
- support.warning("Default k-mer sizes were set to %s because estimated "
- "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL), log)
+ log.info("Default k-mer sizes were set to %s because estimated "
+ "read length (%d) is equal to or greater than 250" % (str(options_storage.K_MERS_250), RL))
return options_storage.K_MERS_250
if RL >= 150:
if not silent:
- support.warning("Default k-mer sizes were set to %s because estimated "
- "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
+ log.info("Default k-mer sizes were set to %s because estimated "
+ "read length (%d) is equal to or greater than 150" % (str(options_storage.K_MERS_150), RL), log)
return options_storage.K_MERS_150
if RL <= max(cur_k_mers):
new_k_mers = [k for k in cur_k_mers if k < RL]
if not silent:
- support.warning("K-mer sizes were set to %s because estimated "
- "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
+ log.info("K-mer sizes were set to %s because estimated "
+ "read length (%d) is less than %d" % (str(new_k_mers), RL, max(cur_k_mers)), log)
return new_k_mers
return cur_k_mers
@@ -201,6 +201,7 @@ def prepare_config_scaffold_correction(filename, cfg, log, saves_dir, K):
#todo
process_cfg.substitute_params(filename, subst_dict, log)
+
def run_scaffold_correction(configs_dir, execution_home, cfg, log, latest, K):
data_dir = os.path.join(cfg.output_dir, "SCC", "K%d" % K)
saves_dir = os.path.join(data_dir, 'saves')
@@ -232,6 +233,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
if not isinstance(cfg.iterative_K, list):
cfg.iterative_K = [cfg.iterative_K]
cfg.iterative_K = sorted(cfg.iterative_K)
+ used_K = []
# checking and removing conflicting K-mer directories
if options_storage.restart_from:
@@ -249,7 +251,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
k_to_delete = []
for id, k in enumerate(needed_K):
if len(processed_K) == id:
- if processed_K[-1] == original_K[-1]: # the last K in the original run was processed in "last_one" mode
+ if processed_K[-1] == original_K[-1]: # the last K in the original run was processed in "last_one" mode
k_to_delete = [original_K[-1]]
break
if processed_K[id] != k:
@@ -272,8 +274,10 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
K = cfg.iterative_K[0]
if len(cfg.iterative_K) == 1:
run_iteration(configs_dir, execution_home, cfg, log, K, None, True)
+ used_K.append(K)
else:
run_iteration(configs_dir, execution_home, cfg, log, K, None, False)
+ used_K.append(K)
if options_storage.stop_after == "k%d" % K:
finished_on_stop_after = True
else:
@@ -290,6 +294,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
"Rerunning for the first value of K (%d) with Repeat Resolving" %
(cfg.iterative_K[1], RL, cfg.iterative_K[0]), log)
run_iteration(configs_dir, execution_home, cfg, log, cfg.iterative_K[0], None, True)
+ used_K.append(cfg.iterative_K[0])
K = cfg.iterative_K[0]
else:
rest_of_iterative_K = cfg.iterative_K
@@ -299,6 +304,7 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
count += 1
last_one = count == len(cfg.iterative_K) or (rest_of_iterative_K[count] + 1 > RL)
run_iteration(configs_dir, execution_home, cfg, log, K, prev_K, last_one)
+ used_K.append(K)
prev_K = K
if last_one:
break
@@ -354,8 +360,6 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
if not os.path.isfile(cfg.result_scaffolds_paths) or not options_storage.continue_mode:
shutil.copyfile(os.path.join(latest, "scaffolds.paths"), cfg.result_scaffolds_paths)
-
-
if cfg.developer_mode:
# saves
saves_link = os.path.join(os.path.dirname(cfg.result_contigs), "saves")
@@ -368,4 +372,4 @@ def run_spades(configs_dir, execution_home, cfg, dataset_data, ext_python_module
if os.path.isdir(cfg.tmp_dir):
shutil.rmtree(cfg.tmp_dir)
- return latest
+ return used_K
diff --git a/src/spades_pipeline/support.py b/src/spades_pipeline/support.py
index 77b8fa0..da7e65a 100644
--- a/src/spades_pipeline/support.py
+++ b/src/spades_pipeline/support.py
@@ -104,6 +104,7 @@ def recreate_dir(dirname):
shutil.rmtree(dirname)
os.makedirs(dirname)
+
def check_files_duplication(filenames, log):
for filename in filenames:
if filenames.count(filename) != 1:
@@ -847,6 +848,8 @@ def read_fasta(filename, gzipped=False):
file_handler = open(filename)
for line in file_handler:
line = process_readline(line, gzipped and sys.version.startswith('3.'))
+ if not line:
+ continue
if line[0] == '>':
res_name.append(line.strip())
if not first:
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/spades.git
More information about the debian-med-commit
mailing list