[med-svn] [Git][med-team/abyss][upstream] New upstream version 2.1.0
Andreas Tille
gitlab at salsa.debian.org
Fri Apr 20 09:44:19 BST 2018
Andreas Tille pushed to branch upstream at Debian Med / abyss
Commits:
e2ac24df by Andreas Tille at 2018-04-20T09:57:31+02:00
New upstream version 2.1.0
- - - - -
16 changed files:
- ChangeLog
- Common/ContigID.h
- Common/Histogram.h
- Common/IOUtil.h
- Common/SAM.h
- MergePaths/MergePaths.cpp
- Misc/samtobreak.hs
- README.md
- Scaffold/longseqdist.cpp
- Scaffold/scaffold.cc
- SimpleGraph/SimpleGraph.cpp
- bin/abyss-pe
- configure.ac
- doc/ABYSS.1
- doc/abyss-pe.1
- doc/abyss-tofastq.1
Changes:
=====================================
ChangeLog
=====================================
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,36 @@
-2017-12-19 Ben Vandervalk <benv at bcgsc.ca>
+2018-04-13 Ben Vandervalk <benv at bcgsc.ca>
+
+ * Release version 2.1.0
+ * Adds support for misassembly correction and scaffolding
+ using linked reads, using Tigmint and ARCS. (Tigmint and
+ ARCS must be installed separately.)
+ * Support simultaneous optimization of `s` (min sequence length)
+ and `n` (min supporting read pairs / Chromium barcodes)
+ during scaffolding
+
+ abyss-longseqdist:
+ * Fix hang on input SAM containing no alignments with MAPQ > 0
+
+ abyss-pe:
+ * New `lr` parameter. Provide linked reads (i.e. 10x Genomics
+ Chromium reads) via this parameter to perform misassembly
+ correction and scaffolding using Chromium barcode information.
+ Requires Tigmint and ARCS tools to be installed in addition
+ to ABySS.
+ * Fix bug where `j` (threads) was not being correctly passed to
+ to `bgzip`/`pigz`
+ * Fix bug where `zsh` time/memory profiling was not being used,
+ even when `zsh` was available
+
+ abyss-scaffold:
+ * Simultaneous optimization of `n` and `s` using line search
+ or grid search [default]
+
+ SimpleGraph:
+ * add options `-s` and `-n` to filter paired-end paths by
+ seed length and edge weight, respectively
+
+2018-03-14 Ben Vandervalk <benv at bcgsc.ca>
* Release version 2.0.3
* Many compiler fixes for GCC >= 6, Boost >= 1.64
=====================================
Common/ContigID.h
=====================================
--- a/Common/ContigID.h
+++ b/Common/ContigID.h
@@ -24,13 +24,32 @@ static inline void setNextContigName(cstring s)
g_nextContigName = 0;
}
+/**
+ * Set the next contig name returned by createContigName
+ * to one plus the current largest numeric contig name.
+ */
+static inline void setNextContigName()
+{
+ if (g_contigNames.empty()) {
+ g_nextContigName = 0;
+ return;
+ }
+ unsigned maxContigName = 0;
+ for (unsigned i = 0; i < g_contigNames.size(); ++i) {
+ cstring s = g_contigNames.getName(i);
+ std::istringstream iss((std::string)s);
+ unsigned contigName;
+ if (iss >> contigName && iss.eof() && contigName > maxContigName)
+ maxContigName = contigName;
+ }
+ g_nextContigName = 1 + maxContigName;
+}
+
/** Return the next unique contig name. */
static inline std::string createContigName()
{
- if (g_nextContigName == 0) {
- assert(!g_contigNames.empty());
- setNextContigName(g_contigNames.back());
- }
+ if (g_nextContigName == 0)
+ setNextContigName();
std::ostringstream ss;
ss << g_nextContigName++;
return ss.str();
=====================================
Common/Histogram.h
=====================================
--- a/Common/Histogram.h
+++ b/Common/Histogram.h
@@ -315,6 +315,29 @@ namespace std {
inline void swap(Histogram&, Histogram&) { assert(false); }
}
+/** Print assembly contiguity statistics header. */
+static inline std::ostream& printContiguityStatsHeader(
+ std::ostream& out,
+ unsigned minSize,
+ const std::string& sep = "\t",
+ const long long unsigned expSize = 0)
+{
+ out << "n" << sep
+ << "n:" << minSize << sep
+ << "L50" << sep;
+ if (expSize > 0)
+ out << "LG50" << sep
+ << "NG50" << sep;
+ return out << "min" << sep
+ << "N80" << sep
+ << "N50" << sep
+ << "N20" << sep
+ << "E-size" << sep
+ << "max" << sep
+ << "sum" << sep
+ << "name" << '\n';
+}
+
/** Print assembly contiguity statistics. */
static inline std::ostream& printContiguityStats(
std::ostream& out, const Histogram& h0,
@@ -323,22 +346,8 @@ static inline std::ostream& printContiguityStats(
const long long unsigned expSize = 0)
{
Histogram h = h0.trimLow(minSize);
- if (printHeader) {
- out << "n" << sep
- << "n:" << minSize << sep
- << "L50" << sep;
- if (expSize > 0)
- out << "LG50" << sep
- << "NG50" << sep;
- out << "min" << sep
- << "N80" << sep
- << "N50" << sep
- << "N20" << sep
- << "E-size" << sep
- << "max" << sep
- << "sum" << sep
- << "name" << '\n';
- }
+ if (printHeader)
+ printContiguityStatsHeader(out, minSize, sep, expSize);
unsigned n50 = h.n50();
out << toEng(h0.size()) << sep
<< toEng(h.size()) << sep
=====================================
Common/IOUtil.h
=====================================
--- a/Common/IOUtil.h
+++ b/Common/IOUtil.h
@@ -51,9 +51,12 @@ static inline std::istream& operator>>(std::istream& in, expect o)
if (!in || c != *p) {
std::cerr << "error: Expected `" << p
<< "' and saw ";
- if (in)
+ if (in) {
std::cerr << '`' << c << "'\n";
- else if (in.eof())
+ std::string s;
+ if (getline(in, s) && !s.empty())
+ std::cerr << "near: " << c << s << '\n';
+ } else if (in.eof())
std::cerr << "end-of-file\n";
else
std::cerr << "I/O error\n";
=====================================
Common/SAM.h
=====================================
--- a/Common/SAM.h
+++ b/Common/SAM.h
@@ -275,6 +275,8 @@ struct SAMRecord : SAMAlignment {
{
flag &= ~(FPROPER_PAIR | FMUNMAP | FMREVERSE);
flag |= FPAIRED;
+ if (rname == o.rname && isReverse() != o.isReverse())
+ flag |= FPROPER_PAIR;
if (o.isUnmapped())
flag |= FMUNMAP;
if (o.isReverse())
@@ -308,7 +310,7 @@ struct SAMRecord : SAMAlignment {
friend std::ostream& operator <<(std::ostream& out,
const SAMRecord& o)
{
- return out << o.qname
+ out << o.qname
<< '\t' << o.flag
<< '\t' << o.rname
<< '\t' << (1 + o.pos)
@@ -316,13 +318,16 @@ struct SAMRecord : SAMAlignment {
<< '\t' << o.cigar
<< '\t' << (o.mrnm == o.rname ? "=" : o.mrnm)
<< '\t' << (1 + o.mpos)
- << '\t' << o.isize
+ << '\t' << o.isize;
#if SAM_SEQ_QUAL
- << '\t' << o.seq
+ out << '\t' << o.seq
<< '\t' << o.qual;
+ if (!o.tags.empty())
+ out << '\t' << o.tags;
#else
- << "\t*\t*";
+ out << "\t*\t*";
#endif
+ return out;
}
friend std::istream& operator >>(std::istream& in, SAMRecord& o)
=====================================
MergePaths/MergePaths.cpp
=====================================
--- a/MergePaths/MergePaths.cpp
+++ b/MergePaths/MergePaths.cpp
@@ -58,6 +58,8 @@ static const char USAGE_MESSAGE[] =
"\n"
" -k, --kmer=KMER_SIZE k-mer size\n"
" -s, --seed-length=L minimum length of a seed contig [0]\n"
+" -G, --genome-size=N expected genome size. Used to calculate NG50\n"
+" and associated stats [disabled]\n"
" -o, --out=FILE write result to FILE\n"
" --no-greedy use the non-greedy algorithm [default]\n"
" --greedy use the greedy algorithm\n"
@@ -86,16 +88,20 @@ namespace opt {
/** Use a greedy algorithm. */
static int greedy;
+ /** Genome size. Used to calculate NG50. */
+ static long long unsigned genomeSize;
+
/** Write the path overlap graph to this file. */
static string graphPath;
}
-static const char shortopts[] = "g:j:k:o:s:v";
+static const char shortopts[] = "G:g:j:k:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
//enum { OPT_HELP = 1, OPT_VERSION };
static const struct option longopts[] = {
+ { "genome-size", required_argument, NULL, 'G' },
{ "graph", no_argument, NULL, 'g' },
{ "greedy", no_argument, &opt::greedy, true },
{ "no-greedy", no_argument, &opt::greedy, false },
@@ -135,6 +141,52 @@ static ContigPath align(const Lengths& lengths,
static bool gDebugPrint;
+/**
+ * Build a histogram of the lengths of the assembled paths and unused contigs.
+ * Note: This function does not account for the ammount of overlap between contigs.
+ */
+static Histogram
+buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths)
+{
+ Histogram h;
+
+ // Compute the lengths of the paths
+ // Mark the vertices that are used in paths
+ vector<bool> used(lengths.size());
+ for (ContigPaths::const_iterator pathIt = paths.begin();
+ pathIt != paths.end(); ++pathIt) {
+ const ContigPath& path = *pathIt;
+ size_t totalLength = 0;
+ for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
+ if (it->ambiguous())
+ continue;
+ unsigned id = it->id();
+ assert(id < lengths.size());
+ totalLength += lengths[id];
+ used[id] = true;
+ }
+ h.insert(totalLength);
+ }
+
+ // Add the contigs that were not used in paths.
+ for (unsigned i = 0; i < lengths.size(); ++i) {
+ if (!used[i])
+ h.insert(lengths[i]);
+ }
+
+ return h;
+}
+
+/** Report assembly metrics. */
+static void
+reportAssemblyMetrics(const Lengths& lengths, const ContigPaths& paths)
+{
+ Histogram h = buildAssembledLengthHistogram(lengths, paths);
+ const unsigned STATS_MIN_LENGTH = opt::seedLen;
+ printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize)
+ << '\t' << opt::out << '\n';
+}
+
/** Return all contigs that are tandem repeats, identified as those
* contigs that appear more than once in a single path.
*/
@@ -699,7 +751,7 @@ static void outputPathGraph(PathGraph& pathGraph)
}
/** Sort and output the specified paths. */
-static void outputSortedPaths(const ContigPathMap& paths)
+static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths)
{
// Sort the paths.
vector<ContigPath> sortedPaths(paths.size());
@@ -715,6 +767,9 @@ static void outputSortedPaths(const ContigPathMap& paths)
it != sortedPaths.end(); ++it)
out << createContigName() << '\t' << *it << '\n';
assert_good(out, opt::out);
+
+ // Report assembly metrics.
+ reportAssemblyMetrics(lengths, sortedPaths);
}
/** Assemble the path overlap graph. */
@@ -753,7 +808,7 @@ static void assemblePathGraph(const Lengths& lengths,
cout << "Removing redundant contigs\n";
removeSubsumedPaths(lengths, paths);
- outputSortedPaths(paths);
+ outputSortedPaths(lengths, paths);
}
/** Read a set of paths from the specified file. */
@@ -891,6 +946,12 @@ int main(int argc, char** argv)
istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case '?': die = true; break;
+ case 'G': {
+ double x;
+ arg >> x;
+ opt::genomeSize = x;
+ break;
+ }
case 'g': arg >> opt::graphPath; break;
case 'j': arg >> opt::threads; break;
case 'k': arg >> opt::k; break;
@@ -1051,7 +1112,7 @@ int main(int argc, char** argv)
}
originalPathMap.clear();
- outputSortedPaths(resultsPathMap);
+ outputSortedPaths(lengths, resultsPathMap);
return 0;
}
=====================================
Misc/samtobreak.hs
=====================================
--- a/Misc/samtobreak.hs
+++ b/Misc/samtobreak.hs
@@ -279,7 +279,7 @@ parseArgs = do
where
help = putStr (usageInfo usage options) >> exitSuccess
tryHelp = "Try 'abyss-samtobreak --help' for more information."
- version = "abyss-samtobreak (ABySS) 2.0.3\n"
+ version = "abyss-samtobreak (ABySS) 2.1.0\n"
usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
\Calculate contig and scaffold contiguity and correctness metrics.\n"
=====================================
README.md
=====================================
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Contents
* [Assembling a paired-end library](#assembling-a-paired-end-library)
* [Assembling multiple libraries](#assembling-multiple-libraries)
* [Scaffolding](#scaffolding)
+* [Scaffolding with linked reads](#scaffolding-with-linked-reads)
* [Rescaffolding with long sequences](#rescaffolding-with-long-sequences)
* [Assembling using a Bloom filter de Bruijn graph](#assembling-using-a-bloom-filter-de-bruijn-graph)
* [Assembling using a paired de Bruijn graph](#assembling-using-a-paired-de-bruijn-graph)
@@ -37,20 +38,29 @@ Contents
Quick Start
===========
-## Install ABySS on Debian or Ubuntu
+## Install ABySS on Linux
-Run the command
+Install [Linuxbrew](http://linuxbrew.sh/), and run the command
- sudo apt-get install abyss
+ brew install brewsci/bio/abyss
+
+## Install ABySS on macOS
+
+Install [Homebrew](https://brew.sh/), and run the command
+
+ brew install brewsci/bio/abyss
+
+## Install ABySS on Windows
+
+Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command
-or download and install the
-[Debian package](http://www.bcgsc.ca/platform/bioinfo/software/abyss).
+ brew install brewsci/bio/abyss
-## Install ABySS on Mac OS X
+## Install ABySS on Debian or Ubuntu
-Install [Homebrew](http://brew.sh/), and run the commands
+Run the command
- brew install homebrew/science/abyss
+ sudo apt-get install abyss
## Assemble a small synthetic data set
@@ -66,19 +76,37 @@ Install [Homebrew](http://brew.sh/), and run the commands
Dependencies
============
+Dependencies may be installed using the package manager [Homebrew](https://homebrew.sh) on macOS and [Linxubrew](http://linuxbrew.sh) on Linux and Windows, using Windows Subsystem for Linux.
+
+ABySS requires a C++ compiler that supports
+[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
+
ABySS requires the following libraries:
* [Boost](http://www.boost.org/)
* [Open MPI](http://www.open-mpi.org)
* [sparsehash](https://code.google.com/p/sparsehash/)
-* [SQLite](http://www.sqlite.org/)
-ABySS requires a C++ compiler that supports
-[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
+ brew install boost open-mpi google-sparsehash
ABySS will receive an error when compiling with Boost 1.51.0 or 1.52.0
since they contain a bug. Later versions of Boost compile without error.
+## Dependencies for linked reads
+
+- [ARCS](https://github.com/bcgsc/arcs) to scaffold
+- [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors
+
+ brew install brewsci/bio/arcs brewsci/bio/links
+
+## Optional dependencies
+
+- [pigz](https://zlib.net/pigz/) for parallel gzip
+- [samtools](https://samtools.github.io) for reading BAM files
+- [zsh](https://sourceforge.net/projects/zsh/) for reporting time and memory usage
+
+ brew install pigz samtools zsh
+
Compiling ABySS from GitHub
===========================
@@ -142,8 +170,7 @@ usage, although it will build without. sparsehash should be found in
./configure CPPFLAGS=-I/usr/local/include
-If SQLite is installed in non-default directories, its location can be
-specified to `configure`:
+If the optional dependency SQLite is installed in non-default directories, its location can be specified to `configure`:
./configure --with-sqlite=/opt/sqlite3
@@ -169,7 +196,7 @@ To assemble paired reads in two files named `reads1.fa` and
`reads2.fa` into contigs in a file named `ecoli-contigs.fa`, run the
command:
- abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa'
+ abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa'
The parameter `in` specifies the input files to read, which may be in
FASTA, FASTQ, qseq, export, SRA, SAM or BAM format and compressed with
@@ -208,7 +235,7 @@ fragment libraries and single-end reads. Note that the names of the libraries
The command line to assemble this example data set is:
- abyss-pe k=64 name=ecoli lib='pea peb' \
+ abyss-pe k=96 name=ecoli lib='pea peb' \
pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
se='se1.fa se2.fa'
@@ -228,13 +255,29 @@ Here's an example of assembling a data set with two paired-end
libraries and two mate-pair libraries. Note that the names of the libraries
(`pea`, `peb`, `mpa`, `mpb`) are arbitrary.
- abyss-pe k=64 name=ecoli lib='pea peb' mp='mpc mpd' \
+ abyss-pe k=96 name=ecoli lib='pea peb' mp='mpc mpd' \
pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
mpc='mpc_1.fa mpc_2.fa' mpd='mpd_1.fa mpd_2.fa'
The mate-pair libraries are used only for scaffolding and do not
contribute towards the consensus sequence.
+Scaffolding with linked reads
+================================================================================
+
+ABySS can scaffold using linked reads from 10x Genomics Chromium. The barcodes must first be extracted from the read sequences and added to the `BX:Z` tag of the FASTQ header, typically using the `longranger basic` command of [Long Ranger](https://support.10xgenomics.com/genome-exome/software/overview/welcome) or [EMA preproc](https://github.com/arshajii/ema#readme). The linked reads are used to correct assembly errors, which requires that [Tigmint](https://github.com/bcgsc/tigmint). The linked reads are also used for scaffolding, which requires [ARCS](https://github.com/bcgsc/arcs). See [Dependencies](#dependencies) for installation instructions.
+
+ABySS can combine paired-end, mate-pair, and linked-read libraries. The `pe` and `lr` libraries will be used to build the de Bruijn graph. The `mp` libraries will be used for paired-end/mate-pair scaffolding. The `lr` libraries will be used for misassembly correction using Tigmint and scaffolding using ARCS.
+
+ abyss-pe k=96 name=hsapiens \
+ pe='pea' pea='lra.fastq.gz' \
+ mp='mpa' mpa='lra.fastq.gz' \
+ lr='lra' lra='lra.fastq.gz'
+
+ABySS performs better with a mixture of paired-end, mate-pair, and linked reads, but it is possible to assemble only linked reads using ABySS, though this mode of operation is experimental.
+
+ abyss-pe k=96 name=hsapiens lr='lra' lra='lra.fastq.gz'
+
Rescaffolding with long sequences
=================================
@@ -247,7 +290,7 @@ Similar to scaffolding, the names of the datasets can be specified with
the `long` parameter. These scaffolds will be stored in the file
`${name}-long-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly. Note that the names of the libraries are arbitrary.
- abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
+ abyss-pe k=96 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa' \
longa='longa.fa'
@@ -266,7 +309,7 @@ example, the following will run a E. coli assembly with an overall memory budget
of 100 megabytes, 3 hash functions, a minimum k-mer count threshold of 3, with
verbose logging enabled:
- abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
+ abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
At the current time, the user must calculate suitable values for `B` and `H` on
their own, and finding the best value for `kc` may require experimentation
@@ -291,7 +334,7 @@ To assemble using paired de Bruijn graph mode, specify both individual
k-mer size (`K`) and k-mer pair span (`k`). For example, to assemble E.
coli with a individual k-mer size of 16 and a k-mer pair span of 64:
- abyss-pe name=ecoli K=16 k=64 in='reads1.fa reads2.fa'
+ abyss-pe name=ecoli K=16 k=96 in='reads1.fa reads2.fa'
In this example, the size of the intervening gap between k-mer pairs is
32 bp (64 - 2\*16). Note that the `k` parameter takes on a new meaning
@@ -308,7 +351,7 @@ respect to the original transcripts that were sequenced. In order to
run ABySS in strand-specific mode, the `SS` parameter must be used as
in the following example:
- abyss-pe name=SS-RNA k=64 in='reads1.fa reads2.fa' SS=--SS
+ abyss-pe name=SS-RNA k=96 in='reads1.fa reads2.fa' SS=--SS
The expected orientation for the read sequences with respect to the
original RNA is RF. i.e. the first read in a read pair is always in
@@ -406,6 +449,8 @@ Parameters of the driver script, `abyss-pe`
* `t`: maximum length of blunt contigs to trim [`k`]
* `v`: use `v=-v` for verbose logging, `v=-vv` for extra verbose
* `x`: spaced seed (Bloom filter assembly only)
+ * `lr_s`: minimum contig size required for building scaffolds with linked reads (bp) [`S`]
+ * `lr_n`: minimum number of barcodes required for building scaffolds with linked reads [`10`]
Please see the
[abyss-pe](http://manpages.ubuntu.com/abyss-pe.1.html)
@@ -414,7 +459,7 @@ manual page for more information on assembly parameters.
Environment variables
=====================
-`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=20`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
+`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=96`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
abyss-pe env [options]
=====================================
Scaffold/longseqdist.cpp
=====================================
--- a/Scaffold/longseqdist.cpp
+++ b/Scaffold/longseqdist.cpp
@@ -120,8 +120,7 @@ static void readAlignments(istream& in, Graph& g)
SAMRecord rec, prev;
vector<Alignment> recs;
int i = 0;
- while (prev.isUnmapped() || prev.mapq == 0)
- in >> prev;
+ while (in >> prev && (prev.isUnmapped() || prev.mapq == 0));
recs.push_back(prev);
while (in >> rec) {
if (rec.isUnmapped() || rec.mapq == 0)
=====================================
Scaffold/scaffold.cc
=====================================
--- a/Scaffold/scaffold.cc
+++ b/Scaffold/scaffold.cc
@@ -6,6 +6,7 @@
#include "IOUtil.h"
#include "Iterator.h"
#include "Uncompress.h"
+#include "Common/UnorderedMap.h"
#include "Graph/Assemble.h"
#include "Graph/ContigGraph.h"
#include "Graph/ContigGraphAlgorithms.h"
@@ -39,7 +40,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Shaun Jackman.\n"
"\n"
-"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
+"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
@@ -54,9 +55,14 @@ static const char USAGE_MESSAGE[] =
" Options:\n"
"\n"
" -n, --npairs=N minimum number of pairs [0]\n"
-" -s, --seed-length=N minimum contig length [200]\n"
-" or -s N0-N1 Find the value of s in [N0,N1]\n"
+" or -n A-B:S Find the value of n in [A,B] with step size S\n"
" that maximizes the scaffold N50.\n"
+" Default value for the step size is 1, if unspecified.\n"
+" -s, --seed-length=N minimum contig length [1000]\n"
+" or -s A-B Find the value of s in [A,B]\n"
+" that maximizes the scaffold N50.\n"
+" --grid optimize using a grid search [default]\n"
+" --line optimize using a line search\n"
" -k, --kmer=N length of a k-mer\n"
" -G, --genome-size=N expected genome size. Used to calculate NG50\n"
" and associated stats [disabled]\n"
@@ -84,12 +90,17 @@ namespace opt {
unsigned k; // used by ContigProperties
+ /** Optimization search strategy. */
+ static int searchStrategy;
+
/** Minimum number of pairs. */
- static unsigned minNumPairs;
+ static unsigned minEdgeWeight;
+ static unsigned minEdgeWeightEnd;
+ static unsigned minEdgeWeightStep;
/** Minimum contig length. */
- static unsigned minContigLength = 200;
- static unsigned minContigLengthEnd;
+ static unsigned minContigLength = 1000;
+ static unsigned minContigLengthEnd = 1000;
/** Genome size. Used to calculate NG50. */
static long long unsigned genomeSize;
@@ -124,7 +135,9 @@ static const char shortopts[] = "G:g:k:n:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
-//enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP };
+
+/** Optimization search strategy. */
+enum { GRID_SEARCH, LINE_SEARCH };
static const struct option longopts[] = {
{ "graph", no_argument, NULL, 'g' },
@@ -133,6 +146,8 @@ static const struct option longopts[] = {
{ "min-gap", required_argument, NULL, OPT_MIN_GAP },
{ "max-gap", required_argument, NULL, OPT_MAX_GAP },
{ "npairs", required_argument, NULL, 'n' },
+ { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
+ { "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
{ "out", required_argument, NULL, 'o' },
{ "seed-length", required_argument, NULL, 's' },
{ "complex", no_argument, &opt::comp_trans, 1 },
@@ -171,16 +186,17 @@ struct InvalidEdge {
/** Return whether the specified edges has sufficient support. */
struct PoorSupport {
- PoorSupport(Graph& g) : m_g(g) { }
+ PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { }
bool operator()(graph_traits<Graph>::edge_descriptor e) const
{
- return m_g[e].numPairs < opt::minNumPairs;
+ return m_g[e].numPairs < m_minEdgeWeight;
}
const Graph& m_g;
+ unsigned m_minEdgeWeight;
};
/** Remove short vertices and unsupported edges from the graph. */
-static void filterGraph(Graph& g, unsigned minContigLength)
+static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
{
typedef graph_traits<Graph> GTraits;
typedef GTraits::vertex_descriptor V;
@@ -203,7 +219,7 @@ static void filterGraph(Graph& g, unsigned minContigLength)
// Remove poorly-supported edges.
unsigned numBefore = num_edges(g);
- remove_edge_if(PoorSupport(g), static_cast<DG&>(g));
+ remove_edge_if(PoorSupport(g, minEdgeWeight), static_cast<DG&>(g));
unsigned numRemovedE = numBefore - num_edges(g);
if (opt::verbose > 0)
cerr << "Removed " << numRemovedE << " edges.\n";
@@ -527,6 +543,8 @@ static ContigPath addDistEst(const Graph& g0, const Graph& g1,
assert(!v.ambiguous());
pair<E, bool> e0 = edge(u, v, g0);
pair<E, bool> e1 = edge(u, v, g1);
+ if (!e0.second && !e1.second)
+ std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n';
assert(e0.second || e1.second);
const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
if (!isOverlap(ep)) {
@@ -591,7 +609,10 @@ unsigned addLength(const Graph& g, It first, It last)
/** A container of contig paths. */
typedef vector<ContigPath> ContigPaths;
-/** Build the scaffold length histogram. */
+/**
+ * Build the scaffold length histogram.
+ * @param g The graph g is destroyed.
+ */
static Histogram buildScaffoldLengthHistogram(
Graph& g, const ContigPaths& paths)
{
@@ -647,17 +668,45 @@ static void addCntgStatsToDb(
}
}
+/** Parameters of scaffolding. */
+struct ScaffoldParam {
+ ScaffoldParam(unsigned n, unsigned s) : n(n), s(s) { }
+ bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
+ unsigned n;
+ unsigned s;
+};
+
+NAMESPACE_STD_HASH_BEGIN
+ template <> struct hash<ScaffoldParam> {
+ size_t operator()(const ScaffoldParam& param) const
+ {
+ return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
+ }
+ };
+NAMESPACE_STD_HASH_END
+
+/** Result of scaffolding. */
+struct ScaffoldResult : ScaffoldParam{
+ ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { }
+ ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
+ : ScaffoldParam(n, s), n50(n50), metrics(metrics) { }
+ unsigned n50;
+ std::string metrics;
+};
+
/** Build scaffold paths.
* @param output write the results
* @return the scaffold N50
*/
-unsigned scaffold(const Graph& g0, unsigned minContigLength,
+ScaffoldResult
+scaffold(const Graph& g0,
+ unsigned minEdgeWeight, unsigned minContigLength,
bool output)
{
Graph g(g0);
// Filter the graph.
- filterGraph(g, minContigLength);
+ filterGraph(g, minEdgeWeight, minContigLength);
if (opt::verbose > 0)
printGraphStats(cerr, g);
@@ -737,45 +786,198 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
addToDb(db, "scaffolds_assembled", paths.size());
}
+ if (output) {
+ // Output the paths.
+ ofstream fout(opt::out.c_str());
+ ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
+ assert_good(out, opt::out);
+ g_contigNames.unlock();
+ for (vector<ContigPath>::const_iterator it = paths.begin();
+ it != paths.end(); ++it)
+ out << createContigName() << '\t'
+ << addDistEst(g0, g, *it) << '\n';
+ assert_good(out, opt::out);
+
+ // Output the graph.
+ if (!opt::graphPath.empty()) {
+ ofstream out(opt::graphPath.c_str());
+ assert_good(out, opt::graphPath);
+ write_dot(out, g);
+ assert_good(out, opt::graphPath);
+ }
+ }
+
+ // Compute contiguity metrics.
+ const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+ std::ostringstream ss;
+ Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
+ printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH,
+ false, "\t", opt::genomeSize)
+ << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
+ std::string metrics = ss.str();
+ addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
+
+ return ScaffoldResult(minEdgeWeight, minContigLength,
+ scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
+ metrics);
+}
+
+/** Memoize the optimization results so far. */
+typedef unordered_map<ScaffoldParam, ScaffoldResult> ScaffoldMemo;
+
+/** Build scaffold paths, memoized. */
+ScaffoldResult
+scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo)
+{
+ ScaffoldParam param(n, s);
+ ScaffoldMemo::const_iterator it = memo.find(param);
+ if (it != memo.end()) {
+ // Clear the metrics string, so that this result is not listed
+ // multiple times in the final table of metrics.
+ ScaffoldResult result(it->second);
+ result.metrics.clear();
+ return result;
+ }
+
+ if (opt::verbose > 0)
+ std::cerr << "\nScaffolding with n=" << n << " s=" << s << "\n\n";
+ ScaffoldResult result = scaffold(g, n, s, false);
+ memo[param] = result;
+
+ // Print assembly metrics.
+ if (opt::verbose > 0) {
+ std::cerr << '\n';
+ const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ }
+ std::cerr << result.metrics;
+ if (opt::verbose > 0)
+ cerr << '\n';
+ return result;
+}
+
+/** Find the value of n that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_n(const Graph& g,
+ std::pair<unsigned, unsigned> minEdgeWeight,
+ unsigned minContigLength,
+ ScaffoldMemo& memo)
+{
+ std::string metrics_table;
+ unsigned bestn = 0, bestN50 = 0;
+ for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
+ ScaffoldResult result = scaffold_memoized(g, n, minContigLength, memo);
+ metrics_table += result.metrics;
+ if (result.n50 > bestN50) {
+ bestN50 = result.n50;
+ bestn = n;
+ }
+ }
+
+ return ScaffoldResult(bestn, minContigLength, bestN50, metrics_table);
+}
+
+/** Find the value of s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_s(const Graph& g,
+ unsigned minEdgeWeight,
+ std::pair<unsigned, unsigned> minContigLength,
+ ScaffoldMemo& memo)
+{
+ std::string metrics_table;
+ unsigned bests = 0, bestN50 = 0;
+ const double STEP = cbrt(10); // Three steps per decade.
+ unsigned ilast = (unsigned)round(
+ log(minContigLength.second) / log(STEP));
+ for (unsigned i = (unsigned)round(
+ log(minContigLength.first) / log(STEP));
+ i <= ilast; ++i) {
+ unsigned s = (unsigned)pow(STEP, (int)i);
+
+ // Round to 1 figure.
+ double nearestDecade = pow(10, floor(log10(s)));
+ s = unsigned(round(s / nearestDecade) * nearestDecade);
+
+ ScaffoldResult result = scaffold_memoized(g, minEdgeWeight, s, memo);
+ metrics_table += result.metrics;
+ if (result.n50 > bestN50) {
+ bestN50 = result.n50;
+ bests = s;
+ }
+ }
+
+ return ScaffoldResult(minEdgeWeight, bests, bestN50, metrics_table);
+}
+
+/** Find the values of n and s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_grid_search(const Graph& g,
+ std::pair<unsigned, unsigned> minEdgeWeight,
+ std::pair<unsigned, unsigned> minContigLength)
+{
const unsigned STATS_MIN_LENGTH = opt::minContigLength;
- if (!output) {
- static bool printHeader = true;
- Histogram h = buildScaffoldLengthHistogram(g, paths);
- printContiguityStats(cerr, h, STATS_MIN_LENGTH,
- printHeader, "\t", opt::genomeSize)
- << "\ts=" << minContigLength << '\n';
- if (opt::verbose == 0)
- printHeader = false;
- addCntgStatsToDb(h, STATS_MIN_LENGTH);
- return h.trimLow(STATS_MIN_LENGTH).n50();
- }
-
- // Output the paths.
- ofstream fout(opt::out.c_str());
- ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
- assert_good(out, opt::out);
- g_contigNames.unlock();
- for (vector<ContigPath>::const_iterator it = paths.begin();
- it != paths.end(); ++it)
- out << createContigName() << '\t'
- << addDistEst(g0, g, *it) << '\n';
- assert_good(out, opt::out);
-
- // Output the graph.
- if (!opt::graphPath.empty()) {
- ofstream out(opt::graphPath.c_str());
- assert_good(out, opt::graphPath);
- write_dot(out, g);
- assert_good(out, opt::graphPath);
- }
-
- // Print assembly contiguity statistics.
- Histogram h = buildScaffoldLengthHistogram(g, paths);
- printContiguityStats(cerr, h, STATS_MIN_LENGTH,
- true, "\t", opt::genomeSize)
- << "\ts=" << minContigLength << '\n';
- addCntgStatsToDb(h, STATS_MIN_LENGTH);
- return h.trimLow(STATS_MIN_LENGTH).n50();
+ if (opt::verbose == 0)
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+
+ ScaffoldMemo memo;
+ std::string metrics_table;
+ ScaffoldResult best(0, 0, 0, "");
+ for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
+ ScaffoldResult result = optimize_s(g, n, minContigLength, memo);
+ metrics_table += result.metrics;
+ if (result.n50 > best.n50)
+ best = result;
+ }
+
+ best.metrics = metrics_table;
+ return best;
+}
+
+/** Find the values of n and s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_line_search(const Graph& g,
+ std::pair<unsigned, unsigned> minEdgeWeight,
+ std::pair<unsigned, unsigned> minContigLength)
+{
+ const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+ if (opt::verbose == 0)
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+
+ ScaffoldMemo memo;
+ std::string metrics_table;
+ ScaffoldResult best(
+ (minEdgeWeight.first + minEdgeWeight.second) / 2,
+ minContigLength.second, 0, "");
+ // An upper limit on the number of iterations.
+ const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
+ for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
+ // Optimize s.
+ if (opt::verbose > 0) {
+ std::cerr << "\nOptimizing s for n=" << best.n << "\n\n";
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ }
+ unsigned previous_s = best.s;
+ best = optimize_s(g, best.n, minContigLength, memo);
+ metrics_table += best.metrics;
+ if (best.s == previous_s)
+ break;
+
+ // Optimize n.
+ if (opt::verbose > 0) {
+ std::cerr << "\nOptimizing n for s=" << best.s << "\n\n";
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ }
+ unsigned previous_n = best.n;
+ best = optimize_n(g, minEdgeWeight, best.s, memo);
+ metrics_table += best.metrics;
+ if (best.n == previous_n)
+ break;
+ }
+
+ std::cerr << "\nLine search converged in " << memo.size() << " iterations.\n";
+
+ best.metrics = metrics_table;
+ return best;
}
/** Run abyss-scaffold. */
@@ -806,7 +1008,16 @@ int main(int argc, char** argv)
arg >> opt::graphPath;
break;
case 'n':
- arg >> opt::minNumPairs;
+ arg >> opt::minEdgeWeight;
+ if (arg.peek() == '-') {
+ arg >> expect("-") >> opt::minEdgeWeightEnd;
+ assert(opt::minEdgeWeight <= opt::minEdgeWeightEnd);
+ } else
+ opt::minEdgeWeightEnd = opt::minEdgeWeight;
+ if (arg.peek() == ':')
+ arg >> expect(":") >> opt::minEdgeWeightStep;
+ else
+ opt::minEdgeWeightStep = 1;
break;
case 'o':
arg >> opt::out;
@@ -818,7 +1029,8 @@ int main(int argc, char** argv)
arg >> expect("-") >> opt::minContigLengthEnd;
assert(opt::minContigLength
<= opt::minContigLengthEnd);
- }
+ } else
+ opt::minContigLengthEnd = opt::minContigLength;
break;
case 'v':
opt::verbose++;
@@ -908,37 +1120,51 @@ int main(int argc, char** argv)
if (!opt::db.empty())
addToDb(db, "Edges_invalid", numRemoved);
- if (opt::minContigLengthEnd == 0) {
- scaffold(g, opt::minContigLength, true);
- return 0;
- }
-
- // Find the value of s that maximizes the scaffold N50.
- unsigned bests = 0, bestN50 = 0;
- const double STEP = cbrt(10); // Three steps per decade.
- unsigned ilast = (unsigned)round(
- log(opt::minContigLengthEnd) / log(STEP));
- for (unsigned i = (unsigned)round(
- log(opt::minContigLength) / log(STEP));
- i <= ilast; ++i) {
- unsigned s = (unsigned)pow(STEP, (int)i);
-
- // Round to 1 figure.
- double nearestDecade = pow(10, floor(log10(s)));
- s = unsigned(round(s / nearestDecade) * nearestDecade);
+ const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+ if (opt::minEdgeWeight == opt::minEdgeWeightEnd
+ && opt::minContigLength == opt::minContigLengthEnd) {
+ ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
+ // Print assembly contiguity statistics.
+ if (opt::verbose > 0)
+ std::cerr << '\n';
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ std::cerr << result.metrics;
+ } else {
+ ScaffoldResult best(0, 0, 0, "");
+ switch (opt::searchStrategy) {
+ case GRID_SEARCH:
+ best = optimize_grid_search(g,
+ std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
+ std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
+ break;
+ case LINE_SEARCH:
+ best = optimize_line_search(g,
+ std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
+ std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
+ break;
+ default:
+ abort();
+ break;
+ }
- unsigned n50 = scaffold(g, s, false);
if (opt::verbose > 0)
- cerr << '\n';
- if (n50 > bestN50) {
- bestN50 = n50;
- bests = s;
+ std::cerr << "\nScaffolding with n=" << best.n << " s=" << best.s << "\n\n";
+ ScaffoldResult result = scaffold(g, best.n, best.s, true);
+
+ // Print the table of all the parameter values attempted and their metrics.
+ if (opt::verbose > 0) {
+ std::cerr << '\n';
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ std::cerr << best.metrics;
}
- }
- bestN50 = scaffold(g, bests, true);
- cerr << "Best scaffold N50 is " << bestN50
- << " at s=" << bests << ".\n";
+ std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n";
+
+ // Print assembly contiguity statistics.
+ std::cerr << '\n';
+ printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+ std::cerr << result.metrics;
+ }
return 0;
}
=====================================
SimpleGraph/SimpleGraph.cpp
=====================================
--- a/SimpleGraph/SimpleGraph.cpp
+++ b/SimpleGraph/SimpleGraph.cpp
@@ -30,7 +30,7 @@ static const char VERSION_MESSAGE[] =
PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Written by Jared Simpson and Shaun Jackman.\n"
"\n"
-"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
+"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
static const char USAGE_MESSAGE[] =
"Usage: " PROGRAM " -k<kmer> -o<out.path> [OPTION]... ADJ DIST\n"
@@ -44,6 +44,8 @@ static const char USAGE_MESSAGE[] =
" Options:\n"
"\n"
" -k, --kmer=KMER_SIZE k-mer size\n"
+" -n, --npairs=N minimum number of pairs [0]\n"
+" -s, --seed-length=N minimum seed contig length [0]\n"
" -d, --dist-error=N acceptable error of a distance estimate\n"
" default is 6 bp\n"
" --max-cost=COST maximum computational cost\n"
@@ -73,6 +75,12 @@ namespace opt {
static int verbose;
static string out;
+ /** Minimum number of pairs. */
+ static unsigned minEdgeWeight;
+
+ /** Minimum seed length. */
+ static unsigned minSeedLength;
+
/** The acceptable error of a distance estimate. */
unsigned distanceError = 6;
@@ -80,7 +88,7 @@ namespace opt {
int format = DIST; // used by Estimate
}
-static const char shortopts[] = "d:j:k:o:v";
+static const char shortopts[] = "d:j:k:n:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
@@ -88,6 +96,8 @@ enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
static const struct option longopts[] = {
{ "kmer", required_argument, NULL, 'k' },
+ { "npairs", required_argument, NULL, 'n' },
+ { "seed-length", required_argument, NULL, 's' },
{ "dist-error", required_argument, NULL, 'd' },
{ "max-cost", required_argument, NULL, OPT_MAX_COST },
{ "out", required_argument, NULL, 'o' },
@@ -125,7 +135,9 @@ int main(int argc, char** argv)
case 'j': arg >> opt::threads; break;
case 'k': arg >> opt::k; break;
case OPT_MAX_COST: arg >> opt::maxCost; break;
+ case 'n': arg >> opt::minEdgeWeight; break;
case 'o': arg >> opt::out; break;
+ case 's': arg >> opt::minSeedLength; break;
case 'v': opt::verbose++; break;
case OPT_HELP:
cout << USAGE_MESSAGE;
@@ -248,6 +260,9 @@ static unsigned g_minNumPairs = UINT_MAX;
static unsigned g_minNumPairsUsed = UINT_MAX;
static struct {
+ unsigned seedTooShort;
+ unsigned noEdges;
+ unsigned edgesRemoved;
unsigned totalAttempted;
unsigned uniqueEnd;
unsigned noPossiblePaths;
@@ -631,6 +646,16 @@ static void handleEstimate(const Graph& g,
pthread_mutex_unlock(&coutMutex);
}
+/** Return whether the specified edge has sufficient support. */
+struct PoorSupport {
+ PoorSupport(unsigned minEdgeWeight) : m_minEdgeWeight(minEdgeWeight) { }
+ bool operator()(const Estimates::value_type& estimate) const
+ {
+ return estimate.second.numPairs < m_minEdgeWeight;
+ }
+ const unsigned m_minEdgeWeight;
+};
+
struct WorkerArg {
istream* in;
ostream* out;
@@ -642,6 +667,9 @@ struct WorkerArg {
static void* worker(void* pArg)
{
WorkerArg& arg = *static_cast<WorkerArg*>(pArg);
+ unsigned countSeedTooShort = 0;
+ unsigned countNoEdges = 0;
+ unsigned countEdgesRemoved = 0;
for (;;) {
/** Lock the input stream. */
static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -651,6 +679,26 @@ static void* worker(void* pArg)
pthread_mutex_unlock(&inMutex);
if (!good)
break;
+ if ((*arg.graph)[ContigNode(er.refID, false)].length < opt::minSeedLength) {
+ ++countSeedTooShort;
+ continue;
+ }
+
+ // Remove edges with insufficient support.
+ for (unsigned i = 0; i < 2; ++i) {
+ Estimates& estimates = er.estimates[i];
+ if (estimates.empty())
+ continue;
+ unsigned sizeBefore = estimates.size();
+ estimates.erase(
+ remove_if(estimates.begin(), estimates.end(), PoorSupport(opt::minEdgeWeight)),
+ estimates.end());
+ unsigned sizeAfter = estimates.size();
+ unsigned edgesRemoved = sizeBefore - sizeAfter;
+ countEdgesRemoved += edgesRemoved;
+ if (sizeAfter == 0)
+ ++countNoEdges;
+ }
// Flip the anterior distance estimates.
for (Estimates::iterator it = er.estimates[1].begin();
@@ -673,6 +721,14 @@ static void* worker(void* pArg)
pthread_mutex_unlock(&outMutex);
}
}
+
+ static pthread_mutex_t statsMutex = PTHREAD_MUTEX_INITIALIZER;
+ pthread_mutex_lock(&statsMutex);
+ stats.seedTooShort += countSeedTooShort;
+ stats.noEdges += countNoEdges;
+ stats.edgesRemoved += countEdgesRemoved;
+ pthread_mutex_unlock(&statsMutex);
+
return NULL;
}
@@ -707,6 +763,9 @@ static void generatePathsThroughEstimates(const Graph& g,
cout << '\n';
cout <<
+ "Seed too short: " << stats.seedTooShort << "\n"
+ "Seeds with no edges: " << stats.noEdges << "\n"
+ "Edges removed: " << stats.edgesRemoved << "\n"
"Total paths attempted: " << stats.totalAttempted << "\n"
"Unique path: " << stats.uniqueEnd << "\n"
"No possible paths: " << stats.noPossiblePaths << "\n"
=====================================
bin/abyss-pe
=====================================
--- a/bin/abyss-pe
+++ b/bin/abyss-pe
@@ -3,14 +3,22 @@
# Written by Shaun Jackman <sjackman at bcgsc.ca> and
# Anthony Raymond <traymond at bcgsc.ca>.
+SHELL=bash -e -o pipefail
ifneq ($(shell command -v zsh),)
# Set pipefail to ensure that all commands of a pipe succeed.
SHELL=zsh -e -o pipefail
# Report run time and memory usage with zsh.
export REPORTTIME=1
export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J
+endif
+
+# Record run time and memory usage in a file using GNU time.
+ifdef time
+ifneq ($(shell command -v gtime),)
+gtime=command gtime -v -o $@.time
else
-SHELL=bash -e -o pipefail
+gtime=command time -v -o $@.time
+endif
endif
# Define this environment variable on Mac OS X to read
@@ -93,10 +101,10 @@ endif
# Use pigz or bgzip for parallel compression if available.
ifneq ($(shell command -v pigz),)
-gzip=pigz -p$t
+gzip=pigz -p$j
else
ifneq ($(shell command -v bgzip),)
-gzip=bgzip -@$t
+gzip=bgzip -@$j
else
gzip=gzip
endif
@@ -123,6 +131,13 @@ MARKDOWN=pandoc
map=$(foreach a,$(2),$(call $(1),$(a)))
deref=$($1)
+
+ifdef lr
+ifndef lib
+lib:=$(pe) $(lr)
+endif
+endif
+
ifdef lib
in?=$(call map, deref, $(lib))
else
@@ -131,6 +146,7 @@ lib?=$(name)
$(lib)?=$(in)
endif
endif
+
pe?=$(lib)
mp?=$(pe)
@@ -141,6 +157,9 @@ endif
ifdef se
override se:=$(strip $(se))
endif
+ifdef lr
+override lr_reads=$(strip $(call map, deref, $(lr)))
+endif
# Graph file format
graph?=dot
@@ -287,13 +306,16 @@ $(foreach i,$(pe),$(eval $i_n?=$n))
override deopt=$v $(dbopt) -j$j -k$k $(DISTANCEEST_OPTIONS) -l$($*_l) -s$($*_s) -n$($*_n) $($*_de)
# SimpleGraph parameters
-sgopt += $(dbopt)
+sgopt += $(dbopt) -s$s -n$n
ifdef d
sgopt += -d$d
endif
# MergePaths parameters
-mpopt += $v $(dbopt) -j$j -k$k
+mpopt += $v $(dbopt) -j$j -k$k -s$s
+ifdef G
+mpopt += -G$G
+endif
# PathOverlap parameters
poopt += $v $(dbopt) -k$k
@@ -346,9 +368,9 @@ ifndef k
error::
@>&2 echo 'abyss-pe: missing parameter `k`'
endif
-ifeq ($(lib)$(in)$(se),)
+ifeq ($(lib)$(in)$(se)$(lr),)
error::
- @>&2 echo 'abyss-pe: missing parameter `lib`, `in` or `se`'
+ @>&2 echo 'abyss-pe: missing parameter `lib`, `in`, `se`, or `lr`'
endif
default:
@@ -371,7 +393,7 @@ help:
@echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users at bcgsc.ca.'
version:
- @echo "abyss-pe (ABySS) 2.0.3"
+ @echo "abyss-pe (ABySS) 2.1.0"
@echo "Written by Shaun Jackman and Anthony Raymond."
@echo
@echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
@@ -689,9 +711,106 @@ endif
%-8.$g: %-7.$g %-7.path
PathOverlap --overlap $(poopt) --$g $^ >$@
+# Scaffold using linked reads
+ifdef lr
+
+# Tigmint
+
+# Options for mapping the reads to the draft assembly.
+lr_l=$l
+override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS)
+
+# Options for abyss-scaffold
+lr_s=1000-100000
+lr_n=5-20
+
+# Minimum AS/Read length ratio
+tigmint_as=0.65
+
+# Maximum number of mismatches
+tigmint_nm=5
+
+# Minimum mapping quality threshold
+tigmint_mapq=0
+
+# Maximum distance between reads to be considered the same molecule
+tigmint_d=50000
+
+# Minimum number of spanning molecules
+tigmint_n=10
+
+# Size of the window that must be spanned by moecules
+tigmint_w=1000
+
+# Align paired-end reads to the draft genome, sort by BX tag,
+# and create molecule extents BED.
+%.lr.bed: %.fa
+ $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+ | samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \
+ | tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \
+ | sort -k1,1 -k2,2n -k3,3n >$@
+
+# Align paired-end reads to the draft genome and sort by BX tag.
+%.lr.sortbx.bam: %.fa
+ $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+ | samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
+
+# Filter the BAM file, create molecule extents BED.
+%.lr.bed: %.lr.sortbx.bam
+ $(gtime) tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) $< \
+ | sort -k1,1 -k2,2n -k3,3n >$@
+
+# Cut sequences at assembly errors.
+%.tigmint.fa: %.lr.bed %.fa %.fa.fai
+ $(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $<
+
+# ARCS
+arcs_c=2
+arcs_d=0
+arcs_e=30000
+arcs_l=0
+arcs_m=4-20000
+arcs_r=0.05
+arcs_s=98
+arcs_z=500
+
+# Align reads and create a graph of linked contigs using ARCS.
+%.arcs.dist.gv: %.fa
+ $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+ | abyss-fixmate-ssq --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
+ | arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
+ -g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
+
+# Align paired-end reads to the draft genome and do not sort.
+%.lr.sortn.sam.gz: %.fa
+ $(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+ | abyss-fixmate --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
+ | $(gzip) >$@
+
+# Create a graph of linked contigs using ARCS.
+%.arcs.dist.gv: %.lr.sortn.sam.gz
+ gunzip -c $< \
+ | arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
+ -g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
+
+# Scaffold using ARCS and abyss-scaffold.
+%.arcs.path: %.arcs.dist.gv
+ abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
+
+# Create the FASTA file of ARCS scaffolds.
+%.arcs.fa: %.fa %.arcs.path
+ MergeContigs $(mcopt) -o $@ $^
+
+%-scaffolds.fa: %-8.tigmint.arcs.fa
+ ln -sf $< $@
+
+else
+
%-scaffolds.fa: %-8.fa
ln -sf $< $@
+endif
+
%-scaffolds.$g: %-8.$g
ln -sf $< $@
@@ -854,4 +973,4 @@ env:
@echo '[override], if variable was defined with an override directive in (this) makefile.'
@$(foreach var,$(varList),\
- echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]" | grep -v "undefined";)
+ echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]";)
=====================================
configure.ac
=====================================
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
AC_PREREQ(2.62)
-AC_INIT(ABySS, 2.0.3, abyss-users at bcgsc.ca, abyss,
+AC_INIT(ABySS, 2.1.0, abyss-users at bcgsc.ca, abyss,
http://www.bcgsc.ca/platform/bioinfo/software/abyss)
m4_include(m4/m4_ax_pthread.m4)
AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)
=====================================
doc/ABYSS.1
=====================================
--- a/doc/ABYSS.1
+++ b/doc/ABYSS.1
@@ -1,4 +1,4 @@
-.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.0.3" "User Commands"
+.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands"
.SH NAME
ABYSS \- assemble short reads into contigs
.SH SYNOPSIS
=====================================
doc/abyss-pe.1
=====================================
--- a/doc/abyss-pe.1
+++ b/doc/abyss-pe.1
@@ -1,4 +1,4 @@
-.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.0.3" "User Commands"
+.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands"
.SH NAME
abyss-pe - assemble reads into contigs
.SH SYNOPSIS
=====================================
doc/abyss-tofastq.1
=====================================
--- a/doc/abyss-tofastq.1
+++ b/doc/abyss-tofastq.1
@@ -1,4 +1,4 @@
-.TH abyss-tofastq "1" "2015-May" "ABySS 2.0.3" "User Commands"
+.TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands"
.SH NAME
abyss-tofastq \- convert various file formats to FASTQ format
.br
View it on GitLab: https://salsa.debian.org/med-team/abyss/commit/e2ac24df5fbe2bfbc5e90d8f8536715303bda141
---
View it on GitLab: https://salsa.debian.org/med-team/abyss/commit/e2ac24df5fbe2bfbc5e90d8f8536715303bda141
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180420/ed472467/attachment-0001.html>
More information about the debian-med-commit
mailing list