[med-svn] [Git][med-team/abyss][upstream] New upstream version 2.1.0

Andreas Tille gitlab at salsa.debian.org
Fri Apr 20 09:44:19 BST 2018


Andreas Tille pushed to branch upstream at Debian Med / abyss


Commits:
e2ac24df by Andreas Tille at 2018-04-20T09:57:31+02:00
New upstream version 2.1.0
- - - - -


16 changed files:

- ChangeLog
- Common/ContigID.h
- Common/Histogram.h
- Common/IOUtil.h
- Common/SAM.h
- MergePaths/MergePaths.cpp
- Misc/samtobreak.hs
- README.md
- Scaffold/longseqdist.cpp
- Scaffold/scaffold.cc
- SimpleGraph/SimpleGraph.cpp
- bin/abyss-pe
- configure.ac
- doc/ABYSS.1
- doc/abyss-pe.1
- doc/abyss-tofastq.1


Changes:

=====================================
ChangeLog
=====================================
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,36 @@
-2017-12-19  Ben Vandervalk <benv at bcgsc.ca>
+2018-04-13  Ben Vandervalk <benv at bcgsc.ca>
+
+	* Release version 2.1.0
+	* Adds support for misassembly correction and scaffolding
+	using linked reads, using Tigmint and ARCS. (Tigmint and
+	ARCS must be installed separately.)
+	* Support simultaneous optimization of `s` (min sequence length)
+	and `n` (min supporting read pairs / Chromium barcodes)
+	during scaffolding
+
+	abyss-longseqdist:
+	* Fix hang on input SAM containing no alignments with MAPQ > 0
+
+	abyss-pe:
+	* New `lr` parameter. Provide linked reads (i.e. 10x Genomics
+	Chromium reads) via this parameter to perform misassembly
+	correction and scaffolding using Chromium barcode information.
+	Requires Tigmint and ARCS tools to be installed in addition
+	to ABySS.
+	* Fix bug where `j` (threads) was not being correctly passed to
+	to `bgzip`/`pigz`
+	* Fix bug where `zsh` time/memory profiling was not being used,
+	even when `zsh` was available
+
+	abyss-scaffold:
+	* Simultaneous optimization of `n` and `s` using line search
+	or grid search [default]
+
+	SimpleGraph:
+	* add options `-s` and `-n` to filter paired-end paths by
+	seed length and edge weight, respectively
+
+2018-03-14  Ben Vandervalk <benv at bcgsc.ca>
 
 	* Release version 2.0.3
 	* Many compiler fixes for GCC >= 6, Boost >= 1.64


=====================================
Common/ContigID.h
=====================================
--- a/Common/ContigID.h
+++ b/Common/ContigID.h
@@ -24,13 +24,32 @@ static inline void setNextContigName(cstring s)
 		g_nextContigName = 0;
 }
 
+/**
+ * Set the next contig name returned by createContigName
+ * to one plus the current largest numeric contig name.
+ */
+static inline void setNextContigName()
+{
+	if (g_contigNames.empty()) {
+		g_nextContigName = 0;
+		return;
+	}
+	unsigned maxContigName = 0;
+	for (unsigned i = 0; i < g_contigNames.size(); ++i) {
+		cstring s = g_contigNames.getName(i);
+		std::istringstream iss((std::string)s);
+		unsigned contigName;
+		if (iss >> contigName && iss.eof() && contigName > maxContigName)
+			maxContigName = contigName;
+	}
+	g_nextContigName = 1 + maxContigName;
+}
+
 /** Return the next unique contig name. */
 static inline std::string createContigName()
 {
-	if (g_nextContigName == 0) {
-		assert(!g_contigNames.empty());
-		setNextContigName(g_contigNames.back());
-	}
+	if (g_nextContigName == 0)
+		setNextContigName();
 	std::ostringstream ss;
 	ss << g_nextContigName++;
 	return ss.str();


=====================================
Common/Histogram.h
=====================================
--- a/Common/Histogram.h
+++ b/Common/Histogram.h
@@ -315,6 +315,29 @@ namespace std {
 	inline void swap(Histogram&, Histogram&) { assert(false); }
 }
 
+/** Print assembly contiguity statistics header. */
+static inline std::ostream& printContiguityStatsHeader(
+		std::ostream& out,
+		unsigned minSize,
+		const std::string& sep = "\t",
+		const long long unsigned expSize = 0)
+{
+	out << "n" << sep
+		<< "n:" << minSize << sep
+		<< "L50" << sep;
+	if (expSize > 0)
+		out << "LG50" << sep
+			<< "NG50" << sep;
+	return out << "min" << sep
+		<< "N80" << sep
+		<< "N50" << sep
+		<< "N20" << sep
+		<< "E-size" << sep
+		<< "max" << sep
+		<< "sum" << sep
+		<< "name" << '\n';
+}
+
 /** Print assembly contiguity statistics. */
 static inline std::ostream& printContiguityStats(
 		std::ostream& out, const Histogram& h0,
@@ -323,22 +346,8 @@ static inline std::ostream& printContiguityStats(
 		const long long unsigned expSize = 0)
 {
 	Histogram h = h0.trimLow(minSize);
-	if (printHeader) {
-		out << "n" << sep
-			<< "n:" << minSize << sep
-			<< "L50" << sep;
-		if (expSize > 0)
-			out << "LG50" << sep
-				<< "NG50" << sep;
-		out << "min" << sep
-			<< "N80" << sep
-			<< "N50" << sep
-			<< "N20" << sep
-			<< "E-size" << sep
-			<< "max" << sep
-			<< "sum" << sep
-			<< "name" << '\n';
-	}
+	if (printHeader)
+		printContiguityStatsHeader(out, minSize, sep, expSize);
 	unsigned n50 = h.n50();
 	out << toEng(h0.size()) << sep
 		<< toEng(h.size()) << sep


=====================================
Common/IOUtil.h
=====================================
--- a/Common/IOUtil.h
+++ b/Common/IOUtil.h
@@ -51,9 +51,12 @@ static inline std::istream& operator>>(std::istream& in, expect o)
 			if (!in || c != *p) {
 				std::cerr << "error: Expected `" << p
 					<< "' and saw ";
-				if (in)
+				if (in) {
 					std::cerr << '`' << c << "'\n";
-				else if (in.eof())
+					std::string s;
+					if (getline(in, s) && !s.empty())
+						std::cerr << "near: " << c << s << '\n';
+				} else if (in.eof())
 					std::cerr << "end-of-file\n";
 				else
 					std::cerr << "I/O error\n";


=====================================
Common/SAM.h
=====================================
--- a/Common/SAM.h
+++ b/Common/SAM.h
@@ -275,6 +275,8 @@ struct SAMRecord : SAMAlignment {
 	{
 		flag &= ~(FPROPER_PAIR | FMUNMAP | FMREVERSE);
 		flag |= FPAIRED;
+		if (rname == o.rname && isReverse() != o.isReverse())
+			flag |= FPROPER_PAIR;
 		if (o.isUnmapped())
 			flag |= FMUNMAP;
 		if (o.isReverse())
@@ -308,7 +310,7 @@ struct SAMRecord : SAMAlignment {
 	friend std::ostream& operator <<(std::ostream& out,
 			const SAMRecord& o)
 	{
-		return out << o.qname
+		out << o.qname
 			<< '\t' << o.flag
 			<< '\t' << o.rname
 			<< '\t' << (1 + o.pos)
@@ -316,13 +318,16 @@ struct SAMRecord : SAMAlignment {
 			<< '\t' << o.cigar
 			<< '\t' << (o.mrnm == o.rname ? "=" : o.mrnm)
 			<< '\t' << (1 + o.mpos)
-			<< '\t' << o.isize
+			<< '\t' << o.isize;
 #if SAM_SEQ_QUAL
-			<< '\t' << o.seq
+		out << '\t' << o.seq
 			<< '\t' << o.qual;
+		if (!o.tags.empty())
+			out << '\t' << o.tags;
 #else
-			<< "\t*\t*";
+		out << "\t*\t*";
 #endif
+		return out;
 	}
 
 	friend std::istream& operator >>(std::istream& in, SAMRecord& o)


=====================================
MergePaths/MergePaths.cpp
=====================================
--- a/MergePaths/MergePaths.cpp
+++ b/MergePaths/MergePaths.cpp
@@ -58,6 +58,8 @@ static const char USAGE_MESSAGE[] =
 "\n"
 "  -k, --kmer=KMER_SIZE  k-mer size\n"
 "  -s, --seed-length=L   minimum length of a seed contig [0]\n"
+"  -G, --genome-size=N   expected genome size. Used to calculate NG50\n"
+"                        and associated stats [disabled]\n"
 "  -o, --out=FILE        write result to FILE\n"
 "      --no-greedy       use the non-greedy algorithm [default]\n"
 "      --greedy          use the greedy algorithm\n"
@@ -86,16 +88,20 @@ namespace opt {
 	/** Use a greedy algorithm. */
 	static int greedy;
 
+	/** Genome size. Used to calculate NG50. */
+	static long long unsigned genomeSize;
+
 	/** Write the path overlap graph to this file. */
 	static string graphPath;
 }
 
-static const char shortopts[] = "g:j:k:o:s:v";
+static const char shortopts[] = "G:g:j:k:o:s:v";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
 //enum { OPT_HELP = 1, OPT_VERSION };
 
 static const struct option longopts[] = {
+	{ "genome-size", required_argument, NULL, 'G' },
 	{ "graph",       no_argument,       NULL, 'g' },
 	{ "greedy",      no_argument,       &opt::greedy, true },
 	{ "no-greedy",   no_argument,       &opt::greedy, false },
@@ -135,6 +141,52 @@ static ContigPath align(const Lengths& lengths,
 
 static bool gDebugPrint;
 
+/**
+  * Build a histogram of the lengths of the assembled paths and unused contigs.
+  * Note: This function does not account for the ammount of overlap between contigs.
+  */
+static Histogram
+buildAssembledLengthHistogram(const Lengths& lengths, const ContigPaths& paths)
+{
+	Histogram h;
+
+	// Compute the lengths of the paths
+	// Mark the vertices that are used in paths
+	vector<bool> used(lengths.size());
+	for (ContigPaths::const_iterator pathIt = paths.begin();
+			pathIt != paths.end(); ++pathIt) {
+		const ContigPath& path = *pathIt;
+		size_t totalLength = 0;
+		for (ContigPath::const_iterator it = path.begin(); it != path.end(); ++it) {
+			if (it->ambiguous())
+				continue;
+			unsigned id = it->id();
+			assert(id < lengths.size());
+			totalLength += lengths[id];
+			used[id] = true;
+		}
+		h.insert(totalLength);
+	}
+
+	// Add the contigs that were not used in paths.
+	for (unsigned i = 0; i < lengths.size(); ++i) {
+		if (!used[i])
+			h.insert(lengths[i]);
+	}
+
+	return h;
+}
+
+/** Report assembly metrics. */
+static void
+reportAssemblyMetrics(const Lengths& lengths, const ContigPaths& paths)
+{
+	Histogram h = buildAssembledLengthHistogram(lengths, paths);
+	const unsigned STATS_MIN_LENGTH = opt::seedLen;
+	printContiguityStats(cerr, h, STATS_MIN_LENGTH, true, "\t", opt::genomeSize)
+		<< '\t' << opt::out << '\n';
+}
+
 /** Return all contigs that are tandem repeats, identified as those
  * contigs that appear more than once in a single path.
  */
@@ -699,7 +751,7 @@ static void outputPathGraph(PathGraph& pathGraph)
 }
 
 /** Sort and output the specified paths. */
-static void outputSortedPaths(const ContigPathMap& paths)
+static void outputSortedPaths(const Lengths& lengths, const ContigPathMap& paths)
 {
 	// Sort the paths.
 	vector<ContigPath> sortedPaths(paths.size());
@@ -715,6 +767,9 @@ static void outputSortedPaths(const ContigPathMap& paths)
 			it != sortedPaths.end(); ++it)
 		out << createContigName() << '\t' << *it << '\n';
 	assert_good(out, opt::out);
+
+	// Report assembly metrics.
+	reportAssemblyMetrics(lengths, sortedPaths);
 }
 
 /** Assemble the path overlap graph. */
@@ -753,7 +808,7 @@ static void assemblePathGraph(const Lengths& lengths,
 		cout << "Removing redundant contigs\n";
 	removeSubsumedPaths(lengths, paths);
 
-	outputSortedPaths(paths);
+	outputSortedPaths(lengths, paths);
 }
 
 /** Read a set of paths from the specified file. */
@@ -891,6 +946,12 @@ int main(int argc, char** argv)
 		istringstream arg(optarg != NULL ? optarg : "");
 		switch (c) {
 			case '?': die = true; break;
+			case 'G': {
+				double x;
+				arg >> x;
+				opt::genomeSize = x;
+				break;
+			}
 			case 'g': arg >> opt::graphPath; break;
 			case 'j': arg >> opt::threads; break;
 			case 'k': arg >> opt::k; break;
@@ -1051,7 +1112,7 @@ int main(int argc, char** argv)
 	}
 	originalPathMap.clear();
 
-	outputSortedPaths(resultsPathMap);
+	outputSortedPaths(lengths, resultsPathMap);
 	return 0;
 }
 


=====================================
Misc/samtobreak.hs
=====================================
--- a/Misc/samtobreak.hs
+++ b/Misc/samtobreak.hs
@@ -279,7 +279,7 @@ parseArgs = do
   where
   help = putStr (usageInfo usage options) >> exitSuccess
   tryHelp = "Try 'abyss-samtobreak --help' for more information."
-  version = "abyss-samtobreak (ABySS) 2.0.3\n"
+  version = "abyss-samtobreak (ABySS) 2.1.0\n"
   usage = "Usage: samtobreak [OPTION]... [FILE]...\n\
 \Calculate contig and scaffold contiguity and correctness metrics.\n"
 


=====================================
README.md
=====================================
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Contents
 * [Assembling a paired-end library](#assembling-a-paired-end-library)
 * [Assembling multiple libraries](#assembling-multiple-libraries)
 * [Scaffolding](#scaffolding)
+* [Scaffolding with linked reads](#scaffolding-with-linked-reads)
 * [Rescaffolding with long sequences](#rescaffolding-with-long-sequences)
 * [Assembling using a Bloom filter de Bruijn graph](#assembling-using-a-bloom-filter-de-bruijn-graph)
 * [Assembling using a paired de Bruijn graph](#assembling-using-a-paired-de-bruijn-graph)
@@ -37,20 +38,29 @@ Contents
 Quick Start
 ===========
 
-## Install ABySS on Debian or Ubuntu
+## Install ABySS on Linux
 
-Run the command
+Install [Linuxbrew](http://linuxbrew.sh/), and run the command
 
-	sudo apt-get install abyss
+	brew install brewsci/bio/abyss
+
+## Install ABySS on macOS
+
+Install [Homebrew](https://brew.sh/), and run the command
+
+	brew install brewsci/bio/abyss
+
+## Install ABySS on Windows
+
+Install [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/) and [Linuxbrew](http://linuxbrew.sh/), and run the command
 
-or download and install the
-[Debian package](http://www.bcgsc.ca/platform/bioinfo/software/abyss).
+	brew install brewsci/bio/abyss
 
-## Install ABySS on Mac OS X
+## Install ABySS on Debian or Ubuntu
 
-Install [Homebrew](http://brew.sh/), and run the commands
+Run the command
 
-	brew install homebrew/science/abyss
+	sudo apt-get install abyss
 
 ## Assemble a small synthetic data set
 
@@ -66,19 +76,37 @@ Install [Homebrew](http://brew.sh/), and run the commands
 Dependencies
 ============
 
+Dependencies may be installed using the package manager [Homebrew](https://homebrew.sh) on macOS and [Linxubrew](http://linuxbrew.sh) on Linux and Windows, using Windows Subsystem for Linux.
+
+ABySS requires a C++ compiler that supports
+[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
+
 ABySS requires the following libraries:
 
 * [Boost](http://www.boost.org/)
 * [Open MPI](http://www.open-mpi.org)
 * [sparsehash](https://code.google.com/p/sparsehash/)
-* [SQLite](http://www.sqlite.org/)
 
-ABySS requires a C++ compiler that supports
-[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
+	brew install boost open-mpi google-sparsehash
 
 ABySS will receive an error when compiling with Boost 1.51.0 or 1.52.0
 since they contain a bug. Later versions of Boost compile without error.
 
+## Dependencies for linked reads
+
+- [ARCS](https://github.com/bcgsc/arcs) to scaffold
+- [Tigmint](https://github.com/bcgsc/tigmint) to correct assembly errors
+
+	brew install brewsci/bio/arcs brewsci/bio/links
+
+## Optional dependencies
+
+- [pigz](https://zlib.net/pigz/) for parallel gzip
+- [samtools](https://samtools.github.io) for reading BAM files
+- [zsh](https://sourceforge.net/projects/zsh/) for reporting time and memory usage
+
+	brew install pigz samtools zsh
+
 Compiling ABySS from GitHub
 ===========================
 
@@ -142,8 +170,7 @@ usage, although it will build without. sparsehash should be found in
 
 	./configure CPPFLAGS=-I/usr/local/include
 
-If SQLite is installed in non-default directories, its location can be
-specified to `configure`:
+If the optional dependency SQLite is installed in non-default directories, its location can be specified to `configure`:
 
 	./configure --with-sqlite=/opt/sqlite3
 
@@ -169,7 +196,7 @@ To assemble paired reads in two files named `reads1.fa` and
 `reads2.fa` into contigs in a file named `ecoli-contigs.fa`, run the
 command:
 
-	abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa'
+	abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa'
 
 The parameter `in` specifies the input files to read, which may be in
 FASTA, FASTQ, qseq, export, SRA, SAM or BAM format and compressed with
@@ -208,7 +235,7 @@ fragment libraries and single-end reads. Note that the names of the libraries
 
 The command line to assemble this example data set is:
 
-	abyss-pe k=64 name=ecoli lib='pea peb' \
+	abyss-pe k=96 name=ecoli lib='pea peb' \
 		pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
 		se='se1.fa se2.fa'
 
@@ -228,13 +255,29 @@ Here's an example of assembling a data set with two paired-end
 libraries and two mate-pair libraries. Note that the names of the libraries
 (`pea`, `peb`, `mpa`, `mpb`) are arbitrary.
 
-	abyss-pe k=64 name=ecoli lib='pea peb' mp='mpc mpd' \
+	abyss-pe k=96 name=ecoli lib='pea peb' mp='mpc mpd' \
 		pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
 		mpc='mpc_1.fa mpc_2.fa' mpd='mpd_1.fa mpd_2.fa'
 
 The mate-pair libraries are used only for scaffolding and do not
 contribute towards the consensus sequence.
 
+Scaffolding with linked reads
+================================================================================
+
+ABySS can scaffold using linked reads from 10x Genomics Chromium. The barcodes must first be extracted from the read sequences and added to the `BX:Z` tag of the FASTQ header, typically using the `longranger basic` command of [Long Ranger](https://support.10xgenomics.com/genome-exome/software/overview/welcome) or [EMA preproc](https://github.com/arshajii/ema#readme). The linked reads are used to correct assembly errors, which requires that [Tigmint](https://github.com/bcgsc/tigmint). The linked reads are also used for scaffolding, which requires [ARCS](https://github.com/bcgsc/arcs). See [Dependencies](#dependencies) for installation instructions.
+
+ABySS can combine paired-end, mate-pair, and linked-read libraries. The `pe` and `lr` libraries will be used to build the de Bruijn graph. The `mp` libraries will be used for paired-end/mate-pair scaffolding. The `lr` libraries will be used for misassembly correction using Tigmint and scaffolding using ARCS.
+
+	abyss-pe k=96 name=hsapiens \
+		pe='pea' pea='lra.fastq.gz' \
+		mp='mpa' mpa='lra.fastq.gz' \
+		lr='lra' lra='lra.fastq.gz'
+
+ABySS performs better with a mixture of paired-end, mate-pair, and linked reads, but it is possible to assemble only linked reads using ABySS, though this mode of operation is experimental.
+
+	abyss-pe k=96 name=hsapiens lr='lra' lra='lra.fastq.gz'
+
 Rescaffolding with long sequences
 =================================
 
@@ -247,7 +290,7 @@ Similar to scaffolding, the names of the datasets can be specified with
 the `long` parameter. These scaffolds will be stored in the file
 `${name}-long-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly. Note that the names of the libraries are arbitrary.
 
-	abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
+	abyss-pe k=96 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
 		pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
 		mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa' \
 		longa='longa.fa'
@@ -266,7 +309,7 @@ example, the following will run a E. coli assembly with an overall memory budget
 of 100 megabytes, 3 hash functions, a minimum k-mer count threshold of 3, with
 verbose logging enabled:
 
-	abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
+	abyss-pe name=ecoli k=96 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
 
 At the current time, the user must calculate suitable values for `B` and `H` on
 their own, and finding the best value for `kc` may require experimentation
@@ -291,7 +334,7 @@ To assemble using paired de Bruijn graph mode, specify both individual
 k-mer size (`K`) and k-mer pair span (`k`). For example, to assemble E.
 coli with a individual k-mer size of 16 and a k-mer pair span of 64:
 
-	abyss-pe name=ecoli K=16 k=64 in='reads1.fa reads2.fa'
+	abyss-pe name=ecoli K=16 k=96 in='reads1.fa reads2.fa'
 
 In this example, the size of the intervening gap between k-mer pairs is
 32 bp (64 - 2\*16). Note that the `k` parameter takes on a new meaning
@@ -308,7 +351,7 @@ respect to the original transcripts that were sequenced. In order to
 run ABySS in strand-specific mode, the `SS` parameter must be used as
 in the following example:
 
-	abyss-pe name=SS-RNA k=64 in='reads1.fa reads2.fa' SS=--SS
+	abyss-pe name=SS-RNA k=96 in='reads1.fa reads2.fa' SS=--SS
 
 The expected orientation for the read sequences with respect to the
 original RNA is RF. i.e. the first read in a read pair is always in
@@ -406,6 +449,8 @@ Parameters of the driver script, `abyss-pe`
  * `t`: maximum length of blunt contigs to trim [`k`]
  * `v`: use `v=-v` for verbose logging, `v=-vv` for extra verbose
  * `x`: spaced seed (Bloom filter assembly only)
+ * `lr_s`: minimum contig size required for building scaffolds with linked reads (bp) [`S`]
+ * `lr_n`: minimum number of barcodes required for building scaffolds with linked reads [`10`]
 
 Please see the
 [abyss-pe](http://manpages.ubuntu.com/abyss-pe.1.html)
@@ -414,7 +459,7 @@ manual page for more information on assembly parameters.
 Environment variables
 =====================
 
-`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=20`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
+`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=96`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
 
 	abyss-pe env [options]
 


=====================================
Scaffold/longseqdist.cpp
=====================================
--- a/Scaffold/longseqdist.cpp
+++ b/Scaffold/longseqdist.cpp
@@ -120,8 +120,7 @@ static void readAlignments(istream& in, Graph& g)
 	SAMRecord rec, prev;
 	vector<Alignment> recs;
 	int i = 0;
-	while (prev.isUnmapped() || prev.mapq == 0)
-		in >> prev;
+	while (in >> prev && (prev.isUnmapped() || prev.mapq == 0));
 	recs.push_back(prev);
 	while (in >> rec) {
 		if (rec.isUnmapped() || rec.mapq == 0)


=====================================
Scaffold/scaffold.cc
=====================================
--- a/Scaffold/scaffold.cc
+++ b/Scaffold/scaffold.cc
@@ -6,6 +6,7 @@
 #include "IOUtil.h"
 #include "Iterator.h"
 #include "Uncompress.h"
+#include "Common/UnorderedMap.h"
 #include "Graph/Assemble.h"
 #include "Graph/ContigGraph.h"
 #include "Graph/ContigGraphAlgorithms.h"
@@ -39,7 +40,7 @@ static const char VERSION_MESSAGE[] =
 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
 "Written by Shaun Jackman.\n"
 "\n"
-"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
+"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
 
 static const char USAGE_MESSAGE[] =
 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
@@ -54,9 +55,14 @@ static const char USAGE_MESSAGE[] =
 " Options:\n"
 "\n"
 "  -n, --npairs=N        minimum number of pairs [0]\n"
-"  -s, --seed-length=N   minimum contig length [200]\n"
-"          or -s N0-N1   Find the value of s in [N0,N1]\n"
+"      or -n A-B:S       Find the value of n in [A,B] with step size S\n"
 "                        that maximizes the scaffold N50.\n"
+"                        Default value for the step size is 1, if unspecified.\n"
+"  -s, --seed-length=N   minimum contig length [1000]\n"
+"      or -s A-B         Find the value of s in [A,B]\n"
+"                        that maximizes the scaffold N50.\n"
+"      --grid            optimize using a grid search [default]\n"
+"      --line            optimize using a line search\n"
 "  -k, --kmer=N          length of a k-mer\n"
 "  -G, --genome-size=N   expected genome size. Used to calculate NG50\n"
 "                        and associated stats [disabled]\n"
@@ -84,12 +90,17 @@ namespace opt {
 
 	unsigned k; // used by ContigProperties
 
+	/** Optimization search strategy. */
+	static int searchStrategy;
+
 	/** Minimum number of pairs. */
-	static unsigned minNumPairs;
+	static unsigned minEdgeWeight;
+	static unsigned minEdgeWeightEnd;
+	static unsigned minEdgeWeightStep;
 
 	/** Minimum contig length. */
-	static unsigned minContigLength = 200;
-	static unsigned minContigLengthEnd;
+	static unsigned minContigLength = 1000;
+	static unsigned minContigLengthEnd = 1000;
 
 	/** Genome size. Used to calculate NG50. */
 	static long long unsigned genomeSize;
@@ -124,7 +135,9 @@ static const char shortopts[] = "G:g:k:n:o:s:v";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
 	OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
-//enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP };
+
+/** Optimization search strategy. */
+enum { GRID_SEARCH, LINE_SEARCH };
 
 static const struct option longopts[] = {
 	{ "graph",       no_argument,       NULL, 'g' },
@@ -133,6 +146,8 @@ static const struct option longopts[] = {
 	{ "min-gap",     required_argument, NULL, OPT_MIN_GAP },
 	{ "max-gap",     required_argument, NULL, OPT_MAX_GAP },
 	{ "npairs",      required_argument, NULL, 'n' },
+	{ "grid",        no_argument,       &opt::searchStrategy, GRID_SEARCH },
+	{ "line",        no_argument,       &opt::searchStrategy, LINE_SEARCH },
 	{ "out",         required_argument, NULL, 'o' },
 	{ "seed-length", required_argument, NULL, 's' },
 	{ "complex",     no_argument, &opt::comp_trans, 1 },
@@ -171,16 +186,17 @@ struct InvalidEdge {
 
 /** Return whether the specified edges has sufficient support. */
 struct PoorSupport {
-	PoorSupport(Graph& g) : m_g(g) { }
+	PoorSupport(Graph& g, unsigned minEdgeWeight) : m_g(g), m_minEdgeWeight(minEdgeWeight) { }
 	bool operator()(graph_traits<Graph>::edge_descriptor e) const
 	{
-		return m_g[e].numPairs < opt::minNumPairs;
+		return m_g[e].numPairs < m_minEdgeWeight;
 	}
 	const Graph& m_g;
+	unsigned m_minEdgeWeight;
 };
 
 /** Remove short vertices and unsupported edges from the graph. */
-static void filterGraph(Graph& g, unsigned minContigLength)
+static void filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
 {
 	typedef graph_traits<Graph> GTraits;
 	typedef GTraits::vertex_descriptor V;
@@ -203,7 +219,7 @@ static void filterGraph(Graph& g, unsigned minContigLength)
 
 	// Remove poorly-supported edges.
 	unsigned numBefore = num_edges(g);
-	remove_edge_if(PoorSupport(g), static_cast<DG&>(g));
+	remove_edge_if(PoorSupport(g, minEdgeWeight), static_cast<DG&>(g));
 	unsigned numRemovedE = numBefore - num_edges(g);
 	if (opt::verbose > 0)
 		cerr << "Removed " << numRemovedE << " edges.\n";
@@ -527,6 +543,8 @@ static ContigPath addDistEst(const Graph& g0, const Graph& g1,
 		assert(!v.ambiguous());
 		pair<E, bool> e0 = edge(u, v, g0);
 		pair<E, bool> e1 = edge(u, v, g1);
+		if (!e0.second && !e1.second)
+			std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> " << get(vertex_name, g0, v) << '\n';
 		assert(e0.second || e1.second);
 		const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
 		if (!isOverlap(ep)) {
@@ -591,7 +609,10 @@ unsigned addLength(const Graph& g, It first, It last)
 /** A container of contig paths. */
 typedef vector<ContigPath> ContigPaths;
 
-/** Build the scaffold length histogram. */
+/**
+  * Build the scaffold length histogram.
+  * @param g The graph g is destroyed.
+  */
 static Histogram buildScaffoldLengthHistogram(
 		Graph& g, const ContigPaths& paths)
 {
@@ -647,17 +668,45 @@ static void addCntgStatsToDb(
 	}
 }
 
+/** Parameters of scaffolding. */
+struct ScaffoldParam {
+	ScaffoldParam(unsigned n, unsigned s) : n(n), s(s) { }
+	bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
+	unsigned n;
+	unsigned s;
+};
+
+NAMESPACE_STD_HASH_BEGIN
+	template <> struct hash<ScaffoldParam> {
+		size_t operator()(const ScaffoldParam& param) const
+		{
+			return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
+		}
+	};
+NAMESPACE_STD_HASH_END
+
+/** Result of scaffolding. */
+struct ScaffoldResult : ScaffoldParam{
+	ScaffoldResult() : ScaffoldParam(0, 0), n50(0) { }
+	ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
+		: ScaffoldParam(n, s), n50(n50), metrics(metrics) { }
+	unsigned n50;
+	std::string metrics;
+};
+
 /** Build scaffold paths.
  * @param output write the results
  * @return the scaffold N50
  */
-unsigned scaffold(const Graph& g0, unsigned minContigLength,
+ScaffoldResult
+scaffold(const Graph& g0,
+		unsigned minEdgeWeight, unsigned minContigLength,
 		bool output)
 {
 	Graph g(g0);
 
 	// Filter the graph.
-	filterGraph(g, minContigLength);
+	filterGraph(g, minEdgeWeight, minContigLength);
 	if (opt::verbose > 0)
 		printGraphStats(cerr, g);
 
@@ -737,45 +786,198 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
 		addToDb(db, "scaffolds_assembled", paths.size());
 	}
 
+	if (output) {
+		// Output the paths.
+		ofstream fout(opt::out.c_str());
+		ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
+		assert_good(out, opt::out);
+		g_contigNames.unlock();
+		for (vector<ContigPath>::const_iterator it = paths.begin();
+				it != paths.end(); ++it)
+			out << createContigName() << '\t'
+				<< addDistEst(g0, g, *it) << '\n';
+		assert_good(out, opt::out);
+
+		// Output the graph.
+		if (!opt::graphPath.empty()) {
+			ofstream out(opt::graphPath.c_str());
+			assert_good(out, opt::graphPath);
+			write_dot(out, g);
+			assert_good(out, opt::graphPath);
+		}
+	}
+
+	// Compute contiguity metrics.
+	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+	std::ostringstream ss;
+	Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
+	printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH,
+			false, "\t", opt::genomeSize)
+		<< "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
+	std::string metrics = ss.str();
+	addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
+
+	return ScaffoldResult(minEdgeWeight, minContigLength,
+			scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
+			metrics);
+}
+
+/** Memoize the optimization results so far. */
+typedef unordered_map<ScaffoldParam, ScaffoldResult> ScaffoldMemo;
+
+/** Build scaffold paths, memoized. */
+ScaffoldResult
+scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo)
+{
+	ScaffoldParam param(n, s);
+	ScaffoldMemo::const_iterator it = memo.find(param);
+	if (it != memo.end()) {
+		// Clear the metrics string, so that this result is not listed
+		// multiple times in the final table of metrics.
+		ScaffoldResult result(it->second);
+		result.metrics.clear();
+		return result;
+	}
+
+	if (opt::verbose > 0)
+		std::cerr << "\nScaffolding with n=" << n << " s=" << s << "\n\n";
+	ScaffoldResult result = scaffold(g, n, s, false);
+	memo[param] = result;
+
+	// Print assembly metrics.
+	if (opt::verbose > 0) {
+		std::cerr << '\n';
+		const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+	}
+	std::cerr << result.metrics;
+	if (opt::verbose > 0)
+		cerr << '\n';
+	return result;
+}
+
+/** Find the value of n that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_n(const Graph& g,
+		std::pair<unsigned, unsigned> minEdgeWeight,
+		unsigned minContigLength,
+		ScaffoldMemo& memo)
+{
+	std::string metrics_table;
+	unsigned bestn = 0, bestN50 = 0;
+	for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
+		ScaffoldResult result = scaffold_memoized(g, n, minContigLength, memo);
+		metrics_table += result.metrics;
+		if (result.n50 > bestN50) {
+			bestN50 = result.n50;
+			bestn = n;
+		}
+	}
+
+	return ScaffoldResult(bestn, minContigLength, bestN50, metrics_table);
+}
+
+/** Find the value of s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_s(const Graph& g,
+		unsigned minEdgeWeight,
+		std::pair<unsigned, unsigned> minContigLength,
+		ScaffoldMemo& memo)
+{
+	std::string metrics_table;
+	unsigned bests = 0, bestN50 = 0;
+	const double STEP = cbrt(10); // Three steps per decade.
+	unsigned ilast = (unsigned)round(
+			log(minContigLength.second) / log(STEP));
+	for (unsigned i = (unsigned)round(
+				log(minContigLength.first) / log(STEP));
+			i <= ilast; ++i) {
+		unsigned s = (unsigned)pow(STEP, (int)i);
+
+		// Round to 1 figure.
+		double nearestDecade = pow(10, floor(log10(s)));
+		s = unsigned(round(s / nearestDecade) * nearestDecade);
+
+		ScaffoldResult result = scaffold_memoized(g, minEdgeWeight, s, memo);
+		metrics_table += result.metrics;
+		if (result.n50 > bestN50) {
+			bestN50 = result.n50;
+			bests = s;
+		}
+	}
+
+	return ScaffoldResult(minEdgeWeight, bests, bestN50, metrics_table);
+}
+
+/** Find the values of n and s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_grid_search(const Graph& g,
+		std::pair<unsigned, unsigned> minEdgeWeight,
+		std::pair<unsigned, unsigned> minContigLength)
+{
 	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
-	if (!output) {
-		static bool printHeader = true;
-		Histogram h = buildScaffoldLengthHistogram(g, paths);
-		printContiguityStats(cerr, h, STATS_MIN_LENGTH,
-				printHeader, "\t", opt::genomeSize)
-			<< "\ts=" << minContigLength << '\n';
-		if (opt::verbose == 0)
-			printHeader = false;
-		addCntgStatsToDb(h, STATS_MIN_LENGTH);
-		return h.trimLow(STATS_MIN_LENGTH).n50();
-	}
-
-	// Output the paths.
-	ofstream fout(opt::out.c_str());
-	ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
-	assert_good(out, opt::out);
-	g_contigNames.unlock();
-	for (vector<ContigPath>::const_iterator it = paths.begin();
-			it != paths.end(); ++it)
-		out << createContigName() << '\t'
-			<< addDistEst(g0, g, *it) << '\n';
-	assert_good(out, opt::out);
-
-	// Output the graph.
-	if (!opt::graphPath.empty()) {
-		ofstream out(opt::graphPath.c_str());
-		assert_good(out, opt::graphPath);
-		write_dot(out, g);
-		assert_good(out, opt::graphPath);
-	}
-
-	// Print assembly contiguity statistics.
-	Histogram h = buildScaffoldLengthHistogram(g, paths);
-	printContiguityStats(cerr, h, STATS_MIN_LENGTH,
-			true, "\t", opt::genomeSize)
-		<< "\ts=" << minContigLength << '\n';
-	addCntgStatsToDb(h, STATS_MIN_LENGTH);
-	return h.trimLow(STATS_MIN_LENGTH).n50();
+	if (opt::verbose == 0)
+		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+
+	ScaffoldMemo memo;
+	std::string metrics_table;
+	ScaffoldResult best(0, 0, 0, "");
+	for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
+		ScaffoldResult result = optimize_s(g, n, minContigLength, memo);
+		metrics_table += result.metrics;
+		if (result.n50 > best.n50)
+			best = result;
+	}
+
+	best.metrics = metrics_table;
+	return best;
+}
+
+/** Find the values of n and s that maximizes the scaffold N50. */
+static ScaffoldResult
+optimize_line_search(const Graph& g,
+		std::pair<unsigned, unsigned> minEdgeWeight,
+		std::pair<unsigned, unsigned> minContigLength)
+{
+	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+	if (opt::verbose == 0)
+		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+
+	ScaffoldMemo memo;
+	std::string metrics_table;
+	ScaffoldResult best(
+			(minEdgeWeight.first + minEdgeWeight.second) / 2,
+			minContigLength.second, 0, "");
+	// An upper limit on the number of iterations.
+	const unsigned MAX_ITERATIONS = 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
+	for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
+		// Optimize s.
+		if (opt::verbose > 0) {
+			std::cerr << "\nOptimizing s for n=" << best.n << "\n\n";
+			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+		}
+		unsigned previous_s = best.s;
+		best = optimize_s(g, best.n, minContigLength, memo);
+		metrics_table += best.metrics;
+		if (best.s == previous_s)
+			break;
+
+		// Optimize n.
+		if (opt::verbose > 0) {
+			std::cerr << "\nOptimizing n for s=" << best.s << "\n\n";
+			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+		}
+		unsigned previous_n = best.n;
+		best = optimize_n(g, minEdgeWeight, best.s, memo);
+		metrics_table += best.metrics;
+		if (best.n == previous_n)
+			break;
+	}
+
+	std::cerr << "\nLine search converged in " << memo.size() << " iterations.\n";
+
+	best.metrics = metrics_table;
+	return best;
 }
 
 /** Run abyss-scaffold. */
@@ -806,7 +1008,16 @@ int main(int argc, char** argv)
 			arg >> opt::graphPath;
 			break;
 		  case 'n':
-			arg >> opt::minNumPairs;
+			arg >> opt::minEdgeWeight;
+			if (arg.peek() == '-') {
+				arg >> expect("-") >> opt::minEdgeWeightEnd;
+				assert(opt::minEdgeWeight <= opt::minEdgeWeightEnd);
+			} else
+				opt::minEdgeWeightEnd = opt::minEdgeWeight;
+			if (arg.peek() == ':')
+				arg >> expect(":") >> opt::minEdgeWeightStep;
+			else
+				opt::minEdgeWeightStep = 1;
 			break;
 		  case 'o':
 			arg >> opt::out;
@@ -818,7 +1029,8 @@ int main(int argc, char** argv)
 				arg >> expect("-") >> opt::minContigLengthEnd;
 				assert(opt::minContigLength
 						<= opt::minContigLengthEnd);
-			}
+			} else
+				opt::minContigLengthEnd = opt::minContigLength;
 			break;
 		  case 'v':
 			opt::verbose++;
@@ -908,37 +1120,51 @@ int main(int argc, char** argv)
 	if (!opt::db.empty())
 		addToDb(db, "Edges_invalid", numRemoved);
 
-	if (opt::minContigLengthEnd == 0) {
-		scaffold(g, opt::minContigLength, true);
-		return 0;
-	}
-
-	// Find the value of s that maximizes the scaffold N50.
-	unsigned bests = 0, bestN50 = 0;
-	const double STEP = cbrt(10); // Three steps per decade.
-	unsigned ilast = (unsigned)round(
-			log(opt::minContigLengthEnd) / log(STEP));
-	for (unsigned i = (unsigned)round(
-				log(opt::minContigLength) / log(STEP));
-			i <= ilast; ++i) {
-		unsigned s = (unsigned)pow(STEP, (int)i);
-
-		// Round to 1 figure.
-		double nearestDecade = pow(10, floor(log10(s)));
-		s = unsigned(round(s / nearestDecade) * nearestDecade);
+	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
+	if (opt::minEdgeWeight == opt::minEdgeWeightEnd
+			&& opt::minContigLength == opt::minContigLengthEnd) {
+		ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
+		// Print assembly contiguity statistics.
+		if (opt::verbose > 0)
+			std::cerr << '\n';
+		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+		std::cerr << result.metrics;
+	} else {
+		ScaffoldResult best(0, 0, 0, "");
+		switch (opt::searchStrategy) {
+			case GRID_SEARCH:
+				best = optimize_grid_search(g,
+						std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
+						std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
+				break;
+			case LINE_SEARCH:
+				best = optimize_line_search(g,
+						std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
+						std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
+				break;
+			default:
+			    abort();
+				break;
+		}
 
-		unsigned n50 = scaffold(g, s, false);
 		if (opt::verbose > 0)
-			cerr << '\n';
-		if (n50 > bestN50) {
-			bestN50 = n50;
-			bests = s;
+			std::cerr << "\nScaffolding with n=" << best.n << " s=" << best.s << "\n\n";
+		ScaffoldResult result = scaffold(g, best.n, best.s, true);
+
+		// Print the table of all the parameter values attempted and their metrics.
+		if (opt::verbose > 0) {
+			std::cerr << '\n';
+			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+			std::cerr << best.metrics;
 		}
-	}
 
-	bestN50 = scaffold(g, bests, true);
-	cerr << "Best scaffold N50 is " << bestN50
-		<< " at s=" << bests << ".\n";
+		std::cerr << '\n' << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s << ".\n";
+
+		// Print assembly contiguity statistics.
+		std::cerr << '\n';
+		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
+		std::cerr << result.metrics;
+	}
 
 	return 0;
 }


=====================================
SimpleGraph/SimpleGraph.cpp
=====================================
--- a/SimpleGraph/SimpleGraph.cpp
+++ b/SimpleGraph/SimpleGraph.cpp
@@ -30,7 +30,7 @@ static const char VERSION_MESSAGE[] =
 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
 "Written by Jared Simpson and Shaun Jackman.\n"
 "\n"
-"Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
+"Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
 
 static const char USAGE_MESSAGE[] =
 "Usage: " PROGRAM " -k<kmer> -o<out.path> [OPTION]... ADJ DIST\n"
@@ -44,6 +44,8 @@ static const char USAGE_MESSAGE[] =
 " Options:\n"
 "\n"
 "  -k, --kmer=KMER_SIZE  k-mer size\n"
+"  -n, --npairs=N        minimum number of pairs [0]\n"
+"  -s, --seed-length=N   minimum seed contig length [0]\n"
 "  -d, --dist-error=N    acceptable error of a distance estimate\n"
 "                        default is 6 bp\n"
 "      --max-cost=COST   maximum computational cost\n"
@@ -73,6 +75,12 @@ namespace opt {
 	static int verbose;
 	static string out;
 
+	/** Minimum number of pairs. */
+	static unsigned minEdgeWeight;
+
+	/** Minimum seed length. */
+	static unsigned minSeedLength;
+
 	/** The acceptable error of a distance estimate. */
 	unsigned distanceError = 6;
 
@@ -80,7 +88,7 @@ namespace opt {
  	int format = DIST; // used by Estimate
 }
 
-static const char shortopts[] = "d:j:k:o:v";
+static const char shortopts[] = "d:j:k:n:o:s:v";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
 	OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
@@ -88,6 +96,8 @@ enum { OPT_HELP = 1, OPT_VERSION, OPT_MAX_COST,
 
 static const struct option longopts[] = {
 	{ "kmer",        required_argument, NULL, 'k' },
+	{ "npairs",      required_argument, NULL, 'n' },
+	{ "seed-length", required_argument, NULL, 's' },
 	{ "dist-error",  required_argument, NULL, 'd' },
 	{ "max-cost",    required_argument, NULL, OPT_MAX_COST },
 	{ "out",         required_argument, NULL, 'o' },
@@ -125,7 +135,9 @@ int main(int argc, char** argv)
 			case 'j': arg >> opt::threads; break;
 			case 'k': arg >> opt::k; break;
 			case OPT_MAX_COST: arg >> opt::maxCost; break;
+			case 'n': arg >> opt::minEdgeWeight; break;
 			case 'o': arg >> opt::out; break;
+			case 's': arg >> opt::minSeedLength; break;
 			case 'v': opt::verbose++; break;
 			case OPT_HELP:
 				cout << USAGE_MESSAGE;
@@ -248,6 +260,9 @@ static unsigned g_minNumPairs = UINT_MAX;
 static unsigned g_minNumPairsUsed = UINT_MAX;
 
 static struct {
+	unsigned seedTooShort;
+	unsigned noEdges;
+	unsigned edgesRemoved;
 	unsigned totalAttempted;
 	unsigned uniqueEnd;
 	unsigned noPossiblePaths;
@@ -631,6 +646,16 @@ static void handleEstimate(const Graph& g,
 	pthread_mutex_unlock(&coutMutex);
 }
 
+/** Return whether the specified edge has sufficient support. */
+struct PoorSupport {
+	PoorSupport(unsigned minEdgeWeight) : m_minEdgeWeight(minEdgeWeight) { }
+	bool operator()(const Estimates::value_type& estimate) const
+	{
+		return estimate.second.numPairs < m_minEdgeWeight;
+	}
+	const unsigned m_minEdgeWeight;
+};
+
 struct WorkerArg {
 	istream* in;
 	ostream* out;
@@ -642,6 +667,9 @@ struct WorkerArg {
 static void* worker(void* pArg)
 {
 	WorkerArg& arg = *static_cast<WorkerArg*>(pArg);
+	unsigned countSeedTooShort = 0;
+	unsigned countNoEdges = 0;
+	unsigned countEdgesRemoved = 0;
 	for (;;) {
 		/** Lock the input stream. */
 		static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
@@ -651,6 +679,26 @@ static void* worker(void* pArg)
 		pthread_mutex_unlock(&inMutex);
 		if (!good)
 			break;
+		if ((*arg.graph)[ContigNode(er.refID, false)].length < opt::minSeedLength) {
+			++countSeedTooShort;
+			continue;
+		}
+
+		// Remove edges with insufficient support.
+		for (unsigned i = 0; i < 2; ++i) {
+			Estimates& estimates = er.estimates[i];
+			if (estimates.empty())
+				continue;
+			unsigned sizeBefore = estimates.size();
+			estimates.erase(
+				remove_if(estimates.begin(), estimates.end(), PoorSupport(opt::minEdgeWeight)),
+				estimates.end());
+			unsigned sizeAfter = estimates.size();
+			unsigned edgesRemoved = sizeBefore - sizeAfter;
+			countEdgesRemoved += edgesRemoved;
+			if (sizeAfter == 0)
+				++countNoEdges;
+		}
 
 		// Flip the anterior distance estimates.
 		for (Estimates::iterator it = er.estimates[1].begin();
@@ -673,6 +721,14 @@ static void* worker(void* pArg)
 			pthread_mutex_unlock(&outMutex);
 		}
 	}
+
+	static pthread_mutex_t statsMutex = PTHREAD_MUTEX_INITIALIZER;
+	pthread_mutex_lock(&statsMutex);
+	stats.seedTooShort += countSeedTooShort;
+	stats.noEdges += countNoEdges;
+	stats.edgesRemoved += countEdgesRemoved;
+	pthread_mutex_unlock(&statsMutex);
+
 	return NULL;
 }
 
@@ -707,6 +763,9 @@ static void generatePathsThroughEstimates(const Graph& g,
 		cout << '\n';
 
 	cout <<
+		"Seed too short: " << stats.seedTooShort << "\n"
+		"Seeds with no edges: " << stats.noEdges << "\n"
+		"Edges removed: " << stats.edgesRemoved << "\n"
 		"Total paths attempted: " << stats.totalAttempted << "\n"
 		"Unique path: " << stats.uniqueEnd << "\n"
 		"No possible paths: " << stats.noPossiblePaths << "\n"


=====================================
bin/abyss-pe
=====================================
--- a/bin/abyss-pe
+++ b/bin/abyss-pe
@@ -3,14 +3,22 @@
 # Written by Shaun Jackman <sjackman at bcgsc.ca> and
 # Anthony Raymond <traymond at bcgsc.ca>.
 
+SHELL=bash -e -o pipefail
 ifneq ($(shell command -v zsh),)
 # Set pipefail to ensure that all commands of a pipe succeed.
 SHELL=zsh -e -o pipefail
 # Report run time and memory usage with zsh.
 export REPORTTIME=1
 export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J
+endif
+
+# Record run time and memory usage in a file using GNU time.
+ifdef time
+ifneq ($(shell command -v gtime),)
+gtime=command gtime -v -o $@.time
 else
-SHELL=bash -e -o pipefail
+gtime=command time -v -o $@.time
+endif
 endif
 
 # Define this environment variable on Mac OS X to read
@@ -93,10 +101,10 @@ endif
 
 # Use pigz or bgzip for parallel compression if available.
 ifneq ($(shell command -v pigz),)
-gzip=pigz -p$t
+gzip=pigz -p$j
 else
 ifneq ($(shell command -v bgzip),)
-gzip=bgzip -@$t
+gzip=bgzip -@$j
 else
 gzip=gzip
 endif
@@ -123,6 +131,13 @@ MARKDOWN=pandoc
 
 map=$(foreach a,$(2),$(call $(1),$(a)))
 deref=$($1)
+
+ifdef lr
+ifndef lib
+lib:=$(pe) $(lr)
+endif
+endif
+
 ifdef lib
 in?=$(call map, deref, $(lib))
 else
@@ -131,6 +146,7 @@ lib?=$(name)
 $(lib)?=$(in)
 endif
 endif
+
 pe?=$(lib)
 mp?=$(pe)
 
@@ -141,6 +157,9 @@ endif
 ifdef se
 override se:=$(strip $(se))
 endif
+ifdef lr
+override lr_reads=$(strip $(call map, deref, $(lr)))
+endif
 
 # Graph file format
 graph?=dot
@@ -287,13 +306,16 @@ $(foreach i,$(pe),$(eval $i_n?=$n))
 override deopt=$v $(dbopt) -j$j -k$k $(DISTANCEEST_OPTIONS) -l$($*_l) -s$($*_s) -n$($*_n) $($*_de)
 
 # SimpleGraph parameters
-sgopt += $(dbopt)
+sgopt += $(dbopt) -s$s -n$n
 ifdef d
 sgopt += -d$d
 endif
 
 # MergePaths parameters
-mpopt += $v $(dbopt) -j$j -k$k
+mpopt += $v $(dbopt) -j$j -k$k -s$s
+ifdef G
+mpopt += -G$G
+endif
 
 # PathOverlap parameters
 poopt += $v $(dbopt) -k$k
@@ -346,9 +368,9 @@ ifndef k
 error::
 	@>&2 echo 'abyss-pe: missing parameter `k`'
 endif
-ifeq ($(lib)$(in)$(se),)
+ifeq ($(lib)$(in)$(se)$(lr),)
 error::
-	@>&2 echo 'abyss-pe: missing parameter `lib`, `in` or `se`'
+	@>&2 echo 'abyss-pe: missing parameter `lib`, `in`, `se`, or `lr`'
 endif
 
 default:
@@ -371,7 +393,7 @@ help:
 	@echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users at bcgsc.ca.'
 
 version:
-	@echo "abyss-pe (ABySS) 2.0.3"
+	@echo "abyss-pe (ABySS) 2.1.0"
 	@echo "Written by Shaun Jackman and Anthony Raymond."
 	@echo
 	@echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
@@ -689,9 +711,106 @@ endif
 %-8.$g: %-7.$g %-7.path
 	PathOverlap --overlap $(poopt) --$g $^ >$@
 
+# Scaffold using linked reads
+ifdef lr
+
+# Tigmint
+
+# Options for mapping the reads to the draft assembly.
+lr_l=$l
+override lrmapopt=$v -j$j -l$(lr_l) $(LR_MAP_OPTIONS)
+
+# Options for abyss-scaffold
+lr_s=1000-100000
+lr_n=5-20
+
+# Minimum AS/Read length ratio
+tigmint_as=0.65
+
+# Maximum number of mismatches
+tigmint_nm=5
+
+# Minimum mapping quality threshold
+tigmint_mapq=0
+
+# Maximum distance between reads to be considered the same molecule
+tigmint_d=50000
+
+# Minimum number of spanning molecules
+tigmint_n=10
+
+# Size of the window that must be spanned by moecules
+tigmint_w=1000
+
+# Align paired-end reads to the draft genome, sort by BX tag,
+# and create molecule extents BED.
+%.lr.bed: %.fa
+	$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+	| samtools sort -@$j -tBX -l0 -T$$(mktemp -u -t $@.XXXXXX) \
+	| tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) - \
+	| sort -k1,1 -k2,2n -k3,3n >$@
+
+# Align paired-end reads to the draft genome and sort by BX tag.
+%.lr.sortbx.bam: %.fa
+	$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+	| samtools sort -@$j -tBX -T$$(mktemp -u -t $@.XXXXXX) -o $@
+
+# Filter the BAM file, create molecule extents BED.
+%.lr.bed: %.lr.sortbx.bam
+	$(gtime) tigmint-molecule -a $(tigmint_as) -n $(tigmint_nm) -q $(tigmint_mapq) -d $(tigmint_d) $< \
+	| sort -k1,1 -k2,2n -k3,3n >$@
+
+# Cut sequences at assembly errors.
+%.tigmint.fa: %.lr.bed %.fa %.fa.fai
+	$(gtime) tigmint-cut -p$j -n$(tigmint_n) -w$(tigmint_w) -o $@ $*.fa $<
+
+# ARCS
+arcs_c=2
+arcs_d=0
+arcs_e=30000
+arcs_l=0
+arcs_m=4-20000
+arcs_r=0.05
+arcs_s=98
+arcs_z=500
+
+# Align reads and create a graph of linked contigs using ARCS.
+%.arcs.dist.gv: %.fa
+	$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+	| abyss-fixmate-ssq --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
+	| arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
+		-g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
+
+# Align paired-end reads to the draft genome and do not sort.
+%.lr.sortn.sam.gz: %.fa
+	$(gtime) $(align) $(lrmapopt) $(lr_reads) $< \
+	| abyss-fixmate --all --qname $v -l$(lr_l) $(FIXMATE_OPTIONS) \
+	| $(gzip) >$@
+
+# Create a graph of linked contigs using ARCS.
+%.arcs.dist.gv: %.lr.sortn.sam.gz
+	gunzip -c $< \
+	| arcs $v -c$(arcs_c) -d$(arcs_d) -e$(arcs_e) -l$(arcs_l) -m$(arcs_m) -r$(arcs_r) -s$(arcs_s) -z$(arcs_z) \
+		-g $*.arcs.dist.gv --tsv=$*.arcs.tsv --barcode-counts=$*.arcs.barcode-counts.tsv /dev/stdin
+
+# Scaffold using ARCS and abyss-scaffold.
+%.arcs.path: %.arcs.dist.gv
+	abyss-scaffold $(scopt) -s$(lr_s) -n$(lr_n) -g $@.dot $(LR_SCAFFOLD_OPTIONS) $< >$@
+
+# Create the FASTA file of ARCS scaffolds.
+%.arcs.fa: %.fa %.arcs.path
+	MergeContigs $(mcopt) -o $@ $^
+
+%-scaffolds.fa: %-8.tigmint.arcs.fa
+	ln -sf $< $@
+
+else
+
 %-scaffolds.fa: %-8.fa
 	ln -sf $< $@
 
+endif
+
 %-scaffolds.$g: %-8.$g
 	ln -sf $< $@
 
@@ -854,4 +973,4 @@ env:
 	@echo '[override], if variable was defined with an override directive in (this) makefile.'
 
 	@$(foreach var,$(varList),\
-		echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]" | grep -v "undefined";)
+		echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]";)


=====================================
configure.ac
=====================================
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
 AC_PREREQ(2.62)
-AC_INIT(ABySS, 2.0.3, abyss-users at bcgsc.ca, abyss,
+AC_INIT(ABySS, 2.1.0, abyss-users at bcgsc.ca, abyss,
 		http://www.bcgsc.ca/platform/bioinfo/software/abyss)
 m4_include(m4/m4_ax_pthread.m4)
 AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)


=====================================
doc/ABYSS.1
=====================================
--- a/doc/ABYSS.1
+++ b/doc/ABYSS.1
@@ -1,4 +1,4 @@
-.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.0.3" "User Commands"
+.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.1.0" "User Commands"
 .SH NAME
 ABYSS \- assemble short reads into contigs
 .SH SYNOPSIS


=====================================
doc/abyss-pe.1
=====================================
--- a/doc/abyss-pe.1
+++ b/doc/abyss-pe.1
@@ -1,4 +1,4 @@
-.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.0.3" "User Commands"
+.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.1.0" "User Commands"
 .SH NAME
 abyss-pe - assemble reads into contigs
 .SH SYNOPSIS


=====================================
doc/abyss-tofastq.1
=====================================
--- a/doc/abyss-tofastq.1
+++ b/doc/abyss-tofastq.1
@@ -1,4 +1,4 @@
-.TH abyss-tofastq "1" "2015-May" "ABySS 2.0.3" "User Commands"
+.TH abyss-tofastq "1" "2015-May" "ABySS 2.1.0" "User Commands"
 .SH NAME
 abyss-tofastq \- convert various file formats to FASTQ format
 .br



View it on GitLab: https://salsa.debian.org/med-team/abyss/commit/e2ac24df5fbe2bfbc5e90d8f8536715303bda141

---
View it on GitLab: https://salsa.debian.org/med-team/abyss/commit/e2ac24df5fbe2bfbc5e90d8f8536715303bda141
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180420/ed472467/attachment-0001.html>


More information about the debian-med-commit mailing list