[med-svn] [abyss] 01/07: New upstream version 2.0.1
Andreas Tille
tille at debian.org
Thu Oct 6 04:57:19 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository abyss.
commit 95e61ae091136a9f5c0e9f513aa330e54098308d
Author: Andreas Tille <tille at debian.org>
Date: Thu Oct 6 06:40:38 2016 +0200
New upstream version 2.0.1
---
.gitignore | 12 +-
Assembly/Options.cc | 8 +-
Bloom/Bloom.h | 10 +-
Bloom/BloomFilter.h | 21 +-
Bloom/BloomFilterWindow.h | 21 +-
Bloom/CascadingBloomFilter.h | 9 +-
Bloom/CascadingBloomFilterWindow.h | 14 +-
Bloom/ConcurrentBloomFilter.h | 10 +-
Bloom/bloom.cc | 470 ++++++-
BloomDBG/HashAgnosticCascadingBloom.h | 145 +++
BloomDBG/LightweightKmer.h | 96 ++
BloomDBG/Makefile.am | 23 +
BloomDBG/MaskedKmer.h | 121 ++
BloomDBG/RollingBloomDBG.h | 486 ++++++++
BloomDBG/RollingHash.h | 289 +++++
BloomDBG/RollingHashIterator.h | 234 ++++
BloomDBG/SpacedSeed.h | 79 ++
BloomDBG/bloom-dbg.cc | 345 ++++++
BloomDBG/bloom-dbg.h | 1276 ++++++++++++++++++++
COPYRIGHT | 84 +-
ChangeLog | 57 +
Common/Kmer.h | 8 +-
Common/Sequence.h | 9 +-
DataBase/Makefile.am | 3 +
DataBase/db-csv.cc | 3 +-
DataLayer/fac.cc | 27 +-
Dockerfile | 20 +
Graph/BreadthFirstSearch.h | 3 +
Graph/ExtendPath.h | 482 ++++++--
Graph/Path.h | 13 +
.../Konnector/integration-tests.mk | 24 +-
Konnector/DBGBloomAlgorithms.h | 11 +-
Konnector/README.md | 176 +++
Konnector/konnector.cc | 690 +++++++----
Konnector/konnector.h | 239 +++-
LICENSE | 22 +-
LogKmerCount/CountingBloomFilter.h | 4 +-
LogKmerCount/plc.h | 6 +-
Makefile.am | 15 +-
ParseAligns/abyss-fixmate.cc | 1 -
README.css | 39 -
README.md | 156 +--
Scaffold/drawgraph.cc | 2 +-
Scaffold/scaffold.cc | 21 +-
Sealer/Makefile.am | 2 +-
Sealer/README.md | 16 +-
Sealer/sealer.cc | 92 +-
SimpleGraph/SimpleGraph.cpp | 2 +-
Unittest/BloomDBG/BloomDBGTest.cpp | 155 +++
.../BloomDBG/HashAgnosticCascadingBloomTest.cpp | 46 +
Unittest/BloomDBG/MaskedKmerTest.cpp | 26 +
Unittest/BloomDBG/RollingBloomDBGTest.cpp | 275 +++++
Unittest/BloomDBG/RollingHashIteratorTest.cpp | 116 ++
Unittest/BloomDBG/RollingHashTest.cpp | 195 +++
Unittest/BloomDBG/SpacedSeedTest.cpp | 26 +
Unittest/Graph/ExtendPathTest.cpp | 98 +-
Unittest/Makefile.am | 191 +--
bin/abyss-adjtodot.pl | 2 +-
bin/abyss-cstont | 2 +-
bin/abyss-dida | 2 +-
bin/abyss-fac.pl | 2 +-
bin/abyss-fatoagp | 23 +-
bin/abyss-joindist | 2 +-
bin/abyss-pe | 123 +-
bin/abyss-samtoafg | 2 +-
configure.ac | 29 +-
doc/ABYSS.1 | 2 +-
doc/abyss-pe.1 | 19 +-
doc/abyss-tofastq.1 | 2 +-
doc/flowchart.graffle | 2 +-
lib/bloomfilter/BloomFilter.hpp | 446 +++++++
lib/bloomfilter/Makefile.am | 1 +
lib/bloomfilter/README.md | 4 +
lib/rolling-hash/Makefile.am | 1 +
lib/rolling-hash/README.md | 2 +
lib/rolling-hash/rolling.h | 316 +++++
76 files changed, 7156 insertions(+), 850 deletions(-)
diff --git a/.gitignore b/.gitignore
index 5253d26..dd0a13a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,14 +1,23 @@
-*/Makefile.in
+Makefile
Makefile.in
+README.html
aclocal.m4
autom4te.cache
+config.guess
+config.h
config.h.in
+config.log
+config.status
+config.sub
configure
depcomp
install-sh
+lib*.a
missing
+stamp-h1
test-driver
_*
+*.o
*.swp
*.swo
*.swn
@@ -16,3 +25,4 @@ _*
tags
compile
*.orig
+.deps
diff --git a/Assembly/Options.cc b/Assembly/Options.cc
index 379dd1f..8ae5d6f 100644
--- a/Assembly/Options.cc
+++ b/Assembly/Options.cc
@@ -51,16 +51,16 @@ static const char USAGE_MESSAGE[] =
" -k, --kmer=N the length of a k-mer (when -K is not set)\n"
" or the span of a k-mer pair (when -K is set)\n"
" -K, --single-kmer=N the length of a single k-mer in a k-mer pair\n"
-" -t, --trim-length=N maximum length of dangling edges to trim\n"
+" -t, --trim-length=N maximum length of blunt contigs to trim [k]\n"
" -c, --coverage=FLOAT remove contigs with mean k-mer coverage\n"
" less than this threshold\n"
" -b, --bubbles=N pop bubbles shorter than N bp [3*k]\n"
" -b0, --no-bubbles do not pop bubbles\n"
-" -e, --erode=N erode bases at the ends of blunt contigs\n"
-" with coverage less than this threshold\n"
+" -e, --erode=N erode bases at the ends of blunt contigs with coverage\n"
+" less than this threshold [round(sqrt(median))]\n"
" -E, --erode-strand=N erode bases at the ends of blunt contigs\n"
" with coverage less than this threshold on\n"
-" either strand\n"
+" either strand [1 if sqrt(median) > 2 else 0]\n"
" --coverage-hist=FILE write the k-mer coverage histogram to FILE\n"
" -m, --mask-cov do not include kmers containing masked bases in\n"
" coverage calculations [experimental]\n"
diff --git a/Bloom/Bloom.h b/Bloom/Bloom.h
index 3b1c6fe..cd90dfb 100644
--- a/Bloom/Bloom.h
+++ b/Bloom/Bloom.h
@@ -40,12 +40,13 @@ namespace Bloom {
size_t fullBloomSize;
size_t startBitPos;
size_t endBitPos;
+ size_t hashSeed;
};
/** Print a progress message after loading this many seqs */
static const unsigned LOAD_PROGRESS_STEP = 100000;
/** file format version number */
- static const unsigned BLOOM_VERSION = 2;
+ static const unsigned BLOOM_VERSION = 4;
/** Return the hash value of this object. */
inline static size_t hash(const key_type& key)
@@ -55,7 +56,7 @@ namespace Bloom {
key_type copy(key);
copy.reverseComplement();
- return hashmem(©, sizeof copy);
+ return hashmem(©, sizeof copy, 0);
}
/** Return the hash value of this object given seed. */
@@ -137,13 +138,12 @@ namespace Bloom {
(void)writeHeader;
out << BLOOM_VERSION << '\n';
- assert(out);
out << Kmer::length() << '\n';
- assert(out);
out << header.fullBloomSize
<< '\t' << header.startBitPos
<< '\t' << header.endBitPos
<< '\n';
+ out << header.hashSeed << '\n';
assert(out);
}
@@ -180,6 +180,8 @@ namespace Bloom {
>> expect("\t") >> header.endBitPos
>> expect("\n");
+ in >> header.hashSeed >> expect("\n");
+
assert(in);
assert(header.startBitPos < header.fullBloomSize);
assert(header.endBitPos < header.fullBloomSize);
diff --git a/Bloom/BloomFilter.h b/Bloom/BloomFilter.h
index 620f045..91b1aa4 100644
--- a/Bloom/BloomFilter.h
+++ b/Bloom/BloomFilter.h
@@ -20,10 +20,11 @@ class BloomFilter
public:
/** Constructor. */
- BloomFilter() : m_size(0), m_array(NULL) { }
+ BloomFilter() : m_size(0), m_hashSeed(0), m_array(NULL) { }
/** Constructor. */
- BloomFilter(size_t n) : m_size(n)
+ BloomFilter(size_t n, size_t hashSeed=0) : m_size(n),
+ m_hashSeed(hashSeed)
{
m_array = new char[(n + 7)/8]();
}
@@ -70,7 +71,7 @@ class BloomFilter
/** Return whether the object is present in this set. */
bool operator[](const Bloom::key_type& key) const
{
- return (*this)[Bloom::hash(key) % m_size];
+ return (*this)[Bloom::hash(key, m_hashSeed) % m_size];
}
/** Add the object with the specified index to this set. */
@@ -83,7 +84,7 @@ class BloomFilter
/** Add the object to this set. */
void insert(const Bloom::key_type& key)
{
- insert(Bloom::hash(key) % m_size);
+ insert(Bloom::hash(key, m_hashSeed) % m_size);
}
/** Operator for reading a bloom filter from a stream. */
@@ -106,6 +107,16 @@ class BloomFilter
Bloom::FileHeader header = Bloom::readHeader(in);
assert(in);
+ if (m_hashSeed != header.hashSeed) {
+ if (readOp == BITWISE_OVERWRITE) {
+ m_hashSeed = header.hashSeed;
+ } else {
+ std::cerr << "error: can't union/intersect bloom filters with "
+ << "different hash seeds\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
if (m_size != header.fullBloomSize) {
if (readOp == BITWISE_OVERWRITE) {
resize(header.fullBloomSize);
@@ -128,6 +139,7 @@ class BloomFilter
header.fullBloomSize = m_size;
header.startBitPos = 0;
header.endBitPos = m_size - 1;
+ header.hashSeed = m_hashSeed;
Bloom::writeHeader(out, header);
assert(out);
@@ -149,6 +161,7 @@ class BloomFilter
protected:
size_t m_size;
+ size_t m_hashSeed;
char* m_array;
};
diff --git a/Bloom/BloomFilterWindow.h b/Bloom/BloomFilterWindow.h
index c16f8dd..521d2a3 100644
--- a/Bloom/BloomFilterWindow.h
+++ b/Bloom/BloomFilterWindow.h
@@ -27,8 +27,9 @@ public:
* @param startBitPos index of first bit in the window
* @param endBitPos index of last bit in the window
*/
- BloomFilterWindow(size_t fullBloomSize, size_t startBitPos, size_t endBitPos) :
- BloomFilter(endBitPos - startBitPos + 1),
+ BloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
+ size_t endBitPos, size_t hashSeed=0) :
+ BloomFilter(endBitPos - startBitPos + 1, hashSeed),
m_fullBloomSize(fullBloomSize),
m_startBitPos(startBitPos),
m_endBitPos(endBitPos)
@@ -88,7 +89,7 @@ public:
/** Return whether the object is present in this set. */
bool operator[](const Bloom::key_type& key) const
{
- return (*this)[Bloom::hash(key) % m_fullBloomSize];
+ return (*this)[Bloom::hash(key, m_hashSeed) % m_fullBloomSize];
}
/** Add the object with the specified index to this set. */
@@ -101,7 +102,7 @@ public:
/** Add the object to this set. */
void insert(const Bloom::key_type& key)
{
- insert(Bloom::hash(key) % m_fullBloomSize);
+ insert(Bloom::hash(key, m_hashSeed) % m_fullBloomSize);
}
/** Operator for reading a bloom filter from a stream. */
@@ -128,6 +129,16 @@ public:
m_startBitPos = header.startBitPos;
m_endBitPos = header.endBitPos;
+ if (m_hashSeed != header.hashSeed) {
+ if (readOp == BITWISE_OVERWRITE) {
+ m_hashSeed = header.hashSeed;
+ } else {
+ std::cerr << "error: can't union/intersect bloom filters with "
+ << "different hash seed values\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
size_t bits = header.endBitPos - header.startBitPos + 1;
if (m_size != bits) {
@@ -152,6 +163,8 @@ public:
header.fullBloomSize = m_fullBloomSize;
header.startBitPos = m_startBitPos;
header.endBitPos = m_endBitPos;
+ header.hashSeed = m_hashSeed;
+
Bloom::writeHeader(out, header);
assert(out);
diff --git a/Bloom/CascadingBloomFilter.h b/Bloom/CascadingBloomFilter.h
index 1cf3f93..30b9844 100644
--- a/Bloom/CascadingBloomFilter.h
+++ b/Bloom/CascadingBloomFilter.h
@@ -18,11 +18,11 @@ class CascadingBloomFilter
CascadingBloomFilter() {}
/** Constructor */
- CascadingBloomFilter(size_t n, size_t max_count)
+ CascadingBloomFilter(size_t n, size_t max_count, size_t hashSeed=0) : m_hashSeed(hashSeed)
{
m_data.reserve(max_count);
for (unsigned i = 0; i < max_count; i++)
- m_data.push_back(new BloomFilter(n));
+ m_data.push_back(new BloomFilter(n, hashSeed));
}
/** Destructor */
@@ -68,7 +68,7 @@ class CascadingBloomFilter
bool operator[](const Bloom::key_type& key) const
{
assert(m_data.back() != NULL);
- return (*m_data.back())[Bloom::hash(key) % m_data.back()->size()];
+ return (*m_data.back())[Bloom::hash(key, m_hashSeed) % m_data.back()->size()];
}
/** Add the object with the specified index to this multiset. */
@@ -87,7 +87,7 @@ class CascadingBloomFilter
void insert(const Bloom::key_type& key)
{
assert(m_data.back() != NULL);
- insert(Bloom::hash(key) % m_data.back()->size());
+ insert(Bloom::hash(key, m_hashSeed) % m_data.back()->size());
}
/** Get the Bloom filter for a given level */
@@ -111,6 +111,7 @@ class CascadingBloomFilter
}
private:
+ size_t m_hashSeed;
std::vector<BloomFilter*> m_data;
};
diff --git a/Bloom/CascadingBloomFilterWindow.h b/Bloom/CascadingBloomFilterWindow.h
index 8aa4106..fa33910 100644
--- a/Bloom/CascadingBloomFilterWindow.h
+++ b/Bloom/CascadingBloomFilterWindow.h
@@ -17,13 +17,14 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
* @param endBitPos index of last bit in the window
* @param max_count the maximum count value of the Bloom filter
*/
- CascadingBloomFilterWindow(size_t fullBloomSize, size_t startBitPos, size_t endBitPos,
- unsigned max_count)
- : m_fullBloomSize(fullBloomSize)
+ CascadingBloomFilterWindow(size_t fullBloomSize, size_t startBitPos,
+ size_t endBitPos, unsigned max_count, size_t hashSeed=0)
+ : m_fullBloomSize(fullBloomSize), m_hashSeed(hashSeed)
{
m_data.reserve(max_count);
- for (unsigned i = 0; i < max_count; ++i)
- m_data.push_back(new BloomFilterWindow(fullBloomSize, startBitPos, endBitPos));
+ for (unsigned i = 0; i < max_count; i++)
+ m_data.push_back(new BloomFilterWindow(fullBloomSize,
+ startBitPos, endBitPos, hashSeed));
}
/** Return the size of the bit array. */
@@ -62,7 +63,7 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
void insert(const Bloom::key_type& key)
{
assert(m_data.back() != NULL);
- insert(Bloom::hash(key) % m_fullBloomSize);
+ insert(Bloom::hash(key, m_hashSeed) % m_fullBloomSize);
}
void write(std::ostream& out) const
@@ -87,6 +88,7 @@ class CascadingBloomFilterWindow : private CascadingBloomFilter
private:
size_t m_fullBloomSize;
+ size_t m_hashSeed;
std::vector<BloomFilterWindow*> m_data;
};
diff --git a/Bloom/ConcurrentBloomFilter.h b/Bloom/ConcurrentBloomFilter.h
index 3dcc68f..18e45a3 100644
--- a/Bloom/ConcurrentBloomFilter.h
+++ b/Bloom/ConcurrentBloomFilter.h
@@ -20,8 +20,9 @@ class ConcurrentBloomFilter
public:
/** Constructor */
- ConcurrentBloomFilter(BloomFilterType& bloom, size_t numLocks) :
- m_bloom(bloom), m_locks(numLocks)
+ ConcurrentBloomFilter(BloomFilterType& bloom, size_t numLocks,
+ size_t hashSeed=0) : m_bloom(bloom), m_locks(numLocks),
+ m_hashSeed(hashSeed)
{
m_windowSize = bloom.size() / numLocks;
// round down to the nearest byte boundary,
@@ -54,7 +55,7 @@ public:
/** Return whether the object is present in this set. */
bool operator[](const Bloom::key_type& key) const
{
- return *this[Bloom::hash(key) % m_bloom.size()];
+ return *this[Bloom::hash(key, m_hashSeed) % m_bloom.size()];
}
/** Add the object with the specified index to this set. */
@@ -69,7 +70,7 @@ public:
/** Add the object to this set. */
void insert(const Bloom::key_type& key)
{
- insert(Bloom::hash(key) % m_bloom.size());
+ insert(Bloom::hash(key, m_hashSeed) % m_bloom.size());
}
private:
@@ -90,6 +91,7 @@ private:
BloomFilterType& m_bloom;
std::vector<omp_lock_t> m_locks;
+ size_t m_hashSeed;
size_t m_windowSize;
};
diff --git a/Bloom/bloom.cc b/Bloom/bloom.cc
index 31f7eaa..d8efc2d 100644
--- a/Bloom/bloom.cc
+++ b/Bloom/bloom.cc
@@ -6,7 +6,12 @@
#include "Common/Options.h"
#include "Common/Kmer.h"
#include "Common/BitUtil.h"
+#include "Common/KmerIterator.h"
+#include "Graph/Path.h"
+#include "Graph/ExtendPath.h"
+#include "Konnector/DBGBloom.h"
#include "DataLayer/Options.h"
+#include "DataLayer/FastaReader.h"
#include "Common/StringUtil.h"
#include "Bloom/Bloom.h"
#include "Bloom/BloomFilter.h"
@@ -19,6 +24,7 @@
#include <iostream>
#include <fstream>
#include <sstream>
+#include <cmath>
#if _OPENMP
# include <omp.h>
@@ -41,6 +47,9 @@ static const char USAGE_MESSAGE[] =
"Usage 2: " PROGRAM " union [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
"Usage 3: " PROGRAM " intersect [GLOBAL_OPTS] [COMMAND_OPTS] <OUTPUT_BLOOM_FILE> <BLOOM_FILE_1> <BLOOM_FILE_2> [BLOOM_FILE_3]...\n"
"Usage 4: " PROGRAM " info [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE>\n"
+"Usage 5: " PROGRAM " compare [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE_1> <BLOOM_FILE_2>\n"
+"Usage 6: " PROGRAM " kmers [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE>\n"
+"Usage 7: " PROGRAM " trim [GLOBAL_OPTS] [COMMAND_OPTS] <BLOOM_FILE> <READS_FILE> [READS_FILE_2]... > trimmed.fq\n"
"Build and manipulate bloom filter files.\n"
"\n"
" Global options:\n"
@@ -55,6 +64,7 @@ static const char USAGE_MESSAGE[] =
" -b, --bloom-size=N size of bloom filter [500M]\n"
" -B, --buffer-size=N size of I/O buffer for each thread, in bytes [100000]\n"
" -j, --threads=N use N parallel threads [1]\n"
+" -h, --hash-seed=N seed for hash function [0]\n"
" -l, --levels=N build a cascading bloom filter with N levels\n"
" and output the last level\n"
" -L, --init-level='N=FILE' initialize level N of cascading bloom filter\n"
@@ -76,8 +86,23 @@ static const char USAGE_MESSAGE[] =
" Options for `" PROGRAM " union': (none)\n"
" Options for `" PROGRAM " intersect': (none)\n"
" Options for `" PROGRAM " info': (none)\n"
+" Options for `" PROGRAM " compare':\n"
"\n"
-"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
+" -m, --method=`String' choose distance calculation method \n"
+" [`jaccard'(default), `forbes', `czekanowski']\n"
+"\n"
+" Options for `" PROGRAM " kmers':\n"
+"\n"
+" -r, --inverse get k-mers that are *NOT* in the bloom filter\n"
+" --bed output k-mers in BED format\n"
+" --fasta output k-mers in FASTA format [default]\n"
+" --raw output k-mers in raw format (one per line)\n"
+"\n"
+" Options for `" PROGRAM " trim': (none)\n"
+"\n"
+"Report bugs to <" PACKAGE_BUGREPORT ">.\n";;
+
+enum OutputFormat { BED, FASTA, RAW };
namespace opt {
@@ -90,6 +115,9 @@ namespace opt {
/** The number of parallel threads. */
unsigned threads = 1;
+ /** Seed for Bloom filter hash function. */
+ size_t hashSeed = 0;
+
/** The size of a k-mer. */
unsigned k;
@@ -115,31 +143,49 @@ namespace opt {
/** Number of windows in complete bloom filter.
("N" for -w option) */
unsigned windows = 0;
+
+ /* Method for similarity or distance calculation.
+ -m option
+ */
+ string method("jaccard");
+
+ /* Inverse option to retrieve kmers which are not
+ in the filter
+ */
+ bool inverse = false;
+
+ OutputFormat format = FASTA;
}
-static const char shortopts[] = "b:B:j:k:l:L:n:q:vw:";
+static const char shortopts[] = "b:B:h:j:k:l:L:m:n:q:rvw:";
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_BED, OPT_FASTA, OPT_RAW };
static const struct option longopts[] = {
- { "bloom-size", required_argument, NULL, 'b' },
- { "buffer-size", required_argument, NULL, 'B' },
- { "threads", required_argument, NULL, 'j' },
- { "kmer", required_argument, NULL, 'k' },
- { "levels", required_argument, NULL, 'l' },
- { "init-level", required_argument, NULL, 'L' },
- { "chastity", no_argument, &opt::chastityFilter, 1 },
- { "no-chastity", no_argument, &opt::chastityFilter, 0 },
- { "trim-masked", no_argument, &opt::trimMasked, 1 },
- { "no-trim-masked", no_argument, &opt::trimMasked, 0 },
- { "num-locks", required_argument, NULL, 'n' },
- { "trim-quality", required_argument, NULL, 'q' },
+ { "bloom-size", required_argument, NULL, 'b' },
+ { "buffer-size", required_argument, NULL, 'B' },
+ { "hash-seed", required_argument, NULL, 'h' },
+ { "threads", required_argument, NULL, 'j' },
+ { "kmer", required_argument, NULL, 'k' },
+ { "levels", required_argument, NULL, 'l' },
+ { "init-level", required_argument, NULL, 'L' },
+ { "chastity", no_argument, &opt::chastityFilter, 1 },
+ { "no-chastity", no_argument, &opt::chastityFilter, 0 },
+ { "trim-masked", no_argument, &opt::trimMasked, 1 },
+ { "no-trim-masked", no_argument, &opt::trimMasked, 0 },
+ { "num-locks", required_argument, NULL, 'n' },
+ { "trim-quality", required_argument, NULL, 'q' },
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
- { "verbose", no_argument, NULL, 'v' },
- { "help", no_argument, NULL, OPT_HELP },
- { "version", no_argument, NULL, OPT_VERSION },
- { "window", required_argument, NULL, 'w' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { "window", required_argument, NULL, 'w' },
+ { "method", required_argument, NULL, 'm' },
+ { "inverse", required_argument, NULL, 'r' },
+ { "bed", no_argument, NULL, OPT_BED },
+ { "fasta", no_argument, NULL, OPT_FASTA },
+ { "raw", no_argument, NULL, OPT_RAW },
{ NULL, 0, NULL, 0 }
};
@@ -318,6 +364,8 @@ int build(int argc, char** argv)
opt::bloomSize = SIToBytes(arg); break;
case 'B':
arg >> opt::bufferSize; break;
+ case 'h':
+ arg >> opt::hashSeed; break;
case 'j':
arg >> opt::threads; break;
case 'l':
@@ -404,10 +452,10 @@ int build(int argc, char** argv)
if (opt::windows == 0) {
if (opt::levels == 1) {
- BloomFilter bloom(bits);
+ BloomFilter bloom(bits, opt::hashSeed);
#ifdef _OPENMP
ConcurrentBloomFilter<BloomFilter>
- cbf(bloom, opt::numLocks);
+ cbf(bloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(bloom, argc, argv);
@@ -416,11 +464,11 @@ int build(int argc, char** argv)
writeBloom(bloom, outputPath);
}
else {
- CascadingBloomFilter cascadingBloom(bits, opt::levels);
+ CascadingBloomFilter cascadingBloom(bits, opt::levels, opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
#ifdef _OPENMP
ConcurrentBloomFilter<CascadingBloomFilter>
- cbf(cascadingBloom, opt::numLocks);
+ cbf(cascadingBloom, opt::numLocks, opt::hashSeed);
loadFilters(cbf, argc, argv);
#else
loadFilters(cascadingBloom, argc, argv);
@@ -441,14 +489,16 @@ int build(int argc, char** argv)
endBitPos = bits - 1;
if (opt::levels == 1) {
- BloomFilterWindow bloom(bits, startBitPos, endBitPos);
+ BloomFilterWindow bloom(bits, startBitPos,
+ endBitPos, opt::hashSeed);
loadFilters(bloom, argc, argv);
printBloomStats(cerr, bloom);
writeBloom(bloom, outputPath);
}
else {
CascadingBloomFilterWindow cascadingBloom(
- bits, startBitPos, endBitPos, opt::levels);
+ bits, startBitPos, endBitPos, opt::levels,
+ opt::hashSeed);
initBloomFilterLevels(cascadingBloom);
loadFilters(cascadingBloom, argc, argv);
printCascadingBloomStats(cerr, cascadingBloom);
@@ -544,6 +594,365 @@ int info(int argc, char** argv)
return 0;
}
+int compare(int argc, char ** argv){
+ parseGlobalOpts(argc, argv);
+ // Arg parser to get `m' option in case set
+ for (int c; (c = getopt_long(argc, argv,
+ shortopts, longopts, NULL)) != -1;) {
+ istringstream arg(optarg != NULL ? optarg : "");
+ switch (c) {
+ case '?':
+ cerr << PROGRAM ": unrecognized option: `-" << optopt
+ << "'" << endl;
+ dieWithUsageError();
+ case 'm':
+ arg >> opt::method; break;
+ break;
+ }
+ if (optarg != NULL && (!arg.eof() || arg.fail())) {
+ cerr << PROGRAM ": invalid option: `-"
+ << (char)c << optarg << "'\n";
+ exit(EXIT_FAILURE);
+ }
+ if (opt::method != "jaccard" && opt::method != "czekanowski" && opt::method != "forbes")
+ std::cerr << "Invalid method: " << opt::method << std::endl;
+ }
+
+
+ // Set method strin
+ string method(opt::method);
+ if (opt::verbose)
+ std::cerr << "Computing distance for 2"
+ << " samples...\n";
+ // Get both paths and open istreams
+ BloomFilter bloomA;
+ string pathA(argv[optind]);
+ BloomFilter bloomB;
+ string pathB(argv[optind+1]);
+ if (opt::verbose)
+ std::cerr << "Loading bloom filters from "
+ << pathA << " and " << pathB << "...\n";
+ istream* inA = openInputStream(pathA);
+ istream* inB = openInputStream(pathB);
+ // Assert state of streams
+ assert_good(*inA, pathA);
+ assert_good(*inB, pathB);
+ // Not sure this conversion is needed, check docs
+ std::istream & tA = *inA;
+ std::istream & tB = *inB;
+ // Need to read header for bit start and end info
+ Bloom::FileHeader headerA = Bloom::readHeader(tA);
+ Bloom::FileHeader headerB = Bloom::readHeader(tB);
+ // Need to assert after every read operation
+ assert(tA);
+ assert(tB);
+
+ const size_t IO_BUFFER_SIZE = 32 * 1024;
+ unsigned char mask = 1;
+ // The number of total bits in the vector
+ size_t bitsA = headerA.endBitPos - headerA.startBitPos + 1;
+ size_t bitsB = headerB.endBitPos - headerB.startBitPos + 1;
+ // They need to be the same size to be comparable
+ if(bitsA != bitsB ) {
+ std::cerr << "Bit sizes of arrays not equal" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (opt::verbose)
+ std::cerr << "Bits: " << bitsA << std::endl;
+ /* As in Choi et al. (2010),
+ a - cases where both bits are set (1/1)
+ b - cases where bits are set in the first but nor the second (1/0)
+ c - cases where bits are set in the second but not the first (0/1)
+ d - cases where bits are not set in either (0/0)
+ */
+ unsigned long a = 0;
+ unsigned long b = 0;
+ unsigned long c = 0;
+ unsigned long d = 0;
+ // Iteratively compare bits
+ for(size_t i = 0; i < bitsA;){
+ char bufferA[IO_BUFFER_SIZE];
+ char bufferB[IO_BUFFER_SIZE];
+ // The number of bits in the buffer is its size * 8 except for the last iteration
+ size_t bitsRead = std::min(IO_BUFFER_SIZE * 8, bitsA - i);
+ size_t bytesRead = (bitsRead + 7)/8;
+ // Read bytes from the the istream and immediately assert
+ tA.read(bufferA, bytesRead);
+ tB.read(bufferB, bytesRead);
+ assert(tA);
+ assert(tB);
+ // For each byte in the buffer, compare bits
+ for(size_t j = 0; j < IO_BUFFER_SIZE; j++){
+ // Compare bit-wise
+ for(int bit = 0; bit < 8; bit++){
+ bool f = (bufferA[j] & (mask << bit)) != 0;
+ bool s = (bufferB[j] & (mask << bit)) != 0;
+ if( f == 1 && s == 1 ) {
+ a++;
+ } else if( f == 1 && s == 0) {
+ b++;
+ } else if( f == 0 && s == 1) {
+ c++;
+ } else d++;
+ }
+ }
+ i += bitsRead;
+ }
+ assert(tA);
+ assert(tB);
+ // Result output:
+ std::cout << "1/1: " << a << "\n1/0: " << b << "\n0/1: " << c << "\n0/0: " << d << std::endl;
+ if(method == "jaccard"){
+ float Dist = (float)a/(float)(a+b+c);
+ std::cout << "Jaccard similarity: " << Dist << std::endl;
+ }
+ if(method == "czekanowski"){
+ float Dist = (2*(float)a)/(float)((2*a)+b+c);
+ std::cout << "Czekanowski similarity: " << Dist << std::endl;
+ }
+ if(method == "forbes"){
+ float n = (float)(a + b + c + d);
+ float Dist = (n*a - ((a+b)*(a+c))) / (n*std::min(a+b,a+c) - ((a+b) * (a+c)));
+ std::cout << "Forbes similarity: " << Dist << std::endl;
+ }
+ // Check and clean up
+ assert_good(tA, pathA);
+ assert_good(tA, pathB);
+ closeInputStream(inA, pathA);
+ closeInputStream(inB, pathB);
+
+ return 1;
+}
+
+int memberOf(int argc, char ** argv){
+ // Initalise bloom and get globals
+ BloomFilter bloom;
+ parseGlobalOpts(argc, argv);
+ // Arg parser to get `m' option in case set
+ for (int c; (c = getopt_long(argc, argv,
+ shortopts, longopts, NULL)) != -1;) {
+ istringstream arg(optarg != NULL ? optarg : "");
+ switch (c) {
+ case '?':
+ cerr << PROGRAM ": unrecognized option: `-" << optopt
+ << "'" << endl;
+ dieWithUsageError();
+ case 'r':
+ opt::inverse = true; break;
+ break;
+ case OPT_BED:
+ opt::format = BED;
+ break;
+ case OPT_FASTA:
+ opt::format = FASTA;
+ break;
+ case OPT_RAW:
+ opt::format = RAW;
+ break;
+ }
+ if (optarg != NULL && (!arg.eof() || arg.fail())) {
+ cerr << PROGRAM ": invalid option: `-"
+ << (char)c << optarg << "'\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+ string path = argv[optind];
+ string fasta = argv[++optind];
+ unsigned k = opt::k;
+ if (opt::verbose)
+ std::cerr << "Loading bloom filter from `"
+ << path << "'...\n";
+
+ istream* in = openInputStream(path);
+ assert_good(*in, path);
+ *in >> bloom;
+
+ assert(!fasta.empty());
+ if (opt::verbose)
+ std::cerr << "Reading `" << fasta << "'...\n";
+ FastaReader _in(fasta.c_str(), FastaReader::FOLD_CASE);
+
+ size_t seqCount=0;
+ for (FastaRecord rec; _in >> rec; ++seqCount) {
+ string& seq = rec.seq;
+ if (seq.size() < k)
+ continue;
+ for (size_t i = 0; i < seq.size() - k + 1; ++i) {
+ string kmer = seq.substr(i, k);
+ size_t pos = kmer.find_last_not_of("ACGTacgt");
+ if (pos != string::npos) {
+ i += pos;
+ continue;
+ }
+ if (bloom[Kmer(kmer)] || opt::inverse) {
+ if (opt::format == FASTA) {
+ cout << ">" << rec.id << ":seq:" << seqCount
+ << ":kmer:" << i << "\n";
+ } else if (opt::format == BED) {
+ cout << rec.id
+ << "\t" << i
+ << "\t" << i + k - 1
+ << "\t";
+ }
+ cout << kmer << "\n";
+ }
+ }
+ if (opt::verbose && seqCount % 1000 == 0)
+ cerr << "processed " << seqCount << " sequences" << endl;
+ }
+ assert(_in.eof());
+ if (opt::verbose)
+ cerr << "processed " << seqCount << " sequences" << endl;
+
+ return 0;
+}
+
+/**
+ * Calculate number of bases to trim from left end of sequence.
+ */
+int calcLeftTrim(const Sequence& seq, unsigned k, const BloomFilter& bloom,
+ size_t minBranchLen)
+{
+ // Boost graph interface for Bloom filter
+ DBGBloom<BloomFilter> g(bloom);
+
+ // if this is the first k-mer we have found in
+ // Bloom filter, starting from the left end
+ // of the sequence
+ bool firstKmerMatch = true;
+
+ KmerIterator it(seq, k);
+ for (; it != KmerIterator::end(); ++it) {
+
+ const Kmer& kmer = *it;
+
+ // assume k-mers not present in Bloom filter are
+ // due to sequencing errors and should be trimmed
+ if (!bloom[kmer])
+ continue;
+
+ // in degree, disregarding false branches
+ unsigned inDegree = trueBranches(kmer, REVERSE, g,
+ minBranchLen).size();
+ // out degree, disregarding false branches
+ unsigned outDegree = trueBranches(kmer, FORWARD, g,
+ minBranchLen).size();
+
+ if (firstKmerMatch) {
+ bool leftTip = (inDegree == 0 && outDegree == 1);
+ bool rightTip = (inDegree == 1 && outDegree == 0);
+ if (!leftTip && !rightTip)
+ break;
+ } else if (inDegree != 1 || outDegree != 1) {
+ // end of linear path
+ break;
+ }
+
+ firstKmerMatch = false;
+
+ } // for each k-mer (left to right)
+
+ if (it.pos() == 0)
+ return 0;
+
+ return k + it.pos() - 1;
+}
+
+/**
+ * Trim reads that corresponds to tips in the Bloom filter
+ * de Bruijn graph.
+ */
+int trim(int argc, char** argv)
+{
+ // parse command line opts
+ parseGlobalOpts(argc, argv);
+ unsigned k = opt::k;
+
+ // arg 1: Bloom filter
+ // args 2-n: FASTA/FASTQ files
+ if (argc - optind < 2) {
+ cerr << PROGRAM ": missing arguments\n";
+ dieWithUsageError();
+ }
+
+ // load Bloom filter de Bruijn graph
+ string bloomPath(argv[optind++]);
+ if (opt::verbose)
+ cerr << "Loading bloom filter from `"
+ << bloomPath << "'...\n";
+
+ BloomFilter bloom;
+ istream *in = openInputStream(bloomPath);
+ assert_good(*in, bloomPath);
+ bloom.read(*in);
+ assert_good(*in, bloomPath);
+
+ if (opt::verbose)
+ printBloomStats(cerr, bloom);
+
+ // Calculate min length threshold for a "true branch"
+ // (not due to Bloom filter false positives)
+ const double falseBranchProbability = 0.0001;
+ const size_t minBranchLen =
+ (size_t)ceil(log(falseBranchProbability)/log(bloom.FPR()));
+
+ if (opt::verbose >= 2)
+ cerr << "min length threshold for true branches (k-mers): "
+ << minBranchLen << endl;
+
+ size_t readCount = 0;
+
+ // trim reads and print to STDOUT
+ for (int i = optind; i < argc; ++i) {
+
+ if (opt::verbose)
+ cerr << "Reading `" << argv[i] << "'..." << endl;
+
+ FastaReader in(argv[i], FastaReader::FOLD_CASE);
+ for (FastqRecord rec; in >> rec; ++readCount) {
+
+ Sequence& seq = rec.seq;
+ string& qual = rec.qual;
+
+ // can't trim if read length < k; just echo
+ // back to STDOUT
+ if (seq.size() < k) {
+ cout << rec;
+ continue;
+ }
+
+ // start pos for trimmed read
+ unsigned startPos = calcLeftTrim(seq, k, bloom, minBranchLen);
+ // end pos for trimmed read
+ unsigned endPos = seq.length() - 1 -
+ calcLeftTrim(reverseComplement(seq), k, bloom, minBranchLen);
+
+ // if whole read was trimmed away
+ if (endPos < startPos)
+ continue;
+
+ // output trimmed read
+ unsigned trimmedLen = endPos - startPos + 1;
+ seq = seq.substr(startPos, trimmedLen);
+ qual = qual.substr(startPos, trimmedLen);
+ cout << rec;
+
+ if (opt::verbose && (readCount+1) % 100000 == 0)
+ cerr << "Processed " << (readCount+1) << " reads"
+ << endl;
+
+ } // for each read
+ assert(in.eof());
+
+ } // for each input FASTA/FASTQ file
+
+ if (opt::verbose)
+ cerr << "Processed " << readCount << " reads" << endl;
+
+ // success
+ return 0;
+}
+
int main(int argc, char** argv)
{
if (argc < 2)
@@ -572,6 +981,17 @@ int main(int argc, char** argv)
else if (command == "info") {
return info(argc, argv);
}
+ else if (command == "compare") {
+ return compare(argc, argv);
+ }
+ else if (command == "kmers" || command == "getKmers") {
+ return memberOf(argc, argv);
+ }
+ else if (command == "trim") {
+ return trim(argc, argv);
+ }
+ cerr << PROGRAM ": unrecognized command: `" << command
+ << "'" << endl;
dieWithUsageError();
}
diff --git a/BloomDBG/HashAgnosticCascadingBloom.h b/BloomDBG/HashAgnosticCascadingBloom.h
new file mode 100644
index 0000000..559a4c6
--- /dev/null
+++ b/BloomDBG/HashAgnosticCascadingBloom.h
@@ -0,0 +1,145 @@
+/**
+ * A cascading Bloom filter
+ * Copyright 2015 Shaun Jackman, Ben Vandervalk.
+ */
+#ifndef HASH_AGNOSTIC_CASCADING_BLOOM_H
+#define HASH_AGNOSTIC_CASCADING_BLOOM_H 1
+
+#include "lib/bloomfilter/BloomFilter.hpp"
+#include <vector>
+
+/**
+ * An implementation of a Cascading Bloom filter.
+ * A Cascading Bloom filter implements a crude
+ * counting mechanism using an array of _l_ Bloom
+ * filters; we say that such a Bloom filter has
+ * l _levels_. Each time an element is inserted, we
+ * check for its presence in each level, and then
+ * insert the element into the first Bloom filter
+ * where the element is not already present.
+ *
+ * We use the Cascading Bloom filter to filter
+ * out error k-mers from the de Bruijn graph, since
+ * these k-mers typically only occur once in the
+ * the data.
+ */
+class HashAgnosticCascadingBloom
+{
+ public:
+
+ /** Default constructor */
+ HashAgnosticCascadingBloom() : m_k(0), m_hashes(0) {}
+
+ /**
+ * Constructor.
+ * @param size size of the Bloom filters (in bits)
+ * @param hashes number of hash functions
+ * @param levels number of levels in Cascading Bloom filter
+ * @param k k-mer size
+ */
+ HashAgnosticCascadingBloom(size_t size, unsigned hashes,
+ size_t levels, unsigned k) : m_k(k), m_hashes(hashes)
+ {
+ m_data.reserve(levels);
+ for (unsigned i = 0; i < levels; i++)
+ m_data.push_back(new BloomFilter(size, hashes, k));
+ }
+
+ /** Destructor */
+ ~HashAgnosticCascadingBloom()
+ {
+ typedef std::vector<BloomFilter*>::iterator Iterator;
+ for (Iterator i = m_data.begin(); i != m_data.end(); i++) {
+ assert(*i != NULL);
+ delete *i;
+ }
+ }
+
+ /** Return k-mer size used by Bloom filter. */
+ unsigned getKmerSize() const { return m_k; }
+
+ /** Return number of hash functions used by Bloom filter */
+ unsigned getHashNum() const { return m_hashes; }
+
+ /** Return the size of the bit array. */
+ size_t size() const
+ {
+ assert(m_data.back() != NULL);
+ return m_data.back()->getFilterSize();
+ }
+
+ /** Return the number of elements with count >= levels. */
+ size_t popcount() const
+ {
+ assert(m_data.back() != NULL);
+ return m_data.back()->getPop();
+ }
+
+ /** Return the estimated false positive rate */
+ double FPR() const
+ {
+ return pow((double)popcount()/size(), m_hashes);
+ }
+
+ /**
+ * Return true if the element with the given hash values
+ * has count >= levels.
+ */
+ bool contains(const std::vector<size_t>& hashes) const
+ {
+ assert(m_data.back() != NULL);
+ return m_data.back()->contains(hashes);
+ }
+
+ /**
+ * Return true if the element with the given hash values
+ * has count >= levels.
+ */
+ bool contains(const size_t hashes[]) const
+ {
+ assert(m_data.back() != NULL);
+ return m_data.back()->contains(hashes);
+ }
+
+ /** Add the object with the specified index to this multiset. */
+ void insert(const std::vector<size_t>& hashes)
+ {
+ for (unsigned i = 0; i < m_data.size(); ++i) {
+ assert(m_data.at(i) != NULL);
+ if (!(*m_data[i]).contains(hashes)) {
+ m_data[i]->insert(hashes);
+ break;
+ }
+ }
+ }
+
+ /** Add the object with the specified index to this multiset. */
+ void insert(const size_t hashes[])
+ {
+ for (unsigned i = 0; i < m_data.size(); ++i) {
+ assert(m_data.at(i) != NULL);
+ if (!(*m_data[i]).contains(hashes)) {
+ m_data[i]->insert(hashes);
+ break;
+ }
+ }
+ }
+
+ /** Get the Bloom filter for a given level */
+ BloomFilter& getBloomFilter(unsigned level)
+ {
+ assert(m_data.at(level) != NULL);
+ return *m_data.at(level);
+ }
+
+ private:
+
+ /** k-mer length */
+ unsigned m_k;
+ /** number of hash functions */
+ unsigned m_hashes;
+ /** the array of Bloom filters */
+ std::vector<BloomFilter*> m_data;
+};
+
+#endif
diff --git a/BloomDBG/LightweightKmer.h b/BloomDBG/LightweightKmer.h
new file mode 100644
index 0000000..72c5723
--- /dev/null
+++ b/BloomDBG/LightweightKmer.h
@@ -0,0 +1,96 @@
+#ifndef LIGHTWEIGHT_KMER_H
+#define LIGHTWEIGHT_KMER_H 1
+
+#include <algorithm>
+#include <cstring>
+#include <boost/shared_array.hpp>
+
+/**
+ * Class that stores a shared pointer to a k-mer (char array).
+ *
+ * I implemented this class because I observed that storing and
+ * copying the full char array between data structures was hurting
+ * performance and using a lot of memory.
+ *
+ * Having a lightweight k-mer representation is particularly
+ * important when using it as the `vertex_descriptor` in a Boost graph.
+ */
+class LightweightKmer
+{
+private:
+
+ /** Shared pointer to k-mer data */
+ boost::shared_array<char> m_kmer;
+
+public:
+
+ /** Default constructor */
+ LightweightKmer() {}
+
+ /** Constructor */
+ LightweightKmer(const char* kmer) : m_kmer(new char[Kmer::length()])
+ {
+ const unsigned k = Kmer::length();
+ std::copy(kmer, kmer + k, m_kmer.get());
+ }
+
+ /** Get pointer to raw char array for k-mer */
+ char* c_str() { return (char*)m_kmer.get(); }
+
+ /** Get pointer to raw char array for k-mer (read-only) */
+ const char* c_str() const { return (const char*)m_kmer.get(); }
+
+ /** Shift the k-mer and set the value of the new incoming base. */
+ void shift(extDirection dir, char charIn = 'A')
+ {
+ const unsigned k = Kmer::length();
+ assert(k >= 2);
+ if (dir == SENSE) {
+ memmove(m_kmer.get(), m_kmer.get() + 1, k - 1);
+ } else {
+ memmove(m_kmer.get() + 1, m_kmer.get(), k - 1);
+ }
+ setLastBase(dir, charIn);
+ }
+
+ /** Change the first/last base of the k-mer */
+ void setLastBase(extDirection dir, char base)
+ {
+ const unsigned k = Kmer::length();
+ unsigned pos = (dir == SENSE) ? k - 1 : 0;
+ setBase(pos, base);
+ }
+
+ /** Change a base within the k-mer */
+ void setBase(unsigned pos, char base)
+ {
+ assert(pos < Kmer::length());
+ *(m_kmer.get() + pos) = base;
+ }
+
+ /** Get the base (ACGT) at a given position */
+ char getBase(unsigned pos) const
+ {
+ return *(m_kmer.get() + pos);
+ }
+
+ /** Equality operator */
+ bool operator==(const LightweightKmer& o) const
+ {
+ unsigned k = Kmer::length();
+ const std::string& spacedSeed = MaskedKmer::mask();
+
+ if (spacedSeed.empty()) {
+ return !memcmp(m_kmer.get(), o.m_kmer.get(), k);
+ } else {
+ assert(spacedSeed.length() == k);
+ for (unsigned i = 0; i < k; ++i) {
+ if (spacedSeed.at(i) != '0' && getBase(i) != o.getBase(i))
+ return false;
+ }
+ return true;
+ }
+ }
+};
+
+#endif
diff --git a/BloomDBG/Makefile.am b/BloomDBG/Makefile.am
new file mode 100644
index 0000000..652c48e
--- /dev/null
+++ b/BloomDBG/Makefile.am
@@ -0,0 +1,23 @@
+bin_PROGRAMS = abyss-bloom-dbg
+
+abyss_bloom_dbg_CPPFLAGS = -I$(top_srcdir) \
+ -I$(top_srcdir)/Common \
+ -I$(top_srcdir)/DataLayer
+
+abyss_bloom_dbg_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+
+abyss_bloom_dbg_LDADD = \
+ $(top_builddir)/DataLayer/libdatalayer.a \
+ $(top_builddir)/Common/libcommon.a
+
+abyss_bloom_dbg_SOURCES = bloom-dbg.cc \
+ bloom-dbg.h \
+ MaskedKmer.h \
+ SpacedSeed.h \
+ HashAgnosticCascadingBloom.h \
+ LightweightKmer.h \
+ RollingBloomDBG.h \
+ RollingHash.h \
+ RollingHashIterator.h \
+ $(top_srcdir)/lib/bloomfilter/BloomFilter.hpp \
+ $(top_srcdir)/lib/rolling-hash/rolling.h
diff --git a/BloomDBG/MaskedKmer.h b/BloomDBG/MaskedKmer.h
new file mode 100644
index 0000000..4336c04
--- /dev/null
+++ b/BloomDBG/MaskedKmer.h
@@ -0,0 +1,121 @@
+#ifndef MASKED_KMER_H
+#define MASKED_KMER_H 1
+
+#include "Common/Kmer.h"
+#include "Common/Hash.h"
+#include "Common/Sequence.h"
+#include <iostream>
+#include <string>
+#include <cstdlib>
+
+class MaskedKmer : public Kmer
+{
+public:
+
+ /** Default constructor */
+ MaskedKmer() : Kmer() {}
+
+ /**
+ * Constructor.
+ * @param seq k-mer sequence
+ */
+ explicit MaskedKmer(const Sequence& seq) : Kmer(seq) {}
+
+ /** Set global k-mer mask (a.k.a. spaced seed) */
+ static void setMask(const std::string& kmerMask)
+ {
+ /* setLength() must be called before setMask() */
+ assert(length() > 0);
+
+ /* set global bitmask */
+ mask() = kmerMask;
+
+ /* empty mask is equivalent to string of '1's */
+ if (kmerMask.empty())
+ return;
+
+ /* check for valid spaced seed pattern */
+ if (mask().length() != length()) {
+ std::cerr << "error: spaced seed must be exactly k bits long\n";
+ exit(EXIT_FAILURE);
+ } else if (mask().find_first_not_of("01") != std::string::npos) {
+ std::cerr << "error: spaced seed must contain only '0's or '1's\n";
+ exit(EXIT_FAILURE);
+ } else if (*mask().begin() != '1' || *mask().rbegin() != '1') {
+ std::cerr << "error: spaced seed must begin and end with '1's\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ /** Get global k-mer mask */
+ static std::string& mask()
+ {
+ static std::string s_kmerMask;
+ return s_kmerMask;
+ }
+
+ /** Compare this k-mer to another */
+ int compare(const Kmer& other) const
+ {
+ if (!mask().empty()) {
+ Kmer kmer1(*this), kmer2(other);
+ maskKmer(kmer1);
+ maskKmer(kmer2);
+ return kmer1.compare(kmer2);
+ }
+ return Kmer::compare(other);
+ }
+
+ /** Equality operator */
+ bool operator==(const MaskedKmer& other) const
+ {
+ return compare(other) == 0;
+ }
+
+ /** Inequality operator */
+ bool operator!=(const MaskedKmer& other) const
+ {
+ return compare(other) != 0;
+ }
+
+ /** Less-than operator */
+ bool operator<(const MaskedKmer& other) const
+ {
+ return compare(other) < 0;
+ }
+
+ /** Mask out don't care positions by changing them to 'A' */
+ static void maskKmer(Kmer& kmer)
+ {
+ if (mask().empty())
+ return;
+
+ assert(mask().length() == length());
+ for(size_t i = 0; i < mask().length(); ++i) {
+ if (mask().at(i) == '0')
+ kmer.set(i, baseToCode('A'));
+ }
+ }
+};
+
+/** Return the reverse complement of the specified k-mer. */
+static inline MaskedKmer reverseComplement(const MaskedKmer& seq)
+{
+ MaskedKmer rc(seq);
+ rc.reverseComplement();
+ return rc;
+}
+
+/** Define default hash function for use with STL containers */
+NAMESPACE_STD_HASH_BEGIN
+template <> struct hash<MaskedKmer> {
+ size_t operator()(const MaskedKmer& kmer) const
+ {
+ MaskedKmer kmerCopy(kmer);
+ MaskedKmer::maskKmer(kmerCopy);
+ return kmerCopy.getHashCode();
+ }
+};
+NAMESPACE_STD_HASH_END
+
+#endif
diff --git a/BloomDBG/RollingBloomDBG.h b/BloomDBG/RollingBloomDBG.h
new file mode 100644
index 0000000..b671522
--- /dev/null
+++ b/BloomDBG/RollingBloomDBG.h
@@ -0,0 +1,486 @@
+/**
+ * de Bruijn Graph data structure using a Bloom filter
+ * Copyright 2015 Shaun Jackman, Ben Vandervalk
+ */
+
+#ifndef ROLLING_BLOOM_DBG_H
+#define ROLLING_BLOOM_DBG_H 1
+
+#include "Assembly/SeqExt.h" // for NUM_BASES
+#include "Common/Hash.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "Graph/Properties.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/LightweightKmer.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib> // for abort
+#include <fstream>
+#include <string>
+#include <utility> // for std::pair
+#include <vector>
+#include <iostream>
+
+#define BASE_CHARS "ACGT"
+
+using boost::graph_traits;
+
+/**
+ * Represents a vertex in the de Bruijn graph.
+ */
+struct RollingBloomDBGVertex
+{
+private:
+
+ LightweightKmer m_kmer;
+ RollingHash m_rollingHash;
+
+public:
+
+ RollingBloomDBGVertex() {}
+
+ RollingBloomDBGVertex(const char* kmer, const RollingHash rollingHash)
+ : m_kmer(kmer), m_rollingHash(rollingHash) {}
+
+ const LightweightKmer& kmer() const { return m_kmer; };
+ const RollingHash& rollingHash() const { return m_rollingHash; }
+
+ RollingBloomDBGVertex clone() const {
+ return RollingBloomDBGVertex(m_kmer.c_str(), m_rollingHash);
+ }
+
+ void shift(extDirection dir, char charIn = 'A')
+ {
+ if (dir == SENSE) {
+ m_rollingHash.rollRight(m_kmer.c_str(), charIn);
+ } else {
+ m_rollingHash.rollLeft(charIn, m_kmer.c_str());
+ }
+ m_kmer.shift(dir, charIn);
+ }
+
+ void setLastBase(extDirection dir, char base)
+ {
+ const unsigned k = Kmer::length();
+ if (dir == SENSE) {
+ m_rollingHash.setBase(m_kmer.c_str(), k-1, base);
+ } else {
+ m_rollingHash.setBase(m_kmer.c_str(), 0, base);
+ }
+ }
+
+ /**
+ * Comparison operator that takes spaced seed bitmask into account.
+ */
+ bool operator==(const RollingBloomDBGVertex& o) const
+ {
+ /* do fast comparison first */
+ if (m_rollingHash != o.m_rollingHash)
+ return false;
+
+ return m_kmer == o.m_kmer;
+ }
+
+ /**
+ * Inequality operator that takes spaced seed bitmask into account.
+ */
+ bool operator!=(const RollingBloomDBGVertex& o) const
+ {
+ return !(*this == o);
+ }
+};
+
+NAMESPACE_STD_HASH_BEGIN
+template <> struct hash<RollingBloomDBGVertex> {
+ /**
+ * Hash function for graph vertex type (vertex_descriptor)
+ */
+ size_t operator()(const RollingBloomDBGVertex& vertex) const
+ {
+ return vertex.rollingHash().getHashSeed();
+ }
+};
+NAMESPACE_STD_HASH_END
+
+template <typename BF>
+class RollingBloomDBG: public BF {
+ public:
+ /** The bundled vertex properties. */
+ typedef no_property vertex_bundled;
+ typedef no_property vertex_property_type;
+
+ /** The bundled edge properties. */
+ typedef no_property edge_bundled;
+ typedef no_property edge_property_type;
+
+ /** The bloom filter */
+ const BF& m_bloom;
+
+ RollingBloomDBG(const BF& bloom) : m_bloom(bloom) {}
+
+ private:
+ /** Copy constructor. */
+ RollingBloomDBG(const RollingBloomDBG<BF>&);
+
+}; // class RollingBloomDBG
+
+// Graph
+
+namespace boost {
+
+/** Graph traits */
+template <typename BF>
+struct graph_traits< RollingBloomDBG<BF> > {
+ // Graph
+
+ /**
+ * Identifier for accessing a vertex in the graph.
+ * The second member of the pair (std::vector<size_t>) is
+ * a set of hash values associated with the k-mer.
+ */
+ typedef RollingBloomDBGVertex vertex_descriptor;
+ typedef boost::directed_tag directed_category;
+ struct traversal_category
+ : boost::adjacency_graph_tag,
+ boost::bidirectional_graph_tag,
+ boost::vertex_list_graph_tag
+ { };
+ typedef boost::disallow_parallel_edge_tag edge_parallel_category;
+
+ // IncidenceGraph
+ typedef std::pair<vertex_descriptor, vertex_descriptor>
+ edge_descriptor;
+ typedef unsigned degree_size_type;
+
+ // VertexListGraph
+ typedef size_t vertices_size_type;
+ typedef void vertex_iterator;
+
+ // EdgeListGraph
+ typedef size_t edges_size_type;
+ typedef void edge_iterator;
+
+// AdjacencyGraph
+
+/** Iterate through the adjacent vertices of a vertex. */
+struct adjacency_iterator
+ : public std::iterator<std::input_iterator_tag, vertex_descriptor>
+{
+ /** Skip to the next edge that is present. */
+ void next()
+ {
+ for (; m_i < NUM_BASES; ++m_i) {
+ m_v.setLastBase(SENSE, BASE_CHARS[m_i]);
+ if (vertex_exists(m_v, *m_g))
+ break;
+ }
+ }
+
+ public:
+
+ adjacency_iterator() { }
+
+ adjacency_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+ adjacency_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+ : m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+ {
+ m_v.shift(SENSE);
+ next();
+ }
+
+ const vertex_descriptor& operator*() const
+ {
+ assert(m_i < NUM_BASES);
+ return m_v;
+ }
+
+ bool operator==(const adjacency_iterator& it) const
+ {
+ return m_i == it.m_i;
+ }
+
+ bool operator!=(const adjacency_iterator& it) const
+ {
+ return !(*this == it);
+ }
+
+ adjacency_iterator& operator++()
+ {
+ assert(m_i < NUM_BASES);
+ ++m_i;
+ next();
+ return *this;
+ }
+
+ adjacency_iterator operator++(int)
+ {
+ adjacency_iterator it = *this;
+ ++*this;
+ return it;
+ }
+
+ private:
+ const RollingBloomDBG<BF>* m_g;
+ vertex_descriptor m_u;
+ vertex_descriptor m_v;
+ short unsigned m_i;
+}; // adjacency_iterator
+
+/** IncidenceGraph */
+struct out_edge_iterator
+ : public std::iterator<std::input_iterator_tag, edge_descriptor>
+{
+ /** Skip to the next edge that is present. */
+ void next()
+ {
+ for (; m_i < NUM_BASES; ++m_i) {
+ m_v.setLastBase(SENSE, BASE_CHARS[m_i]);
+ if (vertex_exists(m_v, *m_g))
+ break;
+ }
+ }
+
+ public:
+ out_edge_iterator() { }
+
+ out_edge_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+ out_edge_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+ : m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+ {
+ m_v.shift(SENSE);
+ next();
+ }
+
+ edge_descriptor operator*() const
+ {
+ assert(m_i < NUM_BASES);
+ return edge_descriptor(m_u, m_v.clone());
+ }
+
+ bool operator==(const out_edge_iterator& it) const
+ {
+ return m_i == it.m_i;
+ }
+
+ bool operator!=(const out_edge_iterator& it) const
+ {
+ return !(*this == it);
+ }
+
+ out_edge_iterator& operator++()
+ {
+ assert(m_i < NUM_BASES);
+ ++m_i;
+ next();
+ return *this;
+ }
+
+ out_edge_iterator operator++(int)
+ {
+ out_edge_iterator it = *this;
+ ++*this;
+ return it;
+ }
+
+ private:
+ const RollingBloomDBG<BF>* m_g;
+ vertex_descriptor m_u;
+ vertex_descriptor m_v;
+ unsigned m_i;
+}; // out_edge_iterator
+
+/** BidirectionalGraph */
+struct in_edge_iterator
+ : public std::iterator<std::input_iterator_tag, edge_descriptor>
+{
+ /** Skip to the next edge that is present. */
+ void next()
+ {
+ for (; m_i < NUM_BASES; ++m_i) {
+ m_v.setLastBase(ANTISENSE, BASE_CHARS[m_i]);
+ if (vertex_exists(m_v, *m_g))
+ break;
+ }
+ }
+
+ public:
+ in_edge_iterator() { }
+
+ in_edge_iterator(const RollingBloomDBG<BF>& g) : m_g(&g), m_i(NUM_BASES) { }
+
+ in_edge_iterator(const RollingBloomDBG<BF>& g, const vertex_descriptor& u)
+ : m_g(&g), m_u(u), m_v(u.clone()), m_i(0)
+ {
+ m_v.shift(ANTISENSE);
+ next();
+ }
+
+ edge_descriptor operator*() const
+ {
+ assert(m_i < NUM_BASES);
+ return edge_descriptor(m_v.clone(), m_u);
+ }
+
+ bool operator==(const in_edge_iterator& it) const
+ {
+ return m_i == it.m_i;
+ }
+
+ bool operator!=(const in_edge_iterator& it) const
+ {
+ return !(*this == it);
+ }
+
+ in_edge_iterator& operator++()
+ {
+ assert(m_i < NUM_BASES);
+ ++m_i;
+ next();
+ return *this;
+ }
+
+ in_edge_iterator operator++(int)
+ {
+ in_edge_iterator it = *this;
+ ++*this;
+ return it;
+ }
+
+ private:
+ const RollingBloomDBG<BF>* m_g;
+ vertex_descriptor m_u;
+ vertex_descriptor m_v;
+ unsigned m_i;
+}; // in_edge_iterator
+
+}; // graph_traits<RollingBloomDBG>
+
+} // namespace boost
+
+// Subgraph
+
+/** Return whether this vertex exists in the subgraph. */
+template <typename Graph>
+static inline bool
+vertex_exists(const typename graph_traits<Graph>::vertex_descriptor& u, const Graph& g)
+{
+ size_t hashes[MAX_HASHES];
+ u.rollingHash().getHashes(hashes);
+ return g.m_bloom.contains(hashes);
+}
+
+template <typename Graph>
+static inline
+std::pair<typename graph_traits<Graph>::adjacency_iterator,
+ typename graph_traits<Graph>::adjacency_iterator>
+adjacent_vertices(
+ const typename graph_traits<Graph>::vertex_descriptor& u, const Graph& g)
+{
+ typedef typename graph_traits<Graph>::adjacency_iterator adjacency_iterator;
+ return std::make_pair(adjacency_iterator(g, u), adjacency_iterator(g));
+}
+
+// IncidenceGraph
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::degree_size_type
+out_degree(
+ const typename graph_traits<Graph>::vertex_descriptor& u,
+ const Graph& g)
+{
+ typedef typename graph_traits<Graph>::adjacency_iterator Ait;
+ std::pair<Ait, Ait> adj = adjacent_vertices(u, g);
+ return std::distance(adj.first, adj.second);
+}
+
+template <typename Graph>
+static inline typename
+std::pair<typename graph_traits<Graph>::out_edge_iterator,
+ typename graph_traits<Graph>::out_edge_iterator>
+out_edges(
+ const typename graph_traits<Graph>::vertex_descriptor& u,
+ const Graph& g)
+{
+ typedef typename graph_traits<Graph>::out_edge_iterator Oit;
+ return std::make_pair(Oit(g, u), Oit(g));
+}
+
+// BidirectionalGraph
+template <typename Graph>
+static inline
+std::pair<typename graph_traits<Graph>::in_edge_iterator,
+ typename graph_traits<Graph>::in_edge_iterator>
+in_edges(
+ const typename graph_traits<Graph>::vertex_descriptor& u,
+ const Graph& g)
+{
+ typedef typename graph_traits<Graph>::in_edge_iterator Iit;
+ return std::make_pair(Iit(g, u), Iit(g));
+}
+
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::degree_size_type
+in_degree(const typename graph_traits<Graph>::vertex_descriptor& u,
+ const Graph& g)
+{
+ //return out_degree(reverseComplement(u), g);
+ typedef typename graph_traits<Graph>::in_edge_iterator Iit;
+ std::pair<Iit, Iit> it = in_edges(u, g);
+ return std::distance(it.first, it.second);
+}
+
+// PropertyGraph
+
+/** Return the reverse complement of the specified k-mer. */
+template <typename Graph>
+static inline
+typename graph_traits<Graph>::vertex_descriptor
+get(vertex_complement_t, const Graph&,
+ typename graph_traits<Graph>::vertex_descriptor u)
+{
+ typedef typename graph_traits<Graph>::vertex_descriptor V;
+ return V(reverseComplement(u.first), u.second);
+}
+
+/** Return the name of the specified vertex. */
+template <typename Graph>
+static inline
+MaskedKmer get(vertex_name_t, const Graph&,
+ typename graph_traits<Graph>::vertex_descriptor u)
+{
+ return u.first;
+}
+
+template <typename Graph>
+static inline
+bool
+get(vertex_removed_t, const Graph&,
+ typename graph_traits<Graph>::vertex_descriptor)
+{
+ return false;
+}
+
+template <typename Graph>
+static inline
+no_property
+get(vertex_bundle_t, const Graph&,
+ typename graph_traits<Graph>::edge_descriptor)
+{
+ return no_property();
+}
+
+template <typename Graph>
+static inline
+no_property
+get(edge_bundle_t, const Graph&,
+ typename graph_traits<Graph>::edge_descriptor)
+{
+ return no_property();
+}
+
+#endif
diff --git a/BloomDBG/RollingHash.h b/BloomDBG/RollingHash.h
new file mode 100644
index 0000000..f7f4215
--- /dev/null
+++ b/BloomDBG/RollingHash.h
@@ -0,0 +1,289 @@
+#ifndef ABYSS_ROLLING_HASH_H
+#define ABYSS_ROLLING_HASH_H 1
+
+#include "config.h"
+#include "lib/rolling-hash/rolling.h"
+#include "BloomDBG/MaskedKmer.h"
+#include <string>
+#include <vector>
+#include <cassert>
+#include <boost/dynamic_bitset.hpp>
+#include <cstring>
+
+class RollingHash
+{
+private:
+
+ /**
+ * Determine the canonical hash value, given hash values for
+ * forward and reverse-complement of the same k-mer.
+ */
+ uint64_t canonicalHash(uint64_t hash, uint64_t rcHash) const
+ {
+ return (rcHash < hash) ? rcHash : hash;
+ }
+
+public:
+
+ /**
+ * Default constructor.
+ */
+ RollingHash() : m_numHashes(0), m_k(0), m_hash1(0), m_rcHash1(0) {}
+
+ /**
+ * Constructor. Construct RollingHash object when initial k-mer
+ * is unknown.
+ * @param numHashes number of pseudo-independent hash values to compute
+ * for each k-mer
+ * @param k k-mer length
+ */
+ RollingHash(unsigned numHashes, unsigned k) : m_numHashes(numHashes),
+ m_k(k), m_hash1(0), m_rcHash1(0) {}
+
+ /**
+ * Constructor. Construct RollingHash object while specifying
+ * initial k-mer to be hashed.
+ * @param kmer initial k-mer for initializing hash value(s)
+ * @param numHashes number of pseudo-independent hash values to compute
+ * for each k-mer
+ * @param k k-mer length
+ */
+ RollingHash(const std::string& kmer, unsigned numHashes, unsigned k)
+ : m_numHashes(numHashes), m_k(k), m_hash1(0), m_rcHash1(0)
+ {
+ /* init rolling hash state */
+ reset(kmer);
+ }
+
+ /**
+ * Initialize hash state from sequence.
+ * @param kmer k-mer used to initialize hash state
+ */
+ void reset(const std::string& kmer)
+ {
+ if (!MaskedKmer::mask().empty())
+ resetMasked(kmer.c_str());
+ else
+ resetUnmasked(kmer);
+ }
+
+ /**
+ * Initialize hash values from current k-mer. When computing the hash
+ * value, mask out "don't care" positions as per the active
+ * k-mer mask.
+ */
+ void resetMasked(const char* kmer)
+ {
+ const std::string& spacedSeed = MaskedKmer::mask();
+ assert(spacedSeed.length() == m_k);
+
+ /* compute first hash function for k-mer */
+ uint64_t hash1 = getFhval(m_hash1, spacedSeed.c_str(), kmer, m_k);
+
+ /* compute first hash function for reverse complement of k-mer */
+ uint64_t rcHash1 = getRhval(m_rcHash1, spacedSeed.c_str(), kmer, m_k);
+
+ m_hash = canonicalHash(hash1, rcHash1);
+ }
+
+ /**
+ * Initialize hash values from sequence.
+ * @param kmer k-mer used to initialize hash state
+ */
+ void resetUnmasked(const std::string& kmer)
+ {
+ /* compute first hash function for k-mer */
+ m_hash1 = getFhval(kmer.c_str(), m_k);
+
+ /* compute first hash function for reverse complement
+ * of k-mer */
+ m_rcHash1 = getRhval(kmer.c_str(), m_k);
+
+ m_hash = canonicalHash(m_hash1, m_rcHash1);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the right and
+ * update internal state.
+ * @param kmer current k-mer
+ * @param nextKmer k-mer we are rolling into
+ */
+ void rollRight(const char* kmer, char charIn)
+ {
+ if (!MaskedKmer::mask().empty())
+ rollRightMasked(kmer, charIn);
+ else
+ rollRightUnmasked(kmer, charIn);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the right and
+ * update internal state. When computing the new hash, mask
+ * out "don't care" positions according to the active
+ * k-mer mask.
+ * @param kmer current k-mer
+ * @param nextKmer k-mer we are rolling into
+ */
+ void rollRightMasked(const char* kmer, char charIn)
+ {
+ const std::string& spacedSeed = MaskedKmer::mask();
+ m_hash = rollHashesRight(m_hash1, m_rcHash1, spacedSeed.c_str(),
+ kmer, charIn, m_k);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the right and
+ * update internal state.
+ * @param kmer current k-mer
+ * @param nextKmer k-mer we are rolling into
+ */
+ void rollRightUnmasked(const char* kmer, char charIn)
+ {
+ /* update first hash function */
+ rollHashesRight(m_hash1, m_rcHash1, kmer[0], charIn, m_k);
+ m_hash = canonicalHash(m_hash1, m_rcHash1);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the left and
+ * update internal state.
+ * @param prevKmer k-mer we are rolling into
+ * @param kmer current k-mer
+ */
+ void rollLeft(char charIn, const char* kmer)
+ {
+ if (!MaskedKmer::mask().empty())
+ rollLeftMasked(charIn, kmer);
+ else
+ rollLeftUnmasked(charIn, kmer);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the left and
+ * update internal state. When computing the new hash, mask
+ * out "don't care" positions according to the active
+ * k-mer mask.
+ * @param prevKmer k-mer we are rolling into
+ * @param kmer current k-mer
+ */
+ void rollLeftMasked(char charIn, const char* kmer)
+ {
+ const std::string& spacedSeed = MaskedKmer::mask();
+ m_hash = rollHashesLeft(m_hash1, m_rcHash1, spacedSeed.c_str(),
+ kmer, charIn, m_k);
+ }
+
+ /**
+ * Compute hash values for next k-mer to the left and
+ * update internal state.
+ * @param prevKmer k-mer we are rolling into
+ * @param kmer current k-mer
+ */
+ void rollLeftUnmasked(char charIn, const char* kmer)
+ {
+ /* update first hash function */
+ rollHashesLeft(m_hash1, m_rcHash1, charIn, kmer[m_k-1], m_k);
+ m_hash = canonicalHash(m_hash1, m_rcHash1);
+ }
+
+ /**
+ * Get the seed hash value for the current k-mer. The seed hash
+ * value is used to calculate multiple pseudo-independant
+ * hash functions.
+ */
+ size_t getHashSeed() const
+ {
+ return (size_t)m_hash;
+ }
+
+ /**
+ * Get hash values for current k-mer.
+ *
+ * @param hashes array for returned hash values
+ */
+ void getHashes(size_t hashes[]) const
+ {
+ uint64_t tmpHashes[MAX_HASHES];
+ multiHash(tmpHashes, m_hash, m_numHashes, m_k);
+ for (unsigned i = 0; i < m_numHashes; ++i) {
+ hashes[i] = (size_t)tmpHashes[i];
+ }
+ }
+
+ /** Equality operator */
+ bool operator==(const RollingHash& o) const
+ {
+ /**
+ * Note: If hash seeds are equal, then the values
+ * for all hash functions will also be equal, since
+ * the hash values are calculated from the
+ * seed in a deterministic manner. In practice seed
+ * collision is very unlikely, though!
+ */
+ return m_k == o.m_k && getHashSeed() == o.getHashSeed();
+ }
+
+ /** Inequality operator */
+ bool operator!=(const RollingHash& o) const
+ {
+ return !(*this == o);
+ }
+
+ /**
+ * Set the base at a given position in the k-mer and update the hash
+ * value accordingly.
+ * @param kmer point to the k-mer char array
+ * @param pos position of the base to be changed
+ * @param base new value for the base
+ */
+ void setBase(char* kmer, unsigned pos, char base)
+ {
+ if (!MaskedKmer::mask().empty())
+ setBaseMasked(kmer, pos, base);
+ else
+ setBaseUnmasked(kmer, pos, base);
+ }
+
+ /**
+ * Set the base at a given position in the k-mer and update the hash
+ * value accordingly.
+ * @param kmer point to the k-mer char array
+ * @param pos position of the base to be changed
+ * @param base new value for the base
+ */
+ void setBaseMasked(char* kmer, unsigned pos, char base)
+ {
+ const std::string& spacedSeed = MaskedKmer::mask();
+ assert(spacedSeed.length() == m_k);
+ m_hash = ::setBase(m_hash1, m_rcHash1, spacedSeed.c_str(), kmer,
+ pos, base, m_k);
+ }
+
+ /**
+ * Set the base at a given position in the k-mer and update the hash
+ * value accordingly.
+ * @param kmer point to the k-mer char array
+ * @param pos position of the base to be changed
+ * @param base new value for the base
+ */
+ void setBaseUnmasked(char* kmer, unsigned pos, char base)
+ {
+ m_hash = ::setBase(m_hash1, m_rcHash1, kmer, pos, base, m_k);
+ }
+
+private:
+
+ /** number of hash functions */
+ unsigned m_numHashes;
+ /** k-mer length */
+ unsigned m_k;
+ /** value of first hash function for current k-mer */
+ uint64_t m_hash1;
+ /** value of first hash function for current k-mer, after
+ * reverse-complementing */
+ uint64_t m_rcHash1;
+ /** current canonical hash value */
+ uint64_t m_hash;
+};
+
+#endif
diff --git a/BloomDBG/RollingHashIterator.h b/BloomDBG/RollingHashIterator.h
new file mode 100644
index 0000000..9a03163
--- /dev/null
+++ b/BloomDBG/RollingHashIterator.h
@@ -0,0 +1,234 @@
+#ifndef ROLLING_HASH_ITERATOR_H
+#define ROLLING_HASH_ITERATOR_H 1
+
+#include <cstring>
+#include <vector>
+#include <cassert>
+#include <limits>
+#include <string>
+#include <algorithm>
+#include <cctype>
+#include <deque>
+#include "BloomDBG/RollingHash.h"
+
+/**
+ * Permitted characters in k-mers. All k-mers containing
+ * other characters will be skipped.
+ */
+#define ACGT_CHARS "ACGT"
+
+/**
+ * Iterate over hash values for k-mers in a
+ * given DNA sequence.
+ *
+ * This implementation uses a rolling hash
+ * function to efficiently calculate
+ * hash values for successive k-mers.
+ */
+class RollingHashIterator
+{
+private:
+
+ /**
+ * Advance iterator right to the next valid k-mer.
+ */
+ void next()
+ {
+ if (m_seq.length() < m_k) {
+ m_pos = std::numeric_limits<std::size_t>::max();
+ return;
+ }
+
+ const std::string& spacedSeed = MaskedKmer::mask();
+
+ while(m_pos < m_seq.length() - m_k + 1) {
+
+ /* skip k-mers with non-ACGT chars in unmasked positions */
+
+ while (!m_badCharPos.empty() && m_badCharPos.front() < m_pos)
+ m_badCharPos.pop_front();
+
+ if (!m_badCharPos.empty() && m_badCharPos.front() < m_pos + m_k) {
+ /* empty spaced seed is equivalent to a string of '1's */
+ if (spacedSeed.empty()) {
+ m_rollNextHash = false;
+ m_pos = m_badCharPos.front() + 1;
+ continue;
+ }
+ bool goodKmer = true;
+ assert(spacedSeed.length() == m_k);
+ for (size_t i = 0; i < m_badCharPos.size() &&
+ m_badCharPos.at(i) < m_pos + m_k; ++i) {
+ size_t kmerPos = m_badCharPos.at(i) - m_pos;
+ if (spacedSeed.at(kmerPos) == '1') {
+ goodKmer = false;
+ break;
+ }
+ }
+ if (!goodKmer) {
+ m_rollNextHash = false;
+ ++m_pos;
+ continue;
+ }
+ }
+
+ /* we are positioned at the next valid k-mer */
+
+ if (!m_rollNextHash) {
+ /* we don't have hash values for the
+ * preceding k-mer, so we must compute
+ * the hash values from scratch */
+ m_rollingHash.reset(m_seq.substr(m_pos, m_k));
+ m_rollNextHash = true;
+ } else {
+ /* compute new hash values based on
+ * hash values of preceding k-mer */
+ assert(m_pos > 0);
+ m_rollingHash.rollRight(m_seq.c_str() + m_pos - 1,
+ m_seq[m_pos + m_k - 1]);
+ }
+ m_rollingHash.getHashes(m_hashes);
+ return;
+
+ }
+
+ /* there are no more valid k-mers */
+ m_pos = std::numeric_limits<std::size_t>::max();
+ }
+
+public:
+
+ /**
+ * Default constructor. Creates an iterator pointing to
+ * the end of the iterator range.
+ */
+ RollingHashIterator() : m_numHashes(0), m_k(0),
+ m_rollingHash(m_numHashes, m_k),
+ m_pos(std::numeric_limits<std::size_t>::max()) {}
+
+ /**
+ * Constructor.
+ * @param seq DNA sequence to be hashed
+ * @param k k-mer size
+ * for each k-mer
+ */
+ RollingHashIterator(const std::string& seq, unsigned numHashes, unsigned k)
+ : m_seq(seq), m_numHashes(numHashes), m_k(k),
+ m_rollingHash(m_numHashes, m_k), m_rollNextHash(false), m_pos(0)
+ {
+ init();
+ }
+
+ /**
+ * Initialize internal state of iterator.
+ */
+ void init()
+ {
+ /* note: empty spaced seed indicates no masking (string of '1's) */
+ assert(MaskedKmer::mask().empty() || MaskedKmer::mask().length() == m_k);
+
+ /* convert sequence to upper case */
+ std::transform(m_seq.begin(), m_seq.end(), m_seq.begin(), ::toupper);
+
+ /* record positions of non-ACGT chars */
+ size_t i = m_seq.find_first_not_of(ACGT_CHARS);
+ while (i != std::string::npos) {
+ m_badCharPos.push_back(i);
+ i = m_seq.find_first_not_of(ACGT_CHARS, i + 1);
+ }
+
+ /* find first "good" k-mer in sequence */
+ next();
+ }
+
+ /** get reference to hash values for current k-mer */
+ const size_t* operator*() const
+ {
+ assert(m_pos + m_k <= m_seq.length());
+ return m_hashes;
+ }
+
+ /** test equality with another iterator */
+ bool operator==(const RollingHashIterator& it) const
+ {
+ return m_pos == it.m_pos;
+ }
+
+ /** test inequality with another iterator */
+ bool operator!=(const RollingHashIterator& it) const
+ {
+ return !(*this == it);
+ }
+
+ /** pre-increment operator */
+ RollingHashIterator& operator++()
+ {
+ ++m_pos;
+ next();
+ return *this;
+ }
+
+ /** post-increment operator */
+ RollingHashIterator operator++(int)
+ {
+ RollingHashIterator it = *this;
+ ++*this;
+ return it;
+ }
+
+ /** iterator pointing to one past last element */
+ static const RollingHashIterator end()
+ {
+ return RollingHashIterator();
+ }
+
+ /** return position of current k-mer */
+ unsigned pos() const
+ {
+ return m_pos;
+ }
+
+ /** return k-mer at current position */
+ std::string kmer(bool mask=false) const
+ {
+ std::string kmer(m_seq, m_pos, m_k);
+ const std::string& spacedSeed = MaskedKmer::mask();
+ if (mask && !spacedSeed.empty()) {
+ assert(spacedSeed.length() == m_k);
+ for(size_t i = 0; i < spacedSeed.length(); ++i) {
+ if (spacedSeed.at(i) == '0')
+ kmer.at(i) = 'N';
+ }
+ }
+ return kmer;
+ }
+
+ /** return RollingHash object for current state */
+ RollingHash rollingHash()
+ {
+ return m_rollingHash;
+ }
+
+private:
+
+ /** DNA sequence being hashed */
+ std::string m_seq;
+ /** number of hash functions */
+ unsigned m_numHashes;
+ /** hash values */
+ size_t m_hashes[MAX_HASHES];
+ /** k-mer size */
+ unsigned m_k;
+ /** internal state for rolling hash */
+ RollingHash m_rollingHash;
+ /** true whenever we can "roll" the hash values for
+ * the current k-mer to compute the hash values for the
+ * next k-mer */
+ bool m_rollNextHash;
+ /** position of current k-mer */
+ size_t m_pos;
+ /** positions of non-ACGT chars in sequence */
+ std::deque<size_t> m_badCharPos;
+};
+
+#endif
diff --git a/BloomDBG/SpacedSeed.h b/BloomDBG/SpacedSeed.h
new file mode 100644
index 0000000..80848e6
--- /dev/null
+++ b/BloomDBG/SpacedSeed.h
@@ -0,0 +1,79 @@
+#ifndef SPACED_SEED_H
+#define SPACED_SEED_H
+
+#include <string>
+#include <cassert>
+#include <algorithm>
+
+namespace SpacedSeed {
+
+ /**
+ * Generate a spaced seed pattern (bitmask) for two equal-size
+ * k-mers separated by a gap.
+ *
+ * @param k width of spaced seed pattern
+ * @param K size of the individual k-mers. K must be <= k/2.
+ * @return spaced seed pattern for gapped k-mer pair
+ */
+ static inline std::string kmerPair(unsigned k, unsigned K)
+ {
+ assert(K <= k/2);
+ std::string seed(k, '0');
+ std::fill(seed.begin(), seed.begin() + K, '1');
+ std::fill(seed.rbegin(), seed.rbegin() + K, '1');
+ return seed;
+ }
+
+ /**
+ * Generate a Quadratic Residue (QR) seed. The background theory
+ * for QR seeds is described in:
+ *
+ * Egidi, Lavinia, and Giovanni Manzini. "Multiple seeds
+ * sensitivity using a single seed with threshold." Journal of
+ * bioinformatics and computational biology 13.04 (2015): 1550011.
+ *
+ * @param len desired length of QR seed. `len` must
+ * be prime and >= 11.
+ * @return a QR seed represented as a std::string
+ * of 0's and 1's
+ */
+ static inline std::string qrSeed(unsigned len)
+ {
+ assert(len >= 11);
+ std::string seed(len, '1');
+ for (size_t i = 0; i < len; ++i) {
+ for (size_t j = 1; j < len; ++j) {
+ if (j*j % len == i) {
+ seed.at(i) = '0';
+ break;
+ }
+ }
+ }
+ return seed;
+ }
+
+ /**
+ * Generate a spaced seed pattern (bitmask) for two equal-length
+ * Quadratic Residue (QR) seeds separated by a gap. The first
+ * QR seed is in the usual orientation and the second QR is reversed,
+ * so that the overall pattern is symmetric.
+ *
+ * @param k width of the spaced seed pattern
+ * @param qrSeedLen width of the individual QR seeds.
+ * qrSeedLen must be a prime number >= 11 and must also be <= k/2.
+ * @return spaced seed pattern for gapped QR seed pair
+ */
+ static inline std::string qrSeedPair(unsigned k, unsigned qrSeedLen)
+ {
+ assert(qrSeedLen <= k/2);
+ std::string seed(k, '0');
+ std::string qrSeed = SpacedSeed::qrSeed(qrSeedLen);
+ std::copy(qrSeed.begin(), qrSeed.end(), seed.begin());
+ std::reverse(qrSeed.begin(), qrSeed.end());
+ std::copy(qrSeed.rbegin(), qrSeed.rend(), seed.rbegin());
+ return seed;
+ }
+
+}
+
+#endif
diff --git a/BloomDBG/bloom-dbg.cc b/BloomDBG/bloom-dbg.cc
new file mode 100644
index 0000000..2c6a05b
--- /dev/null
+++ b/BloomDBG/bloom-dbg.cc
@@ -0,0 +1,345 @@
+#include "config.h"
+
+#include "BloomDBG/bloom-dbg.h"
+#include "BloomDBG/HashAgnosticCascadingBloom.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/SpacedSeed.h"
+#include "Common/StringUtil.h"
+#include "Common/Options.h"
+#include "DataLayer/Options.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <getopt.h>
+#include <iostream>
+#include <sstream>
+#include <cstdlib>
+#include <iomanip>
+#include <cstring>
+#include <limits>
+#include <string>
+
+#if _OPENMP
+# include <omp.h>
+#endif
+
+using namespace std;
+
+#define PROGRAM "abyss-bloom-dbg"
+
+static const char VERSION_MESSAGE[] =
+ PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
+ "Written by Ben Vandervalk, Shaun Jackman, Hamid Mohamadi,\n"
+ "Justin Chu, and Anthony Raymond.\n"
+ "\n"
+ "Copyright 2015 Canada's Michael Smith Genome Science Centre\n";
+
+static const char USAGE_MESSAGE[] =
+"Usage: " PROGRAM " -b <bloom_size> -H <bloom_hashes> -k <kmer_size> \\\n"
+" -G <genome_size> [options] <FASTQ> [FASTQ]... > assembly.fasta\n"
+"\n"
+"Perform a de Bruijn graph assembly of the given FASTQ files.\n"
+"\n"
+"Basic Options:\n"
+"\n"
+" -b --bloom-size=N Bloom filter memory size with unit suffix\n"
+" 'k', 'M', or 'G' [required]\n"
+" --chastity discard unchaste reads [default]\n"
+" --no-chastity do not discard unchaste reads\n"
+" -g --graph=FILE write de Bruijn graph to FILE (GraphViz)\n"
+" --help display this help and exit\n"
+" -H --num-hashes=N number of Bloom filter hash functions [1]\n"
+" -j, --threads=N use N parallel threads [1]\n"
+" --trim-masked trim masked bases from the ends of reads\n"
+" --no-trim-masked do not trim masked bases from the ends\n"
+" of reads [default]\n"
+" -k, --kmer=N the size of a k-mer [required]\n"
+" --kc=N use a cascading Bloom filter with N levels,\n"
+" instead of a counting Bloom filter [2]\n"
+" -o, --out=FILE write the contigs to FILE [STDOUT]\n"
+" -q, --trim-quality=N trim bases from the ends of reads whose\n"
+" quality is less than the threshold\n"
+" -Q, --mask-quality=N mask all low quality bases as `N'\n"
+" --standard-quality zero quality is `!' (33), typically\n"
+" for FASTQ and SAM files [default]\n"
+" --illumina-quality zero quality is `@' (64), typically\n"
+" for qseq and export files\n"
+" -t, --trim-length max branch length to trim, in k-mers [k]\n"
+" -v, --verbose display verbose output\n"
+" --version output version information and exit\n"
+"\n"
+"Spaced Seed Options:\n"
+"\n"
+" -K, --single-kmer=N use a spaced seed that consists of two k-mers\n"
+" separated by a gap. K must be chosen such that\n"
+" K <= k/2\n"
+" --qr-seed=N use a spaced seed than consists of two mirrored\n"
+" QR seeds separated by a gap. The following must\n"
+" hold: (a) N must be prime, (b) N >= 11,\n"
+" (c) N <= k/2\n"
+" -s, --spaced-seed=STR bitmask indicating k-mer positions to be\n"
+" ignored during hashing. The pattern must be\n"
+" symmetric\n"
+"\n"
+"Debugging Options:\n"
+"\n"
+" -C, --cov-track=FILE WIG track with 0/1 indicating k-mers with\n"
+" coverage above the -c threshold. A reference\n"
+" must also be specified with -R.\n"
+" -T, --trace-file=FILE write debugging info about extension of\n"
+" each read to FILE\n"
+" -R, --ref=FILE specify a reference genome. FILE may be\n"
+" FASTA, FASTQ, SAM, or BAM and may be gzipped."
+"\n"
+"Example:\n"
+"\n"
+" Assemble a genome using a k-mer size of 50bp. Allocate a 1GB\n"
+" Bloom filter with 2 hash functions and require that a k-mer\n"
+" occurs 3 times or more to be included in the assembly. (The k-mer\n"
+" count threshold filters out k-mers containing sequencing errors.)\n"
+"\n"
+" $ " PROGRAM " -k50 -b1G -H2 --kc=3 reads1.fq.gz reads2.fq.gz > assembly.fa\n"
+"\n"
+"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
+
+/** Assembly params (stores command-line options) */
+BloomDBG::AssemblyParams params;
+
+static const char shortopts[] = "b:C:g:H:j:k:K:o:q:Q:R:s:t:T:v";
+
+enum { OPT_HELP = 1, OPT_VERSION, QR_SEED, MIN_KMER_COV };
+
+static const struct option longopts[] = {
+ { "bloom-size", required_argument, NULL, 'b' },
+ { "min-coverage", required_argument, NULL, 'c' },
+ { "cov-track", required_argument, NULL, 'C' },
+ { "chastity", no_argument, &opt::chastityFilter, 1 },
+ { "no-chastity", no_argument, &opt::chastityFilter, 0 },
+ { "graph", required_argument, NULL, 'g' },
+ { "num-hashes", required_argument, NULL, 'H' },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "threads", required_argument, NULL, 'j' },
+ { "trim-masked", no_argument, &opt::trimMasked, 1 },
+ { "no-trim-masked", no_argument, &opt::trimMasked, 0 },
+ { "kmer", required_argument, NULL, 'k' },
+ { "kc", required_argument, NULL, MIN_KMER_COV },
+ { "single-kmer", required_argument, NULL, 'K' },
+ { "out", required_argument, NULL, 'o' },
+ { "trim-quality", required_argument, NULL, 'q' },
+ { "mask-quality", required_argument, NULL, 'Q' },
+ { "standard-quality", no_argument, &opt::qualityOffset, 33 },
+ { "illumina-quality", no_argument, &opt::qualityOffset, 64 },
+ { "qr-seed", required_argument, NULL, QR_SEED },
+ { "ref", required_argument, NULL, 'R' },
+ { "spaced-seed", no_argument, NULL, 's' },
+ { "trim-length", no_argument, NULL, 't' },
+ { "trace-file", no_argument, NULL, 'T'},
+ { "verbose", no_argument, NULL, 'v' },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+/**
+ * Create a de novo genome assembly using a Bloom filter de
+ * Bruijn graph.
+ */
+int main(int argc, char** argv)
+{
+ bool die = false;
+
+ for (int c; (c = getopt_long(argc, argv,
+ shortopts, longopts, NULL)) != -1;) {
+ istringstream arg(optarg != NULL ? optarg : "");
+ switch (c) {
+ case '?':
+ die = true; break;
+ case 'b':
+ params.bloomSize = SIToBytes(arg); break;
+ case 'C':
+ arg >> params.covTrackPath; break;
+ case 'g':
+ arg >> params.graphPath; break;
+ case 'H':
+ arg >> params.numHashes; break;
+ case 'j':
+ arg >> params.threads; break;
+ case 'k':
+ arg >> params.k; break;
+ case 'K':
+ params.resetSpacedSeedParams();
+ arg >> params.K;
+ break;
+ case 'o':
+ arg >> params.outputPath; break;
+ case 'q':
+ arg >> opt::qualityThreshold; break;
+ case 'R':
+ arg >> params.refPath; break;
+ case 's':
+ params.resetSpacedSeedParams();
+ arg >> params.spacedSeed;
+ break;
+ case 't':
+ arg >> params.trim; break;
+ case 'T':
+ arg >> params.tracePath; break;
+ case 'Q':
+ arg >> opt::internalQThreshold; break;
+ case 'v':
+ ++params.verbose; break;
+ case OPT_HELP:
+ cout << USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case MIN_KMER_COV:
+ arg >> params.minCov; break;
+ case OPT_VERSION:
+ cout << VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case QR_SEED:
+ params.resetSpacedSeedParams();
+ arg >> params.qrSeedLen;
+ break;
+ }
+ if (optarg != NULL && (!arg.eof() || arg.fail())) {
+ cerr << PROGRAM ": invalid option: `-"
+ << (char)c << optarg << "'\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (params.bloomSize == 0) {
+ cerr << PROGRAM ": missing mandatory option `-b'\n";
+ die = true;
+ }
+
+ if (params.k == 0) {
+ cerr << PROGRAM ": missing mandatory option `-k'\n";
+ die = true;
+ }
+
+ if (params.k > 0 && params.K > 0 && params.K > params.k/2) {
+ cerr << PROGRAM ": value of `-K' must be <= k/2\n";
+ die = true;
+ }
+
+ if (params.numHashes > MAX_HASHES) {
+ cerr << PROGRAM ": number of hash functions (`-H`) must "
+ "be <= " << MAX_HASHES << " (set by `configure` option "
+ "--enable-max-hashes=N)\n";
+ die = true;
+ }
+
+ if (params.k > 0 && params.qrSeedLen > 0 &&
+ (params.qrSeedLen < 11 || params.qrSeedLen > params.k/2)) {
+ cerr << PROGRAM ": value of `--qr-seed' must be >= 11 and <= k/2\n";
+ die = true;
+ }
+
+ if (!params.covTrackPath.empty() && params.refPath.empty()) {
+ cerr << PROGRAM ": you must specify a reference with `-r' "
+ "when using `-C'\n";
+ die = true;
+ }
+
+ if (params.trim == std::numeric_limits<unsigned>::max()) {
+ params.trim = params.k;
+ }
+
+ if (argc - optind < 1) {
+ cerr << PROGRAM ": missing input file arguments\n";
+ die = true;
+ }
+
+ if (die) {
+ cerr << "Try `" << PROGRAM
+ << " --help' for more information.\n";
+ exit(EXIT_FAILURE);
+ }
+
+ assert(params.initialized());
+
+#if _OPENMP
+ if (params.threads > 0)
+ omp_set_num_threads(params.threads);
+#endif
+
+ /* set global variable for k-mer length */
+ MaskedKmer::setLength(params.k);
+
+ /* set global variable for spaced seed */
+ if (params.K > 0)
+ MaskedKmer::setMask(SpacedSeed::kmerPair(params.k, params.K));
+ else if (params.qrSeedLen > 0)
+ MaskedKmer::setMask(SpacedSeed::qrSeedPair(params.k, params.qrSeedLen));
+ else
+ MaskedKmer::setMask(params.spacedSeed);
+
+ if (params.verbose && !MaskedKmer::mask().empty())
+ cerr << "Using spaced seed " << MaskedKmer::mask() << endl;
+
+ /* print contigs to STDOUT unless -o option was set */
+ ofstream outputFile;
+ if (!params.outputPath.empty()) {
+ outputFile.open(params.outputPath.c_str());
+ assert_good(outputFile, params.outputPath);
+ }
+ ostream& out = params.outputPath.empty() ? cout : outputFile;
+
+ /* BloomFilter class requires size to be a multiple of 64 */
+ const size_t bitsPerByte = 8;
+ /*
+ * Note: it is (params.minCov + 1) here because we use an additional
+ * Bloom filter in BloomDBG::assemble() to track the set of
+ * assembled k-mers.
+ */
+ size_t bloomLevelSize = BloomDBG::roundUpToMultiple(
+ params.bloomSize * bitsPerByte / (params.minCov + 1), (size_t)64);
+
+ /* use cascading Bloom filter to remove error k-mers */
+ HashAgnosticCascadingBloom cascadingBloom(
+ bloomLevelSize, params.numHashes, params.minCov, params.k);
+
+ /* load reads into Bloom filter */
+ for (int i = optind; i < argc; ++i) {
+ /*
+ * Debugging feature: If there is a ':'
+ * separating the list of input read files into
+ * two parts, use the first set of files
+ * to load the Bloom filter and the second
+ * set of files for the assembly (read extension).
+ */
+ if (strcmp(argv[i],":") == 0) {
+ optind = i + 1;
+ break;
+ }
+ BloomDBG::loadFile(cascadingBloom, argv[i], params.verbose);
+ }
+ if (params.verbose)
+ cerr << "Bloom filter FPR: " << setprecision(3)
+ << cascadingBloom.FPR() * 100 << "%" << endl;
+
+ if (!params.covTrackPath.empty()) {
+ assert(!params.refPath.empty());
+ BloomDBG::writeCovTrack(cascadingBloom, params);
+ }
+
+ /* second pass through FASTA files for assembling */
+ BloomDBG::assemble(argc - optind, argv + optind,
+ cascadingBloom, params, out);
+
+ /* generate de Bruijn graph in GraphViz format (optional) */
+ if (!params.graphPath.empty()) {
+ ofstream graphOut(params.graphPath.c_str());
+ assert_good(graphOut, params.graphPath);
+ BloomDBG::outputGraph(argc - optind, argv + optind,
+ cascadingBloom, params, graphOut);
+ assert_good(graphOut, params.graphPath);
+ graphOut.close();
+ assert_good(graphOut, params.graphPath);
+ }
+
+ /* cleanup */
+ if (!params.outputPath.empty())
+ outputFile.close();
+
+ return EXIT_SUCCESS;
+}
diff --git a/BloomDBG/bloom-dbg.h b/BloomDBG/bloom-dbg.h
new file mode 100644
index 0000000..819dd0a
--- /dev/null
+++ b/BloomDBG/bloom-dbg.h
@@ -0,0 +1,1276 @@
+#ifndef BLOOM_DBG_H
+#define BLOOM_DBG_H 1
+
+#include "BloomDBG/RollingHashIterator.h"
+#include "Common/Uncompress.h"
+#include "Common/IOUtil.h"
+#include "DataLayer/FastaReader.h"
+#include "Graph/Path.h"
+#include "Graph/ExtendPath.h"
+#include "Graph/BreadthFirstSearch.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/RollingBloomDBG.h"
+#include "Common/UnorderedSet.h"
+#include "DataLayer/FastaConcat.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <limits>
+#include <string>
+
+#if _OPENMP
+# include <omp.h>
+#endif
+
+namespace BloomDBG {
+
+ /**
+ * Type for a vertex in the de Bruijn graph.
+ */
+ typedef RollingBloomDBGVertex Vertex;
+
+ /**
+ * Parameters controlling assembly.
+ */
+ struct AssemblyParams
+ {
+ /** Bloom filter size (in bits) */
+ size_t bloomSize;
+
+ /** minimum k-mer coverage threshold */
+ unsigned minCov;
+
+ /** WIG track containing 0/1 for sufficient k-mer cov */
+ std::string covTrackPath;
+
+ /** path for output GraphViz file */
+ string graphPath;
+
+ /** num Bloom filter hash functions */
+ unsigned numHashes;
+
+ /** the number of parallel threads. */
+ unsigned threads;
+
+ /** the size of a k-mer. */
+ unsigned k;
+
+ /** the size of a single k-mer in a k-mer pair */
+ unsigned K;
+
+ /** reference genome */
+ std::string refPath;
+
+ /** Quadratic Residue (QR) seed length */
+ unsigned qrSeedLen;
+
+ /** spaced seed */
+ string spacedSeed;
+
+ /** maximum length of branches to trim */
+ unsigned trim;
+
+ /** verbose level for progress messages */
+ int verbose;
+
+ /** output contigs path (empty string indicates STDOUT) */
+ std::string outputPath;
+
+ /** output path for trace file (-T) option */
+ std::string tracePath;
+
+ /** Default constructor */
+ AssemblyParams() : bloomSize(0), minCov(2), graphPath(),
+ numHashes(1), threads(1), k(0), K(0), qrSeedLen(0),
+ spacedSeed(), trim(std::numeric_limits<unsigned>::max()),
+ verbose(0), outputPath(), tracePath() {}
+
+ /** Return true if all required members are initialized */
+ bool initialized() const {
+ return bloomSize > 0 && k > 0 &&
+ trim != std::numeric_limits<unsigned>::max();
+ }
+
+ /** Reset all spaced seed params to their default values */
+ void resetSpacedSeedParams() {
+ spacedSeed.clear();
+ K = 0;
+ qrSeedLen = 0;
+ }
+ };
+
+ /**
+ * Round up `num` to the nearest multiple of `base`.
+ */
+ template <typename T>
+ inline static T roundUpToMultiple(T num, T base)
+ {
+ if (base == 0)
+ return num;
+ T remainder = num % base;
+ if (remainder == 0)
+ return num;
+ return num + base - remainder;
+ }
+
+ /**
+ * Load DNA sequence into Bloom filter using rolling hash.
+ *
+ * @param bloom target Bloom filter
+ * @param seq DNA sequence
+ */
+ template <typename BF>
+ inline static void loadSeq(BF& bloom, const std::string& seq)
+ {
+ const unsigned k = bloom.getKmerSize();
+ const unsigned numHashes = bloom.getHashNum();
+ for (RollingHashIterator it(seq, numHashes, k);
+ it != RollingHashIterator::end(); ++it) {
+ bloom.insert(*it);
+ }
+ }
+
+ /**
+ * Load sequences contents of FASTA file into Bloom filter using
+ * rolling hash.
+ * @param bloom target Bloom filter
+ * @param path path to FASTA file
+ * @param verbose if true, print progress messages to STDERR
+ */
+ template <typename BF>
+ inline static void loadFile(BF& bloom, const std::string& path,
+ bool verbose = false)
+ {
+ const size_t BUFFER_SIZE = 1000000;
+ const size_t LOAD_PROGRESS_STEP = 10000;
+
+ assert(!path.empty());
+ if (verbose)
+ std::cerr << "Reading `" << path << "'..." << std::endl;
+
+ FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
+ uint64_t readCount = 0;
+#pragma omp parallel
+ for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
+ buffer.clear();
+ size_t bufferSize = 0;
+ bool good = true;
+#pragma omp critical(in)
+ for (; good && bufferSize < BUFFER_SIZE;) {
+ std::string seq;
+ good = in >> seq;
+ if (good) {
+ buffer.push_back(seq);
+ bufferSize += seq.length();
+ }
+ }
+ if (buffer.size() == 0)
+ break;
+ for (size_t j = 0; j < buffer.size(); j++) {
+ loadSeq(bloom, buffer.at(j));
+ if (verbose)
+#pragma omp critical(cerr)
+ {
+ readCount++;
+ if (readCount % LOAD_PROGRESS_STEP == 0)
+ std::cerr << "Loaded " << readCount
+ << " reads into Bloom filter\n";
+ }
+ }
+ }
+ assert(in.eof());
+ if (verbose) {
+ std::cerr << "Loaded " << readCount << " reads from `"
+ << path << "` into Bloom filter\n";
+ }
+ }
+
+ /**
+ * Return true if all of the k-mers in `seq` are contained in `bloom`
+ * and false otherwise.
+ */
+ template <typename BloomT>
+ inline static bool allKmersInBloom(const Sequence& seq, const BloomT& bloom)
+ {
+ const unsigned k = bloom.getKmerSize();
+ const unsigned numHashes = bloom.getHashNum();
+ assert(seq.length() >= k);
+ unsigned validKmers = 0;
+ for (RollingHashIterator it(seq, numHashes, k);
+ it != RollingHashIterator::end(); ++it, ++validKmers) {
+ if (!bloom.contains(*it))
+ return false;
+ }
+ /* if we skipped over k-mers containing non-ACGT chars */
+ if (validKmers < seq.length() - k + 1)
+ return false;
+ return true;
+ }
+
+ /**
+ * Add all k-mers of a DNA sequence to a Bloom filter.
+ */
+ template <typename BloomT>
+ inline static void addKmersToBloom(const Sequence& seq, BloomT& bloom)
+ {
+ const unsigned k = bloom.getKmerSize();
+ const unsigned numHashes = bloom.getHashNum();
+ for (RollingHashIterator it(seq, numHashes, k);
+ it != RollingHashIterator::end(); ++it) {
+ bloom.insert(*it);
+ }
+ }
+
+ /**
+ * Translate a DNA sequence to an equivalent path in the
+ * de Bruijn graph.
+ */
+ inline static Path<Vertex>
+ seqToPath(const Sequence& seq, unsigned k, unsigned numHashes)
+ {
+ Path<Vertex> path;
+ assert(seq.length() >= k);
+ for (RollingHashIterator it(seq, numHashes, k);
+ it != RollingHashIterator::end(); ++it) {
+ path.push_back(Vertex(it.kmer().c_str(), it.rollingHash()));
+ }
+ return path;
+ }
+
+ /**
+ * Translate a path in the de Bruijn graph to an equivalent
+ * DNA sequence.
+ */
+ inline static Sequence pathToSeq(const Path<Vertex>& path, unsigned k)
+ {
+ assert(path.size() > 0);
+ assert(k > 0);
+
+ const std::string& spacedSeed = MaskedKmer::mask();
+ assert(spacedSeed.empty() || spacedSeed.length() == k);
+ Sequence seq;
+ seq.resize(path.size() + k - 1, 'N');
+
+ for (size_t i = 0; i < path.size(); ++i) {
+ std::string kmer(path.at(i).kmer().c_str());
+ for (size_t j = 0; j < k; ++j) {
+ if (spacedSeed.empty() || spacedSeed.at(j) == '1') {
+ if (seq.at(i + j) != 'N' && seq.at(i + j) != kmer.at(j)) {
+ std::cerr
+ << "warning: inconsistent DBG path detected "
+ "at position " << i + j << ": "
+ << seq.substr(0, i + j)
+ << " (orig base: '" << seq.at(i + j) << "'"
+ << ", new base: '" << kmer.at(j) << "')"
+ << std::endl;
+ }
+ seq.at(i + j) = kmer.at(j);
+ }
+ }
+ }
+
+ return seq;
+ }
+
+ /**
+ * Results for the extension of a read segment.
+ * Each instance represents a row in the trace file generated
+ * by the '-T' option for abyss-bloom-dbg.
+ */
+ struct SeqExtensionResult
+ {
+ /** FASTA ID for origin read */
+ std::string readId;
+ /**
+ * Index of this segment within the read. (Prior to extension,
+ * each read is split into segments at branching k-mers.)
+ */
+ unsigned readSegmentId;
+ /** Total number of segments after splitting the read */
+ unsigned numReadSegments;
+ /** True if leftwards sequence extension was attempted */
+ bool extendedLeft;
+ /** True if rightwards sequence extension was attempted */
+ bool extendedRight;
+ /** Result code for attempted left sequence extension (e.g. DEAD END) */
+ PathExtensionResult leftExtensionResult;
+ /** Result code for attempted left sequence extension (e.g. DEAD END) */
+ PathExtensionResult rightExtensionResult;
+ /** Original length of the read segment prior to extension */
+ unsigned origLength;
+ /** length of left extension (bp) */
+ unsigned leftExtensionLength;
+ /** length of right extension (bp) */
+ unsigned rightExtensionLength;
+ /** total length of extended sequence (bp) */
+ unsigned extendedLength;
+ /**
+ * True if the extended sequence was excluded from the output contigs
+ * because it was redundant. (An identical sequence was generated
+ * when extending a previous read.)
+ */
+ bool redundantContig;
+ /** Contig ID assigned to extended segment */
+ size_t contigID;
+
+ SeqExtensionResult() :
+ readId(),
+ readSegmentId(std::numeric_limits<unsigned>::max()),
+ numReadSegments(std::numeric_limits<unsigned>::max()),
+ extendedLeft(false),
+ extendedRight(false),
+ leftExtensionResult(DEAD_END),
+ rightExtensionResult(DEAD_END),
+ origLength(std::numeric_limits<unsigned>::max()),
+ leftExtensionLength(std::numeric_limits<unsigned>::max()),
+ rightExtensionLength(std::numeric_limits<unsigned>::max()),
+ extendedLength(std::numeric_limits<unsigned>::max()),
+ redundantContig(false),
+ contigID(std::numeric_limits<size_t>::max()) {}
+
+ bool initialized() const
+ {
+ return !readId.empty() &&
+ readSegmentId != std::numeric_limits<unsigned>::max() &&
+ numReadSegments != std::numeric_limits<unsigned>::max() &&
+ origLength != std::numeric_limits<unsigned>::max() &&
+ leftExtensionLength != std::numeric_limits<unsigned>::max() &&
+ rightExtensionLength != std::numeric_limits<unsigned>::max() &&
+ extendedLength != std::numeric_limits<unsigned>::max();
+ }
+
+ static std::ostream& printHeaders(std::ostream& out)
+ {
+ out << "read_id\t"
+ << "read_segment_id\t"
+ << "num_read_segments\t"
+ << "left_extension_result\t"
+ << "right_extension_result\t"
+ << "orig_length\t"
+ << "left_extension_len\t"
+ << "right_extension_len\t"
+ << "extended_length\t"
+ << "redundant_contig\t"
+ << "contig_id\n";
+ return out;
+ }
+
+ friend std::ostream& operator <<(std::ostream& out,
+ const SeqExtensionResult& o)
+ {
+ if (o.redundantContig) {
+ out << o.readId << '\t'
+ << o.readSegmentId << '\t'
+ << o.numReadSegments << '\t'
+ << "-\t"
+ << "-\t"
+ << o.origLength << '\t'
+ << "-\t"
+ << "-\t"
+ << "-\t"
+ << "true" << '\t'
+ << "-\n";
+ } else {
+ out << o.readId << '\t'
+ << o.readSegmentId << '\t'
+ << o.numReadSegments << '\t';
+ if (o.extendedLeft)
+ out << pathExtensionResultStr(o.leftExtensionResult) << '\t';
+ else
+ out << "-\t";
+ if (o.extendedRight)
+ out << pathExtensionResultStr(o.rightExtensionResult) << '\t';
+ else
+ out << "-\t";
+ out << o.origLength << '\t';
+ if (o.extendedLeft)
+ out << o.leftExtensionLength << '\t';
+ else
+ out << "-\t";
+ if (o.extendedRight)
+ out << o.rightExtensionLength << '\t';
+ else
+ out << "-\t";
+ out << o.extendedLength << '\t'
+ << "false" << '\t'
+ << o.contigID << '\n';
+ }
+ return out;
+ }
+ };
+
+ /**
+ * Extend a sequence left (REVERSE) or right (FORWARD) within the de Bruijn
+ * graph until either a branching point or a dead-end is encountered.
+ */
+ template <typename GraphT>
+ inline static PathExtensionResult extendSeq(Sequence& seq, Direction dir,
+ unsigned k, unsigned numHashes, unsigned minBranchLen, const GraphT& graph)
+ {
+ assert(seq.length() >= k);
+
+ /* Convert sequence to path in DBG */
+ Path<Vertex> path = seqToPath(seq, k, numHashes);
+
+ /* Extend path */
+ ExtendPathParams params;
+ params.trimLen = minBranchLen - 1;
+ params.maxLen = NO_LIMIT;
+ PathExtensionResult result =
+ extendPath(path, dir, graph, params);
+
+ /* Convert extended path back to sequence */
+ Sequence extendedSeq = pathToSeq(path, k);
+
+ /*
+ * If a spaced seed is in effect, short paths may result in
+ * sequences containing 'N's. However, since we only extend
+ * "perfect reads", we can replace the 'N's with the correct
+ * bases by overlaying the seed sequence.
+ */
+ if (dir == FORWARD) {
+ overlaySeq(seq, extendedSeq, 0);
+ } else {
+ assert(dir == REVERSE);
+ overlaySeq(seq, extendedSeq, extendedSeq.length() - seq.length());
+ }
+
+ /*
+ * Replace orig seq with extended version.
+ */
+ seq = extendedSeq;
+
+ /* Return true if sequence was successfully extended */
+ return result;
+ }
+
+
+ /**
+ * Counters for tracking assembly statistics and producing
+ * progress messages.
+ */
+ struct AssemblyCounters
+ {
+ size_t readsExtended;
+ size_t readsProcessed;
+ size_t basesAssembled;
+ size_t contigID;
+
+ AssemblyCounters() : readsExtended(0), readsProcessed(0),
+ basesAssembled(0), contigID(0) {}
+ };
+
+ /** Print an intermediate progress message during assembly */
+ void printProgressMessage(AssemblyCounters counters)
+ {
+#pragma omp critical(cerr)
+ std::cerr
+ << "Extended " << counters.readsExtended
+ << " of " << counters.readsProcessed
+ << " reads (" << std::setprecision(3) << (float)100
+ * counters.readsExtended / counters.readsProcessed
+ << "%), assembled " << counters.basesAssembled
+ << " bp so far" << std::endl;
+ }
+
+ /**
+ * Split a path at branching k-mers (degree > 2).
+ */
+ template <typename GraphT>
+ inline static std::vector<
+ Path<typename boost::graph_traits<GraphT>::vertex_descriptor> >
+ splitPath(const Path<typename boost::graph_traits<GraphT>::vertex_descriptor>& path,
+ const GraphT& dbg, unsigned minBranchLen)
+ {
+ assert(path.size() > 0);
+
+ typedef typename boost::graph_traits<GraphT>::vertex_descriptor V;
+ typedef typename Path<V>::const_iterator PathIt;
+
+ std::vector< Path<V> > splitPaths;
+ Path<V> currentPath;
+ for (PathIt it = path.begin(); it != path.end(); ++it) {
+ currentPath.push_back(*it);
+ unsigned inDegree =
+ trueBranches(*it, REVERSE, dbg, minBranchLen).size();
+ unsigned outDegree =
+ trueBranches(*it, FORWARD, dbg, minBranchLen).size();
+ if (inDegree > 1 || outDegree > 1) {
+ /* we've hit a branching point -- end the current
+ * path and start a new one */
+ splitPaths.push_back(currentPath);
+ currentPath.clear();
+ currentPath.push_back(*it);
+ }
+ }
+ if (currentPath.size() > 1 || splitPaths.empty())
+ splitPaths.push_back(currentPath);
+
+ assert(splitPaths.size() >= 1);
+ return splitPaths;
+ }
+
+ /**
+ * Split a sequence at branching k-mers (degree > 2).
+ * Branching k-mers are shared between the resulting sequence
+ * segments.
+ */
+ template <typename GraphT>
+ inline static std::vector<Sequence>
+ splitSeq(const Sequence& seq, unsigned k, unsigned numHashes,
+ const GraphT& dbg, unsigned minBranchLen)
+ {
+ assert(seq.length() >= k);
+
+ typedef typename boost::graph_traits<GraphT>::vertex_descriptor V;
+ typedef typename Path<V>::const_iterator PathIt;
+
+ std::vector<Sequence> segments;
+ Path<V> path = seqToPath(seq, k, numHashes);
+ PathIt start = path.begin();
+ PathIt end = path.begin();
+
+ for (; end != path.end(); ++end) {
+ std::vector<V> inBranches = trueBranches(*end, REVERSE, dbg,
+ minBranchLen-1);
+ unsigned inDegree = inBranches.size();
+ /*
+ * Tricky: Include the read itself in the list of valid
+ * incoming branches, even if it is shorter than trimLen.
+ */
+ if (end > path.begin() && std::find(inBranches.begin(),
+ inBranches.end(), *(end - 1)) == inBranches.end()) {
+ inDegree++;
+ }
+ std::vector<V> outBranches = trueBranches(*end, FORWARD, dbg,
+ minBranchLen-1);
+ unsigned outDegree = outBranches.size();
+ /*
+ * Tricky: Include the read itself in the list of valid
+ * outgoing branches, even if it is shorter than trimLen.
+ */
+ if (end < path.end() - 1 && std::find(outBranches.begin(),
+ outBranches.end(), *(end + 1)) == outBranches.end()) {
+ outDegree++;
+ }
+ if (inDegree > 1 || outDegree > 1) {
+ /* we've hit a branching point -- end the current
+ * segment and start a new one */
+ Sequence segment = seq.substr(start - path.begin(),
+ end - start + k);
+ segments.push_back(segment);
+ start = end;
+ }
+ }
+ if (segments.empty() || segments.back().length() > k) {
+ Sequence segment = seq.substr(start - path.begin(),
+ end - start + k);
+ segments.push_back(segment);
+ }
+
+ assert(segments.size() >= 1);
+ return segments;
+ }
+
+ /**
+ * Trim a sequence down to the longest contiguous subsequence
+ * of "good" k-mers. If the sequence has length < k or contains
+ * no good k-mers, the trimmed sequence will be the empty string.
+ *
+ * @param seq the DNA sequence to be trimmed
+ * @param goodKmerSet Bloom filter containing "good" k-mers
+ */
+ template <typename BloomT>
+ static inline void trimSeq(Sequence& seq, const BloomT& goodKmerSet)
+ {
+ const unsigned k = goodKmerSet.getKmerSize();
+ const unsigned numHashes = goodKmerSet.getHashNum();
+
+ if (seq.length() < k) {
+ seq.clear();
+ return;
+ }
+
+ const unsigned UNSET = UINT_MAX;
+ unsigned prevPos = UNSET;
+ unsigned matchStart = UNSET;
+ unsigned matchLen = 0;
+ unsigned maxMatchStart = UNSET;
+ unsigned maxMatchLen = 0;
+
+ /* note: RollingHashIterator skips over k-mer
+ * positions with non-ACGT chars */
+ for (RollingHashIterator it(seq, numHashes, k);
+ it != RollingHashIterator::end(); prevPos=it.pos(),++it) {
+ if (!goodKmerSet.contains(*it) ||
+ (prevPos != UNSET && it.pos() - prevPos > 1)) {
+ /* end any previous match */
+ if (matchStart != UNSET && matchLen > maxMatchLen) {
+ maxMatchLen = matchLen;
+ maxMatchStart = matchStart;
+ }
+ matchStart = UNSET;
+ matchLen = 0;
+ }
+ if (goodKmerSet.contains(*it)) {
+ /* initiate or extend match */
+ if (matchStart == UNSET)
+ matchStart = it.pos();
+ matchLen++;
+ }
+ }
+ /* handles case last match extends to end of seq */
+ if (matchStart != UNSET && matchLen > maxMatchLen) {
+ maxMatchLen = matchLen;
+ maxMatchStart = matchStart;
+ }
+ /* if there were no matching k-mers */
+ if (maxMatchLen == 0) {
+ seq.clear();
+ return;
+ }
+ /* trim read down to longest matching subseq */
+ seq = seq.substr(maxMatchStart, maxMatchLen + k - 1);
+ }
+
+ /**
+ * Ensure that branching k-mers are not repeated in the output
+ * contigs by selectively trimming contig ends.
+ *
+ * The idea is to keep a branch k-mer if the edge leading to it
+ * is unambigous. For example, in the diagram below the contig
+ * generated from the right side would include the branching k-mer
+ * k5, whereas the two contigs entering on the left would discard it:
+ *
+ * ...-k1-k2
+ * \
+ * k5-k6-...
+ * /
+ * ...-k3-k4
+ *
+ * @param seq the contig to trim
+ * @param k k-mer size
+ * @param numHashes number of Bloom filter hash functions
+ * @param minBranchLen minimum length of a "true" branch (shorter
+ * branches are assumed to be caused by sequencing errors or
+ * Bloom filter false positives).
+ */
+ template <typename GraphT>
+ inline static void trimBranchKmers(Sequence& seq,
+ unsigned k, unsigned numHashes, unsigned minBranchLen,
+ const GraphT& dbg)
+ {
+ assert(seq.length() >= k);
+
+ if (seq.length() == k)
+ return;
+
+ Sequence firstKmer = seq.substr(0, k);
+ Vertex vFirst(firstKmer.c_str(), RollingHash(firstKmer, numHashes, k));
+ unsigned outDegree = trueDegree(vFirst, FORWARD, dbg, minBranchLen - 1);
+ if (outDegree > 1)
+ seq.erase(0, 1);
+
+ if (seq.length() == k)
+ return;
+
+ Sequence lastKmer = seq.substr(seq.length()-k);
+ Vertex vLast(lastKmer.c_str(), RollingHash(lastKmer, numHashes, k));
+ unsigned inDegree = trueDegree(vLast, REVERSE, dbg, minBranchLen - 1);
+ if (inDegree > 1)
+ seq.erase(seq.length()-1, 1);
+ }
+
+ /**
+ * Append a contig to the output FASTA stream.
+ */
+ inline static void printContig(const Sequence& seq,
+ size_t contigID, const std::string& readID, unsigned k,
+ std::ostream& out)
+ {
+ assert(seq.length() >= k);
+
+ FastaRecord contig;
+
+ /* set FASTA id */
+ std::ostringstream id;
+ id << contigID;
+
+ /* add FASTA comment indicating extended read id */
+ std::ostringstream comment;
+ comment << "read:" << readID;
+ assert(id.good());
+ contig.id = id.str();
+ contig.comment = comment.str();
+
+ /* set seq (in canonical orientation) */
+ Sequence rcSeq = reverseComplement(seq);
+ contig.seq = (seq < rcSeq) ? seq : rcSeq;
+
+ /* output FASTQ record */
+ out << contig;
+ assert(out);
+ }
+
+ /**
+ * Trim contiguous stretches of previously-assembled k-mers from
+ * both ends of a contig.
+ *
+ * @param seq contig to be trimmed
+ * @param assembledKmerSet Bloom filter of k-mers from previously
+ * assembled contigs
+ */
+ template <typename BloomT>
+ inline static void trimContigOverlaps(Sequence &seq,
+ const BloomT& assembledKmerSet)
+ {
+ const unsigned k = assembledKmerSet.getKmerSize();
+ const unsigned numHashes = assembledKmerSet.getHashNum();
+
+ /* trim previously assembled k-mers from start of sequence */
+ RollingHashIterator fwd(seq, numHashes, k);
+ for (; fwd != RollingHashIterator::end(); ++fwd) {
+ if (!assembledKmerSet.contains(*fwd))
+ break;
+ }
+ if (fwd.pos() > 0)
+ seq.erase(0, fwd.pos());
+
+ /* trim previously assembled k-mers from end of sequence */
+ Sequence rcSeq = reverseComplement(seq);
+ RollingHashIterator rev(rcSeq, numHashes, k);
+ for (; rev != RollingHashIterator::end(); ++rev) {
+ if (!assembledKmerSet.contains(*rev))
+ break;
+ }
+ if (rev.pos() > 0)
+ rcSeq.erase(0, rev.pos());
+
+ /* flip seq back to original orientation */
+ seq = reverseComplement(rcSeq);
+
+ assert(seq.length() >= k);
+ }
+
+ /**
+ * Split a read at branching points in the de Bruijn graph and
+ * then extend each segment left and right, up to the next
+ * branching point or dead end.
+ *
+ * @param read read to be assembled
+ * @param dbg Boost graph interface to de Bruijn graph
+ * @param assembledKmerSet Bloom filter containing k-mers of
+ * previously assembled contigs
+ * @param params command line options for the assembly
+ * (e.g. k-mer coverage threshold)
+ * @param counters counter variables used for generating assembly
+ * progress messages.
+ * @param out output stream for contigs
+ * @param traceOut output stream for trace file (-T option)
+ */
+ template <typename GraphT, typename BloomT>
+ inline static void extendRead(const FastaRecord& read,
+ const GraphT& dbg, BloomT& assembledKmerSet,
+ const AssemblyParams& params, AssemblyCounters& counters,
+ std::ostream& out, std::ostream& traceOut)
+ {
+ const unsigned k = params.k;
+ const unsigned numHashes = params.numHashes;
+ const unsigned minBranchLen = params.trim + 1;
+
+ if (params.verbose >= 2) {
+#pragma omp critical(cerr)
+ std::cerr << "Extending read: " << read.id << std::endl;
+ }
+
+ /* split read at branching points (prevents over-assembly) */
+ std::vector<Sequence> segments = splitSeq(read.seq, k,
+ numHashes, dbg, minBranchLen);
+
+ for (std::vector<Sequence>::iterator it = segments.begin();
+ it != segments.end(); ++it) {
+
+ Sequence& seq = *it;
+
+ /*
+ * track results of sequence extension attempt for
+ * trace file ('-T' option).
+ */
+ SeqExtensionResult traceResult;
+ traceResult.readId = read.id;
+ traceResult.readSegmentId = it - segments.begin() + 1;
+ traceResult.numReadSegments = segments.size();
+ traceResult.origLength = seq.length();
+ traceResult.leftExtensionLength = 0;
+ traceResult.rightExtensionLength = 0;
+ traceResult.redundantContig = true;
+
+ /*
+ * extend first and last segments only, since
+ * internal segments are bounded by branching
+ * points.
+ */
+ if (it == segments.begin()) {
+ traceResult.extendedLeft = true;
+ traceResult.leftExtensionResult = extendSeq(seq,
+ REVERSE, k, numHashes, minBranchLen, dbg);
+ traceResult.leftExtensionLength =
+ seq.length() - traceResult.origLength;
+ }
+ if (it == segments.end() - 1) {
+ unsigned origLength = seq.length();
+ traceResult.extendedRight = true;
+ traceResult.rightExtensionResult = extendSeq(seq,
+ FORWARD, k, numHashes, minBranchLen, dbg);
+ traceResult.rightExtensionLength =
+ seq.length() - origLength;
+ }
+ traceResult.extendedLength = seq.length();
+
+ /* ensure branching k-mers are included only once in output */
+ trimBranchKmers(seq, k, numHashes, minBranchLen, dbg);
+
+ /*
+ * check assembledKmerSet again to prevent race
+ * condition. (Otherwise, the same contig may be
+ * generated multiple times.)
+ */
+#pragma omp critical(assembledKmerSet)
+ if (!allKmersInBloom(seq, assembledKmerSet)) {
+
+ /* trim previously assembled k-mers from both ends */
+ trimContigOverlaps(seq, assembledKmerSet);
+
+ /* mark remaining k-mers as assembled */
+ addKmersToBloom(seq, assembledKmerSet);
+
+ /* add contig to output FASTA */
+ printContig(seq, counters.contigID, read.id, k, out);
+
+ /* update counters / trace results */
+ traceResult.redundantContig = false;
+ traceResult.contigID = counters.contigID;
+ counters.basesAssembled += seq.length();
+ counters.contigID++;
+ }
+
+ /* trace file output ('-T' option) */
+#pragma omp critical(traceOut)
+ if (!params.tracePath.empty()) {
+ assert(traceResult.initialized());
+ traceOut << traceResult;
+ assert_good(traceOut, params.tracePath);
+ }
+
+ } /* for each read segment */
+ }
+
+ /**
+ * Perform a Bloom-filter-based de Bruijn graph assembly.
+ * Contigs are generated by extending reads left/right within
+ * the de Bruijn graph, up to the next branching point or dead end.
+ * Short branches due to Bloom filter false positives are
+ * ignored.
+ *
+ * @param argc number of input FASTA files
+ * @param argv array of input FASTA filenames
+ * @param genomeSize approx genome size
+ * @param goodKmerSet Bloom filter containing k-mers that
+ * occur more than once in the input data
+ * @param out output stream for contigs (FASTA)
+ * @param verbose set to true to print progress messages to
+ * STDERR
+ */
+ template <typename BloomT>
+ inline static void assemble(int argc, char** argv, const BloomT& goodKmerSet,
+ const AssemblyParams& params, std::ostream& out)
+ {
+ assert(params.initialized());
+
+ /* per-thread I/O buffer (size is in bases) */
+ const size_t SEQ_BUFFER_SIZE = 1000000;
+
+ /* print progress message after processing this many reads */
+ const unsigned progressStep = 1000;
+ const unsigned k = goodKmerSet.getKmerSize();
+
+ /* trace file output ('-T' option) */
+ std::ofstream traceOut;
+ if (!params.tracePath.empty()) {
+ traceOut.open(params.tracePath.c_str());
+ assert_good(traceOut, params.tracePath);
+ SeqExtensionResult::printHeaders(traceOut);
+ assert_good(traceOut, params.tracePath);
+ }
+
+ /* k-mers in previously assembled contigs */
+ BloomFilter assembledKmerSet(goodKmerSet.size(),
+ goodKmerSet.getHashNum(), goodKmerSet.getKmerSize());
+ /* counters for progress messages */
+ AssemblyCounters counters;
+
+ /* Boost graph API over Bloom filter */
+ RollingBloomDBG<BloomT> graph(goodKmerSet);
+
+ if (params.verbose)
+ std::cerr << "Trimming branches " << params.trim
+ << " k-mers or shorter" << std::endl;
+
+ FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
+#pragma omp parallel
+ for (std::vector<FastaRecord> buffer;;) {
+
+ /* read sequences in batches to reduce I/O contention */
+ buffer.clear();
+ size_t bufferSize;
+ bool good = true;
+#pragma omp critical(in)
+ for (bufferSize = 0; bufferSize < SEQ_BUFFER_SIZE;) {
+ FastaRecord rec;
+ good = in >> rec;
+ if (!good)
+ break;
+ buffer.push_back(rec);
+ bufferSize += rec.seq.length();
+ }
+ if (buffer.size() == 0)
+ break;
+
+ for (std::vector<FastaRecord>::iterator it = buffer.begin();
+ it != buffer.end(); ++it) {
+
+ const FastaRecord& rec = *it;
+ bool skip = false;
+
+ /* we can't extend reads shorter than k */
+ if (rec.seq.length() < k)
+ skip = true;
+
+ /* only extend error-free reads */
+ if (!skip && !allKmersInBloom(rec.seq, goodKmerSet))
+ skip = true;
+
+ /* skip reads in previously assembled regions */
+ if (!skip && allKmersInBloom(rec.seq, assembledKmerSet))
+ skip = true;
+
+ /* extend the read left and right within the DBG */
+ if (!skip) {
+ extendRead(rec, graph, assembledKmerSet, params,
+ counters, out, traceOut);
+#pragma omp atomic
+ counters.readsExtended++;
+ }
+
+#pragma omp atomic
+ counters.readsProcessed++;
+ if (params.verbose && counters.readsProcessed % progressStep == 0)
+ printProgressMessage(counters);
+
+ } /* for each read */
+
+ } /* for each batch of reads (parallel) */
+
+ assert(in.eof());
+ if (!params.tracePath.empty()) {
+ traceOut.close();
+ assert_good(traceOut, params.tracePath);
+ }
+
+ if (params.verbose) {
+ printProgressMessage(counters);
+ std::cerr << "Assembly complete" << std::endl;
+ }
+ }
+
+ /**
+ * Visitor class that outputs visited nodes/edges in GraphViz format during
+ * a breadth first traversal. An instance of this class may be passed
+ * as an argument to the `breadthFirstSearch` function.
+ */
+ template <typename GraphT>
+ class GraphvizBFSVisitor
+ {
+ typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
+ typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
+
+ public:
+
+ /** Constructor */
+ GraphvizBFSVisitor(std::ostream& out) :
+ m_out(out), m_nodesVisited(0), m_edgesVisited(0)
+ {
+ /* start directed graph (GraphViz) */
+ m_out << "digraph g {\n";
+ }
+
+ /** Destructor */
+ ~GraphvizBFSVisitor()
+ {
+ /* end directed graph (GraphViz) */
+ m_out << "}\n";
+ }
+
+ /** Invoked when a vertex is initialized */
+ void initialize_vertex(const VertexT&, const GraphT&) {}
+
+ /** Invoked when a vertex is visited for the first time */
+ void discover_vertex(const VertexT& v, const GraphT&)
+ {
+ ++m_nodesVisited;
+ /* declare vertex (GraphViz) */
+ m_out << '\t' << v.kmer().c_str() << ";\n";
+ }
+
+ /** Invoked each time a vertex is visited */
+ void examine_vertex(const VertexT&, const GraphT&) {}
+
+ /**
+ * Invoked when all of a vertex's outgoing edges have been
+ * traversed.
+ */
+ void finish_vertex(const VertexT&, const GraphT&) {}
+
+ /**
+ * Invoked when an edge is traversed. (Each edge
+ * in the graph is traversed exactly once.)
+ */
+ void examine_edge(const EdgeT& e, const GraphT& g)
+ {
+ ++m_edgesVisited;
+ const VertexT& u = source(e, g);
+ const VertexT& v = target(e, g);
+
+ /* declare edge (GraphViz) */
+ m_out << '\t' << u.kmer().c_str() << " -> "
+ << v.kmer().c_str() << ";\n";
+ }
+
+ /**
+ * Invoked when an edge is traversed to a "gray" vertex.
+ * A vertex is gray when some but not all of its outgoing edges
+ * have been traversed.
+ */
+ void gray_target(const EdgeT&, const GraphT&) {}
+
+ /**
+ * Invoked when an edge is traversed to a "black" vertex.
+ * A vertex is black when all of its outgoing edges have
+ * been traversed.
+ */
+ void black_target(const EdgeT&, const GraphT&) {}
+
+ /**
+ * Invoked when an edge is traversed to a "gray" or
+ * "black" vertex.
+ */
+ void non_tree_edge(const EdgeT&, const GraphT&) {}
+
+ /**
+ * Invoked when an edge is traversed to a "white" vertex.
+ * A vertex is a white if it is previously unvisited.
+ */
+ void tree_edge(const EdgeT&, const GraphT&) {}
+
+ /** Return number of distinct nodes visited */
+ size_t getNumNodesVisited() const
+ {
+ return m_nodesVisited;
+ }
+
+ /** Get number of distinct edges visited */
+ size_t getNumEdgesVisited() const
+ {
+ return m_edgesVisited;
+ }
+
+ protected:
+
+ /** output stream for GraphViz serialization */
+ std::ostream& m_out;
+ /** number of nodes visited so far */
+ size_t m_nodesVisited;
+ /** number of edges visited so far */
+ size_t m_edgesVisited;
+ };
+
+ /**
+ * Output a GraphViz serialization of the de Bruijn graph
+ * using FASTA files and a Bloom filter as input.
+ *
+ * @param argc number of input FASTA files
+ * @param argv array of input FASTA filenames
+ * @param kmerSet Bloom filter containing valid k-mers
+ * @param out output stream for GraphViz serialization
+ * @param verbose prints progress messages to STDERR if true
+ */
+ template <typename BloomT>
+ static inline void outputGraph(int argc, char** argv,
+ const BloomT& kmerSet, const AssemblyParams& params,
+ std::ostream& out)
+ {
+ assert(params.initialized());
+
+ typedef RollingBloomDBG<BloomT> GraphT;
+
+ /* interval for progress messages */
+ const unsigned progressStep = 1000;
+ const unsigned k = kmerSet.getKmerSize();
+ const unsigned numHashes = kmerSet.getHashNum();
+
+ /* counter for progress messages */
+ size_t readsProcessed = 0;
+
+ /* Boost graph API over rolling hash Bloom filter */
+ GraphT dbg(kmerSet);
+
+ /* Marks visited nodes in breadth-first traversal */
+ DefaultColorMap<GraphT> colorMap;
+
+ /* BFS Visitor -- generates GraphViz output as nodes
+ * and edges are traversed. */
+ GraphvizBFSVisitor<GraphT> visitor(out);
+
+ if (params.verbose)
+ std::cerr << "Generating GraphViz output..." << std::endl;
+
+ FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
+ for (FastaRecord rec;;) {
+ bool good;
+ good = in >> rec;
+ if (!good)
+ break;
+ Sequence& seq = rec.seq;
+
+ /* Trim down to longest subsequence of "good" k-mers */
+ trimSeq(seq, kmerSet);
+ if (seq.length() > 0) {
+
+ /* BFS traversal in forward dir */
+ std::string startKmer = seq.substr(0, k);
+ Vertex start(startKmer.c_str(),
+ RollingHash(startKmer, numHashes, k));
+ breadthFirstSearch(dbg, start, visitor, colorMap);
+
+ /* BFS traversal in reverse dir */
+ Sequence rcSeq = reverseComplement(seq);
+ std::string rcStartKmer = rcSeq.substr(0, k);
+ Vertex rcStart(rcStartKmer.c_str(),
+ RollingHash(rcStartKmer, numHashes, k));
+ breadthFirstSearch(dbg, rcStart, visitor, colorMap);
+
+ }
+
+ if (++readsProcessed % progressStep == 0 && params.verbose) {
+ std::cerr << "processed " << readsProcessed
+ << " (k-mers visited: " << visitor.getNumNodesVisited()
+ << ", edges visited: " << visitor.getNumEdgesVisited()
+ << ")" << std::endl;
+ }
+ }
+ assert(in.eof());
+ if (params.verbose) {
+ std::cerr << "processed " << readsProcessed
+ << " reads (k-mers visited: " << visitor.getNumNodesVisited()
+ << ", edges visited: " << visitor.getNumEdgesVisited()
+ << ")" << std::endl;
+ std::cerr << "GraphViz generation complete" << std::endl;
+ }
+ }
+
+ /**
+ * Write a single block of a 'variableStep' WIG file.
+ *
+ * @param chr chromosome name
+ * @param start start coordinate of block
+ * @param length length of block
+ * @param val value of block
+ * @param out output stream for WIG file
+ * @param outPath path for output WIG file
+ */
+ static inline void outputWigBlock(const std::string& chr, size_t start,
+ size_t length, unsigned val, ostream& out, const std::string& outPath)
+ {
+ assert(length > 0);
+ out << "variableStep chrom=" << chr
+ << " span=" << length << "\n";
+ out << start << ' ' << val << '\n';
+ assert_good(out, outPath);
+ }
+
+ /**
+ * Write a WIG file for a reference genome, using the values 0 and 1
+ * to indicate whether or not a given k-mer had sufficient coverage
+ * in the reads to exceed the minimum coverage threshold.
+ *
+ * @param goodKmerSet Bloom filter of k-mers that exceed the
+ * minimum coverage threshold
+ * @param params encapsulates all command line options for the
+ * assembly, including the reference genome and the output path
+ * for the WIG file.
+ */
+ template <class BloomT>
+ static inline void writeCovTrack(const BloomT& goodKmerSet,
+ const AssemblyParams& params)
+ {
+ assert(!params.covTrackPath.empty());
+ assert(!params.refPath.empty());
+
+ const unsigned k = goodKmerSet.getKmerSize();
+ const unsigned numHashes = goodKmerSet.getHashNum();
+
+ std::ofstream covTrack(params.covTrackPath.c_str());
+ assert_good(covTrack, params.covTrackPath);
+
+ if (params.verbose)
+ std::cerr << "Writing 0/1 k-mer coverage track for `"
+ << params.refPath << "` to `"
+ << params.covTrackPath << "`" << std::endl;
+
+ FastaReader ref(params.refPath.c_str(), FastaReader::FOLD_CASE);
+ for (FastaRecord rec; ref >> rec;) {
+ std::string chr = rec.id;
+ bool firstVal = true;
+ size_t blockStart = 1;
+ size_t blockLength = 0;
+ uint8_t blockVal = 0;
+ for (RollingHashIterator it(rec.seq, numHashes, k);
+ it != RollingHashIterator::end(); ++it) {
+ uint8_t val = goodKmerSet.contains(*it) ? 1 : 0;
+ if (firstVal) {
+ firstVal = false;
+ /* WIG standard uses 1-based coords */
+ blockStart = it.pos() + 1;
+ blockLength = 1;
+ blockVal = val;
+ } else if (val != blockVal) {
+ assert(firstVal == false);
+ outputWigBlock(chr, blockStart, blockLength, blockVal,
+ covTrack, params.covTrackPath);
+ /* WIG standard uses 1-based coords */
+ blockStart = it.pos() + 1;
+ blockLength = 1;
+ blockVal = val;
+ } else {
+ blockLength++;
+ }
+ }
+ /* output last block */
+ if (blockLength > 0) {
+ outputWigBlock(chr, blockStart, blockLength, blockVal,
+ covTrack, params.covTrackPath);
+ }
+ }
+ assert(ref.eof());
+
+ assert_good(covTrack, params.covTrackPath);
+ covTrack.close();
+ }
+
+} /* BloomDBG namespace */
+
+#endif
diff --git a/COPYRIGHT b/COPYRIGHT
index 0c4d7a2..8dcf6fd 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -3,14 +3,10 @@ Upstream-Name: ABySS
Upstream-Contact: Shaun Jackman <sjackman at gmail.com>
Source: https://github.com/bcgsc/abyss
-License: GPL-NC-3+
- You may use, redistribute and modify this software for non-commercial
- purposes under the terms of the GNU General Public License as
- published by the Free Software Foundation, either version 3 of the
- License, or (at your option) any later version.
- .
- To license ABySS for commercial purposes, please contact
- Patrick Rebstein <prebstein at bccancer.bc.ca>
+License: GPL-3
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, version 3.
.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -18,17 +14,26 @@ License: GPL-NC-3+
GNU General Public License for more details.
.
You should have received a copy of the GNU General Public License
- along with this software. If not, see <http://www.gnu.org/licenses/>.
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
.
- Debian may redistribute this software package.
+ For commercial licensing options, please contact
+ Patrick Rebstein <prebstein at bccancer.bc.ca>
Files: *
-Copyright: Copyright 2014 Canada's Michael Smith Genome Sciences Centre
-License: GPL-NC-3+
+Copyright: Copyright 2016 British Columbia Cancer Agency Branch
+License: GPL-3
-Files: Common/* DataLayer/* DistanceEst/* FMIndex/* Map/* ParseAligns/*
-Copyright: Copyright 2014 Canada's Michael Smith Genome Sciences Centre
-License: GPL-3+
+Files: Layout/*
+Copyright: Copyright 2012 Shaun Jackman
+License: GPL-3
+
+Files: lib/bloomfilter/*
+Copyright: Copyright 2016 Justin Chu
+License: GPL-3
+
+Files: lib/rolling-hash/*
+Copyright: Copyright 2016 Hamid Mohamadi
+License: GPL-3
Files: Common/cholesky.hpp
Copyright: Copyright 2005 Gunter Winkler, Konstantin Kutzkow
@@ -38,10 +43,6 @@ Files: Common/city.cc Common/city.h
Copyright: Copyright 2011 Google, Inc.
License: Expat
-Files: Layout/*
-Copyright: Copyright 2012 Shaun Jackman
-License: GPL-3+
-
Files: dialign/*
Copyright: Copyright 2008 Amarendran R. Subramanian
License: LGPL-2.1+
@@ -127,22 +128,31 @@ License: Expat
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
-License: GPL-3+
- This program is free software; you can redistribute it
- and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 3 of the License, or (at your option) any later
- version.
- .
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
- PURPOSE. See the GNU General Public License for more
- details.
- .
- You should have received a copy of the GNU General Public License
- along with this software. If not, see <http://www.gnu.org/licenses/>.
+Files: lib/gtest-*/*
+Copyright: Copyright 2008 Google Inc.
+License: BSD-3-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
.
- On Debian systems, the full text of the GNU General Public
- License version 3 can be found in the file
- `/usr/share/common-licenses/GPL-3'.
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ChangeLog b/ChangeLog
index 8316e1c..368e132 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,60 @@
+2016-09-14 Ben Vandervalk <benv at bcgsc.ca>
+
+ * Release version 2.0.1
+ * Resolve licensing issues by switching to standard GPL-3 license
+
+2016-08-30 Ben Vandervalk <benv at bcgsc.ca>
+
+ * Release version 2.0.0
+ * New Bloom filter mode for assembly => assemble large genomes
+ with minimal memory (e.g. 34G for H. sapiens)
+ * Update param defaults for modern Illumina data
+ * Make sqlite3 an optional dependency
+
+ abyss-bloom:
+ * New 'compare' command for bitwise comparison of Bloom filters
+ (thanks to @bschiffthaler!)
+ * New 'kmers' command for printing k-mers that match a Bloom filter
+ (thanks to @bschiffthaler!)
+
+ abyss-bloom-dbg:
+ * New preunitig assembler that uses Bloom filter
+ * Add 'B' param (Bloom filter size) to 'abyss-pe' command to enable
+ Bloom filter mode
+ * See README.md and '--help' for further instructions
+
+ abyss-fatoagp:
+ * Mask scaftigs shorter than 50bp with 'N's (short scaftigs
+ were causing problems with NCBI submission)
+
+ abyss-pe:
+ * Update default parameter values for modern Illumina data
+ * Change 'l=k' => 'l=40'
+ * Change 's=200' => 's=1000'
+ * Change 'S=s' => 'S=1000-10000' (do a param sweep of 'S')
+ * Use 'DistanceEst --mean' for scaffolding stage, instead of
+ the default '--mle'
+
+ abyss-sealer:
+ * New '--max-gap-length' ('-G') option to replace unintuitive
+ '--max-frag'; use of '--max-frag' is now deprecated
+ * Require user to explicitly specify Bloom filter size (e.g.
+ '-b40G')
+ * Report false positive rate (FPR) when building/loading Bloom
+ filters
+ * Don't require input FASTQ files when using pre-built Bloom
+ filter files
+
+ konnector:
+ * Fix bug causing output read 2 file to be empty
+ * New percent sequence identity options ('-x' and '-X')
+ * New '--alt-paths-mode' option to output alternate connecting
+ paths between read pairs
+
+ README.md:
+ * Fixes to documentation of ABYSS and abyss-pe parameters
+ (thanks to @nsoranzo!)
+
2015-05-28 Ben Vandervalk <benv at bcgsc.ca>
* Release version 1.9.0
diff --git a/Common/Kmer.h b/Common/Kmer.h
index f1a58a3..017f73c 100644
--- a/Common/Kmer.h
+++ b/Common/Kmer.h
@@ -109,13 +109,13 @@ class Kmer
return out << o.str();
}
- private:
- uint8_t shiftAppend(uint8_t base);
- uint8_t shiftPrepend(uint8_t base);
-
uint8_t at(unsigned i) const;
void set(unsigned i, uint8_t base);
+ protected:
+ uint8_t shiftAppend(uint8_t base);
+ uint8_t shiftPrepend(uint8_t base);
+
static uint8_t leftShiftByte(char* pSeq,
unsigned byteNum, unsigned index, uint8_t base);
static uint8_t rightShiftByte(char* pSeq,
diff --git a/Common/Sequence.h b/Common/Sequence.h
index b9283b9..1c3a152 100644
--- a/Common/Sequence.h
+++ b/Common/Sequence.h
@@ -104,10 +104,10 @@ static inline bool ambiguityIsSubset(char a, char b)
* @param maskNew output bases that have been changed or added
* to target in lowercase.
*/
-static inline void overlaySeq(Sequence& overlay, Sequence& target,
+static inline void overlaySeq(const Sequence& overlay, Sequence& target,
int shift, bool maskNew = false)
{
- Sequence::iterator src = overlay.begin();
+ Sequence::const_iterator src = overlay.begin();
Sequence::iterator dest;
if (shift < 0) {
@@ -125,8 +125,9 @@ static inline void overlaySeq(Sequence& overlay, Sequence& target,
for (; src != overlay.end(); ++src, ++dest) {
assert(dest != target.end());
if (maskNew && *src != *dest)
- *src = tolower(*src);
- *dest = *src;
+ *dest = tolower(*src);
+ else
+ *dest = *src;
}
}
diff --git a/DataBase/Makefile.am b/DataBase/Makefile.am
index e4c7cca..6a055ce 100644
--- a/DataBase/Makefile.am
+++ b/DataBase/Makefile.am
@@ -3,7 +3,10 @@ libdb_a_SOURCES = DB.cc DB.h Options.h
libdb_a_CPPFLAGS = -I$(top_srcdir)
libdb_a_LIBADD = $(top_builddir)/Common/libcommon.a
+if HAVE_SQLITE3
bin_PROGRAMS = abyss-db-csv
+endif
+
abyss_db_csv_SOURCES = DB.cc DB.h db-csv.cc
abyss_db_csv_CPPFLAGS = -I$(top_srcdir)
abyss_db_csv_LDADD = -lsqlite3
diff --git a/DataBase/db-csv.cc b/DataBase/db-csv.cc
index 05cac19..fc43c2f 100644
--- a/DataBase/db-csv.cc
+++ b/DataBase/db-csv.cc
@@ -19,8 +19,7 @@ typedef vector<string> vs;
static bool existFile(const char* f)
{
- ifstream file(f);
- return file;
+ return (bool)ifstream(f);
}
template <typename D>
diff --git a/DataLayer/fac.cc b/DataLayer/fac.cc
index f03af8f..fdf3232 100644
--- a/DataLayer/fac.cc
+++ b/DataLayer/fac.cc
@@ -29,8 +29,8 @@ static const char USAGE_MESSAGE[] =
"\n"
" Options:\n"
"\n"
-" -e, --exp-size=N expected genome size. Will calculate NG50\n"
-" and associated stats\n"
+" -G, -e, --genome-size=N expected genome size. Used to calculate NG50\n"
+" and associated stats [disabled]\n"
" -s, -t, --min-length=N ignore sequences shorter than N bp [500]\n"
" -d, --delimiter=S use S for the field delimiter [\\t]\n"
" -j, --jira output JIRA format\n"
@@ -50,7 +50,7 @@ static const char USAGE_MESSAGE[] =
namespace opt {
static unsigned minLength = 500;
- static long long unsigned expSize = 0;
+ static long long unsigned genomeSize;
static string delimiter = "\t";
static int format;
static int verbose;
@@ -58,12 +58,12 @@ namespace opt {
}
enum { TAB, JIRA, MMD };
-static const char shortopts[] = "d:jms:t:e:v";
+static const char shortopts[] = "d:e:G:jms:t:v";
enum { OPT_HELP = 1, OPT_VERSION };
static const struct option longopts[] = {
- { "exp-size", no_argument, NULL, 'e' },
+ { "genome-size", required_argument, NULL, 'G' },
{ "min-length", no_argument, NULL, 's' },
{ "delimiter", required_argument, NULL, 'd' },
{ "jira", no_argument, NULL, 'j' },
@@ -109,7 +109,7 @@ static void printContiguityStatistics(const char* path)
<< "n" << sep
<< "n:" << opt::minLength << sep
<< "L50" << sep;
- if (opt::expSize > 0)
+ if (opt::genomeSize > 0)
cout << "n:NG50" << sep
<< "NG50" << sep;
cout << "min" << sep
@@ -126,7 +126,7 @@ static void printContiguityStatistics(const char* path)
cout << "n" << sep
<< "n:" << opt::minLength << sep
<< "L50" << sep;
- if (opt::expSize > 0)
+ if (opt::genomeSize > 0)
cout << "n:NG50" << sep
<< "NG50" << sep;
cout << "min" << sep
@@ -137,7 +137,7 @@ static void printContiguityStatistics(const char* path)
<< "max" << sep
<< "sum" << sep
<< "name" << '\n';
- if (opt::expSize > 0)
+ if (opt::genomeSize > 0)
cout << "------" << sep
<< "------" << sep;
cout << "------" << sep
@@ -157,7 +157,7 @@ static void printContiguityStatistics(const char* path)
if (opt::format == JIRA)
cout << '|';
printContiguityStats(cout, h, opt::minLength,
- printHeader, opt::delimiter, opt::expSize)
+ printHeader, opt::delimiter, opt::genomeSize)
<< opt::delimiter << path;
if (opt::format == JIRA)
cout << opt::delimiter;
@@ -189,9 +189,14 @@ int main(int argc, char** argv)
opt::delimiter = "\t|";
opt::format = MMD;
break;
+ case 'G':
case 'e':
- arg >> opt::expSize;
- break;
+ {
+ double x;
+ arg >> x;
+ opt::genomeSize = x;
+ break;
+ }
case 's': case 't':
arg >> opt::minLength;
break;
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..dbcf383
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM ubuntu:latest
+MAINTAINER Shaun Jackman <sjackman at gmail.com>
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ make openmpi-bin ssh
+ADD . /tmp/abyss
+RUN apt-get install -y --no-install-recommends \
+ automake g++ libboost-dev libopenmpi-dev libsparsehash-dev \
+ && cd /tmp/abyss \
+ && ./autogen.sh \
+ && mkdir build && cd build \
+ && ../configure --with-mpi=/usr/lib/openmpi \
+ && make install-strip \
+ && rm -rf /tmp/abyss \
+ && apt-get autoremove -y binutils \
+ automake g++ libboost-dev libopenmpi-dev libsparsehash-dev
+ENV SHELL=/bin/bash
+ENTRYPOINT ["abyss-pe"]
+CMD ["help"]
diff --git a/Graph/BreadthFirstSearch.h b/Graph/BreadthFirstSearch.h
index a36a567..40a04f0 100644
--- a/Graph/BreadthFirstSearch.h
+++ b/Graph/BreadthFirstSearch.h
@@ -29,6 +29,9 @@ template <class IncidenceGraph, class Buffer, class BFSVisitor,
typedef color_traits<ColorValue> Color;
typename GTraits::out_edge_iterator ei, ei_end;
+ if (get(color, s) == Color::black())
+ return;
+
put(color, s, Color::gray()); vis.discover_vertex(s, g);
Q.push(s);
while (! Q.empty()) {
diff --git a/Graph/ExtendPath.h b/Graph/ExtendPath.h
index acef178..50d471a 100644
--- a/Graph/ExtendPath.h
+++ b/Graph/ExtendPath.h
@@ -1,3 +1,4 @@
+
#ifndef _EXTENDPATH_H_
#define _EXTENDPATH_H_
@@ -10,22 +11,95 @@
#include <cassert>
#include <cstdio>
#include <iostream>
+#include <algorithm>
+
+/**
+ * Parameters for path extension.
+ */
+struct ExtendPathParams
+{
+ /* ignore branches shorter than or equal to this length */
+ unsigned trimLen;
+ /* maximum length after extension */
+ unsigned maxLen;
+ /*
+ * if true, multiple incoming branches > trimLen
+ * will cause a path extension to halt
+ */
+ bool lookBehind;
+
+ /* constructor */
+ ExtendPathParams() : trimLen(0), maxLen(NO_LIMIT), lookBehind(true) {}
+};
/**
* The result of attempting to extend a path.
*/
enum PathExtensionResult {
+ /** path could not be extended because of a dead end */
DEAD_END,
+ /** path could not be extended because of a branching point */
BRANCHING_POINT,
+ /** path could not be extended because of a cycle */
CYCLE,
+ /** path could not be extended because of caller-specified length limit */
LENGTH_LIMIT,
+ /** path was extended up to a dead end */
EXTENDED_TO_DEAD_END,
+ /** path was extended up to a branching point */
EXTENDED_TO_BRANCHING_POINT,
+ /** path was extended up to a cycle */
EXTENDED_TO_CYCLE,
+ /** path was extended up to caller-specified length limit */
EXTENDED_TO_LENGTH_LIMIT
};
/**
+ * Translate path extension result code to a string.
+ */
+static inline const char* pathExtensionResultStr(PathExtensionResult result)
+{
+ switch(result) {
+ case DEAD_END:
+ return "DEAD_END";
+ case BRANCHING_POINT:
+ return "BRANCHING_POINT";
+ case CYCLE:
+ return "CYCLE";
+ case LENGTH_LIMIT:
+ return "LENGTH_LIMIT";
+ case EXTENDED_TO_DEAD_END:
+ return "EXTENDED_TO_DEAD_END";
+ case EXTENDED_TO_BRANCHING_POINT:
+ return "EXTENDED_TO_BRANCHING_POINT";
+ case EXTENDED_TO_CYCLE:
+ return "EXTENDED_TO_CYCLE";
+ case EXTENDED_TO_LENGTH_LIMIT:
+ return "EXTENDED_TO_LENGTH_LIMIT";
+ default:
+ assert(false);
+ }
+}
+
+/**
+ * Return true if the path extension result code indicates
+ * that the path was successfully extended by one or more nodes.
+ */
+static inline bool pathExtended(PathExtensionResult result)
+{
+ switch(result) {
+ case DEAD_END:
+ case BRANCHING_POINT:
+ case CYCLE:
+ case LENGTH_LIMIT:
+ return false;
+ default:
+ return true;
+ }
+ assert(false);
+}
+
+/**
* The result of attempting to extend a path
* by a single neighbouring vertex.
*/
@@ -36,9 +110,65 @@ enum SingleExtensionResult {
};
/**
+ * Return true if there is a path of at least depthLimit vertices
+ * that extends from given vertex u, otherwise return false.
+ * Implemented using a bounded depth first search.
+ *
+ * @param start starting vertex for traversal
+ * @param dir direction for traversal (FORWARD or REVERSE)
+ * @param depth depth of current vertex u
+ * @param depthLimit maximum depth to probe
+ * @param g graph to use for traversal
+ * @param visited vertices that have already been visited by the DFS
+ * @return true if at least one path with length >= len
+ * extends from v in direction dir, false otherwise
+ */
+template <class Graph>
+static inline bool lookAhead(
+ const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+ Direction dir, unsigned depth, unsigned depthLimit,
+ unordered_set< typename boost::graph_traits<Graph>::vertex_descriptor,
+ hash<typename boost::graph_traits<Graph>::vertex_descriptor> >& visited, const Graph& g)
+{
+ typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+ typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+ typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
+
+ OutEdgeIter oei, oei_end;
+ InEdgeIter iei, iei_end;
+
+ visited.insert(u);
+ if (depth == depthLimit)
+ return true;
+
+ if (dir == FORWARD) {
+ for (boost::tie(oei, oei_end) = out_edges(u, g);
+ oei != oei_end; ++oei) {
+ const V& v = target(*oei, g);
+ if (visited.find(v) == visited.end()) {
+ if(lookAhead(v, dir, depth+1, depthLimit, visited, g))
+ return true;
+ }
+ }
+ } else {
+ assert(dir == REVERSE);
+ for (boost::tie(iei, iei_end) = in_edges(u, g);
+ iei != iei_end; ++iei) {
+ const V& v = source(*iei, g);
+ if (visited.find(v) == visited.end()) {
+ if(lookAhead(v, dir, depth+1, depthLimit, visited, g))
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/**
* Return true if there is a path of at least 'depth' vertices
* that extends from given vertex v, otherwise return false.
- * Implemented using a bounded breadth first search.
+ * Implemented using a bounded depth first search.
*
* @param start starting vertex for traversal
* @param dir direction for traversal (FORWARD or REVERSE)
@@ -47,39 +177,141 @@ enum SingleExtensionResult {
* @return true if at least one path with length >= len
* extends from v in direction dir, false otherwise
*/
-template <class BidirectionalGraph>
+template <class Graph>
static inline bool lookAhead(
- typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor start,
- Direction dir, unsigned depth, const BidirectionalGraph& g)
+ const typename boost::graph_traits<Graph>::vertex_descriptor& start,
+ Direction dir, unsigned depth, const Graph& g)
+{
+ typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+ unordered_set< V, hash<V> > visited;
+ return lookAhead(start, dir, 0, depth, visited, g);
+}
+
+/**
+ * Return neighbour vertices that begin branches that are longer than trimLen.
+ *
+ * @param u root vertex
+ * @param dir direction for neighbours (FORWARD or REVERSE)
+ * @param g graph
+ * @param trimLen ignore all branches less than or equal to this length
+ * @return std::vector of neighbour vertices that start branches that are
+ * greater than trimLen vertices in length
+ */
+template <class BidirectionalGraph>
+static inline std::vector<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>
+trueBranches(const typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor& u,
+ Direction dir, const BidirectionalGraph& g, unsigned trimLen=0)
+{
+ typedef BidirectionalGraph G;
+ typedef boost::graph_traits<G> graph_traits;
+ typedef typename graph_traits::vertex_descriptor V;
+
+ typename graph_traits::out_edge_iterator oei, oei_end;
+ typename graph_traits::in_edge_iterator iei, iei_end;
+
+ std::vector<V> branchRoots;
+
+ if (dir == FORWARD) {
+ for (boost::tie(oei, oei_end) = out_edges(u, g);
+ oei != oei_end; ++oei) {
+ const V& v = target(*oei, g);
+ if (lookAhead(v, dir, trimLen, g))
+ branchRoots.push_back(v);
+ }
+ } else {
+ assert(dir == REVERSE);
+ for (boost::tie(iei, iei_end) = in_edges(u, g);
+ iei != iei_end; ++iei) {
+ const V& v = source(*iei, g);
+ if (lookAhead(v, dir, trimLen, g)) {
+ branchRoots.push_back(v);
+ }
+ }
+ }
+
+ return branchRoots;
+}
+
+/**
+ * Return the in/out degree of a vertex, disregarding branches
+ * <= trimLen.
+ *
+ * @param u the vertex of interest
+ * @param dir FORWARD for out degree, REVERSE for in degree
+ * @param g the graph
+ * @param trimLen branches less then or equal to this length
+ * are ignored (unless they are the only option)
+ * @return the in/out degree of u, ignoring branches <= trimLen
+ */
+template <typename Graph>
+static inline unsigned trueDegree(
+ const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+ Direction dir, const Graph& g, unsigned trimLen=0)
+{
+ typedef boost::graph_traits<Graph> graph_traits;
+ typedef typename graph_traits::vertex_descriptor V;
+
+ unsigned degree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
+ if (degree <= 1)
+ return degree;
+
+ std::vector<V> branches = trueBranches(u, dir, g, trimLen);
+ /*
+ * Note: If branches.size() == 0, we know from above that
+ * we must have 2 or more short branches. This situation typically occurs
+ * near coverage gaps, where one of the branches is the correct choice.
+ * (During path extension, our heuristic is to choose the longest branch
+ * and to continue extending.)
+ */
+ if (branches.size() == 0)
+ return 1;
+
+ return branches.size();
+}
+
+/**
+ * Return the depth of the graph from the given source vertex,
+ * i.e. the distance of the furthest node. The depth is measured
+ * by means of an exhaustive breadth first search.
+ *
+ * @param root starting vertex for traversal
+ * @param dir direction for traversal (FORWARD or REVERSE)
+ * @param g graph to use for traversal
+ * @return the distance of the furthest vertex from root
+ */
+template <typename Graph>
+static inline size_t depth(
+ typename boost::graph_traits<Graph>::vertex_descriptor root,
+ Direction dir, const Graph& g)
{
- typedef typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor V;
- typedef typename boost::graph_traits<BidirectionalGraph>::out_edge_iterator OutEdgeIter;
- typedef typename boost::graph_traits<BidirectionalGraph>::in_edge_iterator InEdgeIter;
+ typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+ typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+ typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
OutEdgeIter oei, oei_end;
InEdgeIter iei, iei_end;
unordered_set<V, hash<V> > visited;
- typedef unordered_map<V, unsigned> DepthMap;
+ typedef unordered_map<V, size_t> DepthMap;
DepthMap depthMap;
std::deque<V> q;
- q.push_back(start);
+ q.push_back(root);
- visited.insert(start);
+ visited.insert(root);
std::pair<typename DepthMap::iterator, bool> inserted =
- depthMap.insert(std::make_pair(start, 0));
+ depthMap.insert(std::make_pair(root, 0));
assert(inserted.second);
+ size_t maxDepth = 0;
while (!q.empty()) {
- V u = q.front();
- q.pop_front();
+ V& u = q.front();
visited.insert(u);
typename DepthMap::const_iterator it = depthMap.find(u);
assert(it != depthMap.end());
- unsigned uDepth = it->second;
- if (uDepth == depth)
- return true;
+ size_t depth = it->second;
+ if (depth > maxDepth)
+ maxDepth = depth;
if (dir == FORWARD) {
for (boost::tie(oei, oei_end) = out_edges(u, g);
oei != oei_end; ++oei) {
@@ -87,7 +319,7 @@ static inline bool lookAhead(
if (visited.find(v) == visited.end()) {
visited.insert(v);
std::pair<typename DepthMap::iterator, bool> inserted =
- depthMap.insert(std::make_pair(v, uDepth+1));
+ depthMap.insert(std::make_pair(v, depth+1));
assert(inserted.second);
q.push_back(v);
}
@@ -100,50 +332,69 @@ static inline bool lookAhead(
if (visited.find(v) == visited.end()) {
visited.insert(v);
std::pair<typename DepthMap::iterator, bool> inserted =
- depthMap.insert(std::make_pair(v, uDepth+1));
+ depthMap.insert(std::make_pair(v, depth+1));
assert(inserted.second);
q.push_back(v);
}
}
}
+ q.pop_front();
}
- return false;
+ return maxDepth;
}
-template <class BidirectionalGraph>
-static inline std::vector<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>
-trueBranches(typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor& u,
- Direction dir, const BidirectionalGraph& g, unsigned trimLen=0)
+/**
+ * Return the neighbor vertex corresponding to the longest branch. If there
+ * are no neighbour vertices, an assertion will be thrown. If there
+ * is a tie between branch lengths, the "winning" branch is chosen arbitrarily.
+ *
+ * @param u root vertex
+ * @param dir direction of branches to consider (FORWARD or REVERSE)
+ * @param g the graph
+ * @return the vertex at the head of the longest branch
+ */
+template <typename Graph>
+inline static typename boost::graph_traits<Graph>::vertex_descriptor
+longestBranch(const typename boost::graph_traits<Graph>::vertex_descriptor& u,
+ Direction dir, const Graph& g)
{
- typedef BidirectionalGraph G;
- typedef boost::graph_traits<G> graph_traits;
- typedef typename graph_traits::vertex_descriptor V;
-
- typename graph_traits::out_edge_iterator oei, oei_end;
- typename graph_traits::in_edge_iterator iei, iei_end;
-
- std::vector<V> branchRoots;
+ typedef typename boost::graph_traits<Graph>::vertex_descriptor V;
+ typedef typename boost::graph_traits<Graph>::out_edge_iterator OutEdgeIter;
+ typedef typename boost::graph_traits<Graph>::in_edge_iterator InEdgeIter;
+ OutEdgeIter oei, oei_end;
+ InEdgeIter iei, iei_end;
+ size_t maxDepth = 0;
+ unsigned degree = 0;
+ /* note: had to initialize to prevent compiler warnings */
+ V longestBranch = u;
if (dir == FORWARD) {
for (boost::tie(oei, oei_end) = out_edges(u, g);
- oei != oei_end; ++oei) {
+ oei != oei_end; ++oei) {
+ degree++;
const V& v = target(*oei, g);
- if (lookAhead(v, dir, trimLen, g))
- branchRoots.push_back(v);
+ size_t d = depth(v, dir, g);
+ if (d >= maxDepth) {
+ maxDepth = d;
+ longestBranch = v;
+ }
}
} else {
assert(dir == REVERSE);
for (boost::tie(iei, iei_end) = in_edges(u, g);
- iei != iei_end; ++iei) {
+ iei != iei_end; ++iei) {
+ degree++;
const V& v = source(*iei, g);
- if (lookAhead(v, dir, trimLen, g)) {
- branchRoots.push_back(v);
+ size_t d = depth(v, dir, g);
+ if (d >= maxDepth) {
+ maxDepth = d;
+ longestBranch = v;
}
}
}
-
- return branchRoots;
+ assert(degree > 0);
+ return longestBranch;
}
/**
@@ -153,14 +404,13 @@ trueBranches(typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor
* @param path the path to extend (a list of vertices)
* @param dir direction of extension (FORWARD or REVERSE)
* @param g the graph to use for traversal
- * @param trimLen ignore neighbour vertices with branches
- * shorter than this length [0]
+ * @param params parameters controlling extension (e.g. trimLen)
* @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT, or EXTENDED
*/
template <class BidirectionalGraph>
static inline SingleExtensionResult extendPathBySingleVertex(
Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
- Direction dir, const BidirectionalGraph& g, unsigned trimLen = 0)
+ Direction dir, const BidirectionalGraph& g, const ExtendPathParams& params)
{
typedef BidirectionalGraph G;
typedef boost::graph_traits<G> graph_traits;
@@ -172,38 +422,76 @@ static inline SingleExtensionResult extendPathBySingleVertex(
assert(dir == FORWARD || dir == REVERSE);
V& u = (dir == FORWARD) ? path.back() : path.front();
- unsigned degree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
- if (degree == 0) {
+ unsigned outDegree = (dir == FORWARD) ? out_degree(u, g) : in_degree(u, g);
+ if (outDegree == 0) {
return SE_DEAD_END;
- } else if (degree == 1) {
- const V& v = (dir == FORWARD) ?
- target(*(out_edges(u, g).first), g) :
- source(*(in_edges(u, g).first), g);
+ }
+
+ unsigned inDegree = 0;
+ if (params.lookBehind)
+ inDegree = (dir == FORWARD) ? in_degree(u, g) : out_degree(u, g);
+
+ if ((!params.lookBehind || inDegree <= 1) && outDegree == 1) {
if (dir == FORWARD) {
+ const V& v = target(*(out_edges(u, g).first), g);
path.push_back(v);
} else {
assert(dir == REVERSE);
+ const V& v = source(*(in_edges(u, g).first), g);
path.push_front(v);
}
return SE_EXTENDED;
- } else {
- std::vector<V> neighbours = trueBranches(u, dir, g, trimLen);
- if (neighbours.empty()) {
- return SE_DEAD_END;
- } else if (neighbours.size() == 1) {
- if (dir == FORWARD) {
- path.push_back(neighbours.front());
- } else {
- assert(dir == REVERSE);
- path.push_front(neighbours.front());
+ }
+
+ Direction otherDir = (dir == FORWARD) ? REVERSE : FORWARD;
+ std::vector<V> longBranchesOut = trueBranches(u, dir, g, params.trimLen);
+ std::vector<V> longBranchesIn;
+
+ if (params.lookBehind) {
+ longBranchesIn = trueBranches(u, otherDir, g, params.trimLen);
+ /*
+ * Tricky: Make sure the path we are extending
+ * is treated as a valid incoming branch, even if it is less
+ * than trimLen. This can happen if we seeded the path on
+ * an error branch or a branch that has a coverage gap.
+ */
+ if (path.size() > 1) {
+ const V& predecessor = (dir == FORWARD) ?
+ *(path.rbegin() + 1) : *(path.begin() + 1);
+ if (std::find(longBranchesIn.begin(), longBranchesIn.end(),
+ predecessor) == longBranchesIn.end()) {
+ longBranchesIn.push_back(predecessor);
}
- return SE_EXTENDED;
- } else {
- assert(neighbours.size() > 1);
- return SE_BRANCHING_POINT;
}
}
+
+ if ((params.lookBehind && longBranchesIn.size() > 1) ||
+ longBranchesOut.size() > 1)
+ return SE_BRANCHING_POINT;
+
+ if (longBranchesOut.size() == 0) {
+ /*
+ * If we have multiple branches that are shorter
+ * than the trim length then choose the longest one.
+ * (This type of situation usually occurs near
+ * coverage gaps.)
+ */
+ V v = longestBranch(u, dir, g);
+ if (dir == FORWARD)
+ path.push_back(v);
+ else
+ path.push_front(v);
+
+ return SE_EXTENDED;
+ }
+
+ if (dir == FORWARD)
+ path.push_back(longBranchesOut.front());
+ else
+ path.push_front(longBranchesOut.front());
+
+ return SE_EXTENDED;
}
/**
@@ -212,16 +500,18 @@ static inline SingleExtensionResult extendPathBySingleVertex(
* @param path path to extend (modified by this function)
* @param dir direction to extend path (FORWARD or REVERSE)
* @param g graph in which to perform the extension
- * @param trimLen ignore branches less than this length when
- * detecting branch points [0]
+ * @param visited set of previously visited vertices (used
+ * to detect cycles in the de Bruijn graph)
+ * @param params parameters controlling extension (e.g. trimLen)
* @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
* or EXTENDED.
*/
template <class BidirectionalGraph>
-PathExtensionResult extendPath(
+static inline PathExtensionResult extendPath(
Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
- Direction dir, const BidirectionalGraph& g, unsigned trimLen = 0,
- unsigned maxLen = NO_LIMIT)
+ Direction dir, const BidirectionalGraph& g,
+ unordered_set<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& visited,
+ const ExtendPathParams& params)
{
typedef BidirectionalGraph G;
typedef boost::graph_traits<G> graph_traits;
@@ -229,20 +519,19 @@ PathExtensionResult extendPath(
typename graph_traits::out_edge_iterator oei, oei_end;
typename graph_traits::in_edge_iterator iei, iei_end;
- assert(path.size() > 0 && path.size() <= maxLen);
+ assert(path.size() > 0);
size_t origPathLen = path.size();
- /* track visited nodes to avoid infinite traversal of cycles */
- unordered_set<V> visited;
- visited.insert(path.begin(), path.end());
+ if (path.size() != NO_LIMIT && path.size() >= params.maxLen)
+ return LENGTH_LIMIT;
SingleExtensionResult result = SE_EXTENDED;
bool detectedCycle = false;
while (result == SE_EXTENDED && !detectedCycle &&
- path.size() < maxLen)
+ path.size() < params.maxLen)
{
- result = extendPathBySingleVertex(path, dir, g, trimLen);
+ result = extendPathBySingleVertex(path, dir, g, params);
if (result == SE_EXTENDED) {
std::pair<typename unordered_set<V>::iterator,bool> inserted;
if (dir == FORWARD) {
@@ -275,7 +564,7 @@ PathExtensionResult extendPath(
return EXTENDED_TO_BRANCHING_POINT;
} else {
assert(result == SE_EXTENDED &&
- path.size() == maxLen);
+ path.size() == params.maxLen);
return EXTENDED_TO_LENGTH_LIMIT;
}
} else {
@@ -287,10 +576,53 @@ PathExtensionResult extendPath(
} else if (result == SE_BRANCHING_POINT) {
return BRANCHING_POINT;
} else {
- assert(origPathLen >= maxLen);
+ assert(origPathLen >= params.maxLen);
return LENGTH_LIMIT;
}
}
}
+/**
+ * Extend a path up to the next branching point in the graph.
+ *
+ * @param path path to extend (modified by this function)
+ * @param dir direction to extend path (FORWARD or REVERSE)
+ * @param g graph in which to perform the extension
+ * @param params parameters controlling extension (e.g. trimLen)
+ * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
+ * or EXTENDED.
+ */
+template <class BidirectionalGraph>
+PathExtensionResult extendPath(
+ Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
+ Direction dir, const BidirectionalGraph& g, const ExtendPathParams& params)
+{
+ typedef typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor V;
+
+ /* track visited nodes to avoid infinite traversal of cycles */
+ unordered_set<V> visited;
+ visited.insert(path.begin(), path.end());
+
+ return extendPath(path, dir, g, visited, params);
+}
+
+/**
+ * Extend a path up to the next branching point in the graph.
+ *
+ * @param path path to extend (modified by this function)
+ * @param dir direction to extend path (FORWARD or REVERSE)
+ * @param g graph in which to perform the extension
+ * @return PathExtensionResult: NO_EXTENSION, HIT_BRANCHING_POINT,
+ * or EXTENDED.
+ */
+template <class BidirectionalGraph>
+PathExtensionResult extendPath(
+ Path<typename boost::graph_traits<BidirectionalGraph>::vertex_descriptor>& path,
+ Direction dir, const BidirectionalGraph& g)
+{
+ /* default extension params */
+ ExtendPathParams params;
+ return extendPath(path, dir, g, params);
+}
+
#endif
diff --git a/Graph/Path.h b/Graph/Path.h
index 33c893d..2a0abaf 100644
--- a/Graph/Path.h
+++ b/Graph/Path.h
@@ -5,6 +5,7 @@
#include <sstream>
#include <climits>
#include <deque>
+#include <cassert>
enum PathSearchResult {
FOUND_PATH = 0,
@@ -26,6 +27,18 @@ const char* PathSearchResultLabel[] = {
enum Direction { FORWARD = 0, REVERSE };
+inline static const char* directionStr(Direction dir)
+{
+ switch(dir) {
+ case FORWARD:
+ return "FORWARD";
+ case REVERSE:
+ return "REVERSE";
+ default:
+ assert(false);
+ }
+}
+
const unsigned NO_LIMIT = UINT_MAX;
template <class Vertex> class Path : public std::deque<Vertex>
diff --git a/Unittest/Konnector/integration-tests.mk b/IntegrationTest/Konnector/integration-tests.mk
similarity index 94%
rename from Unittest/Konnector/integration-tests.mk
rename to IntegrationTest/Konnector/integration-tests.mk
index 18e9c07..758f445 100755
--- a/Unittest/Konnector/integration-tests.mk
+++ b/IntegrationTest/Konnector/integration-tests.mk
@@ -88,7 +88,7 @@ $(tmpdir)/test_reference.fa: | $(tmpdir)
$(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq: $(tmpdir)/test_reference.fa
wgsim -S 0 -e $* -N $N -r 0 -R 0 $< $(tmpdir)/e$*_1.fq $(tmpdir)/e$*_2.fq
-$(tmpdir)/e%_merged.fa $(tmpdir)/e%_reads_1.fq $(tmpdir)/e%_reads_2.fq: $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
+$(tmpdir)/e%_pseudoreads.fa $(tmpdir)/e%_reads_1.fq $(tmpdir)/e%_reads_2.fq: $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
/usr/bin/time -v $(konnector) $(k_opts) -b$b -o $(tmpdir)/e$* $(K_OPTS) $^
$(tmpdir)/e%_l2.bloom: $(tmpdir) $(tmpdir)/e%_1.fq $(tmpdir)/e%_2.fq
@@ -113,7 +113,7 @@ $(tmpdir)/e%_reads_1of3.fq \
# run_test
#------------------------------------------------------------
-run_test: $(tmpdir) $(tmpdir)/e$e_merged.fa
+run_test: $(tmpdir) $(tmpdir)/e$e_pseudoreads.fa
@echo '------------------'
@echo '$@: PASSED'
@echo '------------------'
@@ -123,12 +123,12 @@ run_test: $(tmpdir) $(tmpdir)/e$e_merged.fa
#------------------------------------------------------------
save_and_load_test: $(tmpdir)/e$e_l2.bloom \
- $(tmpdir)/e$e_merged.fa \
+ $(tmpdir)/e$e_pseudoreads.fa \
$(tmpdir)/e$e_reads_1.fq \
$(tmpdir)/e$e_reads_2.fq
/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_loaded \
-i $(tmpdir)/e$e_l2.bloom $(K_OPTS) $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
- diff $(tmpdir)/e$e_merged.fa $(tmpdir)/e$e_loaded_merged.fa
+ diff $(tmpdir)/e$e_pseudoreads.fa $(tmpdir)/e$e_loaded_pseudoreads.fa
diff $(tmpdir)/e$e_reads_1.fq $(tmpdir)/e$e_loaded_reads_1.fq
diff $(tmpdir)/e$e_reads_2.fq $(tmpdir)/e$e_loaded_reads_2.fq
@echo '------------------'
@@ -142,7 +142,7 @@ save_and_load_test: $(tmpdir)/e$e_l2.bloom \
HALF_FASTQ_LINES:=$(shell echo '$N * 2 * 4 / 2' | bc)
interleaved_files_test: $(tmpdir)/e$e_l2.bloom \
- $(tmpdir)/e$e_merged.fa \
+ $(tmpdir)/e$e_pseudoreads.fa \
$(tmpdir)/e$e_interleaved_a.fq \
$(tmpdir)/e$e_interleaved_b.fq
/usr/bin/time -v $(konnector) $(k_opts) -I -b$b \
@@ -150,7 +150,7 @@ interleaved_files_test: $(tmpdir)/e$e_l2.bloom \
$(K_OPTS) \
$(tmpdir)/e$e_interleaved_a.fq \
$(tmpdir)/e$e_interleaved_b.fq
- diff $(tmpdir)/e$e_merged.fa $(tmpdir)/e$e_interleaved_merged.fa
+ diff $(tmpdir)/e$e_pseudoreads.fa $(tmpdir)/e$e_interleaved_pseudoreads.fa
diff $(tmpdir)/e$e_reads_1.fq $(tmpdir)/e$e_interleaved_reads_1.fq
diff $(tmpdir)/e$e_reads_2.fq $(tmpdir)/e$e_interleaved_reads_2.fq
@echo '------------------'
@@ -355,16 +355,16 @@ abyss_bloom_multithreaded_test: $(tmpdir) $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
konnector_multithreaded_test: $(tmpdir)/e$e_1.fq $(tmpdir)/e$e_2.fq
/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_singlethreaded \
$(K_OPTS) -j1 $^
- cat $(tmpdir)/e$e_singlethreaded_merged.fa | \
+ cat $(tmpdir)/e$e_singlethreaded_pseudoreads.fa | \
paste - - | sort | tr '\t' '\n' \
- > $(tmpdir)/e$e_singlethreaded_merged.sorted.fa
+ > $(tmpdir)/e$e_singlethreaded_pseudoreads.sorted.fa
/usr/bin/time -v $(konnector) $(k_opts) -o $(tmpdir)/e$e_multithreaded \
$(K_OPTS) -j10 $^
- cat $(tmpdir)/e$e_multithreaded_merged.fa | \
+ cat $(tmpdir)/e$e_multithreaded_pseudoreads.fa | \
paste - - | sort | tr '\t' '\n' \
- > $(tmpdir)/e$e_multithreaded_merged.sorted.fa
- diff $(tmpdir)/e$e_singlethreaded_merged.sorted.fa \
- $(tmpdir)/e$e_multithreaded_merged.sorted.fa
+ > $(tmpdir)/e$e_multithreaded_pseudoreads.sorted.fa
+ diff $(tmpdir)/e$e_singlethreaded_pseudoreads.sorted.fa \
+ $(tmpdir)/e$e_multithreaded_pseudoreads.sorted.fa
@echo '------------------'
@echo '$@: PASSED'
@echo '------------------'
diff --git a/Konnector/DBGBloomAlgorithms.h b/Konnector/DBGBloomAlgorithms.h
index c1e7aa6..57f0270 100644
--- a/Konnector/DBGBloomAlgorithms.h
+++ b/Konnector/DBGBloomAlgorithms.h
@@ -36,22 +36,21 @@ static inline Sequence pathToSeq(Path<Kmer> path)
* is no sequence of matches of length numMatchesThreshold,
* use the longest sequence of matching kmers instead.
*
- * The default behaviour of this method is to choose
- * the last kmer in the sequence that is present in the
- * Bloom filter de Bruijn graph.
- *
* @param seq sequence in which to find start kmer
* @param k kmer size
* @param g de Bruijn graph
* @param numMatchesThreshold if we encounter a sequence
* of numMatchesThreshold consecutive kmers in the Bloom filter,
* choose the kmer at the beginning of that sequence
+ * @param anchorToEnd if true, all k-mers from end of sequence
+ * up to the chosen k-mer must be matches. (This option is used when
+ * we wish to preserve the original sequences of the reads.)
* @return position of chosen start kmer
*/
template<typename Graph>
static inline unsigned getStartKmerPos(const Sequence& seq,
unsigned k, Direction dir, const Graph& g,
- unsigned numMatchesThreshold=1)
+ unsigned numMatchesThreshold=1, bool anchorToEnd=false)
{
assert(numMatchesThreshold > 0);
@@ -86,6 +85,8 @@ static inline unsigned getStartKmerPos(const Sequence& seq,
maxMatchPos = i - inc;
maxMatchLen = matchCount;
}
+ if (anchorToEnd)
+ break;
matchCount = 0;
} else {
matchCount++;
diff --git a/Konnector/README.md b/Konnector/README.md
new file mode 100644
index 0000000..e27ad72
--- /dev/null
+++ b/Konnector/README.md
@@ -0,0 +1,176 @@
+---
+title: konnector
+author: Ben Vandervalk, Shaun Jackman, Tony Raymond, Hamid Mohamadi, Justin Chu
+date: 2015-06-30
+header: ABySS
+footer: ABySS
+section: 1
+---
+
+NAME
+====
+
+konnector - merged paired-end sequences by finding connecting paths in the de Bruijn graph
+
+SYNOPSIS
+========
+
+`konnector -k <kmer_size> -o <output_prefix> [options]... <FASTQ> [FASTQ]...`
+
+DESCRIPTION
+===========
+
+Konnector generates long pseudo-reads by finding connecting paths between paired-end reads within the de Bruijn graph. This can be thought of as a targeted de novo assembly in the neighbourhood of the paired-end reads. An additional feature of Konnector is the ability to extend the pseudo-reads and unmerged reads outwards, until it encounters a branching point or dead end in the de Bruijn graph.
+
+Konnector uses a Bloom filter representation of the de Bruijn graph to minimize memory requirements, as described in: Chikhi, Rayan, and Guillaume Rizk. "Space-efficient and exact de Bruijn graph representation based on a Bloom filter." Algorithms for Molecular Biology 8.22 (2013):1.
+
+OPTIONS
+=======
+
+Required Options
+----------------
+```
+-k, --kmer=N the size of a k-mer [required]
+-o, --output-prefix=FILE prefix of output FASTA files [required]
+```
+
+Bloom Filter Options
+--------------------
+```
+-b, --bloom-size=N size of bloom filter [500M]
+-c, --min-coverage=N kmer coverage threshold for error correction [2].
+ This option specifies the number of levels in the
+ cascading Bloom filter; it has no effect if the Bloom
+ filter is loaded from an external file.
+-i, --input-bloom=FILE load bloom filter from FILE; Bloom filter files can
+ be created separately with the 'abyss-bloom' program
+```
+
+Graph Search Limits
+-------------------
+```
+-B, --max-branches=N max branches in de Bruijn graph traversal;
+ use 'nolimit' for no limit [350]
+-f, --min-frag=N min fragment size in base pairs [0]
+-F, --max-frag=N max fragment size in base pairs [1000]
+-P, --max-paths=N merge at most N alternate paths; use 'nolimit'
+ for no limit [2]
+```
+
+Sequence Identity Limits
+------------------------
+```
+-m, --read-mismatches=N max mismatches between paths and reads; use
+ 'nolimit' for no limit [nolimit]
+-M, --max-mismatches=N max mismatches between all alternate paths;
+ use 'nolimit' for no limit [2]
+-x, --read-identity=N min percent seq identity between consensus seq
+ and reads [0]
+-X, --path-identity=N min percent seq identity across alternate
+ connecting paths [0]
+```
+
+Input Options
+-------------
+```
+-q, --trim-quality=N trim bases from the ends of reads whose
+ quality is less than the threshold
+ --standard-quality zero quality is `!' (33), typically
+ for FASTQ and SAM files [default]
+ --illumina-quality zero quality is `@' (64), typically
+ for qseq and export files
+ --chastity discard unchaste reads [default]
+ --no-chastity do not discard unchaste reads
+ --trim-masked trim masked bases from the ends of reads
+ --no-trim-masked do not trim masked bases from the ends
+ of reads [default]
+-I, --interleaved input reads files are interleaved
+```
+
+Output Options
+--------------
+```
+ --fastq output merged reads in FASTQ format
+ (default is FASTA); bases that are corrected
+ or inserted by konnector are assigned a
+ fixed quality score determined by -Q
+-Q, --corrected-qual quality score for bases corrected or inserted
+ by konnector; only relevant when --fastq is
+ in effect [40]
+ --mask mask new and changed bases as lower case
+ --no-mask do not mask bases [default]
+-p, --alt-paths-mode output a separate pseudoread for each alternate
+ path connecting a read pair (default is to create
+ a consensus sequence of all connecting paths).
+ The limit on the number of alternate paths is
+ specified by the '--max-paths' option.
+ The sequence IDs for alternate paths are named:
+ ${orig_read_id}_1, ${orig_read_id}_2, ...
+--preserve-reads don't correct any bases within the reads [disabled]
+-v, --verbose display verbose output
+```
+
+Debugging Options
+-----------------
+```
+-d, --dot-file=FILE write graph traversals to a DOT file
+-r, --read-name=STR only process reads with names that contain STR
+-t, --trace-file=FILE write graph search stats to FILE
+```
+
+Sequence Extension Options
+--------------------------
+```
+-D, --dup-bloom-size=N use an additional Bloom filter to avoid
+ assembling the same region of the genome
+ multiple times. This option is highly
+ recommended whenever -E (--extend) is used
+ and has no effect otherwise. As a rule of
+ thumb, the Bloom filter size should be
+ about twice the target genome size [disabled]
+-E, --extend in addition to connecting read pairs,
+ extend the merged reads outwards to the next
+ dead end or branching point in the de Brujin
+ graph. For read pairs that were not successfully
+ connected, trim the single-end reads at both ends
+ and extend them independently.
+```
+
+Other Options
+-------------
+```
+-e, --fix-errors find and fix single-base errors when reads
+ have no kmers in bloom filter [disabled]
+-j, --threads=N use N parallel threads [1]
+-n --no-limits disable all limits; equivalent to
+ '-B nolimit -m nolimit -M nolimit -P nolimit'
+ --help display this help and exit
+ --version output version information and exit
+```
+
+OUTPUT FILES
+============
+
+`$PREFIX` in the filenames below is determined by the `-o` option.
+
+Without `--extend`:
+
+ * `$PREFIX_pseudoreads.fa`: Pseudo-reads created by connecting paired-end reads.
+ * `$PREFIX_reads_1.fq`: Read 1 from read pairs that could not be connected.
+ * `$PREFIX_reads_2.fq`: Read 2 from read pairs that could not be connected.
+
+With `--extend`:
+
+ * `$prefix_pseudoreads.fa`: Pseudo-reads created by connecting paired-end reads, which may or may not be extended. Also contains single-end reads from read pairs that could not be connected, but which could be trimmed and/or extended.
+ * `$PREFIX_reads_1.fq`: Read 1 from read pairs that could not be connected and which could not be trimmed (because they contain no "good" k-mers).
+ * `$PREFIX_reads_2.fq`: Read 2 from read pairs that could not be connected and which could not be trimmed (because they contain no "good" k-mers).
+
+AUTHORS
+=======
+
+Ben Vandervalk, Shaun Jackman, Tony Raymond, Hamid Mohamadi, Justin Chu.
+
+REPORTING BUGS
+==============
+
+Report bugs to <abyss-users at bcgsc.ca>.
diff --git a/Konnector/konnector.cc b/Konnector/konnector.cc
index 1c8f80d..e63cff1 100644
--- a/Konnector/konnector.cc
+++ b/Konnector/konnector.cc
@@ -25,6 +25,7 @@
#include <getopt.h>
#include <iostream>
#include <cstring>
+#include <algorithm>
#if _OPENMP
# include <omp.h>
@@ -73,6 +74,8 @@ static const char USAGE_MESSAGE[] =
" dead end or branching point in the de Brujin\n"
" graph. If the reads were not successfully\n"
" connected, extend them inwards as well.\n"
+" --fastq output merged reads in FASTQ format\n"
+" (default is FASTA)\n"
" -f, --min-frag=N min fragment size in base pairs [0]\n"
" -F, --max-frag=N max fragment size in base pairs [1000]\n"
" -i, --input-bloom=FILE load bloom filter from FILE\n"
@@ -91,37 +94,48 @@ static const char USAGE_MESSAGE[] =
" -n --no-limits disable all limits; equivalent to\n"
" '-B nolimit -m nolimit -M nolimit -P nolimit'\n"
" -o, --output-prefix=FILE prefix of output FASTA files [required]\n"
+" --preserve-reads don't correct any bases within the reads [disabled]\n"
+" -p, --alt-paths-mode output a separate pseudoread for each alternate\n"
+" path connecting a read pair (default is to create\n"
+" a consensus sequence of all connecting paths)\n"
" -P, --max-paths=N merge at most N alternate paths; use 'nolimit'\n"
" for no limit [2]\n"
" -q, --trim-quality=N trim bases from the ends of reads whose\n"
" quality is less than the threshold\n"
-" --standard-quality zero quality is `!' (33)\n"
-" default for FASTQ and SAM files\n"
-" --illumina-quality zero quality is `@' (64)\n"
-" default for qseq and export files\n"
+" --standard-quality zero quality is `!' (33), typically\n"
+" for FASTQ and SAM files [default]\n"
+" --illumina-quality zero quality is `@' (64), typically\n"
+" for qseq and export files\n"
+" -Q, --corrected-qual quality score for bases corrected or inserted\n"
+" by konnector; only relevant when --fastq is\n"
+" in effect [40]\n"
" -r, --read-name=STR only process reads with names that contain STR\n"
" -s, --search-mem=N mem limit for graph searches; multiply by the\n"
" number of threads (-j) to get the total mem used\n"
" for graph traversal [500M]\n"
" -t, --trace-file=FILE write graph search stats to FILE\n"
" -v, --verbose display verbose output\n"
+" -x, --read-identity=N min percent seq identity between consensus seq\n"
+" and reads [0]\n"
+" -X, --path-identity=N min percent seq identity across alternate\n"
+" connecting paths [0]\n"
" --help display this help and exit\n"
" --version output version information and exit\n"
"\n"
"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
const unsigned g_progressStep = 1000;
-/*
+/**
* ignore branches less than this length
*(false positive branches)
*/
const unsigned g_trimLen = 3;
-
/*
- * Bloom filter use to keep track of portions
+ * Bloom filter to keep track of portions
* of genome that have already been assembled.
- * This Bloom filter is only used when the
- * -E (--extend) option is in effect.
+ * This Bloom filter is only used when both
+ * the --extend and --dup-bloom-size options
+ * are in effect.
*/
BloomFilter g_dupBloom;
@@ -165,6 +179,11 @@ namespace opt {
*/
bool extend = false;
+ /**
+ * Output pseudo-reads in FASTQ format.
+ */
+ bool fastq = false;
+
/** The size of a k-mer. */
unsigned k;
@@ -177,9 +196,26 @@ namespace opt {
/** Bloom filter input file */
static string inputBloomPath;
+ /**
+ * Do not correct bases in input read sequences.
+ */
+ static bool preserveReads = false;
+
+ /**
+ * Output separate sequence for each alternate path
+ * between read pairs
+ */
+ static bool altPathsMode = false;
+
/** Max paths between read 1 and read 2 */
unsigned maxPaths = 2;
+ /**
+ * Quality score for bases that are corrected
+ * or inserted by konnector.
+ */
+ uint8_t correctedQual = 40;
+
/** Prefix for output files */
static string outputPrefix;
@@ -201,6 +237,17 @@ namespace opt {
/** Max mismatches between consensus and original reads */
static unsigned maxReadMismatches = NO_LIMIT;
+ /**
+ * Min percent seq identity between consensus seq
+ * and input reads
+ */
+ static float minReadIdentity = 0.0f;
+
+ /**
+ * Min percent seq identity between all alternate
+ * paths
+ */
+ static float minPathIdentity = 0.0f;
}
/** Counters */
@@ -220,14 +267,13 @@ static struct {
size_t readPairsMerged;
size_t skipped;
/* counts below are used only when -E is enabled */
- size_t mergedAndExtended;
size_t mergedAndSkipped;
- size_t singleEndCorrected;
+ size_t singleEndExtended;
} g_count;
-static const char shortopts[] = "b:B:c:d:D:eEf:F:i:Ij:k:lm:M:no:P:q:r:s:t:v";
+static const char shortopts[] = "b:B:c:d:D:eEf:F:i:Ij:k:lm:M:no:p:P:q:Q:r:s:t:vx:X:";
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_FASTQ = 1, OPT_HELP, OPT_PRESERVE_READS, OPT_VERSION };
static const struct option longopts[] = {
{ "bloom-size", required_argument, NULL, 'b' },
@@ -253,37 +299,35 @@ static const struct option longopts[] = {
{ "output-prefix", required_argument, NULL, 'o' },
{ "read-mismatches", required_argument, NULL, 'm' },
{ "max-mismatches", required_argument, NULL, 'M' },
+ { "alt-paths-mode", no_argument, NULL, 'p' },
{ "max-paths", required_argument, NULL, 'P' },
{ "trim-quality", required_argument, NULL, 'q' },
+ { "corrected-qual", required_argument, NULL, 'Q' },
{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
{ "read-name", required_argument, NULL, 'r' },
{ "search-mem", required_argument, NULL, 's' },
{ "trace-file", required_argument, NULL, 't' },
{ "verbose", no_argument, NULL, 'v' },
+ { "read-identity", required_argument, NULL, 'x' },
+ { "path-identity", required_argument, NULL, 'X' },
+ { "fastq", no_argument, NULL, OPT_FASTQ },
{ "help", no_argument, NULL, OPT_HELP },
+ { "preserve-reads", no_argument, NULL, OPT_PRESERVE_READS },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
};
/**
* Return true if the Bloom filter contains all of the
- * kmers in the given sequence.
+ * "good" kmers in the given sequence.
*/
-static bool bloomContainsSeq(const BloomFilter& bloom, const Sequence& seq)
+static inline bool isSeqRedundant(const BloomFilter& assembledKmers,
+ const BloomFilter& goodKmers, Sequence seq)
{
- if (containsAmbiguityCodes(seq)) {
- Sequence seqCopy = seq;
- flattenAmbiguityCodes(seqCopy, false);
- for (KmerIterator it(seqCopy, opt::k); it != KmerIterator::end();
- ++it) {
- if (!bloom[*it])
- return false;
- }
- return true;
- }
+ flattenAmbiguityCodes(seq, false);
for (KmerIterator it(seq, opt::k); it != KmerIterator::end(); ++it) {
- if (!bloom[*it])
+ if (goodKmers[*it] && !assembledKmers[*it])
return false;
}
return true;
@@ -292,18 +336,133 @@ static bool bloomContainsSeq(const BloomFilter& bloom, const Sequence& seq)
/**
* Load the kmers of a given sequence into a Bloom filter.
*/
-static inline void loadSeq(BloomFilter& bloom, unsigned k, const Sequence& seq)
+static inline void addKmers(BloomFilter& bloom,
+ const BloomFilter& goodKmers, unsigned k,
+ const Sequence& seq)
{
if (containsAmbiguityCodes(seq)) {
- Sequence seqCopy = seq;
- Sequence rc = reverseComplement(seqCopy);
- flattenAmbiguityCodes(seqCopy, false);
- flattenAmbiguityCodes(rc, false);
- Bloom::loadSeq(bloom, k, seqCopy);
- Bloom::loadSeq(bloom, k, rc);
+ Sequence flattened = seq;
+ Sequence rcFlattened = reverseComplement(seq);
+ flattenAmbiguityCodes(flattened, false);
+ flattenAmbiguityCodes(rcFlattened, false);
+ for (KmerIterator it(flattened, k);
+ it != KmerIterator::end();++it) {
+ if (goodKmers[*it])
+ bloom.insert(*it);
+ }
+ for (KmerIterator it(rcFlattened, k);
+ it != KmerIterator::end(); ++it) {
+ if (goodKmers[*it])
+ bloom.insert(*it);
+ }
+ return;
} else {
- Bloom::loadSeq(bloom, k, seq);
+ for (KmerIterator it(seq, k);
+ it != KmerIterator::end(); ++it) {
+ if (goodKmers[*it])
+ bloom.insert(*it);
+ }
+ }
+}
+
+enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
+
+/**
+ * Calculate quality string for a pseudo-read. A base will
+ * have a score of CORRECTED_BASE_QUAL if it was corrected
+ * by konnector or added by konnector (in the gap between
+ * paired-end reads). For bases that are unchanged from the
+ * input reads, the original quality score is used. In the
+ * case that the two input read(s) overlap and both provide
+ * a correct base call, the maximum of the two quality scores
+ * is used.
+ */
+static inline std::string calcQual(const FastqRecord& read1,
+ const FastqRecord& read2, Sequence& merged)
+{
+ unsigned char correctedQual = opt::qualityOffset + opt::correctedQual;
+ std::string qual(merged.length(), correctedQual);
+
+ /*
+ * In the case that the input files are FASTA,
+ * the quality strings for read1 / read2 will be
+ * empty, so just return a uniform quality string.
+ */
+ if (read1.qual.empty() || read2.qual.empty())
+ return qual;
+
+ Sequence r1 = read1.seq, r2 = reverseComplement(read2.seq);
+ std::string r1qual = read1.qual, r2qual = read2.qual;
+ std::reverse(r2qual.begin(), r2qual.end());
+ assert(r1.length() <= merged.length());
+ assert(r2.length() <= merged.length());
+
+ /* region covered only by read 1 */
+ unsigned r2offset = merged.length() - r2.length();
+ for (unsigned r1pos = 0; r1pos < r1.length() && r1pos < r2offset;
+ ++r1pos) {
+ if (r1.at(r1pos) == merged.at(r1pos)) {
+ qual.at(r1pos) = r1qual.at(r1pos);
+ } else {
+ //r1Corrected.at(i) = true;
+ qual.at(r1pos) = correctedQual;
+ }
+ }
+
+ /* region where read 1 and read 2 overlap */
+ for (unsigned r1pos = r2offset; r1pos < r1.length(); ++r1pos) {
+ unsigned r2pos = r1pos - r2offset;
+ if (r1.at(r1pos) != merged.at(r1pos) ||
+ r2.at(r2pos) != merged.at(r1pos)) {
+ qual.at(r1pos) = correctedQual;
+ } else {
+ assert(r1.at(r1pos) == r2.at(r2pos));
+ qual.at(r1pos) = max(r1qual.at(r1pos), r2qual.at(r2pos));
+ }
+ }
+
+ /* region covered only by read 2 */
+ for (unsigned r1pos = max(r2offset, (unsigned)r1.length());
+ r1pos < merged.length(); ++r1pos) {
+ unsigned r2pos = r1pos - r2offset;
+ if (r2.at(r2pos) == merged.at(r1pos)) {
+ qual.at(r1pos) = r2qual.at(r2pos);
+ } else {
+ qual.at(r1pos) = correctedQual;
+ }
+ }
+
+ return qual;
+}
+
+static inline string calcQual(const FastqRecord& orig,
+ const Sequence& extended, unsigned extendedLeft,
+ unsigned extendedRight)
+{
+ assert(extended.length() == orig.seq.length() +
+ extendedLeft + extendedRight);
+
+ unsigned char correctedQual = opt::qualityOffset + opt::correctedQual;
+ string qual(extended.length(), correctedQual);
+
+ /*
+ * In the case that the input files are FASTA,
+ * the quality strings for read1 / read2 will be
+ * empty, so just return a uniform quality string.
+ */
+ if (orig.qual.empty())
+ return qual;
+
+ unsigned offset = extendedLeft;
+ for (unsigned i = 0; i < orig.seq.length(); ++i) {
+ assert(offset + i < extended.length());
+ assert(i < orig.seq.length());
+ assert(i < orig.qual.length());
+ if (orig.seq.at(i) == extended.at(offset + i))
+ qual.at(offset + i) = orig.qual.at(i);
}
+
+ return qual;
}
/**
@@ -318,45 +477,46 @@ static inline void loadSeq(BloomFilter& bloom, unsigned k, const Sequence& seq)
* (or both) directions, false otherwise
*/
template <typename Graph>
-static bool extendRead(Sequence& seq, unsigned k, const Graph& g)
+static bool extendRead(FastqRecord& rec, unsigned k, const Graph& g)
{
- ExtendSeqResult result;
- bool extended = false;
+ unsigned extendedLeft = 0, extendedRight = 0;
+ Sequence extendedSeq = rec.seq;
/*
* offset start pos to reduce chance of hitting
* a dead end on a false positive kmer
*/
const unsigned runLengthHint = 3;
- unsigned startPos = getStartKmerPos(seq, k, FORWARD, g,
+ unsigned startPos = getStartKmerPos(extendedSeq, k, FORWARD, g,
runLengthHint);
if (startPos != NO_MATCH) {
- assert(startPos <= seq.length() - k);
- result = extendSeq(seq, FORWARD, startPos, k, g,
- NO_LIMIT, g_trimLen, opt::mask);
- if (result == ES_EXTENDED_TO_DEAD_END ||
- result == ES_EXTENDED_TO_BRANCHING_POINT ||
- result == ES_EXTENDED_TO_CYCLE) {
- extended = true;
- }
+ assert(startPos <= extendedSeq.length() - k);
+ unsigned lengthBefore = extendedSeq.length();
+ extendSeq(extendedSeq, FORWARD, startPos, k, g,
+ NO_LIMIT, g_trimLen, opt::mask,
+ !opt::altPathsMode, opt::preserveReads);
+ extendedRight = extendedSeq.length() - lengthBefore;
}
- startPos = getStartKmerPos(seq, k, REVERSE, g, runLengthHint);
+ startPos = getStartKmerPos(extendedSeq, k, REVERSE, g, runLengthHint);
if (startPos != NO_MATCH) {
- assert(startPos <= seq.length() - k);
- result = extendSeq(seq, REVERSE, startPos, k, g,
- NO_LIMIT, g_trimLen, opt::mask);
- if (result == ES_EXTENDED_TO_DEAD_END ||
- result == ES_EXTENDED_TO_BRANCHING_POINT ||
- result == ES_EXTENDED_TO_CYCLE) {
- extended = true;
- }
+ assert(startPos <= extendedSeq.length() - k);
+ unsigned lengthBefore = extendedSeq.length();
+ extendSeq(extendedSeq, REVERSE, startPos, k, g,
+ NO_LIMIT, g_trimLen, opt::mask,
+ !opt::altPathsMode, opt::preserveReads);
+ extendedLeft = extendedSeq.length() - lengthBefore;
}
- return extended;
-}
+ if (extendedLeft > 0 || extendedRight > 0) {
+ rec.qual = calcQual(rec, extendedSeq,
+ extendedLeft, extendedRight);
+ rec.seq = extendedSeq;
+ return true;
+ }
-enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
+ return false;
+}
/**
* Attempt to extend a merged read (a.k.a. pseudoread)
@@ -369,11 +529,14 @@ enum ExtendResult { ER_NOT_EXTENDED, ER_REDUNDANT, ER_EXTENDED };
* @return ExtendResult (ER_NOT_EXTENDED, ER_EXTENDED,
* ER_REDUNDANT)
*/
-template <typename Graph>
+template <typename Graph, typename BloomT1, typename BloomT2>
static inline ExtendResult
-extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
+extendReadIfNonRedundant(FastqRecord& seq, BloomT1& assembledKmers,
+ const BloomT2& goodKmers, unsigned k, const Graph& g)
{
+ bool extended = false;
bool redundant = false;
+
if (opt::dupBloomSize > 0) {
/*
* Check to see if the current pseudoread
@@ -381,12 +544,12 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
* that has already been assembled.
*/
#pragma omp critical(dupBloom)
- redundant = bloomContainsSeq(g_dupBloom, seq);
+ redundant = isSeqRedundant(assembledKmers, goodKmers, seq);
if (redundant)
return ER_REDUNDANT;
}
- Sequence origSeq = seq;
- bool extended = extendRead(seq, k, g);
+ Sequence origSeq = seq.seq;
+ extended = extendRead(seq, k, g);
if (opt::dupBloomSize > 0) {
/*
* mark the extended read as an assembled
@@ -395,8 +558,8 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
#pragma omp critical(dupBloom)
{
/* must check again to avoid race conditions */
- if (!bloomContainsSeq(g_dupBloom, origSeq))
- loadSeq(g_dupBloom, opt::k, seq);
+ if (!isSeqRedundant(assembledKmers, goodKmers, origSeq))
+ addKmers(assembledKmers, goodKmers, k, seq.seq);
else
redundant = true;
}
@@ -410,6 +573,22 @@ extendReadIfNonRedundant(Sequence& seq, unsigned k, const Graph& g)
return ER_NOT_EXTENDED;
}
+static inline FastqRecord connectingSeq(const FastqRecord& mergedSeq,
+ unsigned startKmerPos, unsigned goalKmerPos)
+{
+ FastqRecord rec;
+
+ unsigned start = startKmerPos;
+ unsigned end = mergedSeq.seq.length() - 1 - goalKmerPos;
+ assert(start <= end);
+
+ rec.id = mergedSeq.id;
+ rec.seq = mergedSeq.seq.substr(start, end - start + 1);
+ rec.qual = mergedSeq.qual.substr(start, end - start + 1);
+
+ return rec;
+}
+
/**
* Print progress stats about reads merged/extended so far.
*/
@@ -419,7 +598,7 @@ static inline void printProgressMessage()
<< g_count.readPairsProcessed << " read pairs";
if (opt::extend) {
- cerr << ", corrected/extended " << g_count.singleEndCorrected << " of "
+ cerr << ", corrected/extended " << g_count.singleEndExtended << " of "
<< (g_count.readPairsProcessed - g_count.uniquePath -
g_count.multiplePaths) * 2
<< " unmerged reads";
@@ -436,34 +615,100 @@ static inline void printProgressMessage()
<< ")\n";
}
-
-/**
- * For a successfully merged read pair, get the sequence
- * representing the connecting path between the two reads.
- */
-template <typename Bloom>
-static inline string getConnectingSeq(ConnectPairsResult& result,
- unsigned k, const Bloom& bloom)
+static inline void updateCounters(const ConnectPairsParams& params,
+ const ConnectPairsResult& result)
{
- assert(result.pathResult == FOUND_PATH);
- (void)bloom;
+ switch (result.pathResult) {
+ case NO_PATH:
+ assert(result.mergedSeqs.empty());
+ if (result.foundStartKmer && result.foundGoalKmer)
+#pragma omp atomic
+ ++g_count.noPath;
+ else
+#pragma omp atomic
+ ++g_count.noStartOrGoalKmer;
+ break;
+
+ case FOUND_PATH:
+ assert(!result.mergedSeqs.empty());
+ if (result.pathMismatches > params.maxPathMismatches ||
+ result.pathIdentity < params.minPathIdentity) {
+#pragma omp atomic
+ ++g_count.tooManyMismatches;
+ } else if (result.readMismatches > params.maxReadMismatches ||
+ result.readIdentity < params.minReadIdentity) {
+#pragma omp atomic
+ ++g_count.tooManyReadMismatches;
+ } else {
+ if (result.mergedSeqs.size() == 1)
+#pragma omp atomic
+ ++g_count.uniquePath;
+ else
+#pragma omp atomic
+ ++g_count.multiplePaths;
+ }
+ break;
+
+ case TOO_MANY_PATHS:
+#pragma omp atomic
+ ++g_count.tooManyPaths;
+ break;
- vector<FastaRecord>& paths = result.mergedSeqs;
- assert(paths.size() > 0);
+ case TOO_MANY_BRANCHES:
+#pragma omp atomic
+ ++g_count.tooManyBranches;
+ break;
- Sequence& seq = (paths.size() == 1) ?
- paths.front().seq : result.consensusSeq.seq;
+ case PATH_CONTAINS_CYCLE:
+#pragma omp atomic
+ ++g_count.containsCycle;
+ break;
- /*
- * initialize sequence to the chars between the
- * start and goal kmers of the path search.
- */
- int startPos = result.startKmerPos;
- int endPos = seq.length() - result.goalKmerPos - k;
- assert(startPos >= 0 && startPos <=
- (int)(seq.length() - k + 1));
+ case EXCEEDED_MEM_LIMIT:
+#pragma omp atomic
+ ++g_count.exceededMemLimit;
+ break;
+ }
+}
- return seq.substr(startPos, endPos - startPos + k);
+static inline void outputRead(const FastqRecord& read, ostream& out,
+ bool fastq = true)
+{
+ if (fastq)
+ out << read;
+ else
+ out << (FastaRecord)read;
+}
+
+static inline bool exceedsMismatchThresholds(const ConnectPairsParams& params,
+ const ConnectPairsResult& result)
+{
+ return (result.pathMismatches > params.maxPathMismatches ||
+ result.pathIdentity < params.minPathIdentity ||
+ result.readMismatches > params.maxReadMismatches ||
+ result.readIdentity < params.minReadIdentity);
+}
+
+/**
+ * Correct and extend an unmerged single-end read.
+ * @return true if the read was modified, false otherwise
+ */
+template <typename Graph, typename BloomT1, typename BloomT2>
+static inline bool correctAndExtend(FastqRecord& read,
+ BloomT1& assembledKmers, const BloomT2& goodKmers,
+ unsigned k, const Graph& g, bool preserveRead=false)
+{
+ bool corrected = false;
+ if (!preserveRead)
+ corrected = trimRead(read, k, g);
+ if (preserveRead || corrected) {
+ ExtendResult extendResult =
+ extendReadIfNonRedundant(read, assembledKmers,
+ goodKmers, k, g);
+ if (extendResult == ER_EXTENDED)
+ return true;
+ }
+ return corrected;
}
/** Connect a read pair. */
@@ -489,15 +734,31 @@ static void connectPair(const Graph& g,
return;
}
+ /* Search for connecting paths between read pair */
+
ConnectPairsResult result =
connectPairs(opt::k, read1, read2, g, params);
- vector<FastaRecord>& paths = result.mergedSeqs;
- bool mergedSeqRedundant = false;
- bool read1Corrected = false;
- bool read1Redundant = false;
- bool read2Corrected = false;
- bool read2Redundant = false;
+ /* Calculate quality strings for merged reads */
+
+ vector<FastqRecord> paths;
+ FastqRecord consensus;
+ if (result.pathResult == FOUND_PATH) {
+ for (unsigned i = 0; i < result.mergedSeqs.size(); ++i) {
+ FastqRecord fastq;
+ fastq.id = result.mergedSeqs.at(i).id;
+ fastq.seq = result.mergedSeqs.at(i).seq;
+ fastq.qual = calcQual(read1, read2, result.mergedSeqs.at(i).seq);
+ paths.push_back(fastq);
+ }
+ consensus.id = result.consensusSeq.id;
+ consensus.seq = result.consensusSeq.seq;
+ consensus.qual = calcQual(read1, read2, result.consensusSeq.seq);
+ }
+
+ bool outputRead1 = false;
+ bool outputRead2 = false;
+ std::vector<bool> pathRedundant;
/*
* extend reads inwards or outwards up to the
@@ -506,58 +767,66 @@ static void connectPair(const Graph& g,
*/
if (opt::extend) {
ExtendResult extendResult;
- if (result.pathResult == FOUND_PATH
- && result.pathMismatches <= params.maxPathMismatches
- && result.readMismatches <= params.maxReadMismatches) {
+ if (result.pathResult == FOUND_PATH &&
+ !exceedsMismatchThresholds(params, result)) {
+ /* we found at least one connecting path */
assert(paths.size() > 0);
- Sequence& seq = (paths.size() == 1) ?
- paths.front().seq : result.consensusSeq.seq;
- seq = getConnectingSeq(result, opt::k, bloom);
- extendResult = extendReadIfNonRedundant(
- seq, opt::k, g);
- if (extendResult == ER_REDUNDANT) {
+ if (opt::altPathsMode) {
+ /* extend each alternate path independently */
+ for (unsigned i = 0; i < paths.size(); ++i) {
+ if (!opt::preserveReads)
+ paths.at(i) = connectingSeq(paths.at(i),
+ result.startKmerPos, result.goalKmerPos);
+ extendResult = extendReadIfNonRedundant(
+ paths.at(i), g_dupBloom, bloom, opt::k, g);
+ pathRedundant.push_back(extendResult == ER_REDUNDANT);
+ }
+ } else {
+ /* extend consensus sequence for all paths */
+ if (!opt::preserveReads)
+ consensus = connectingSeq(consensus,
+ result.startKmerPos, result.goalKmerPos);
+ extendResult = extendReadIfNonRedundant(
+ consensus, g_dupBloom, bloom, opt::k, g);
+ pathRedundant.push_back(extendResult == ER_REDUNDANT);
+ }
+ if (std::find(pathRedundant.begin(), pathRedundant.end(),
+ false) == pathRedundant.end()) {
#pragma omp atomic
g_count.mergedAndSkipped++;
- mergedSeqRedundant = true;
- } else if (extendResult == ER_EXTENDED) {
-#pragma omp atomic
- g_count.mergedAndExtended++;
}
} else {
/*
* read pair could not be merged, so try
- * to extend each read individually (in
+ * to correct and extend each read individually (in
* both directions).
*/
-//std::cerr << "correcting " << read1.id << " (read 1)" << std::endl;
- read1Corrected = correctAndExtendSeq(read1.seq,
- opt::k, g, read1.seq.length(), g_trimLen,
- opt::mask);
-
- if (read1Corrected) {
+ if (correctAndExtend(read1, g_dupBloom, bloom,
+ opt::k, g, opt::preserveReads)) {
+ /* avoid duplicate read IDs */
+ if (!endsWith(read1.id, "/1")) {
+ read1.id.append("/1");
+ read1.comment.clear();
+ }
+ outputRead1 = true;
#pragma omp atomic
- g_count.singleEndCorrected++;
- extendResult = extendReadIfNonRedundant(read1.seq,
- opt::k, g);
- if (extendResult == ER_REDUNDANT)
- read1Redundant = true;
+ g_count.singleEndExtended++;
}
-//std::cerr << "correcting " << read2.id << " (read 2)" << std::endl;
- read2Corrected = correctAndExtendSeq(read2.seq,
- opt::k, g, read2.seq.length(), g_trimLen,
- opt::mask);
-
- if (read2Corrected) {
+ if (correctAndExtend(read2, g_dupBloom, bloom,
+ opt::k, g, opt::preserveReads)) {
+ /* avoid duplicate read IDs */
+ if (!endsWith(read2.id, "/2")) {
+ read2.id.append("/2");
+ read2.comment.clear();
+ }
+ outputRead2 = true;
#pragma omp atomic
- g_count.singleEndCorrected++;
- extendResult = extendReadIfNonRedundant(read2.seq,
- opt::k, g);
- if (extendResult == ER_REDUNDANT)
- read2Redundant = true;
+ g_count.singleEndExtended++;
}
+
}
}
@@ -568,106 +837,42 @@ static void connectPair(const Graph& g,
assert_good(traceStream, opt::tracefilePath);
}
- switch (result.pathResult) {
+ /* update stats regarding merge successes / failures */
- case NO_PATH:
- assert(paths.empty());
- if (result.foundStartKmer && result.foundGoalKmer)
-#pragma omp atomic
- ++g_count.noPath;
- else {
-#pragma omp atomic
- ++g_count.noStartOrGoalKmer;
- }
- break;
+ updateCounters(params, result);
- case FOUND_PATH:
- assert(!paths.empty());
- if (result.pathMismatches > params.maxPathMismatches ||
- result.readMismatches > params.maxReadMismatches) {
- if (result.pathMismatches > params.maxPathMismatches)
-#pragma omp atomic
- ++g_count.tooManyMismatches;
- else
- ++g_count.tooManyReadMismatches;
- if (opt::extend) {
- if (read1Corrected || read2Corrected)
-#pragma omp critical(mergedStream)
- {
- if (read1Corrected && !read1Redundant)
- mergedStream << (FastaRecord)read1;
- if (read2Corrected && !read2Redundant)
- mergedStream << (FastaRecord)read2;
- }
- if (!read1Corrected || !read2Corrected)
-#pragma omp critical(readStream)
- {
- if (!read1Corrected)
- read1Stream << (FastaRecord)read1;
- if (!read2Corrected)
- read1Stream << (FastaRecord)read2;
- }
- } else
-#pragma omp critical(readStream)
- {
- read1Stream << read1;
- read2Stream << read2;
- }
- }
- else if (paths.size() > 1) {
-#pragma omp atomic
- ++g_count.multiplePaths;
- if (!mergedSeqRedundant)
+ /* ouput merged / unmerged reads */
+
+ if (result.pathResult == FOUND_PATH &&
+ !exceedsMismatchThresholds(params, result)) {
+ assert(!paths.empty());
+ if (opt::altPathsMode) {
#pragma omp critical(mergedStream)
- mergedStream << result.consensusSeq;
+ for (unsigned i = 0; i < paths.size(); ++i) {
+ if (opt::dupBloomSize == 0 || !pathRedundant.at(i))
+ outputRead(paths.at(i), mergedStream, opt::fastq);
}
- else {
-#pragma omp atomic
- ++g_count.uniquePath;
- if (!mergedSeqRedundant)
+ } else if (opt::dupBloomSize == 0 || !pathRedundant.front()) {
#pragma omp critical(mergedStream)
- mergedStream << paths.front();
- }
- break;
-
- case TOO_MANY_PATHS:
-#pragma omp atomic
- ++g_count.tooManyPaths;
- break;
-
- case TOO_MANY_BRANCHES:
-#pragma omp atomic
- ++g_count.tooManyBranches;
- break;
-
- case PATH_CONTAINS_CYCLE:
-#pragma omp atomic
- ++g_count.containsCycle;
- break;
-
- case EXCEEDED_MEM_LIMIT:
-#pragma omp atomic
- ++g_count.exceededMemLimit;
- break;
- }
-
- if (result.pathResult != FOUND_PATH) {
+ outputRead(consensus, mergedStream, opt::fastq);
+ }
+ } else {
if (opt::extend) {
- if (read1Corrected || read2Corrected)
+ if (outputRead1 || outputRead2)
#pragma omp critical(mergedStream)
{
- if (read1Corrected && !read1Redundant)
- mergedStream << (FastaRecord)read1;
- if (read2Corrected && !read2Redundant)
- mergedStream << (FastaRecord)read2;
+ if (outputRead1)
+ outputRead(read1, mergedStream, opt::fastq);
+ if (outputRead2)
+ outputRead(read2, mergedStream, opt::fastq);
}
- if (!read1Corrected || !read2Corrected)
+ if (!outputRead1 || !outputRead2)
#pragma omp critical(readStream)
{
- if (!read1Corrected)
- read1Stream << (FastaRecord)read1;
- if (!read2Corrected)
- read1Stream << (FastaRecord)read2;
+ if (!outputRead1)
+ read1Stream << read1;
+ if (!outputRead2)
+ read2Stream << read2;
}
} else
#pragma omp critical(readStream)
@@ -784,21 +989,33 @@ int main(int argc, char** argv)
setMaxOption(opt::maxMismatches, arg); break;
case 'o':
arg >> opt::outputPrefix; break;
+ case 'p':
+ opt::altPathsMode = true; break;
case 'P':
setMaxOption(opt::maxPaths, arg); break;
case 'q':
arg >> opt::qualityThreshold; break;
+ case 'Q':
+ arg >> opt::correctedQual; break;
case 'r':
arg >> opt::readName; break;
case 's':
opt::searchMem = SIToBytes(arg); break;
case 't':
arg >> opt::tracefilePath; break;
+ case 'x':
+ arg >> opt::minReadIdentity; break;
+ case 'X':
+ arg >> opt::minPathIdentity; break;
case 'v':
opt::verbose++; break;
+ case OPT_FASTQ:
+ opt::fastq = true; break;
case OPT_HELP:
cout << USAGE_MESSAGE;
exit(EXIT_SUCCESS);
+ case OPT_PRESERVE_READS:
+ opt::preserveReads = true; break;
case OPT_VERSION:
cout << VERSION_MESSAGE;
exit(EXIT_SUCCESS);
@@ -848,6 +1065,15 @@ int main(int argc, char** argv)
seqanTests();
#endif
+ /*
+ * We need to set a default quality score offset
+ * in order to generate quality scores
+ * for bases that are corrected/inserted by
+ * konnector (--fastq option).
+ */
+ if (opt::qualityOffset == 0)
+ opt::qualityOffset = 33;
+
assert(opt::bloomSize > 0);
if (opt::dupBloomSize > 0)
@@ -925,10 +1151,11 @@ int main(int argc, char** argv)
*/
string mergedOutputPath(opt::outputPrefix);
- if (opt::extend)
- mergedOutputPath.append("_pseudoreads.fa");
+ mergedOutputPath.append("_pseudoreads");
+ if (opt::fastq)
+ mergedOutputPath.append(".fq");
else
- mergedOutputPath.append("_merged.fa");
+ mergedOutputPath.append(".fa");
ofstream mergedStream(mergedOutputPath.c_str());
assert_good(mergedStream, mergedOutputPath);
@@ -939,18 +1166,12 @@ int main(int argc, char** argv)
*/
string read1OutputPath(opt::outputPrefix);
- if (opt::extend)
- read1OutputPath.append("_reads_1.fa");
- else
- read1OutputPath.append("_reads_1.fq");
+ read1OutputPath.append("_reads_1.fq");
ofstream read1Stream(read1OutputPath.c_str());
assert_good(read1Stream, read1OutputPath);
string read2OutputPath(opt::outputPrefix);
- if (opt::extend)
- read2OutputPath.append("_reads_2.fa");
- else
- read2OutputPath.append("_reads_2.fq");
+ read2OutputPath.append("_reads_2.fq");
ofstream read2Stream(read2OutputPath.c_str());
assert_good(read2Stream, read2OutputPath);
@@ -964,10 +1185,13 @@ int main(int argc, char** argv)
params.maxPaths = opt::maxPaths;
params.maxBranches = opt::maxBranches;
params.maxPathMismatches = opt::maxMismatches;
+ params.minPathIdentity = opt::minPathIdentity;
params.maxReadMismatches = opt::maxReadMismatches;
+ params.minReadIdentity = opt::minReadIdentity;
params.kmerMatchesThreshold = 3;
params.fixErrors = opt::fixErrors;
params.maskBases = opt::mask;
+ params.preserveReads = opt::preserveReads;
params.memLimit = opt::searchMem;
params.dotPath = opt::dotPath;
params.dotStream = opt::dotPath.empty() ? NULL : &dotStream;
@@ -1037,9 +1261,9 @@ int main(int argc, char** argv)
<< "%)\n";
if (opt::extend) {
cerr << "Unmerged reads corrected/extended: "
- << g_count.singleEndCorrected
+ << g_count.singleEndExtended
<< " (" << setprecision(3) << (float)100
- * g_count.singleEndCorrected / ((g_count.readPairsProcessed -
+ * g_count.singleEndExtended / ((g_count.readPairsProcessed -
g_count.uniquePath - g_count.multiplePaths) * 2)
<< "%)\n";
}
diff --git a/Konnector/konnector.h b/Konnector/konnector.h
index e78fac6..2325af9 100644
--- a/Konnector/konnector.h
+++ b/Konnector/konnector.h
@@ -26,7 +26,16 @@ struct ConnectPairsResult
unsigned k;
std::string readNamePrefix;
PathSearchResult pathResult;
+ /** alternate connecting sequence(s) for read pair */
+ std::vector<Sequence> connectingSeqs;
+ /** read pairs joined with alternate connecting sequence(s) */
std::vector<FastaRecord> mergedSeqs;
+ /** consensus sequence for alternate connecting sequences */
+ Sequence consensusConnectingSeq;
+ /**
+ * consensus sequence for read pairs joined by
+ * alternate connecting sequences
+ */
FastaRecord consensusSeq;
bool foundStartKmer;
bool foundGoalKmer;
@@ -37,7 +46,9 @@ struct ConnectPairsResult
unsigned maxDepthVisitedForward;
unsigned maxDepthVisitedReverse;
unsigned pathMismatches;
+ float pathIdentity;
unsigned readMismatches;
+ float readIdentity;
size_t memUsage;
ConnectPairsResult() :
@@ -52,7 +63,9 @@ struct ConnectPairsResult
maxDepthVisitedForward(0),
maxDepthVisitedReverse(0),
pathMismatches(0),
+ pathIdentity(0.0f),
readMismatches(0),
+ readIdentity(0.0f),
memUsage(0)
{}
@@ -70,7 +83,9 @@ struct ConnectPairsResult
<< "max_depth_forward" << "\t"
<< "max_depth_reverse" << "\t"
<< "path_mismatches" << "\t"
+ << "path_identity" << "\t"
<< "read_mismatches" << "\t"
+ << "read_identity" << "\t"
<< "mem_usage" << "\n";
return out;
}
@@ -105,7 +120,9 @@ struct ConnectPairsResult
<< o.maxDepthVisitedForward << "\t"
<< o.maxDepthVisitedReverse << "\t"
<< o.pathMismatches << "\t"
+ << std::setprecision(3) << o.pathIdentity << "\t"
<< o.readMismatches << "\t"
+ << std::setprecision(3) << o.readIdentity << "\t"
<< o.memUsage << "\n";
return out;
@@ -119,10 +136,13 @@ struct ConnectPairsParams {
unsigned maxPaths;
unsigned maxBranches;
unsigned maxPathMismatches;
+ float minPathIdentity;
unsigned maxReadMismatches;
+ float minReadIdentity;
unsigned kmerMatchesThreshold;
bool fixErrors;
bool maskBases;
+ bool preserveReads;
size_t memLimit;
std::string dotPath;
std::ofstream* dotStream;
@@ -133,10 +153,13 @@ struct ConnectPairsParams {
maxPaths(NO_LIMIT),
maxBranches(NO_LIMIT),
maxPathMismatches(NO_LIMIT),
+ minPathIdentity(0.0f),
maxReadMismatches(NO_LIMIT),
+ minReadIdentity(0.0f),
kmerMatchesThreshold(1),
fixErrors(false),
maskBases(false),
+ preserveReads(false),
memLimit(std::numeric_limits<std::size_t>::max()),
dotStream(NULL)
{}
@@ -230,10 +253,10 @@ static inline ConnectPairsResult connectPairs(
const unsigned numMatchesThreshold = 3;
unsigned startKmerPos = getStartKmerPos(read1, k, FORWARD, g,
- numMatchesThreshold);
+ numMatchesThreshold, params.preserveReads);
unsigned goalKmerPos = getStartKmerPos(read2, k, FORWARD, g,
- numMatchesThreshold);
+ numMatchesThreshold, params.preserveReads);
const FastaRecord* pRead1 = &read1;
const FastaRecord* pRead2 = &read2;
@@ -298,48 +321,101 @@ static inline ConnectPairsResult connectPairs(
result.maxDepthVisitedReverse = visitor.getMaxDepthVisited(REVERSE);
result.memUsage = visitor.approxMemUsage();
- // write traversal graph to dot file (-d option)
-
if (result.pathResult == FOUND_PATH) {
- // build sequences for connecting paths
-
- std::string seqPrefix = pRead1->seq.substr(0, startKmerPos);
- std::string seqSuffix = reverseComplement(pRead2->seq.substr(0, goalKmerPos));
- for (unsigned i = 0; i < paths.size(); i++) {
- FastaRecord mergedSeq;
- std::stringstream index;
- index << i;
- assert(index);
- mergedSeq.id = result.readNamePrefix + "_" + index.str();
- mergedSeq.seq = seqPrefix + pathToSeq(paths[i]) + seqSuffix;
- result.mergedSeqs.push_back(mergedSeq);
+ /* build sequences for connecting paths */
+
+ std::string seqPrefix, seqSuffix;
+
+ if (params.preserveReads) {
+ seqPrefix = pRead1->seq;
+ seqSuffix = reverseComplement(pRead2->seq);
+ unsigned trimLeft = pRead1->seq.length() - startKmerPos;
+ unsigned trimRight = pRead2->seq.length() - goalKmerPos;
+ for (unsigned i = 0; i < paths.size(); i++) {
+ Sequence connectingSeq = pathToSeq(paths[i]);
+ /*
+ * If the input reads overlap, we must fail because
+ * there's no way to preserve the original read
+ * sequences in the merged read (the reads may disagree
+ * in the region of overlap)
+ */
+ if (trimLeft + trimRight > connectingSeq.length()) {
+ result.pathResult = NO_PATH;
+ return result;
+ }
+ connectingSeq = connectingSeq.substr(trimLeft,
+ connectingSeq.length() - trimLeft - trimRight);
+ result.connectingSeqs.push_back(connectingSeq);
+ }
+ } else {
+ seqPrefix = pRead1->seq.substr(0, startKmerPos);
+ seqSuffix = reverseComplement(pRead2->seq.substr(0, goalKmerPos));
+ for (unsigned i = 0; i < paths.size(); i++)
+ result.connectingSeqs.push_back(pathToSeq(paths[i]));
}
- // calc consensus seq and mismatch stats
+ unsigned readPairLength = read1.seq.length() + read2.seq.length();
if (paths.size() == 1) {
+ /* found a unique path between the reads */
+
+ FastaRecord mergedSeq;
+ mergedSeq.id = result.readNamePrefix;
+ mergedSeq.seq = seqPrefix + result.connectingSeqs.front() + seqSuffix;
result.readMismatches =
- maskNew(read1, read2, result.mergedSeqs.front(), params.maskBases);
+ maskNew(read1, read2, mergedSeq, params.maskBases);
+ result.pathIdentity = 100.0f;
+ result.readIdentity = 100.0f * (float)(readPairLength -
+ result.readMismatches) / readPairLength;
+
+ result.mergedSeqs.push_back(mergedSeq);
+ result.consensusSeq = mergedSeq;
+ result.consensusConnectingSeq = result.connectingSeqs.front();
} else {
+ /*
+ * multiple paths were found, so build a consensus
+ * sequence using multiple sequence alignment.
+ */
+
NWAlignment aln;
unsigned matches, size;
- boost::tie(matches, size) = align(result.mergedSeqs, aln);
+ boost::tie(matches, size) = align(result.connectingSeqs, aln);
assert(size >= matches);
result.pathMismatches = size - matches;
-
+ result.consensusConnectingSeq = aln.match_align;
+ result.pathIdentity = 100.0f *
+ (float)(result.consensusConnectingSeq.length()
+ - result.pathMismatches) / result.consensusConnectingSeq.length();
result.consensusSeq.id = result.readNamePrefix;
- result.consensusSeq.seq = aln.match_align;
+ result.consensusSeq.seq = seqPrefix + result.consensusConnectingSeq +
+ seqSuffix;
result.readMismatches =
maskNew(read1, read2, result.consensusSeq, params.maskBases);
+ result.readIdentity = 100.0f * (float)(readPairLength -
+ result.readMismatches) / readPairLength;
+
+ unsigned i = 1;
+ for (std::vector<Sequence>::iterator it = result.connectingSeqs.begin();
+ it != result.connectingSeqs.end(); ++it) {
+ FastaRecord mergedSeq;
+ std::ostringstream id;
+ id << result.readNamePrefix << '_' << i++;
+ mergedSeq.id = id.str();
+ mergedSeq.seq = seqPrefix + *it + seqSuffix;
+ result.mergedSeqs.push_back(mergedSeq);
+ }
}
+ assert(result.connectingSeqs.size() == result.mergedSeqs.size());
}
+ /* write traversal graph to dot file (-d option) */
+
if (!params.dotPath.empty()) {
HashGraph<Kmer> traversalGraph;
visitor.getTraversalGraph(traversalGraph);
@@ -369,7 +445,8 @@ static inline Kmer getHeadKmer(const Sequence& seq, Direction dir,
template <typename Graph>
static inline bool extendSeqThroughBubble(Sequence& seq,
Direction dir, unsigned startKmerPos, unsigned k,
- const Graph& g, unsigned trimLen=0, bool maskNew=false)
+ const Graph& g, unsigned trimLen=0, bool maskNew=false,
+ bool preserveSeq=false)
{
assert(seq.length() >= k);
assert(dir == FORWARD || dir == REVERSE);
@@ -388,6 +465,11 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
return false;
}
+ std::string headKmer = seq.substr(startKmerPos, k);
+ if (headKmer.find_first_not_of("AGCTagct") !=
+ std::string::npos)
+ return false;
+
Kmer head(seq.substr(startKmerPos, k));
std::vector<Kmer> buds = trueBranches(head, dir, g, trimLen);
@@ -406,8 +488,14 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
path1.push_back(head);
path2.push_back(head);
}
- extendPath(path1, dir, g, trimLen, k+2);
- extendPath(path2, dir, g, trimLen, k+2);
+
+ ExtendPathParams params;
+ params.trimLen = trimLen;
+ params.maxLen = k + 2;
+ params.lookBehind = true;
+
+ extendPath(path1, dir, g, params);
+ extendPath(path2, dir, g, params);
/* paths lengths not k+1 -- not a simple bubble */
if (path1.size() != k+2 || path2.size() != k+2)
@@ -432,10 +520,33 @@ static inline bool extendSeqThroughBubble(Sequence& seq,
Sequence& consensus = alignment.match_align;
if (dir == FORWARD) {
- overlaySeq(consensus, seq, startKmerPos, maskNew);
+ if (preserveSeq) {
+ /*
+ * make sure bubble extends beyond end of
+ * original sequence
+ */
+ assert(startKmerPos + consensus.length()
+ > seq.length());
+ overlaySeq(consensus.substr(seq.length() - startKmerPos),
+ seq, seq.length(), maskNew);
+ } else {
+ overlaySeq(consensus, seq, startKmerPos, maskNew);
+ }
} else {
- overlaySeq(consensus, seq,
- -consensus.length() + startKmerPos + k, maskNew);
+ if (preserveSeq) {
+ /*
+ * make sure bubble extends beyond end of
+ * original sequence
+ */
+ assert(consensus.length() > startKmerPos + k);
+ consensus = consensus.substr(0,
+ consensus.length() - startKmerPos - k);
+ overlaySeq(consensus, seq, -consensus.length(),
+ maskNew);
+ } else {
+ overlaySeq(consensus, seq,
+ -consensus.length() + startKmerPos + k, maskNew);
+ }
}
return true;
@@ -530,7 +641,8 @@ template <typename Graph>
static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
unsigned startKmerPos, unsigned k, const Graph& g,
unsigned maxLen=NO_LIMIT, unsigned trimLen=0,
- bool maskNew=false)
+ bool maskNew=false, bool popBubbles=true,
+ bool preserveSeq=false)
{
if (seq.length() < k)
return ES_NO_START_KMER;
@@ -587,8 +699,12 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
(int)(maxLen - startKmerPos - k + 1));
}
- pathResult = extendPath(path, FORWARD, g, trimLen,
- maxPathLen);
+ ExtendPathParams params;
+ params.trimLen = trimLen;
+ params.maxLen = maxPathLen;
+ params.lookBehind = false;
+
+ pathResult = extendPath(path, FORWARD, g, params);
/*
* give up if we don't at extend beyond end
@@ -622,35 +738,46 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
pathResult == EXTENDED_TO_LENGTH_LIMIT))
{
std::string pathSeq = pathToSeq(path);
- overlaySeq(pathSeq, seq, seq.length() - k + 1, maskNew);
+ if (preserveSeq)
+ overlaySeq(pathSeq.substr(k), seq,
+ seq.length(), maskNew);
+ else
+ overlaySeq(pathSeq, seq,
+ seq.length() - k + 1, maskNew);
}
/*
* extend through simple bubbles
*/
done = true;
- if (seq.length() < maxLen &&
+ if (popBubbles && seq.length() < maxLen &&
(pathResult == BRANCHING_POINT ||
pathResult == EXTENDED_TO_BRANCHING_POINT)) {
startKmerPos = startKmerPos + path.size() - 1;
assert(startKmerPos < seq.length() - k + 1);
if (extendSeqThroughBubble(seq, FORWARD, startKmerPos,
- k, g, trimLen, maskNew)) {
+ k, g, trimLen, maskNew, preserveSeq)) {
/* make sure we don't exceed extension limit */
if (seq.length() > maxLen)
seq = seq.substr(0, maxLen);
/* check for cycle */
- Path<Kmer> bubblePath = seqToPath(seq.substr(startKmerPos), k);
- for (Path<Kmer>::iterator it = bubblePath.begin();
- it != bubblePath.end(); ++it) {
- if (visited.containsKmer(*it)) {
+ for (unsigned i = startKmerPos + 1;
+ i < seq.length() - k + 1; ++i) {
+ std::string kmerStr = seq.substr(i, k);
+ size_t pos = kmerStr.find_first_not_of("AGCTagct");
+ if (pos != std::string::npos) {
+ i += pos;
+ continue;
+ }
+ Kmer kmer(kmerStr);
+ if (visited.containsKmer(kmer)) {
pathResult = EXTENDED_TO_CYCLE;
- bubblePath.erase(it, bubblePath.end());
+ seq.erase(i);
break;
}
- visited.addKmer(*it);
+ visited.addKmer(kmer);
}
/* set up for another round of extension */
@@ -715,23 +842,15 @@ static inline ExtendSeqResult extendSeq(Sequence& seq, Direction dir,
return result;
}
-/**
- * Correct the given sequence using the Bloom filter de Bruijn
- * graph. The correction is performed by finding the longest
- * stretch of good kmers in the sequence and extending that
- * region both left and right.
- */
template <typename Graph>
-static inline bool correctAndExtendSeq(Sequence& seq,
- unsigned k, const Graph& g, unsigned maxLen=NO_LIMIT,
- unsigned trimLen=0, bool maskNew=false)
+static inline bool trimRead(FastqRecord& read,
+ unsigned k, const Graph& g)
{
+ Sequence& seq = read.seq;
+
if (seq.size() < k)
return false;
- if (maxLen < seq.length())
- maxLen = seq.length();
-
/*
* find longest stretch of contiguous kmers
* in de Bruijn graph
@@ -772,22 +891,8 @@ static inline bool correctAndExtendSeq(Sequence& seq,
assert(maxMatchStart != UNSET);
assert(maxMatchLen > 0);
- unsigned maxMatchSeqLen = maxMatchLen+k-1;
- unsigned seedSeqLen = std::min(2*k-1, maxMatchSeqLen);
-
- Sequence correctedSeq = seq.substr(
- maxMatchStart + maxMatchSeqLen - seedSeqLen,
- std::string::npos);
-
- extendSeq(correctedSeq, REVERSE, correctedSeq.length()-k, k, g, 2*k,
- trimLen, maskNew);
- if (correctedSeq.length() < 2*k)
- return false;
-
- correctedSeq = correctedSeq.substr(0, k);
- extendSeq(correctedSeq, FORWARD, 0, k, g, 2*k+1, trimLen, maskNew);
-
- seq = correctedSeq;
+ read.seq = read.seq.substr(maxMatchStart, maxMatchLen);
+ read.qual = read.qual.substr(maxMatchStart, maxMatchLen);
return true;
}
diff --git a/LICENSE b/LICENSE
index f13b0d6..542147f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,24 +1,22 @@
ABySS
-Copyright 2014 Canada's Michael Smith Genome Sciences Centre
+Copyright 2016 British Columbia Cancer Agency Branch
-You may use, redistribute and modify this software for non-commercial
-purposes under the terms of the GNU General Public License as
-published by the Free Software Foundation, either version 3 of the
-License, or (at your option) any later version.
-
-To license ABySS for commercial purposes, please contact
-Patrick Rebstein <prebstein at bccancer.bc.ca>
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-The complete text of the GNU General Public License version 3 follows
-and is also available from the Free Software Foundation web site:
-http://www.gnu.org/licenses/
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+For commercial licensing options, please contact
+Patrick Rebstein <prebstein at bccancer.bc.ca>
-See the file `COPYRIGHT` for details of the copyright and license of
+See the file COPYRIGHT for details of the copyright and license of
each individual file included with this software.
GNU GENERAL PUBLIC LICENSE
diff --git a/LogKmerCount/CountingBloomFilter.h b/LogKmerCount/CountingBloomFilter.h
index ee09ae0..3dbfea7 100644
--- a/LogKmerCount/CountingBloomFilter.h
+++ b/LogKmerCount/CountingBloomFilter.h
@@ -6,9 +6,9 @@
#define COUNTINGBLOOMFILTER_H 1
#include "Bloom/Bloom.h"
-#include <vector>
-#include <math.h>
#include <cassert>
+#include <cmath>
+#include <vector>
/** A counting Bloom filter. */
template<typename NumericType>
diff --git a/LogKmerCount/plc.h b/LogKmerCount/plc.h
index 07fe2c1..b07a020 100644
--- a/LogKmerCount/plc.h
+++ b/LogKmerCount/plc.h
@@ -5,10 +5,10 @@
* Exponent = 5 bits
* Copyright 2014 bcgsc
*/
-#include <stdint.h>
-#include <stdlib.h>
-#include <math.h>
+#include <cmath>
+#include <cstdlib>
#include <iostream>
+#include <stdint.h>
using namespace std;
diff --git a/Makefile.am b/Makefile.am
index 6314e11..190a219 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -12,9 +12,13 @@ dist_doc_DATA = \
CITATION.bib CITATION.md \
COPYRIGHT \
LICENSE \
- README.css README.html README.md
+ README.md
-EXTRA_DIST=doxygen.conf
+if HAVE_PANDOC
+dist_doc_DATA += README.html
+endif
+
+EXTRA_DIST=autogen.sh doxygen.conf
SUBDIRS = \
bin \
@@ -29,6 +33,7 @@ SUBDIRS = \
Align \
ABYSS $(Parallel) \
Bloom \
+ BloomDBG \
Konnector \
Consensus \
DAssembler \
@@ -51,11 +56,13 @@ SUBDIRS = \
GapFiller \
Sealer \
AdjList \
+ lib/bloomfilter \
+ lib/rolling-hash \
$(GTest) \
$(UnitTest)
-%.html: %.md
- -multimarkdown $< >$@
+%.html: $(srcdir)/%.md
+ -pandoc -s -o $@ $<
clean-local:
rm -f README.html
diff --git a/ParseAligns/abyss-fixmate.cc b/ParseAligns/abyss-fixmate.cc
index 0c97f92..39eee77 100644
--- a/ParseAligns/abyss-fixmate.cc
+++ b/ParseAligns/abyss-fixmate.cc
@@ -17,7 +17,6 @@
#include <boost/unordered_map.hpp>
#include "DataBase/Options.h"
#include "DataBase/DB.h"
-#include <math.h>
using namespace std;
diff --git a/README.css b/README.css
deleted file mode 100644
index 2ed6ef2..0000000
--- a/README.css
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Font */
-body { font: 12pt Georgia, Palatino, Times, serif; }
-h1, h2, h3, h4 {
- font-family: Verdana, Helvetica, Arial, sans-serif;
- font-weight: normal;
-}
-h1 { font-size: 18pt; }
-h2, h3, h4 { font-size: 14pt; }
-a { text-decoration: none; }
-code { font: 12pt Courier, monospace; }
-pre { font: 12pt Courier, monospace; }
-
-/* Colour and border */
-a {
- color: #222222;
- border-bottom: 1pt dashed #888888;
-}
-a:hover {
- color: #ffffff;
- background: #222222;
-}
-pre {
- background-color: #dddddd;
- border: #777777 1pt solid;
-}
-
-/* Layout */
-p {
- text-align: justify;
- min-width: 18pc;
- max-width: 42pc;
-}
-pre {
- word-wrap: break-word;
- max-width: 42pc;
- margin: 1pc;
- padding-left: 1pc;
- padding-right: 1pc;
-}
diff --git a/README.md b/README.md
index ed06ef3..4765389 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,3 @@
-Title: ABySS README
-Author: Shaun Jackman, Anthony Raymond
-Affiliation: Canada's Michael Smith Genome Sciences Centre
-CSS: README.css
-
ABySS
=====
@@ -22,6 +17,7 @@ Contents
* [Assembling multiple libraries](#assembling-multiple-libraries)
* [Scaffolding](#scaffolding)
* [Rescaffolding with long sequences](#rescaffolding-with-long-sequences)
+* [Assembling using a Bloom filter de Bruijn graph](#assembling-using-a-bloom-filter-de-bruijn-graph)
* [Assembling using a paired de Bruijn graph](#assembling-using-a-paired-de-bruijn-graph)
* [Assembling a strand-specific RNA-Seq library](#assembling-a-strand-specific-rna-seq-library)
* [Optimizing the parameter k](#optimizing-the-parameter-k)
@@ -69,9 +65,10 @@ Dependencies
ABySS requires the following libraries:
-* [Boost](http://www.boost.org)
-* [sparsehash](http://code.google.com/p/sparsehash)
+* [Boost](http://www.boost.org/)
* [Open MPI](http://www.open-mpi.org)
+* [sparsehash](https://code.google.com/p/sparsehash/)
+* [SQLite](http://www.sqlite.org/)
ABySS requires a C++ compiler that supports
[OpenMP](http://www.openmp.org) such as [GCC](http://gcc.gnu.org).
@@ -137,6 +134,11 @@ usage, although it will build without. sparsehash should be found in
./configure CPPFLAGS=-I/usr/local/include
+If SQLite is installed in non-default directories, its location can be
+specified to `configure`:
+
+ ./configure --with-sqlite=/opt/sqlite3
+
The default maximum k-mer size is 64 and may be decreased to reduce
memory usage or increased at compile time. This value must be a
multiple of 32 (i.e. 32, 64, 96, 128, etc):
@@ -187,22 +189,23 @@ single-end assembly must be well over the fragment-size to obtain an
accurate empirical distribution.
Here's an example scenario of assembling a data set with two different
-fragment libraries and single-end reads:
+fragment libraries and single-end reads. Note that the names of the libraries
+(`pea` and `peb`) are arbitrary.
- * Library `pe200` has reads in two files,
- `pe200_1.fa` and `pe200_2.fa`.
- * Library `pe500` has reads in two files,
- `pe500_1.fa` and `pe500_2.fa`.
+ * Library `pea` has reads in two files,
+ `pea_1.fa` and `pea_2.fa`.
+ * Library `peb` has reads in two files,
+ `peb_1.fa` and `peb_2.fa`.
* Single-end reads are stored in two files, `se1.fa` and `se2.fa`.
The command line to assemble this example data set is:
- abyss-pe k=64 name=ecoli lib='pe200 pe500' \
- pe200='pe200_1.fa pe200_2.fa' pe500='pe500_1.fa pe500_2.fa' \
+ abyss-pe k=64 name=ecoli lib='pea peb' \
+ pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
se='se1.fa se2.fa'
The empirical distribution of fragment sizes will be stored in two
-files named `pe200-3.hist` and `pe500-3.hist`. These files may be
+files named `pea-3.hist` and `peb-3.hist`. These files may be
plotted to check that the empirical distribution agrees with the
expected distribution. The assembled contigs will be stored in
`${name}-contigs.fa`.
@@ -214,11 +217,12 @@ Long-distance mate-pair libraries may be used to scaffold an assembly.
Specify the names of the mate-pair libraries using the parameter `mp`.
The scaffolds will be stored in the file `${name}-scaffolds.fa`.
Here's an example of assembling a data set with two paired-end
-libraries and two mate-pair libraries:
+libraries and two mate-pair libraries. Note that the names of the libraries
+(`pea`, `peb`, `mpa`, `mpb`) are arbitrary.
- abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' \
- pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
- mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa'
+ abyss-pe k=64 name=ecoli lib='pea peb' mp='mpc mpd' \
+ pea='pea_1.fa pea_2.fa' peb='peb_1.fa peb_2.fa' \
+ mpc='mpc_1.fa mpc_2.fa' mpd='mpd_1.fa mpd_2.fa'
The mate-pair libraries are used only for scaffolding and do not
contribute towards the consensus sequence.
@@ -233,12 +237,32 @@ can be linked unambiguously when considering all BWA-MEM alignments.
Similar to scaffolding, the names of the datasets can be specified with
the `long` parameter. These scaffolds will be stored in the file
-`${name}-trans-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly:
+`${name}-trans-scaffs.fa`. The following is an example of an assembly with PET, MPET and an RNA-Seq assembly. Note that the names of the libraries are arbitrary.
- abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long=long1 \
+ abyss-pe k=64 name=ecoli lib='pe1 pe2' mp='mp1 mp2' long='longa' \
pe1='pe1_1.fa pe1_2.fa' pe2='pe2_1.fa pe2_2.fa' \
mp1='mp1_1.fa mp1_2.fa' mp2='mp2_1.fa mp2_2.fa' \
- long1=long1.fa
+ longa='longa.fa'
+
+Assembling using a Bloom filter de Bruijn graph
+=========================================
+
+Assemblies may be performed using a _Bloom filter de Bruijn graph_, which
+typically reduces memory requirements by an order of magnitude. In order to
+assemble in Bloom filter mode, the user must specify 3 additional parameters:
+`B` (Bloom filter size), `H` (number of Bloom filter hash functions), and `kc`
+(minimum k-mer count threshold). Valid size units for the `B` parameter are 'k',
+'M', 'G'. If no unit is specified, bytes are assumed. For example, the following
+will run a E. coli assembly with a Bloom filter size of 100 MB, 3 hash
+functions, a minimum k-mer count threshold of 3, and verbose logging:
+
+ abyss-pe name=ecoli k=64 in='reads1.fa reads2.fa' B=100M H=3 kc=3 v=-v
+
+At the current time, the user must calculate suitable values for `B` and `H` on
+their own, and finding the best value for `kc` may require experimentation
+(optimal values are typically in the range of 2-4). Users are recommended to
+target a Bloom filter false positive rate (FPR) that is less than 5%, as
+reported by the assembly log when using the `v=-v` option (verbose level 1).
Assembling using a paired de Bruijn graph
=========================================
@@ -265,7 +289,7 @@ Assembling a strand-specific RNA-Seq library
============================================
Strand-specific RNA-Seq libraries can be assembled such that the
-resulting unitigs, conitgs and scaffolds are oriented correctly with
+resulting unitigs, contigs and scaffolds are oriented correctly with
respect to the original transcripts that were sequenced. In order to
run ABySS in strand-specific mode, the `SS` parameter must be used as
in the following example:
@@ -281,18 +305,17 @@ Optimizing the parameter k
To find the optimal value of `k`, run multiple assemblies and inspect
the assembly contiguity statistics. The following shell snippet will
-assemble for every value of `k` from 20 to 40.
+assemble for every eighth value of `k` from 50 to 90.
- export k
- for k in {20..40}; do
+ for k in `seq 50 8 90`; do
mkdir k$k
- abyss-pe -C k$k name=ecoli in=../reads.fa
+ abyss-pe -C k$k name=ecoli k=$k in=../reads.fa
done
abyss-fac k*/ecoli-contigs.fa
-The default maximum value for `k` is 64. This limit may be changed at
+The default maximum value for `k` is 96. This limit may be changed at
compile time using the `--enable-maxk` option of configure. It may be
-decreased to 32 to decrease memory usage or increased to 96.
+decreased to 32 to decrease memory usage or increased to larger values.
Parallel processing
===================
@@ -321,12 +344,11 @@ ABySS integrates well with cluster job schedulers, such as:
* Load Sharing Facility (LSF)
* IBM LoadLeveler
-For example, to submit an array of jobs to assemble every odd value of
-`k` between 51 and 63 using 64 processes for each job:
+For example, to submit an array of jobs to assemble every eighth value of
+`k` between 50 and 90 using 64 processes for each job:
- mkdir k{51..63}
- qsub -N ecoli -pe openmpi 64 -t 51-63:2 \
- <<<'abyss-pe -C k$SGE_TASK_ID in=/data/reads.fa'
+ qsub -N ecoli -pe openmpi 64 -t 50-90:8 \
+ <<<'mkdir k$SGE_TASK_ID && abyss-pe -C k$SGE_TASK_ID in=/data/reads.fa'
Using the DIDA alignment framework
=================================
@@ -346,34 +368,41 @@ Assembly Parameters
Parameters of the driver script, `abyss-pe`
* `a`: maximum number of branches of a bubble [`2`]
- * `b`: maximum length of a bubble (bp) [`10000`]
+ * `b`: maximum length of a bubble (bp) [`""`]
+ * `B`: Bloom filter size (e.g. "100M")
* `c`: minimum mean k-mer coverage of a unitig [`sqrt(median)`]
* `d`: allowable error of a distance estimate (bp) [`6`]
- * `e`: minimum erosion k-mer coverage [`sqrt(median)`]
- * `E`: minimum erosion k-mer coverage per strand [`1`]
+ * `e`: minimum erosion k-mer coverage [`round(sqrt(median))`]
+ * `E`: minimum erosion k-mer coverage per strand [1 if sqrt(median) > 2 else 0]
+ * `G`: genome size, used to calculate NG50 [disabled]
+ * `H`: number of Bloom filter hash functions [1]
* `j`: number of threads [`2`]
* `k`: size of k-mer (when `K` is not set) or the span of a k-mer pair (when `K` is set)
+ * `kc`: minimum k-mer count threshold for Bloom filter assembly [`2`]
* `K`: the length of a single k-mer in a k-mer pair (bp)
- * `l`: minimum alignment length of a read (bp) [`k`]
+ * `l`: minimum alignment length of a read (bp) [`40`]
* `m`: minimum overlap of two unitigs (bp) [`30`]
* `n`: minimum number of pairs required for building contigs [`10`]
* `N`: minimum number of pairs required for building scaffolds [`n`]
+ * `np`: number of MPI processes [`1`]
* `p`: minimum sequence identity of a bubble [`0.9`]
* `q`: minimum base quality [`3`]
- * `s`: minimum unitig size required for building contigs (bp) [`200`]
- * `S`: minimum contig size required for building scaffolds (bp) [`s`]
- * `t`: minimum tip size (bp) [`2k`]
+ * `s`: minimum unitig size required for building contigs (bp) [`1000`]
+ * `S`: minimum contig size required for building scaffolds (bp) [`1000-10000`]
+ * `t`: maximum length of blunt contigs to trim [`k`]
* `v`: use `v=-v` for verbose logging, `v=-vv` for extra verbose [`disabled`]
+ * `x`: spaced seed (Bloom filter assembly only)
Please see the
[abyss-pe](http://manpages.ubuntu.com/abyss-pe.1.html)
manual page for more information on assembly parameters.
-Possibly, `abyss-pe` parameters can have same names as existing environment variables'. The parameters then cannot be used until the environment variables are unset. To detect such occasions, run the command:
+Environment variables
+=====================
- abyss-pe env [options]
+`abyss-pe` configuration variables may be set on the command line or from the environment, for example with `export k=20`. It can happen that `abyss-pe` picks up such variables from your environment that you had not intended, and that can cause trouble. To troubleshoot that situation, use the `abyss-pe env` command to print the values of all the `abyss-pe` configuration variables:
-Above command will report all `abyss-pe` parameters that are set from various origins. However it will not operate ABySS programs.
+ abyss-pe env [options]
ABySS programs
==============
@@ -408,23 +437,12 @@ ABySS programs
* `abyss-scaffold`: scaffold contigs using distance estimates
* `abyss-todot`: convert graph formats and merge graphs
-For a flowchart showing the relationship between these programs,
-see doc/flowchart.pdf.
+This [flowchart](https://github.com/bcgsc/abyss/blob/master/doc/flowchart.pdf) shows the ABySS assembly pipeline its intermediate files.
Export to SQLite Database
=========================
-ABySS has a built-in support for SQLite database. With this option activated, it exports log values into a SQLite file and/or `.csv` files at runtime.
-
-## Activating the functionality
-
-Download SQLite [here](http://www.sqlite.org/download.html) and install. (See [Quick Start](#quick-start) for details)
-
-To compile ABySS with SQLite, add configure flag `--with-sqlite` to the steps in [Compiling ABySS from GiHub](#compiling-abyss-from-github) / [Compiling ABySS from source](#compiling-abyss-from-source).
-
- ./configure [other options] --with-sqlite=/path/to/sqlite3/
- make
- sudo make install
+ABySS has a built-in support for SQLite database to export log values into a SQLite file and/or `.csv` files at runtime.
## Database parameters
Of `abyss-pe`:
@@ -433,9 +451,9 @@ Of `abyss-pe`:
* `strain`: name of strain to archive [ ]
* `library`: name of library to archive [ ]
-For example, to export data of species 'Ecoli', strain 'O121' and library 'pe200' into your SQLite database repository named '/abyss/test.sqlite':
+For example, to export data of species 'Ecoli', strain 'O121' and library 'pea' into your SQLite database repository named '/abyss/test.sqlite':
- abyss-pe db=/abyss/test.sqlite species=Ecoli strain=O121 library=pe200 [other options]
+ abyss-pe db=/abyss/test.sqlite species=Ecoli strain=O121 library=pea [other options]
## Helper programs
Found in your `path`:
@@ -461,7 +479,7 @@ Publications
## [ABySS](http://genome.cshlp.org/content/19/6/1117)
Simpson, Jared T., Kim Wong, Shaun D. Jackman, Jacqueline E. Schein,
-Steven JM Jones, and İnanç Birol.
+Steven JM Jones, and Inanc Birol.
**ABySS: a parallel assembler for short read sequence data**.
*Genome research* 19, no. 6 (2009): 1117-1123.
[doi:10.1101/gr.089532.108](http://dx.doi.org/10.1101/gr.089532.108)
@@ -488,6 +506,8 @@ Support
[Ask a question](https://www.biostars.org/p/new/post/?tag_val=abyss,assembly)
on [Biostars](https://www.biostars.org/t/abyss/).
+[Create a new issue](https://github.com/bcgsc/abyss/issues) on GitHub.
+
Subscribe to the
[ABySS mailing list]
(http://groups.google.com/group/abyss-users),
@@ -501,15 +521,11 @@ For questions related to transcriptome assembly, contact the
Authors
=======
-- **[Shaun Jackman](http://sjackman.ca)**
- — [GitHub/sjackman](https://github.com/sjackman)
- — [@sjackman](https://twitter.com/sjackman)
-- **Tony Raymond** — [GitHub/traymond](https://github.com/traymond)
-- **Ben Vandervalk** — [GitHub/benvvalk ](https://github.com/benvvalk)
-- **Jared Simpson** — [GitHub/jts](https://github.com/jts)
-
-Supervised by [**Dr. İnanç Birol**](http://www.bcgsc.ca/faculty/inanc-birol).
++ **[Shaun Jackman](http://sjackman.ca)** - [GitHub/sjackman](https://github.com/sjackman) - [@sjackman](https://twitter.com/sjackman)
++ **Tony Raymond** - [GitHub/traymond](https://github.com/traymond)
++ **Ben Vandervalk** - [GitHub/benvvalk ](https://github.com/benvvalk)
++ **Jared Simpson** - [GitHub/jts](https://github.com/jts)
-Copyright 2014 Canada's Michael Smith Genome Sciences Centre
+Supervised by [**Dr. Inanc Birol**](http://www.bcgsc.ca/faculty/inanc-birol).
-[![githalytics.com](https://cruel-carlota.pagodabox.com/af4811df3b40b7d096f6085db2969f0e "githalytics.com")](http://githalytics.com/sjackman/abyss)
+Copyright 2016 Canada's Michael Smith Genome Sciences Centre
diff --git a/Scaffold/drawgraph.cc b/Scaffold/drawgraph.cc
index 6d39178..6b155e4 100644
--- a/Scaffold/drawgraph.cc
+++ b/Scaffold/drawgraph.cc
@@ -293,7 +293,7 @@ int main(int argc, char** argv)
for (tie(uit, ulast) = vertices(g); uit != ulast; ++uit) {
V u = *uit;
size_t ui = get(vertex_index, g, u);
- double x1 = isnan(b[ui]) ? 0 : b[ui];
+ double x1 = std::isnan(b[ui]) ? 0 : b[ui];
sorted.push_back(std::make_pair(x1, u));
}
sort(sorted.begin(), sorted.end());
diff --git a/Scaffold/scaffold.cc b/Scaffold/scaffold.cc
index d1ef9f0..e021b0f 100644
--- a/Scaffold/scaffold.cc
+++ b/Scaffold/scaffold.cc
@@ -58,6 +58,8 @@ static const char USAGE_MESSAGE[] =
" or -s N0-N1 Find the value of s in [N0,N1]\n"
" that maximizes the scaffold N50.\n"
" -k, --kmer=N length of a k-mer\n"
+" -G, --genome-size=N expected genome size. Used to calculate NG50\n"
+" and associated stats [disabled]\n"
" --min-gap=N minimum scaffold gap length to output [50]\n"
" --max-gap=N maximum scaffold gap length to output [inf]\n"
" --complex remove complex transitive edges\n"
@@ -89,6 +91,9 @@ namespace opt {
static unsigned minContigLength = 200;
static unsigned minContigLengthEnd;
+ /** Genome size. Used to calculate NG50. */
+ static long long unsigned genomeSize;
+
/** Minimum scaffold gap length to output. */
static int minGap = 50;
@@ -115,7 +120,7 @@ namespace opt {
static int comp_trans;
}
-static const char shortopts[] = "g:k:n:o:s:v";
+static const char shortopts[] = "G:g:k:n:o:s:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
@@ -124,6 +129,7 @@ enum { OPT_HELP = 1, OPT_VERSION, OPT_MIN_GAP, OPT_MAX_GAP, OPT_COMP,
static const struct option longopts[] = {
{ "graph", no_argument, NULL, 'g' },
{ "kmer", required_argument, NULL, 'k' },
+ { "genome-size", required_argument, NULL, 'G' },
{ "min-gap", required_argument, NULL, OPT_MIN_GAP },
{ "max-gap", required_argument, NULL, OPT_MAX_GAP },
{ "npairs", required_argument, NULL, 'n' },
@@ -736,7 +742,7 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
static bool printHeader = true;
Histogram h = buildScaffoldLengthHistogram(g, paths);
printContiguityStats(cerr, h, STATS_MIN_LENGTH,
- printHeader)
+ printHeader, "\t", opt::genomeSize)
<< "\ts=" << minContigLength << '\n';
if (opt::verbose == 0)
printHeader = false;
@@ -765,7 +771,9 @@ unsigned scaffold(const Graph& g0, unsigned minContigLength,
// Print assembly contiguity statistics.
Histogram h = buildScaffoldLengthHistogram(g, paths);
- printContiguityStats(cerr, h, STATS_MIN_LENGTH) << '\n';
+ printContiguityStats(cerr, h, STATS_MIN_LENGTH,
+ true, "\t", opt::genomeSize)
+ << "\ts=" << minContigLength << '\n';
addCntgStatsToDb(h, STATS_MIN_LENGTH);
return h.trimLow(STATS_MIN_LENGTH).n50();
}
@@ -787,6 +795,13 @@ int main(int argc, char** argv)
case 'k':
arg >> opt::k;
break;
+ case 'G':
+ {
+ double x;
+ arg >> x;
+ opt::genomeSize = x;
+ break;
+ }
case 'g':
arg >> opt::graphPath;
break;
diff --git a/Sealer/Makefile.am b/Sealer/Makefile.am
index 30704e1..eca0b31 100644
--- a/Sealer/Makefile.am
+++ b/Sealer/Makefile.am
@@ -26,4 +26,4 @@ abyss_sealer_SOURCES = sealer.cc \
# Convert the README.md to a man page using Pandoc
abyss-sealer.1: README.md
- -pandoc -s -o $@ $<
+ pandoc -s -o $@ $<
diff --git a/Sealer/README.md b/Sealer/README.md
index 190aa86..d1aab26 100644
--- a/Sealer/README.md
+++ b/Sealer/README.md
@@ -15,11 +15,11 @@ abyss-sealer - Close gaps within scaffolds
Synopsis
================================================================================
-`abyss-sealer -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...`
+`abyss-sealer -b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...`
For example:
-`abyss-sealer -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa`
+`abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa`
Description
===========
@@ -34,7 +34,7 @@ See ABySS installation instructions.
How to run as stand-alone application
=====================================
-`abyss-sealer [-k values...] [-o outputprefix] [-S assembly file] [options...] [reads...]`
+`abyss-sealer [-b bloom filter size][-k values...] [-o outputprefix] [-S assembly file] [options...] [reads...]`
Sealer requires the following information to run:
- draft assembly
@@ -47,13 +47,17 @@ Sample commands
Without pre-built bloom filters:
-`abyss-sealer -k90 -k80 -o run1 -S test.fa read1.fa.gz read2.fa.gz`
+`abyss-sealer -b20G -k90 -k80 -o run1 -S test.fa read1.fa.gz read2.fa.gz`
With pre-built bloom filters:
`abyss-sealer -k90 -k80 -o run1 -S test.fa -i k90.bloom -i k80.bloom read1.fa.gz read2.fa.gz`
-Note: when using pre-built bloom filters, Sealer must be compiled with the same `maxk` value that the bloom filter was built with. For example, if a bloom filter was built with a `maxk`of 64, Sealer must be compiled with a `maxk` of 64 as well. If different values are used between the pre-built bloom filter and Sealer, any sequences generated will be nonsensical and incorrect.
+Reusable Bloom filters can be pre-built with `abyss-bloom build`, e.g.:
+
+`abyss-bloom build -vv -k90 -j12 -b20G -l2 k90.bloom read1.fa.gz read2.fa.gz`
+
+Note: when using pre-built bloom filters generated by `abyss-bloom build`, Sealer must be compiled with the same `maxk` value that `abyss-bloom` was compiled with. For example, if a Bloom filter was built with a `maxk`of 64, Sealer must be compiled with a `maxk` of 64 as well. If different values are used between the pre-built bloom filter and Sealer, any sequences generated will be nonsensical and incorrect.
Suggested parameters for first run
==================================
@@ -141,7 +145,7 @@ Parameters of `abyss-sealer`
* `-D`,`--flank-distance=N`: distance of flank from gap [0]
* `-j`,`--threads=N`: use N parallel threads [1]
* `-k`,`--kmer=N`: the size of a k-mer
-* `-b`,`--bloom-size=N`: size of bloom filter [500M]
+* `-b`,`--bloom-size=N`: size of bloom filter. Required when not using pre-built Bloom filter(s).
* `-B`,`--max-branches=N`: max branches in de Bruijn graph traversal; use 'nolimit' for no limit [1000]
* `-d`,`--dot-file=FILE`: write graph traversals to a DOT file
* `-e`,`--fix-errors`: find and fix single-base errors when reads have no kmers in bloom filter [disabled]
diff --git a/Sealer/sealer.cc b/Sealer/sealer.cc
index 420deeb..9c439a7 100644
--- a/Sealer/sealer.cc
+++ b/Sealer/sealer.cc
@@ -60,10 +60,10 @@ PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
"Copyright 2014 Canada's Michael Smith Genome Science Centre\n";
static const char USAGE_MESSAGE[] =
-"Usage: " PROGRAM " -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
-"i.e. abyss-sealer -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
+"Usage: " PROGRAM "-b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
+"i.e. abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
"Close gaps by using left and right flanking sequences of gaps as 'reads' for Konnector\n"
-"and performing multiple runs with each of the supplied K values..\n"
+"and performing multiple runs with each of the supplied K values.\n"
"\n"
" Options:\n"
"\n"
@@ -71,16 +71,18 @@ static const char USAGE_MESSAGE[] =
" -S, --input-scaffold=FILE load scaffold from FILE\n"
" -L, --flank-length=N length of flanks to be used as pseudoreads [100]\n"
" -D, --flank-distance=N distance of flank from gap [0]\n"
+" -G, --max-gap-length=N max gap size to fill in bp [800]; runtime increases\n"
+" exponentially with respect to this parameter\n"
" -j, --threads=N use N parallel threads [1]\n"
" -k, --kmer=N the size of a k-mer\n"
-" -b, --bloom-size=N size of bloom filter [500M]\n"
+" -b, --bloom-size=N size of Bloom filter (e.g. '40G'). Required\n"
+" when not using pre-built Bloom filter(s)\n"
+" (-i option)\n"
" -B, --max-branches=N max branches in de Bruijn graph traversal;\n"
" use 'nolimit' for no limit [1000]\n"
" -d, --dot-file=FILE write graph traversals to a DOT file\n"
" -e, --fix-errors find and fix single-base errors when reads\n"
" have no kmers in bloom filter [disabled]\n"
-" -f, --min-frag=N min fragment size in base pairs [0]\n"
-" -F, --max-frag=N max fragment size in base pairs [1000]\n"
" -i, --input-bloom=FILE load bloom filter from FILE\n"
" --mask mask new and changed bases as lower case\n"
" --no-mask do not mask bases [default]\n"
@@ -113,6 +115,22 @@ static const char USAGE_MESSAGE[] =
" --help display this help and exit\n"
" --version output version information and exit\n"
"\n"
+" Deprecated Options:\n"
+"\n"
+" -f, --min-frag=N min fragment size in base pairs\n"
+" -F, --max-frag=N max fragment size in base pairs\n"
+"\n"
+" Note: --max-frag was formerly used to determine the maximum gap\n"
+" size that abyss-sealer would attempt to close, according to the formula\n"
+" max_gap_size = max_frag - 2 * flank_length, where flank_length is\n"
+" deteremined by the -L option. --max-frag is kept only for backwards\n"
+" compatibility and is superceded by the more intuitive -G (--max-gap-length)\n"
+" option. Similarly, --min-frag determines the minimum gap size to close,\n"
+" according to the formula min_gap_size = min_frag - 2 * flank_length, where\n"
+" a negative gap size indicates an overlap between gap flanks. Normally the\n"
+" user would not want to specify a minimum gap size and so it is recommended to\n"
+" leave --min-frag unset.\n"
+"\n"
"Report bugs to <" PACKAGE_BUGREPORT ">.\n";
namespace opt {
@@ -123,6 +141,9 @@ namespace opt {
/** Distance of flank from gap. */
unsigned flankDistance = 0;
+ /** Max gap size to fill */
+ unsigned maxGapLength = 800;
+
/** scaffold file input. */
static string inputScaffold;
@@ -130,7 +151,7 @@ namespace opt {
static unsigned threads = 1;
/** The size of the bloom filter in bytes. */
- size_t bloomSize = 500 * 1024 * 1024;
+ size_t bloomSize = 0;
/** The maximum count value of the BLoom filter. */
unsigned max_count = 2;
@@ -163,7 +184,7 @@ namespace opt {
unsigned minFrag = 0;
/** The maximum fragment size */
- unsigned maxFrag = 1000;
+ unsigned maxFrag = 0;
/** Bloom filter input file */
static string inputBloomPath;
@@ -218,7 +239,7 @@ struct Counters {
size_t skipped;
};
-static const char shortopts[] = "S:L:D:b:B:d:ef:F:i:Ij:k:lm:M:no:P:q:r:s:t:v";
+static const char shortopts[] = "S:L:D:b:B:d:ef:F:G:i:Ij:k:lm:M:no:P:q:r:s:t:v";
enum { OPT_HELP = 1, OPT_VERSION };
@@ -228,6 +249,7 @@ static const struct option longopts[] = {
{ "input-scaffold", required_argument, NULL, 'S' },
{ "flank-length", required_argument, NULL, 'L' },
{ "flank-distance", required_argument, NULL, 'D' },
+ { "max-gap-length", required_argument, NULL, 'G' },
{ "bloom-size", required_argument, NULL, 'b' },
{ "max-branches", required_argument, NULL, 'B' },
{ "dot-file", required_argument, NULL, 'd' },
@@ -266,7 +288,7 @@ struct Coord
{
int start;
int end;
-
+
Coord() { }
Coord(int start, int end) : start(start), end(end) { }
@@ -701,6 +723,8 @@ int main(int argc, char** argv)
arg >> opt::flankLength; break;
case 'D':
arg >> opt::flankDistance; break;
+ case 'G':
+ arg >> opt::maxGapLength; break;
case 'b':
opt::bloomSize = SIToBytes(arg); break;
case 'B':
@@ -769,6 +793,14 @@ int main(int argc, char** argv)
}
}
+ /* translate --max-frag to --max-gap-length for backwards compatibility */
+ if (opt::maxFrag > 0) {
+ if ((int)opt::maxFrag < 2 * opt::flankLength)
+ opt::maxGapLength = 0;
+ else
+ opt::maxGapLength = opt::maxFrag - 2 * opt::flankLength;
+ }
+
if (opt::inputScaffold.empty()) {
cerr << PROGRAM ": missing mandatory option `-S'\n";
die = true;
@@ -779,14 +811,36 @@ int main(int argc, char** argv)
die = true;
}
+ if (opt::bloomFilterPaths.size() < opt::kvector.size()
+ && opt::bloomSize == 0)
+ {
+ cerr << PROGRAM ": missing mandatory option `-b' (Bloom filter size)\n"
+ << "Here are some guidelines for sizing the Bloom filter:\n"
+ << " * E. coli (~5 Mbp genome), 615X coverage: -b500M\n"
+ << " * S. cerevisiae (~12 Mbp genome), 25X coverage: -b500M\n"
+ << " * C. elegans (~100 Mbp genome), 89X coverage: -b1200M\n"
+ << " * H. sapiens (~3 Gbp genome), 71X coverage: -b40G\n";
+ die = true;
+ }
+
if (opt::outputPrefix.empty()) {
cerr << PROGRAM ": missing mandatory option `-o'\n";
die = true;
}
- if (argc - optind < 1) {
+ if (opt::bloomFilterPaths.size() > opt::kvector.size()) {
+ cerr << PROGRAM ": you must specify a k-mer size (-k) for each Bloom "
+ " filter file (-i)\n";
+ die = true;
+ } else if (opt::bloomFilterPaths.size() < opt::kvector.size()
+ && argc - optind < 1) {
cerr << PROGRAM ": missing input file arguments\n";
die = true;
+ } else if (opt::bloomFilterPaths.size() == opt::kvector.size()
+ && argc - optind > 0) {
+ cerr << PROGRAM ": input FASTA/FASTQ args should be omitted when using "
+ "pre-built Bloom filters (-i) for all k-mer sizes\n";
+ die = true;
}
if (die) {
@@ -806,8 +860,6 @@ int main(int argc, char** argv)
seqanTests();
#endif
- assert(opt::bloomSize > 0);
-
ofstream dotStream;
if (!opt::dotPath.empty()) {
if (opt::verbose)
@@ -848,7 +900,7 @@ int main(int argc, char** argv)
ConnectPairsParams params;
params.minMergedSeqLen = opt::minFrag;
- params.maxMergedSeqLen = opt::maxFrag;
+ params.maxMergedSeqLen = opt::maxGapLength + 2 * opt::flankLength;
params.maxPaths = opt::maxPaths;
params.maxBranches = opt::maxBranches;
params.maxPathMismatches = opt::maxMismatches;
@@ -893,12 +945,12 @@ int main(int argc, char** argv)
map<FastaRecord, Gap>::iterator read2_it;
string read1OutputPath(opt::outputPrefix);
- read1OutputPath.append("_flanks_1.fq");
+ read1OutputPath.append("_flanks_1.fa");
ofstream read1Stream(read1OutputPath.c_str());
assert_good(read1Stream, read1OutputPath);
string read2OutputPath(opt::outputPrefix);
- read2OutputPath.append("_flanks_2.fq");
+ read2OutputPath.append("_flanks_2.fa");
ofstream read2Stream(read2OutputPath.c_str());
assert_good(read2Stream, read2OutputPath);
@@ -934,7 +986,7 @@ int main(int argc, char** argv)
BloomFilter* bloom;
CascadingBloomFilter* cascadingBloom = NULL;
- if (!opt::bloomFilterPaths.empty() && i <= opt::bloomFilterPaths.size()) {
+ if (!opt::bloomFilterPaths.empty() && i < opt::bloomFilterPaths.size()) {
temp = "Loading bloom filter from `" + opt::bloomFilterPaths.at(i) + "'...\n";
printLog(logStream, temp);
@@ -963,6 +1015,12 @@ int main(int argc, char** argv)
bloom = &cascadingBloom->getBloomFilter(opt::max_count - 1);
}
+ assert(bloom != NULL);
+
+ if (opt::verbose)
+ cerr << "Bloom filter FPR: " << setprecision(3)
+ << 100 * bloom->FPR() << "%\n";
+
DBGBloom<BloomFilter> g(*bloom);
temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
diff --git a/SimpleGraph/SimpleGraph.cpp b/SimpleGraph/SimpleGraph.cpp
index 373ec70..0a9ebd3 100644
--- a/SimpleGraph/SimpleGraph.cpp
+++ b/SimpleGraph/SimpleGraph.cpp
@@ -647,7 +647,7 @@ static void* worker(void* pArg)
static pthread_mutex_t inMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&inMutex);
EstimateRecord er;
- bool good = (*arg.in) >> er;
+ bool good = bool((*arg.in) >> er);
pthread_mutex_unlock(&inMutex);
if (!good)
break;
diff --git a/Unittest/BloomDBG/BloomDBGTest.cpp b/Unittest/BloomDBG/BloomDBGTest.cpp
new file mode 100644
index 0000000..155ae38
--- /dev/null
+++ b/Unittest/BloomDBG/BloomDBGTest.cpp
@@ -0,0 +1,155 @@
+#include "Common/Sequence.h"
+#include "BloomDBG/bloom-dbg.h"
+#include "BloomDBG/MaskedKmer.h"
+#include "BloomDBG/RollingHash.h"
+#include "BloomDBG/RollingBloomDBG.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+using namespace std;
+typedef RollingBloomDBG<BloomFilter> Graph;
+typedef graph_traits<Graph> GraphTraits;
+
+/* each vertex is represented by
+ * std::pair<MaskedKmer, vector<size_t>>, where 'string' is the
+ * k-mer and 'vector<size_t>' is the associated set of
+ * hash values */
+typedef graph_traits<Graph>::vertex_descriptor V;
+
+/** Convert a path in the de Bruijn graph to a sequence */
+TEST(BloomDBG, pathToSeq)
+{
+ const string inputSeq = "ACGTAC";
+ const string spacedSeed = "10001";
+ const unsigned k = 5;
+ const unsigned numHashes = 2;
+
+ MaskedKmer::setLength(k);
+ MaskedKmer::setMask(spacedSeed);
+
+ Path<BloomDBG::Vertex> path =
+ BloomDBG::seqToPath(inputSeq, k, numHashes);
+ ASSERT_EQ(2U, path.size());
+
+ string outputSeq = BloomDBG::pathToSeq(path, k);
+ ASSERT_EQ("ACNNAC", outputSeq);
+}
+
+/** Split a sequence at branching k-mers the de Bruijn graph */
+TEST(BloomDBG, splitSeq)
+{
+ const size_t bloomSize = 100000;
+ const unsigned k = 5;
+ const unsigned numHashes = 2;
+ const unsigned minBranchLen = 1;
+ size_t hashes[MAX_HASHES];
+
+ /* it is important to reset these, since they persist between tests */
+ MaskedKmer::setLength(k);
+ MaskedKmer::mask().clear();
+
+ /*
+ * Test graph (k=5):
+ *
+ * GACTC-ACTCG-CTCGG
+ *
+ * Input sequence (horizontal path above):
+ *
+ * GACTCGG
+ */
+
+ BloomFilter bloom1(bloomSize, numHashes, k);
+
+ RollingHash("GACTC", numHashes, k).getHashes(hashes);
+ bloom1.insert(hashes);
+ RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+ bloom1.insert(hashes);
+ RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+ bloom1.insert(hashes);
+
+ Sequence seq1 = "GACTCGG";
+
+ Graph graph1(bloom1);
+ vector<Sequence> segments1 = BloomDBG::splitSeq(seq1, k,
+ numHashes, graph1, minBranchLen);
+
+ V GACTC(V("GACTC", RollingHash("GACTC", numHashes, k)));
+
+ ASSERT_EQ(1U, out_degree(GACTC, graph1));
+ ASSERT_EQ(1U, segments1.size());
+ ASSERT_EQ("GACTCGG", segments1.front());
+
+ /*
+ * Test graph (k=5):
+ *
+ * ACTCT
+ * /
+ * GACTC-ACTCG-CTCGG
+ * /
+ * TCTCG
+ *
+ * Input sequence (horizontal path above):
+ *
+ * GACTCGG
+ */
+
+ BloomFilter bloom2(bloomSize, numHashes, k);
+
+ RollingHash("GACTC", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("ACTCT", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("TCTCG", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+
+ Sequence seq2 = "GACTCGG";
+
+ Graph graph2(bloom2);
+ vector<Sequence> segments2 = BloomDBG::splitSeq(seq2, k,
+ numHashes, graph2, minBranchLen);
+
+ ASSERT_EQ(3U, segments2.size());
+ ASSERT_EQ("GACTC", segments2.at(0));
+ ASSERT_EQ("GACTCGG", segments2.at(1));
+ ASSERT_EQ("CTCGG", segments2.at(2));
+
+ /*
+ * Test graph (k=5):
+ *
+ * TACTC CTCGA
+ * \ /
+ * GACTC-ACTCG-CTCGG
+ *
+ * Input sequence (horizontal path above):
+ *
+ * ACTCG
+ */
+
+ BloomFilter bloom3(bloomSize, numHashes, k);
+
+ RollingHash("TACTC", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("GACTC", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("ACTCG", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("CTCGA", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+ RollingHash("CTCGG", numHashes, k).getHashes(hashes);
+ bloom2.insert(hashes);
+
+ Sequence seq3 = "ACTCG";
+
+ Graph graph3(bloom3);
+ vector<Sequence> segments3 = BloomDBG::splitSeq(seq3, k,
+ numHashes, graph3, minBranchLen);
+
+ ASSERT_EQ(1U, segments3.size());
+ ASSERT_EQ("ACTCG", segments3.front());
+}
diff --git a/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp b/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp
new file mode 100644
index 0000000..5acfb53
--- /dev/null
+++ b/Unittest/BloomDBG/HashAgnosticCascadingBloomTest.cpp
@@ -0,0 +1,46 @@
+#include "BloomDBG/RollingHashIterator.h"
+#include "BloomDBG/HashAgnosticCascadingBloom.h"
+
+#include <gtest/gtest.h>
+
+using namespace std;
+typedef uint64_t hash_t;
+
+TEST(HashAgnosticCascadingBloom, base)
+{
+ const unsigned bloomSize = 1000;
+ const unsigned numHashes = 1;
+ const unsigned numLevels = 2;
+ const unsigned k = 16;
+
+ HashAgnosticCascadingBloom x(bloomSize, numHashes, numLevels, k);
+ EXPECT_EQ(x.size(), bloomSize);
+
+ const char* a = "AGATGTGCTGCCGCCT";
+ const char* b = "TGGACAGCGTTACCTC";
+ const char* c = "TAATAACAGTCCCTAT";
+ const char* d = "GATCGTGGCGGGCGAT";
+
+ RollingHashIterator itA(a, numHashes, k);
+ RollingHashIterator itB(b, numHashes, k);
+ RollingHashIterator itC(c, numHashes, k);
+ RollingHashIterator itD(d, numHashes, k);
+ size_t hash;
+
+ x.insert(*itA);
+ EXPECT_EQ(x.popcount(), 0U);
+ EXPECT_FALSE(x.contains(&hash));
+ x.insert(*itA);
+ EXPECT_EQ(x.popcount(), 1U);
+ EXPECT_TRUE(x.contains(*itA));
+ x.insert(*itB);
+ EXPECT_EQ(x.popcount(), 1U);
+ EXPECT_FALSE(x.contains(*itB));
+ x.insert(*itC);
+ EXPECT_EQ(x.popcount(), 1U);
+ EXPECT_FALSE(x.contains(*itC));
+ x.insert(*itB);
+ EXPECT_EQ(x.popcount(), 2U);
+ EXPECT_TRUE(x.contains(*itB));
+ EXPECT_FALSE(x.contains(*itD));
+}
diff --git a/Unittest/BloomDBG/MaskedKmerTest.cpp b/Unittest/BloomDBG/MaskedKmerTest.cpp
new file mode 100644
index 0000000..441954e
--- /dev/null
+++ b/Unittest/BloomDBG/MaskedKmerTest.cpp
@@ -0,0 +1,26 @@
+#include "BloomDBG/MaskedKmer.h"
+
+#include <gtest/gtest.h>
+
+using namespace std;
+
+TEST(MaskedKmerTest, trivialMask)
+{
+ MaskedKmer::setLength(4);
+
+ MaskedKmer kmer1("ACGT");
+ MaskedKmer kmer2("ACGT");
+
+ ASSERT_EQ(kmer1, kmer2);
+}
+
+TEST(MaskedKmerTest, nonTrivialMask)
+{
+ MaskedKmer::setLength(4);
+ MaskedKmer::setMask("1001");
+
+ MaskedKmer kmer1("ACGT");
+ MaskedKmer kmer2("ATTT");
+
+ ASSERT_EQ(kmer1, kmer2);
+}
diff --git a/Unittest/BloomDBG/RollingBloomDBGTest.cpp b/Unittest/BloomDBG/RollingBloomDBGTest.cpp
new file mode 100644
index 0000000..39bf88d
--- /dev/null
+++ b/Unittest/BloomDBG/RollingBloomDBGTest.cpp
@@ -0,0 +1,275 @@
+#include "BloomDBG/RollingBloomDBG.h"
+#include "lib/bloomfilter/BloomFilter.hpp"
+#include "Common/UnorderedSet.h"
+
+#include <gtest/gtest.h>
+#include <string>
+
+using namespace std;
+using namespace boost;
+
+typedef RollingBloomDBG<BloomFilter> Graph;
+typedef graph_traits<Graph> GraphTraits;
+typedef graph_traits<Graph>::vertex_descriptor V;
+
+/** Test fixture for RollingBloomDBG tests. */
+class RollingBloomDBGTest : public ::testing::Test
+{
+protected:
+
+ const unsigned m_k;
+ const unsigned m_bloomSize;
+ const unsigned m_numHashes;
+ BloomFilter m_bloom;
+ Graph m_graph;
+
+ RollingBloomDBGTest() : m_k(5), m_bloomSize(100000), m_numHashes(2),
+ m_bloom(m_bloomSize, m_numHashes, m_k), m_graph(m_bloom)
+ {
+ MaskedKmer::setLength(m_k);
+
+ /*
+ * Test de Bruijn graph:
+ *
+ * CGACT ACTCT
+ * \ /
+ * GACTC
+ * / \
+ * TGACT ACTCG
+ *
+ * Note: No unexpected edges
+ * are created by the reverse
+ * complements of these k-mers.
+ */
+
+ size_t hashes[MAX_HASHES];
+ RollingHash("CGACT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("TGACT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("GACTC", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("ACTCT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("ACTCG", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ }
+
+};
+
+TEST_F(RollingBloomDBGTest, out_edge_iterator)
+{
+ /* TEST: check that "GACTC" has the expected outgoing edges */
+
+ const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+ const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+ const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+ unordered_set<V> expectedNeighbours;
+ expectedNeighbours.insert(ACTCT);
+ expectedNeighbours.insert(ACTCG);
+
+ ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+ GraphTraits::out_edge_iterator ei, ei_end;
+ boost::tie(ei, ei_end) = out_edges(GACTC, m_graph);
+ ASSERT_NE(ei_end, ei);
+ unordered_set<V>::iterator neighbour =
+ expectedNeighbours.find(target(*ei, m_graph));
+ EXPECT_NE(expectedNeighbours.end(), neighbour);
+ expectedNeighbours.erase(neighbour);
+ ei++;
+ ASSERT_NE(ei_end, ei);
+ neighbour = expectedNeighbours.find(target(*ei, m_graph));
+ ASSERT_NE(expectedNeighbours.end(), neighbour);
+ ei++;
+ ASSERT_EQ(ei_end, ei);
+}
+
+TEST_F(RollingBloomDBGTest, adjacency_iterator)
+{
+ /* TEST: check that "GACTC" has the expected outgoing edges */
+
+ const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+ const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+ const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+ unordered_set<V> expectedNeighbours;
+ expectedNeighbours.insert(ACTCT);
+ expectedNeighbours.insert(ACTCG);
+
+ ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+ GraphTraits::adjacency_iterator ai, ai_end;
+ boost::tie(ai, ai_end) = adjacent_vertices(GACTC, m_graph);
+ ASSERT_NE(ai_end, ai);
+ unordered_set<V>::iterator neighbour =
+ expectedNeighbours.find(*ai);
+ EXPECT_NE(expectedNeighbours.end(), neighbour);
+ expectedNeighbours.erase(neighbour);
+ ai++;
+ ASSERT_NE(ai_end, ai);
+ neighbour = expectedNeighbours.find(*ai);
+ ASSERT_NE(expectedNeighbours.end(), neighbour);
+ ai++;
+ ASSERT_EQ(ai_end, ai);
+}
+
+TEST_F(RollingBloomDBGTest, in_edges)
+{
+ /* TEST: check that "GACTC" has the expected ingoing edges */
+
+ const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+ const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));
+ const V TGACT("TGACT", RollingHash("TGACT", m_numHashes, m_k));
+
+ unordered_set<V> expectedNeighbours;
+ expectedNeighbours.insert(CGACT);
+ expectedNeighbours.insert(TGACT);
+
+ ASSERT_EQ(2u, in_degree(GACTC, m_graph));
+ GraphTraits::in_edge_iterator ei, ei_end;
+ boost::tie(ei, ei_end) = in_edges(GACTC, m_graph);
+ ASSERT_NE(ei_end, ei);
+ unordered_set<V>::iterator neighbour =
+ expectedNeighbours.find(source(*ei, m_graph));
+ EXPECT_NE(expectedNeighbours.end(), neighbour);
+ expectedNeighbours.erase(neighbour);
+ ei++;
+ ASSERT_NE(ei_end, ei);
+ neighbour = expectedNeighbours.find(source(*ei, m_graph));
+ ASSERT_NE(expectedNeighbours.end(), neighbour);
+ ei++;
+ ASSERT_EQ(ei_end, ei);
+}
+
+TEST_F(RollingBloomDBGTest, pathTraversal)
+{
+ /*
+ * Walk a simple path:
+ *
+ * CGACT-GACTC-ACTCG
+ */
+
+ BloomFilter bloom(m_bloomSize, m_numHashes, m_k);
+ Graph graph(bloom);
+
+ const V CGACT("CGACT", RollingHash("CGACT", m_numHashes, m_k));
+ const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+ const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+ size_t hashes[MAX_HASHES];
+ CGACT.rollingHash().getHashes(hashes);
+ bloom.insert(hashes);
+ GACTC.rollingHash().getHashes(hashes);
+ bloom.insert(hashes);
+ ACTCG.rollingHash().getHashes(hashes);
+ bloom.insert(hashes);
+
+ /* step one */
+
+ V v = CGACT;
+ ASSERT_EQ(1u, out_degree(v, graph));
+ GraphTraits::out_edge_iterator ei, ei_end;
+ boost::tie(ei, ei_end) = out_edges(v, graph);
+ ASSERT_NE(ei_end, ei);
+ ASSERT_EQ(CGACT, source(*ei, graph));
+ ASSERT_EQ(GACTC, target(*ei, graph));
+ v = target(*ei, graph);
+ ++ei;
+ ASSERT_EQ(ei_end, ei);
+
+ /* step two */
+
+ ASSERT_EQ(1u, out_degree(v, graph));
+ boost::tie(ei, ei_end) = out_edges(v, graph);
+ ASSERT_NE(ei_end, ei);
+ ASSERT_EQ(GACTC, source(*ei, graph));
+ ASSERT_EQ(ACTCG, target(*ei, graph));
+ v = target(*ei, graph);
+ ++ei;
+ ASSERT_EQ(ei_end, ei);
+}
+
+/** Test fixture for RollingBloomDBG with spaced seed k-mers. */
+class RollingBloomDBGSpacedSeedTest : public ::testing::Test
+{
+protected:
+
+ const unsigned m_k;
+ const unsigned m_bloomSize;
+ const unsigned m_numHashes;
+ BloomFilter m_bloom;
+ Graph m_graph;
+ const std::string m_spacedSeed;
+
+ RollingBloomDBGSpacedSeedTest() : m_k(5), m_bloomSize(100000), m_numHashes(1),
+ m_bloom(m_bloomSize, m_numHashes, m_k), m_graph(m_bloom),
+ m_spacedSeed("11011")
+ {
+ MaskedKmer::setLength(m_k);
+ MaskedKmer::setMask(m_spacedSeed);
+
+ /*
+ * Test de Bruijn graph:
+ *
+ * CGACT ACTCT
+ * \ /
+ * GACTC
+ * / \
+ * TGACT ACTCG
+ *
+ * Masked version:
+ *
+ * CG_CT AC_CT
+ * \ /
+ * GA_TC
+ * / \
+ * TG_CT AC_CG
+ *
+ * Note: With respect to the spaced seed "11011",
+ * GACTC is equivalent to its own reverse complement
+ * GAGTC. However, this does not result in
+ * any additional edges in the graph.
+ */
+
+ size_t hashes[MAX_HASHES];
+ RollingHash("CGACT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("TGACT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("GACTC", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("ACTCT", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ RollingHash("ACTCG", m_numHashes, m_k).getHashes(hashes);
+ m_bloom.insert(hashes);
+ }
+
+};
+
+TEST_F(RollingBloomDBGSpacedSeedTest, out_edge_iterator)
+{
+ /* TEST: check that "GACTC" has the expected outgoing edges */
+
+ const V GACTC("GACTC", RollingHash("GACTC", m_numHashes, m_k));
+ const V ACTCT("ACTCT", RollingHash("ACTCT", m_numHashes, m_k));
+ const V ACTCG("ACTCG", RollingHash("ACTCG", m_numHashes, m_k));
+
+ unordered_set<V> expectedNeighbours;
+ expectedNeighbours.insert(ACTCT);
+ expectedNeighbours.insert(ACTCG);
+
+ ASSERT_EQ(2u, out_degree(GACTC, m_graph));
+ GraphTraits::out_edge_iterator ei, ei_end;
+ boost::tie(ei, ei_end) = out_edges(GACTC, m_graph);
+ ASSERT_NE(ei_end, ei);
+ unordered_set<V>::iterator neighbour =
+ expectedNeighbours.find(target(*ei, m_graph));
+ EXPECT_NE(expectedNeighbours.end(), neighbour);
+ expectedNeighbours.erase(neighbour);
+ ei++;
+ ASSERT_NE(ei_end, ei);
+ neighbour = expectedNeighbours.find(target(*ei, m_graph));
+ ASSERT_NE(expectedNeighbours.end(), neighbour);
+ ei++;
+ ASSERT_EQ(ei_end, ei);
+}
diff --git a/Unittest/BloomDBG/RollingHashIteratorTest.cpp b/Unittest/BloomDBG/RollingHashIteratorTest.cpp
new file mode 100644
index 0000000..f2272be
--- /dev/null
+++ b/Unittest/BloomDBG/RollingHashIteratorTest.cpp
@@ -0,0 +1,116 @@
+#include "BloomDBG/RollingHashIterator.h"
+
+#include <gtest/gtest.h>
+#include <string>
+
+using namespace std;
+
+TEST(RollingHashIterator, reverseComplement)
+{
+ const unsigned k = 6;
+ const unsigned numHashes = 1;
+ const char* seq = "GCAATGT";
+ const char* rcSeq = "ACATTGC";
+
+ /** hash forward sequence */
+
+ RollingHashIterator it(seq, numHashes, k);
+ size_t kmer1Hash, kmer2Hash;
+ kmer1Hash = (*it)[0];
+ ++it;
+ kmer2Hash = (*it)[0];
+ ++it;
+ ASSERT_EQ(RollingHashIterator::end(), it);
+
+ /** hash reverse complement sequence */
+
+ RollingHashIterator rcIt(rcSeq, numHashes, k);
+ size_t rcKmer1Hash, rcKmer2Hash;
+ rcKmer2Hash = (*rcIt)[0];
+ ++rcIt;
+ rcKmer1Hash = (*rcIt)[0];
+ ++rcIt;
+ ASSERT_EQ(RollingHashIterator::end(), rcIt);
+
+ /** check hash values are the same for forward and reverse complement */
+
+ ASSERT_EQ(kmer1Hash, rcKmer1Hash);
+ ASSERT_EQ(kmer2Hash, rcKmer2Hash);
+}
+
+TEST(RollingHashIterator, badKmers)
+{
+ const unsigned k = 3;
+ const unsigned numHashes = 1;
+
+ /* skip bad k-mers in middle of sequence */
+
+ const char* seq = "AAANAAA";
+ RollingHashIterator it(seq, numHashes, k);
+ ASSERT_EQ(0u, it.pos());
+ ++it;
+ ASSERT_EQ(4u, it.pos());
+ ++it;
+ ASSERT_EQ(RollingHashIterator::end(), it);
+
+ /* all bad k-mers */
+
+ const char* seq2 = "NNNNNNN";
+ RollingHashIterator it2(seq2, numHashes, k);
+ ASSERT_EQ(RollingHashIterator::end(), it2);
+}
+
+TEST(RollingHashIterator, seqShorterThanK)
+{
+ const unsigned k = 5;
+ const unsigned numHashes = 1;
+ const char* seq = "ACGT";
+
+ RollingHashIterator it(seq, numHashes, k);
+ ASSERT_EQ(RollingHashIterator::end(), it);
+}
+
+TEST(RollingHashIterator, emptySeq)
+{
+ const unsigned k = 3;
+ const unsigned numHashes = 1;
+ const char* seq = "";
+
+ RollingHashIterator it(seq, numHashes, k);
+ ASSERT_EQ(RollingHashIterator::end(), it);
+}
+
+TEST(RollingHashIterator, spacedSeed)
+{
+ const unsigned k = 5;
+ const unsigned numHashes = 1;
+ const char* seq = "AGNNGC";
+ const char* rcSeq = "GCNNCT";
+ Kmer::setLength(k);
+ MaskedKmer::setMask("10001");
+
+ /** hash forward sequence */
+
+ RollingHashIterator it(seq, numHashes, k);
+ size_t kmer1Hash, kmer2Hash;
+ kmer1Hash = (*it)[0];
+ ++it;
+ kmer2Hash = (*it)[0];
+ ++it;
+ ASSERT_EQ(RollingHashIterator::end(), it);
+
+ /** hash reverse complement sequence */
+
+ RollingHashIterator rcIt(rcSeq, numHashes, k);
+ size_t rcKmer1Hash, rcKmer2Hash;
+ rcKmer2Hash = (*rcIt)[0];
+ ++rcIt;
+ rcKmer1Hash = (*rcIt)[0];
+ ++rcIt;
+ ASSERT_EQ(RollingHashIterator::end(), rcIt);
+
+ /** check hash values are the same for forward and reverse complement */
+
+ ASSERT_EQ(kmer1Hash, rcKmer1Hash);
+ ASSERT_EQ(kmer2Hash, rcKmer2Hash);
+}
diff --git a/Unittest/BloomDBG/RollingHashTest.cpp b/Unittest/BloomDBG/RollingHashTest.cpp
new file mode 100644
index 0000000..4895213
--- /dev/null
+++ b/Unittest/BloomDBG/RollingHashTest.cpp
@@ -0,0 +1,195 @@
+#include "BloomDBG/RollingHash.h"
+
+#include <gtest/gtest.h>
+#include <string>
+#include <algorithm>
+#include "boost/dynamic_bitset.hpp"
+
+using namespace std;
+using namespace boost;
+
+/** test fixture for RollingHash tests */
+class RollingHashTest : public ::testing::Test
+{
+protected:
+
+ const unsigned m_numHashes;
+ const unsigned m_k;
+ const string m_kmerMask;
+
+ RollingHashTest() : m_numHashes(2), m_k(4)
+ {
+ Kmer::setLength(m_k);
+ }
+};
+
+TEST_F(RollingHashTest, kmerMask)
+{
+ MaskedKmer::setMask("1001");
+ RollingHash kmer1Hash("GCCG", m_numHashes, m_k);
+ RollingHash kmer2Hash("GTTG", m_numHashes, m_k);
+ ASSERT_EQ(kmer1Hash, kmer2Hash);
+}
+
+TEST_F(RollingHashTest, rollRight)
+{
+ MaskedKmer::mask().clear();
+ RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ leftKmerHash.rollRight("GACG", 'T');
+ ASSERT_EQ(middleKmerHash, leftKmerHash);
+ leftKmerHash.rollRight("ACGT", 'C');
+ ASSERT_EQ(rightKmerHash, leftKmerHash);
+}
+
+TEST_F(RollingHashTest, rollRightMasked)
+{
+ MaskedKmer::setMask("1001");
+ RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ leftKmerHash.rollRight("GACG", 'T');
+ ASSERT_EQ(middleKmerHash, leftKmerHash);
+ leftKmerHash.rollRight("ACGT", 'C');
+ ASSERT_EQ(rightKmerHash, leftKmerHash);
+}
+
+TEST_F(RollingHashTest, rollRightMaskedMismatch)
+{
+ MaskedKmer::setMask("1001");
+
+ const char* origSeq = "GACGTC";
+ const char* mutatedSeq = "GACTTC";
+
+ RollingHash left(origSeq, m_numHashes, m_k);
+ RollingHash middle(origSeq + 1, m_numHashes, m_k);
+ RollingHash right(origSeq + 2, m_numHashes, m_k);
+
+ RollingHash mutated(mutatedSeq, m_numHashes, m_k);
+
+ ASSERT_NE(left, mutated);
+ mutated.rollRight(mutatedSeq, 'T');
+ ASSERT_EQ(middle, mutated);
+ mutated.rollRight(mutatedSeq + 1, 'C');
+ ASSERT_EQ(right, mutated);
+}
+
+TEST_F(RollingHashTest, rollLeft)
+{
+ MaskedKmer::mask().clear();
+
+ RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ rightKmerHash.rollLeft('A', "CGTC");
+ ASSERT_EQ(middleKmerHash, rightKmerHash);
+ rightKmerHash.rollLeft('G', "ACGT");
+ ASSERT_EQ(leftKmerHash, rightKmerHash);
+}
+
+TEST_F(RollingHashTest, rollLeftMasked)
+{
+ MaskedKmer::setMask("1001");
+
+ RollingHash leftKmerHash("GACG", m_numHashes, m_k);
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ rightKmerHash.rollLeft('A', "CGTC");
+ ASSERT_EQ(middleKmerHash, rightKmerHash);
+ rightKmerHash.rollLeft('G', "ACGT");
+ ASSERT_EQ(leftKmerHash, rightKmerHash);
+}
+
+TEST_F(RollingHashTest, rollLeftMaskedMismatch)
+{
+ MaskedKmer::setMask("1001");
+
+ const char* origSeq = "GACGTC";
+ const char* mutatedSeq = "GAGGTC";
+
+ RollingHash left(origSeq, m_numHashes, m_k);
+ RollingHash middle(origSeq + 1, m_numHashes, m_k);
+ RollingHash right(origSeq + 2, m_numHashes, m_k);
+
+ RollingHash mutated(mutatedSeq + 2, m_numHashes, m_k);
+
+ ASSERT_NE(right, mutated);
+ mutated.rollLeft('A', mutatedSeq + 2);
+ ASSERT_EQ(middle, mutated);
+ mutated.rollLeft('G', mutatedSeq + 1);
+ ASSERT_EQ(left, mutated);
+}
+
+TEST_F(RollingHashTest, reset)
+{
+ MaskedKmer::mask().clear();
+
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ middleKmerHash.reset("CGTC");
+ ASSERT_EQ(rightKmerHash, middleKmerHash);
+}
+
+TEST_F(RollingHashTest, resetMasked)
+{
+ MaskedKmer::setMask("1001");
+
+ RollingHash middleKmerHash("ACGT", m_numHashes, m_k);
+ RollingHash rightKmerHash("CGTC", m_numHashes, m_k);
+
+ /*
+ * Note: third base of middleKmerHash is intentionally set to 'G'
+ * instead of 'T'. However, the hash values should
+ * still match the rightKmerHash due to the effect of
+ * the k-mer mask.
+ */
+ middleKmerHash.reset("CGGC");
+ ASSERT_EQ(rightKmerHash, middleKmerHash);
+}
+
+TEST_F(RollingHashTest, setBase)
+{
+ MaskedKmer::mask().clear();
+
+ char kmer1[] = "ACGT";
+ char kmer2[] = "ACCT";
+
+ RollingHash hash1(kmer1, m_numHashes, m_k);
+ RollingHash hash2(kmer2, m_numHashes, m_k);
+
+ ASSERT_NE(hash2, hash1);
+ hash1.setBase(kmer1, 2, 'C');
+ ASSERT_EQ(0, strcmp(kmer1, kmer2));
+ ASSERT_EQ(hash2, hash1);
+}
+
+TEST_F(RollingHashTest, setBaseMasked)
+{
+ MaskedKmer::setMask("1101");
+
+ char kmer1[] = "ACGT";
+ char kmer2[] = "ACCT";
+
+ RollingHash hash1(kmer1, m_numHashes, m_k);
+ RollingHash hash2(kmer2, m_numHashes, m_k);
+
+ /* hashes should agree since mismatch is in masked position */
+ ASSERT_EQ(hash2, hash1);
+ ASSERT_NE(0, strcmp(kmer1, kmer2));
+
+ /* fix mismatch in masked position (hash values shouldn't change) */
+ hash1.setBase(kmer1, 2, 'C');
+ ASSERT_EQ(hash2, hash1);
+ ASSERT_EQ(0, strcmp(kmer1, kmer2));
+
+ /* create mismatch in unmasked position (hash value should now differ) */
+ hash1.setBase(kmer1, 1, 'G');
+ ASSERT_NE(hash2, hash1);
+ ASSERT_NE(0, strcmp(kmer1, kmer2));
+}
diff --git a/Unittest/BloomDBG/SpacedSeedTest.cpp b/Unittest/BloomDBG/SpacedSeedTest.cpp
new file mode 100644
index 0000000..ecd222a
--- /dev/null
+++ b/Unittest/BloomDBG/SpacedSeedTest.cpp
@@ -0,0 +1,26 @@
+#include "BloomDBG/SpacedSeed.h"
+#include <gtest/gtest.h>
+
+using namespace std;
+
+TEST(SpacedSeedTest, qrSeed)
+{
+ /*
+ * Generate a Quadratic Residue (QR) seed. The background theory
+ * for QR seeds is described in:
+ *
+ * Egidi, Lavinia, and Giovanni Manzini. "Multiple seeds
+ * sensitivity using a single seed with threshold." Journal of
+ * bioinformatics and computational biology 13.04 (2015): 1550011.
+ */
+ ASSERT_EQ("10100011101", SpacedSeed::qrSeed(11));
+}
+
+TEST(SpacedSeedTest, qrSeedPair)
+{
+ /*
+ * Generate spaced seed pattern for two mirrored QR seeds with
+ * a gap in between.
+ */
+ ASSERT_EQ("101000111010000000000010111000101", SpacedSeed::qrSeedPair(33,11));
+}
diff --git a/Unittest/Graph/ExtendPathTest.cpp b/Unittest/Graph/ExtendPathTest.cpp
index 63ad7d9..47072d3 100644
--- a/Unittest/Graph/ExtendPathTest.cpp
+++ b/Unittest/Graph/ExtendPathTest.cpp
@@ -62,6 +62,54 @@ TEST(extendPath, lookAhead)
ASSERT_FALSE(lookAhead(0, FORWARD, depth, g2));
}
+TEST(extendPath, depth)
+{
+ /*
+ * 2
+ * /
+ * 0--1
+ * \
+ * 3--4
+ */
+
+ Graph g;
+ add_edge(0, 1, g);
+ add_edge(1, 2, g);
+ add_edge(1, 3, g);
+ add_edge(3, 4, g);
+
+ /* note: depth of starting node is 0 */
+ ASSERT_EQ(3u, depth(0, FORWARD, g));
+ ASSERT_EQ(2u, depth(1, FORWARD, g));
+ ASSERT_EQ(3u, depth(4, REVERSE, g));
+ ASSERT_EQ(1u, depth(1, REVERSE, g));
+}
+
+TEST(extendPath, longestBranch)
+{
+ /*
+ * 2
+ * /
+ * 0--1
+ * \
+ * 3--4
+ * /
+ * 5
+ */
+
+ Graph g;
+ add_edge(0, 1, g);
+ add_edge(1, 2, g);
+ add_edge(1, 3, g);
+ add_edge(3, 4, g);
+ add_edge(5, 3, g);
+
+ ASSERT_EQ(1u, longestBranch(0, FORWARD, g));
+ ASSERT_EQ(3u, longestBranch(1, FORWARD, g));
+ ASSERT_EQ(1u, longestBranch(3, REVERSE, g));
+ ASSERT_EQ(3u, longestBranch(4, REVERSE, g));
+}
+
TEST(extendPath, noExtension)
{
// Graph containing a single edge.
@@ -170,7 +218,8 @@ TEST(extendPath, bidirectional)
TEST(extendPath, withTrimming)
{
- const unsigned trimLen = 1;
+ ExtendPathParams params;
+ params.trimLen = 1;
/*
* 2
@@ -193,8 +242,7 @@ TEST(extendPath, withTrimming)
Path<Vertex> path;
path.push_back(0);
- extendPath(path, FORWARD, g, trimLen);
- ASSERT_EQ(4u, path.size());
+ extendPath(path, FORWARD, g, params);
ASSERT_EQ(expectedPath, path);
/*
@@ -212,17 +260,29 @@ TEST(extendPath, withTrimming)
add_edge(3, 4, g2);
add_edge(3, 5, g2);
- Path<Vertex> expectedPath2;
- expectedPath2.push_back(0);
- expectedPath2.push_back(1);
- expectedPath2.push_back(3);
-
Path<Vertex> path2;
path2.push_back(0);
- extendPath(path2, FORWARD, g2, trimLen);
- EXPECT_EQ(3u, path2.size());
- ASSERT_EQ(expectedPath2, path2);
+ extendPath(path2, FORWARD, g2, params);
+
+ /**
+ * Note: In situations where there are
+ * multiple branches shorter than the trim
+ * length, we chose the longest one. (And
+ * if the branches are of equal length we
+ * choose one arbitrarily.)
+ *
+ * This is the desired behaviour to deal
+ * with coverage gaps in the de Bruijn
+ * graph, which can make a legimitate branch
+ * indistinguishable from short branches
+ * due to read errors / Bloom filter false
+ * positives.
+ */
+ ASSERT_EQ(4u, path2.size());
+ ASSERT_EQ(0u, path2.at(0));
+ ASSERT_EQ(1u, path2.at(1));
+ ASSERT_EQ(3u, path2.at(2));
}
TEST(extendPath, cycles)
@@ -282,11 +342,13 @@ TEST(extendPath, cycles)
Path<Vertex> expectedPath2;
expectedPath2.push_back(0);
expectedPath2.push_back(1);
- expectedPath2.push_back(2);
- expectedPath2.push_back(3);
result = extendPath(path2, FORWARD, g2);
- EXPECT_EQ(EXTENDED_TO_CYCLE, result);
+ /*
+ * note: expected result is EXTENDED_TO_BRANCHING_POINT
+ * because vertex 1 has 2 incoming edges
+ */
+ EXPECT_EQ(EXTENDED_TO_BRANCHING_POINT, result);
EXPECT_EQ(expectedPath2, path2);
/*
@@ -305,13 +367,15 @@ TEST(extendPath, cycles)
path3.push_back(0);
Path<Vertex> expectedPath3;
- expectedPath3.push_back(3);
- expectedPath3.push_back(2);
expectedPath3.push_back(1);
expectedPath3.push_back(0);
result = extendPath(path3, REVERSE, g3);
- EXPECT_EQ(EXTENDED_TO_CYCLE, result);
+ /*
+ * note: expected result is EXTENDED_TO_BRANCHING_POINT
+ * because vertex 1 has 2 incoming edges
+ */
+ EXPECT_EQ(EXTENDED_TO_BRANCHING_POINT, result);
EXPECT_EQ(expectedPath3, path3);
}
diff --git a/Unittest/Makefile.am b/Unittest/Makefile.am
index d0e41d4..da0980f 100644
--- a/Unittest/Makefile.am
+++ b/Unittest/Makefile.am
@@ -1,150 +1,111 @@
-GTEST_LIBS_ = $(top_builddir)/lib/gtest-1.7.0/libgtest_main.a
# -Wno-error is used here because there is no portable way
# to suppress warning: "argument unused during compilation: '-pthread'"
# for clang on OSX.
# See: http://stackoverflow.com/questions/17841140/os-x-clang-pthread
-GTEST_CXXFLAGS_ = $(AM_CXXFLAGS) $(PTHREAD_CFLAGS) -Wno-error
-GTEST_INCLUDES_ = -I$(top_srcdir) -I$(top_srcdir)/lib/gtest-1.7.0/include
-GTEST_LDFLAGS_ = $(PTHREAD_LIBS)
+AM_CXXFLAGS += $(PTHREAD_CFLAGS) -Wno-error
+AM_LDFLAGS = $(PTHREAD_LIBS)
+AM_CPPFLAGS = \
+ -I$(top_srcdir) \
+ -I$(top_srcdir)/lib/gtest-1.7.0/include
+LDADD = $(top_builddir)/lib/gtest-1.7.0/libgtest_main.a
check_PROGRAMS = common_stringutil
common_stringutil_SOURCES = Common/StringUtilTest.cpp
-common_stringutil_CPPFLAGS = $(GTEST_INCLUDES_)
-common_stringutil_LDADD = $(GTEST_LIBS_)
-common_stringutil_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_stringutil_LDFLAGS = $(GTEST_LDFLAGS_)
check_PROGRAMS += common_histogram
common_histogram_SOURCES = Common/HistogramTest.cpp
-common_histogram_CPPFLAGS = $(GTEST_INCLUDES_)
-common_histogram_LDADD = $(GTEST_LIBS_)
-common_histogram_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_histogram_LDFLAGS = $(GTEST_LDFLAGS_)
check_PROGRAMS += common_bitutil
common_bitutil_SOURCES = Common/BitUtilTest.cpp
-common_bitutil_CPPFLAGS = $(GTEST_INCLUDES_)
-common_bitutil_LDADD = $(GTEST_LIBS_)
-common_bitutil_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_bitutil_LDFLAGS = $(GTEST_LDFLAGS_)
check_PROGRAMS += common_kmer
common_kmer_SOURCES = Common/KmerTest.cpp
-common_kmer_CPPFLAGS = $(GTEST_INCLUDES_)
-common_kmer_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_kmer_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_kmer_LDFLAGS = $(GTEST_LDFLAGS_)
+common_kmer_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += common_sequence
common_sequence_SOURCES = Common/Sequence.cc
-common_sequence_CPPFLAGS = $(GTEST_INCLUDES_)
-common_sequence_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_sequence_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_sequence_LDFLAGS = $(GTEST_LDFLAGS_)
+common_sequence_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += common_KmerIterator
common_KmerIterator_SOURCES = Common/KmerIteratorTest.cpp
-common_KmerIterator_CPPFLAGS = $(GTEST_INCLUDES_)
-common_KmerIterator_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_KmerIterator_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_KmerIterator_LDFLAGS = $(GTEST_LDFLAGS_)
+common_KmerIterator_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += common_sam
common_sam_SOURCES = Common/SAM.cc
-common_sam_CPPFLAGS = $(GTEST_INCLUDES_)
-common_sam_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-common_sam_CXXFLAGS = $(GTEST_CXXFLAGS_)
-common_sam_LDFLAGS = $(GTEST_LDFLAGS_)
+common_sam_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += BloomFilter
BloomFilter_SOURCES = Konnector/BloomFilter.cc
-BloomFilter_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-BloomFilter_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-BloomFilter_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-BloomFilter_LDFLAGS = $(GTEST_LDFLAGS_)
+BloomFilter_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+BloomFilter_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
+BloomFilter_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
check_PROGRAMS += Konnector_DBGBloom
Konnector_DBGBloom_SOURCES = Konnector/DBGBloomTest.cpp
-Konnector_DBGBloom_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-Konnector_DBGBloom_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-Konnector_DBGBloom_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_DBGBloom_LDFLAGS = $(GTEST_LDFLAGS_)
+Konnector_DBGBloom_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+Konnector_DBGBloom_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
+Konnector_DBGBloom_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
check_PROGRAMS += Konnector_DBGBloomAlgorithms
Konnector_DBGBloomAlgorithms_SOURCES = Konnector/DBGBloomAlgorithmsTest.cpp
-Konnector_DBGBloomAlgorithms_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-Konnector_DBGBloomAlgorithms_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_DBGBloomAlgorithms_LDADD = \
- $(top_builddir)/Common/libcommon.a \
- $(GTEST_LIBS_)
-Konnector_DBGBloomAlgorithms_LDFLAGS = $(GTEST_LDFLAGS_)
+Konnector_DBGBloomAlgorithms_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+Konnector_DBGBloomAlgorithms_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+Konnector_DBGBloomAlgorithms_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_ConstrainedBFSVisitor
graph_ConstrainedBFSVisitor_SOURCES = Graph/ConstrainedBFSVisitorTest.cpp
-graph_ConstrainedBFSVisitor_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ConstrainedBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ConstrainedBFSVisitor_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ConstrainedBFSVisitor_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ConstrainedBFSVisitor_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ConstrainedBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_BidirectionalBFS
graph_BidirectionalBFS_SOURCES = Graph/BidirectionalBFSTest.cpp
-graph_BidirectionalBFS_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_BidirectionalBFS_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_BidirectionalBFS_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_BidirectionalBFS_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_BidirectionalBFS_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_BidirectionalBFS_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_AllPathsSearch
graph_AllPathsSearch_SOURCES = Graph/AllPathsSearchTest.cpp
-graph_AllPathsSearch_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_AllPathsSearch_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_AllPathsSearch_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_AllPathsSearch_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_AllPathsSearch_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_AllPathsSearch_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_HashGraph
graph_HashGraph_SOURCES = Graph/HashGraphTest.cpp
-graph_HashGraph_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_HashGraph_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_HashGraph_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_HashGraph_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_HashGraph_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_HashGraph_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_ConstrainedBidiBFSVisitor
graph_ConstrainedBidiBFSVisitor_SOURCES = \
Graph/ConstrainedBidiBFSVisitorTest.cpp
-graph_ConstrainedBidiBFSVisitor_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ConstrainedBidiBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ConstrainedBidiBFSVisitor_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ConstrainedBidiBFSVisitor_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ConstrainedBidiBFSVisitor_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ConstrainedBidiBFSVisitor_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += graph_ExtendPath
graph_ExtendPath_SOURCES = Graph/ExtendPathTest.cpp
-graph_ExtendPath_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
-graph_ExtendPath_LDADD = $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-graph_ExtendPath_CXXFLAGS = $(GTEST_CXXFLAGS_)
-graph_ExtendPath_LDFLAGS = $(GTEST_LDFLAGS_)
+graph_ExtendPath_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+graph_ExtendPath_LDADD = $(top_builddir)/Common/libcommon.a $(LDADD)
check_PROGRAMS += Konnector_konnector
Konnector_konnector_SOURCES = \
Konnector/konnectorTest.cpp
-Konnector_konnector_CPPFLAGS = $(GTEST_INCLUDES_) -I$(top_srcdir)/Common
+Konnector_konnector_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
Konnector_konnector_LDADD = \
$(top_builddir)/Align/libalign.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-Konnector_konnector_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-Konnector_konnector_LDFLAGS = $(GTEST_LDFLAGS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+Konnector_konnector_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
check_PROGRAMS += DBG_LoadAlgorithm
DBG_LoadAlgorithm_SOURCES = \
DBG/LoadAlgorithmTest.cpp
DBG_LoadAlgorithm_CPPFLAGS = \
- $(GTEST_INCLUDES_) \
+ $(AM_CPPFLAGS) \
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/Common
DBG_LoadAlgorithm_LDADD = \
$(top_builddir)/Assembly/libassembly.a \
$(top_builddir)/DataLayer/libdatalayer.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-DBG_LoadAlgorithm_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-DBG_LoadAlgorithm_LDFLAGS = $(GTEST_LDFLAGS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+DBG_LoadAlgorithm_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
if PAIRED_DBG
@@ -152,59 +113,103 @@ check_PROGRAMS += PairedDBG_LoadAlgorithm
PairedDBG_LoadAlgorithm_SOURCES = \
PairedDBG/LoadAlgorithmTest.cpp
PairedDBG_LoadAlgorithm_CPPFLAGS = \
- $(GTEST_INCLUDES_) \
+ $(AM_CPPFLAGS) \
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/Common
PairedDBG_LoadAlgorithm_LDADD = \
$(top_builddir)/PairedDBG/libpaireddbg.a \
$(top_builddir)/Assembly/libassembly.a \
$(top_builddir)/DataLayer/libdatalayer.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_LoadAlgorithm_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_LoadAlgorithm_LDFLAGS = $(GTEST_LDFLAGS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+PairedDBG_LoadAlgorithm_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
check_PROGRAMS += PairedDBG_KmerPair
PairedDBG_KmerPair_SOURCES = \
PairedDBG/KmerPairTest.cc
PairedDBG_KmerPair_CPPFLAGS = \
- $(GTEST_INCLUDES_) \
+ $(AM_CPPFLAGS) \
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/Common
PairedDBG_KmerPair_LDADD = \
$(top_builddir)/PairedDBG/libpaireddbg.a \
$(top_builddir)/DataLayer/libdatalayer.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_KmerPair_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_KmerPair_LDFLAGS = $(GTEST_LDFLAGS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+PairedDBG_KmerPair_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
check_PROGRAMS += PairedDBG_Dinuc
PairedDBG_Dinuc_SOURCES = \
PairedDBG/DinucTest.cc
PairedDBG_Dinuc_CPPFLAGS = \
- $(GTEST_INCLUDES_) \
+ $(AM_CPPFLAGS) \
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/Common
PairedDBG_Dinuc_LDADD = \
$(top_builddir)/PairedDBG/libpaireddbg.a \
$(top_builddir)/DataLayer/libdatalayer.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
PairedDBG_Dinuc_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
-PairedDBG_Dinuc_LDFLAGS = $(GTEST_LDFLAGS_)
check_PROGRAMS += PairedDBG_BranchRecord
PairedDBG_BranchRecord_SOURCES = \
PairedDBG/BranchRecordTest.cpp
PairedDBG_BranchRecord_CPPFLAGS = \
- $(GTEST_INCLUDES_) \
+ $(AM_CPPFLAGS) \
-I$(top_srcdir)/DataLayer \
-I$(top_srcdir)/Common
PairedDBG_BranchRecord_LDADD = \
$(top_builddir)/PairedDBG/libpaireddbg.a \
$(top_builddir)/DataLayer/libdatalayer.a \
- $(top_builddir)/Common/libcommon.a $(GTEST_LIBS_)
-PairedDBG_BranchRecord_CXXFLAGS = $(GTEST_CXXFLAGS_) $(OPENMP_CXXFLAGS)
-PairedDBG_BranchRecord_LDFLAGS = $(GTEST_LDFLAGS_)
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+PairedDBG_BranchRecord_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
endif # PAIRED_DBG
+check_PROGRAMS += BloomDBG_BloomDBG
+BloomDBG_BloomDBG_SOURCES = BloomDBG/BloomDBGTest.cpp
+BloomDBG_BloomDBG_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/Common
+BloomDBG_BloomDBG_CXXFLAGS = $(AM_CXXFLAGS) $(OPENMP_CXXFLAGS)
+BloomDBG_BloomDBG_LDADD = \
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+
+check_PROGRAMS += BloomDBG_RollingHash
+BloomDBG_RollingHash_SOURCES = BloomDBG/RollingHashTest.cpp
+BloomDBG_RollingHash_LDADD = \
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+
+check_PROGRAMS += BloomDBG_RollingHashIterator
+BloomDBG_RollingHashIterator_SOURCES = BloomDBG/RollingHashIteratorTest.cpp
+BloomDBG_RollingHashIterator_LDADD = \
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+
+check_PROGRAMS += BloomDBG_HashAgnosticCascadingBloom
+BloomDBG_HashAgnosticCascadingBloom_SOURCES = \
+ BloomDBG/HashAgnosticCascadingBloomTest.cpp
+BloomDBG_HashAgnosticCascadingBloom_CXXFLAGS = $(AM_CXXFLAGS) \
+ $(OPENMP_CXXFLAGS)
+
+check_PROGRAMS += BloomDBG_RollingBloomDBG
+BloomDBG_RollingBloomDBG_SOURCES = BloomDBG/RollingBloomDBGTest.cpp
+BloomDBG_RollingBloomDBG_CXXFLAGS = $(AM_CXXFLAGS) \
+ $(OPENMP_CXXFLAGS)
+BloomDBG_RollingBloomDBG_LDADD = \
+ $(top_builddir)/DataLayer/libdatalayer.a \
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+
+check_PROGRAMS += BloomDBG_MaskedKmer
+BloomDBG_MaskedKmer_SOURCES = BloomDBG/MaskedKmerTest.cpp
+BloomDBG_MaskedKmer_LDADD = \
+ $(top_builddir)/Common/libcommon.a \
+ $(LDADD)
+
+check_PROGRAMS += BloomDBG_SpacedSeed
+BloomDBG_SpacedSeed_SOURCES = BloomDBG/SpacedSeedTest.cpp
+
TESTS = $(check_PROGRAMS)
diff --git a/bin/abyss-adjtodot.pl b/bin/abyss-adjtodot.pl
index 416eafe..2736fa6 100755
--- a/bin/abyss-adjtodot.pl
+++ b/bin/abyss-adjtodot.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Convert an ABySS adjacency file to GraphViz dot format.
# Written by Shaun Jackman <sjackman at bcgsc.ca>.
use strict;
diff --git a/bin/abyss-cstont b/bin/abyss-cstont
index 270e02a..33137f8 100755
--- a/bin/abyss-cstont
+++ b/bin/abyss-cstont
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Convert colour-space FASTA sequences to nucleotide FASTA sequences.
# Written by Shaun Jackman <sjackman at bcgsc.ca>.
# Usage: cstofasta data.csfa >data.fa
diff --git a/bin/abyss-dida b/bin/abyss-dida
index 593cfec..31cbc7d 100755
--- a/bin/abyss-dida
+++ b/bin/abyss-dida
@@ -71,7 +71,7 @@ fi
# Add file arguments to dida command. Convert all input file paths
# to absolute, since we change to a temp dir below
-query=($(readlink -f "$@"))
+query=($(echo "$@" | xargs -n1 readlink -f))
target=${query[${#query[@]}-1]}
unset query[${#query[@]}-1]
diff --git a/bin/abyss-fac.pl b/bin/abyss-fac.pl
index 828caa1..4728760 100755
--- a/bin/abyss-fac.pl
+++ b/bin/abyss-fac.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# abyss-fac (FASTA count)
# Calculate assembly contiguity statistics, such as N50.
# Written by Shaun Jackman <sjackman at bcgsc.ca>.
diff --git a/bin/abyss-fatoagp b/bin/abyss-fatoagp
index a5297b9..65353b2 100755
--- a/bin/abyss-fatoagp
+++ b/bin/abyss-fatoagp
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Convert a FASTA file of scaffolds to a FASTA file of contigs and an
# AGP file.
# Written by Shaun Jackman <sjackman at bcgsc.ca>.
@@ -7,9 +7,12 @@ use strict;
use Getopt::Std qw'getopts';
my %opt;
-getopts 'f:s:', \%opt;
+getopts 'f:s:S:', \%opt;
my $opt_fasta = $opt{'f'};
-my $opt_min_len = defined $opt{'s'} ? $opt{'s'} : 200;
+# scaffolds shorter than this length will be excluded
+my $opt_min_scaf_len = defined $opt{'s'} ? $opt{'s'} : 200;
+# scaftigs shorter than this length will be masked with "N"s
+my $opt_min_ctg_len = defined $opt{'S'} ? $opt{'S'} : 50;
open FASTA, ">$opt_fasta"
or die "error: `$opt_fasta': $!\n"
@@ -24,9 +27,21 @@ while (<>) {
my $scafseq = <>;
chomp $scafseq;
my $scaflen = $scafseq =~ tr/ACGTacgt//;
- next if $scaflen < $opt_min_len;
+ next if $scaflen < $opt_min_scaf_len;
+ # mask scaftigs shorter than length threshold with "N"s
my @ctgseqs = split /([Nn]+)/, $scafseq;
+ foreach my $ctgseq (@ctgseqs) {
+ next if /^[nN]/;
+ if (length($ctgseq) < $opt_min_ctg_len) {
+ $ctgseq = "N" x length($ctgseq);
+ }
+ }
+ # rejoin and split to merge adjacent stretches of "N"s
+ $scafseq = join '', @ctgseqs;
+ next unless $scafseq =~ /[^nN]/;
+ @ctgseqs = split /([Nn]+)/, $scafseq;
+
my $i = 0;
my $x = 0;
for my $ctgseq (@ctgseqs) {
diff --git a/bin/abyss-joindist b/bin/abyss-joindist
index ddb03be..860b55d 100755
--- a/bin/abyss-joindist
+++ b/bin/abyss-joindist
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
# Join multiple ABySS distance estimate files.
# Written by Shaun Jackman <sjackman at bcgsc.ca>.
use strict;
diff --git a/bin/abyss-pe b/bin/abyss-pe
index 60436c6..48cdd2f 100755
--- a/bin/abyss-pe
+++ b/bin/abyss-pe
@@ -101,7 +101,7 @@ species ?= ""
endif
# Programs
-MARKDOWN=multimarkdown
+MARKDOWN=pandoc
map=$(foreach a,$(2),$(call $(1),$(a)))
deref=$($1)
@@ -129,6 +129,18 @@ graph?=dot
# g is private. Use graph instead.
override g:=$(graph)
+# Number of threads
+ifdef PE_HOSTFILE
+hostname?=$(shell hostname -f)
+j?=$(shell awk '$$1 == "$(hostname)" {print $$2}' $(PE_HOSTFILE))
+endif
+ifeq ($j,)
+j:=$(np)
+endif
+ifeq ($j,)
+j:=2
+endif
+
# ABYSS parameters
q ?= 3
abyssopt += -k$k -q$q
@@ -157,31 +169,42 @@ endif
ifdef ss
SS=--SS
endif
-abyssopt += $v $(dbopt) $(SS) --coverage-hist=coverage.hist -s $*-bubbles.fa
+abyssopt += $v
-# Number of threads
-ifdef PE_HOSTFILE
-hostname?=$(shell hostname -f)
-j?=$(shell awk '$$1 == "$(hostname)" {print $$2}' $(PE_HOSTFILE))
+# additional params for Bloom filter assembly (`abyss-bloom-dbg`)
+ifdef B
+abyssopt += -b$B
+ifdef H
+abyssopt += -H$H
endif
-ifeq ($j,)
-j:=$(np)
+ifdef j
+abyssopt += -j$j
endif
-ifeq ($j,)
-j:=2
+ifdef kc
+abyssopt += --kc=$(kc)
+endif
+ifdef x
+abyssopt += -s$x
+endif
+else
+abyssopt += $(dbopt) $(SS) --coverage-hist=coverage.hist -s $*-bubbles.fa
endif
# AdjList parameters
m?=50
alopt += $v $(dbopt) $(SS) -k$k -m$m
+ifndef B
ifdef K
alopt += -K$K
endif
+endif
# filtergraph parameters
+ifndef B
ifdef K
fgopt += --assemble --shim-max-degree=2
endif
+endif
ifdef xtip
fgopt += -t$(shell echo $k*2 |bc)
endif
@@ -237,8 +260,8 @@ fmopt=$v $(dbopt) -l$($*_l) $(FIXMATE_OPTIONS)
# DistanceEst parameters
DistanceEst?=DistanceEst$(ssq_t)
-l?=$k
-s?=200
+l?=40
+s?=1000
n?=10
libs=$(pe) $(mp)
$(foreach i,$(libs),$(eval $i_l?=$l))
@@ -246,6 +269,7 @@ $(foreach i,$(libs),$(eval $i_s?=$s))
$(foreach i,$(libs),$(eval $i_n?=$n))
deopt=$v $(dbopt) -j$j -k$k -l$($*_l) -s$($*_s) -n$($*_n) $($*_de) \
$(DISTANCEEST_OPTIONS)
+scaffold_deopt=--dot --mean $(deopt)
# SimpleGraph parameters
sgopt += $(dbopt)
@@ -270,9 +294,17 @@ pcopt += -p$p
mcopt += $v $(dbopt) -k$k
# Scaffold parameters
-S?=$s
+S?=1000-10000
N?=$n
scopt += $v $(dbopt) $(SS) -k$k
+ifdef G
+scopt += -G$G
+endif
+
+# abyss-fac parameters
+ifdef G
+override facopt = -G$G
+endif
# BWA-SW parameters
bwaswopt=-t$j
@@ -305,23 +337,21 @@ error::
# Help and version messages
help:
- @printf '\
-Usage: abyss-pe [OPTION]... [PARAMETER=VALUE]... [COMMAND]...\n\
-Assemble reads into contigs and scaffolds. ABySS is a de novo\n\
-sequence assembler intended for short paired-end reads and large\n\
-genomes. See the abyss-pe man page for documentation of assembly\n\
-parameters and commands. abyss-pe is a Makefile script, and so\n\
-options of `make` may also be used with abyss-pe. See the `make`\n\
-man page for documentation.\n\
-\n\
-Report bugs to <abyss-users at bcgsc.ca>.\n'
+ @echo 'Usage: abyss-pe [OPTION]... [PARAMETER=VALUE]... [COMMAND]...'
+ @echo 'Assemble reads into contigs and scaffolds. ABySS is a de novo'
+ @echo 'sequence assembler intended for short paired-end reads and large'
+ @echo 'genomes. See the abyss-pe man page for documentation of assembly'
+ @echo 'parameters and commands. abyss-pe is a Makefile script, and so'
+ @echo 'options of `make` may also be used with abyss-pe. See the `make`'
+ @echo 'man page for documentation.'
+ @echo
+ @echo 'Report bugs to https://github.com/bcgsc/abyss/issues or abyss-users at bcgsc.ca.'
version:
- @printf '\
-abyss-pe (ABySS) 1.9.0\n\
-Written by Shaun Jackman and Anthony Raymond.\n\
-\n\
-Copyright 2012 Canada'\''s Michael Smith Genome Science Centre\n'
+ @echo "abyss-pe (ABySS) 2.0.1"
+ @echo "Written by Shaun Jackman and Anthony Raymond."
+ @echo
+ @echo "Copyright 2012 Canada's Michael Smith Genome Science Centre"
versions: version
@echo PATH=$(PATH)
@@ -453,7 +483,11 @@ startDb:
> db.txt
endif
-ifdef K
+ifdef B
+%-1.fa:
+ abyss-bloom-dbg $(abyssopt) $(ABYSS_OPTIONS) $(in) $(se) > $@
+else ifdef K
+
ifdef np
%-1.fa:
$(mpirun) -np $(np) abyss-paired-dbg-mpi $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa $(in) $(se)
@@ -462,15 +496,13 @@ else
abyss-paired-dbg $(abyssopt) $(ABYSS_OPTIONS) -o $*-1.fa -g $*-1.$g $(in) $(se)
endif
-else
-ifdef np
+else ifdef np
%-1.fa:
$(mpirun) -np $(np) ABYSS-P $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
else
%-1.fa:
ABYSS $(abyssopt) $(ABYSS_OPTIONS) -o $@ $(in) $(se)
endif
-endif
# Find overlapping contigs
@@ -560,7 +592,7 @@ ifndef cs
%-5.path %-5.fa %-5.$g: %-3.fa %-4.fa %-4.$g %-4.path3
cat $(wordlist 1, 2, $^) \
- |PathConsensus $v --$g -k$k $(pcopt) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
+ |PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -o $*-5.path -s $*-5.fa -g $*-5.$g - $(wordlist 3, 4, $^)
%-6.fa: %-3.fa %-4.fa %-5.fa %-5.$g %-5.path
cat $(wordlist 1, 3, $^) |MergeContigs $(mcopt) -o $@ - $(wordlist 4, 5, $^)
@@ -607,17 +639,17 @@ endif
%-6.dist.dot: %-6.sam.gz %-6.hist
gunzip -c $< \
- |$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+ |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
%-6.dist.dot: %-6.bam %-6.hist
samtools view -h $< \
- |$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+ |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
%-6.dist.dot: $(name)-6.fa
$(align) $(mapopt) $(strip $($*)) $< \
|$(fixmate) $(fmopt) -h $*-6.hist \
|sort -snk3 -k4 \
- |$(DistanceEst) --dot $(deopt) -o $@ $*-6.hist
+ |$(DistanceEst) $(scaffold_deopt) -o $@ $*-6.hist
# Scaffold
@@ -625,7 +657,7 @@ endif
abyss-scaffold $(scopt) -s$S -n$N -g $@.dot $(SCAFFOLD_OPTIONS) $^ >$@
%-7.path %-7.$g %-7.fa: %-6.fa %-6.$g %-6.path
- PathConsensus $v --$g -k$k $(pcopt) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
+ PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-7.fa -g $*-7.$g -o $*-7.path $^
%-8.fa: %-6.fa %-7.fa %-7.$g %-7.path
cat $(wordlist 1, 2, $^) \
@@ -674,7 +706,7 @@ sealer_ks?=-k90 -k80 -k70 -k60 -k50 -k40 -k30
abyss-scaffold $(scopt) -s$S -n1 -g $@.$g $(SCAFFOLD_OPTIONS) $^ >$@
%-9.path %-9.$g %-9.fa: %-8.fa %-8.$g %-8.path
- PathConsensus $v --$g -k$k $(pcopt) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
+ PathConsensus $v --$g -k$k $(pcopt) $(PATHCONSENSUS_OPTIONS) -s $*-9.fa -g $*-9.$g -o $*-9.path $^
%-10.fa: %-8.fa %-9.fa %-9.$g %-9.path
cat $(wordlist 1, 2, $^) \
@@ -750,7 +782,7 @@ ifneq ($(long),)
$(name)-stats.tab: %-stats.tab: %-long-scaffs.fa
endif
$(name)-stats.tab:
- abyss-fac $(FAC_OPTIONS) $^ |tee $@
+ abyss-fac $(facopt) $(FAC_OPTIONS) $^ |tee $@
%.csv: %.tab
tr '\t' , <$< >$@
@@ -778,7 +810,7 @@ $(name)-stats.tab:
# Report ABySS configuration variable(s) and value(s) currently set.
-override varList := a b c d e E j k l m n N p q s S t v cs pi \
+override varList := a b c d e E G j k l m n N p q s S t v cs pi \
np pe lib mp se SS hostname xtip \
ssq ssq_ti libs path name in mpirun \
aligner long ref fixmate DistanceEst \
@@ -792,12 +824,11 @@ override varList := a b c d e E j k l m n N p q s S t v cs pi \
MARKDOWN
env:
- @echo -e "\
- List of ABySS configuration variables currently set:\n\n\
- [environment], if variable was inherited from the environment.\n\
- [command line], if variable was defined on the command line.\n\
- [file], if variable was defined in (this) makefile.\n\
- [override], if variable was defined with an override directive in (this) makefile.\n"
+ @echo 'List of ABySS configuration variables currently set:'
+ @echo '[environment], if variable was inherited from the environment.'
+ @echo '[command line], if variable was defined on the command line.'
+ @echo '[file], if variable was defined in (this) makefile.'
+ @echo '[override], if variable was defined with an override directive in (this) makefile.'
@$(foreach var,$(varList),\
echo -e $(var)" = "$($(var))"\t["$(origin $(var))"]" | grep -v "undefined";)
diff --git a/bin/abyss-samtoafg b/bin/abyss-samtoafg
index 48f972f..893dfe8 100755
--- a/bin/abyss-samtoafg
+++ b/bin/abyss-samtoafg
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
use strict;
use Getopt::Long;
use Pod::Usage;
diff --git a/configure.ac b/configure.ac
index a3af488..884852b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
AC_PREREQ(2.62)
-AC_INIT(ABySS, 1.9.0, abyss-users at bcgsc.ca, abyss,
+AC_INIT(ABySS, 2.0.1, abyss-users at bcgsc.ca, abyss,
http://www.bcgsc.ca/platform/bioinfo/software/abyss)
m4_include(m4/m4_ax_pthread.m4)
AM_INIT_AUTOMAKE(1.9.6 foreign subdir-objects)
@@ -104,12 +104,16 @@ AC_ARG_WITH(sqlite, AS_HELP_STRING([--with-sqlite=PATH],
[specify prefix directory for the installed sqlite library]))
if test "$with_sqlite" -a "$with_sqlite" != "no" -a -d "$with_sqlite"; then
sqlite_cppflags="-I$with_sqlite/include"
- sqlite_ldflags="-L$with_sqlite/lib -lsqlite3"
+ if test -d "$with_sqlite/lib64"; then
+ sqlite_ldflags="-L$with_sqlite/lib64 -lsqlite3"
+ else
+ sqlite_ldflags="-L$with_sqlite/lib -lsqlite3"
+ fi
fi
# SparseHash
AC_ARG_WITH(sparsehash, AS_HELP_STRING([--with-sparsehash=PATH],
- [specify prefix directory for the installed spasehash library]))
+ [specify prefix directory for the installed sparsehash library]))
if test "$with_sparsehash" -a "$with_sparsehash" != "no" -a -d "$with_sparsehash" ; then
sparsehash_cppflags="-isystem$with_sparsehash/include"
sparsehash_ldflags="-L$with_sparsehash/lib"
@@ -122,10 +126,15 @@ AC_DEFINE_UNQUOTED(FMBITS, $enable_fm,
[Width of bits of the FM-index in bits])
AC_ARG_ENABLE(maxk, AS_HELP_STRING([--enable-maxk=N],
- [set the maximum k-mer length (default is 96)]),
- [], [enable_maxk=96])
+ [set the maximum k-mer length (default is 128)]),
+ [], [enable_maxk=128])
AC_DEFINE_UNQUOTED(MAX_KMER, [$enable_maxk], [maximum k-mer length])
+AC_ARG_ENABLE(max-hashes, AS_HELP_STRING([--enable-max-hashes],
+ [set the maximum number of Bloom filter hash functions (default is 32)]),
+ [], [enable_max_hashes=32])
+AC_DEFINE_UNQUOTED(MAX_HASHES, [$enable_max_hashes], [maximum Bloom filter hash functions])
+
# Find the absolute path to the source.
my_abs_srcdir=$(cd $srcdir; pwd)
@@ -220,6 +229,9 @@ fi
if (test "$ac_cv_header_sqlite3_h" = "yes" -a "$ac_cv_lib_sqlite3_main" = "yes"); then
AC_DEFINE(_SQL, 1, [Define to 1 if you have sqlite lib/header])
fi
+AM_CONDITIONAL(HAVE_SQLITE3,
+ [test "$ac_cv_header_sqlite3_h" = "yes" -a "$ac_cv_lib_sqlite3_main" = "yes"],
+ [Define to 1 if you have sqlite lib/header])
AC_SUBST(SQLITE_LIBS, "$LIBS")
LIBS=$libs
@@ -284,18 +296,21 @@ AC_CONFIG_FILES([
Unittest/Makefile
LogKmerCount/Makefile
Bloom/Makefile
+ BloomDBG/Makefile
DataBase/Makefile
+ lib/bloomfilter/Makefile
+ lib/rolling-hash/Makefile
])
if test "$with_sparsehash" != "no" -a "$ac_cv_header_google_sparse_hash_map" != "yes"; then
AC_MSG_ERROR([ABySS should be compiled with Google sparsehash to
reduce memory usage. It may be downloaded here:
- http://code.google.com/p/google-sparsehash
+ https://code.google.com/p/sparsehash/
If you do not wish to use sparsehash, specify --without-sparsehash.])
fi
-if test $ac_cv_header_pthread_h != yes -o $ac_cv_lib_pthread_pthread_create != yes; then
+if test x"$have_pthread" != x"yes"; then
AC_MSG_WARN([Warning: Running the unit tests with 'make check' has been disabled
because pthread.h and/or libpthread could not be found.])
fi
diff --git a/doc/ABYSS.1 b/doc/ABYSS.1
index 6b90240..14a9f69 100644
--- a/doc/ABYSS.1
+++ b/doc/ABYSS.1
@@ -1,4 +1,4 @@
-.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 1.9.0" "User Commands"
+.TH ABYSS "1" "2015-May" "ABYSS (ABySS) 2.0.1" "User Commands"
.SH NAME
ABYSS \- assemble short reads into contigs
.SH SYNOPSIS
diff --git a/doc/abyss-pe.1 b/doc/abyss-pe.1
index 349ee05..fb94fe7 100644
--- a/doc/abyss-pe.1
+++ b/doc/abyss-pe.1
@@ -1,4 +1,4 @@
-.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 1.9.0" "User Commands"
+.TH abyss-pe "1" "2015-May" "abyss-pe (ABySS) 2.0.1" "User Commands"
.SH NAME
abyss-pe - assemble reads into contigs
.SH SYNOPSIS
@@ -49,7 +49,10 @@ files containing single-end reads
maximum number of branches of a bubble [2]
.TP
.B b
-maximum length of a bubble (bp) [10000]
+maximum length of a bubble (bp) [""]
+.br
+abyss-pe has two bubble popping stages. The default limits are 3*k bp
+for ABYSS and 10000 bp for PopBubbles.
.TP
.B c
minimum mean k-mer coverage of a unitig [sqrt(median)]
@@ -58,10 +61,10 @@ minimum mean k-mer coverage of a unitig [sqrt(median)]
allowable error of a distance estimate (bp) [6]
.TP
.B e
-minimum erosion k-mer coverage [sqrt(median)]
+minimum erosion k-mer coverage [round(sqrt(median))]
.TP
.B E
-minimum erosion k-mer coverage per strand [1]
+minimum erosion k-mer coverage per strand [1 if sqrt(median) > 2 else 0]
.TP
.B j
number of threads [2]
@@ -73,7 +76,7 @@ size of a k-mer (when K is not set) or the span of a k-mer pair (when K is set)
size of a single k-mer in a k-mer pair (bp)
.TP
.B l
-minimum alignment length of a read (bp) [k]
+minimum alignment length of a read (bp) [40]
.TP
.B m
minimum overlap of two unitigs (bp) [30]
@@ -98,13 +101,13 @@ minimum base quality [0]
Mask all bases of reads whose quality is less than Q as `N'.
.TP
.B s
-minimum unitig size required for building contigs (bp) [200]
+minimum unitig size required for building contigs (bp) [1000]
.br
The seed length should be at least twice the value of k. If more
sequence is assembled than the expected genome size, try increasing s.
.TP
.B S
-minimum contig size required for building scaffolds (bp) [s]
+minimum contig size required for building scaffolds (bp) [1000-10000]
.TP
.B SS
SS=--SS to assemble in strand-specific mode
@@ -114,7 +117,7 @@ Assumes that the first read in a read pair is reveresed WRT the
transcripts sequenced.
.TP
.B t
-minimum tip size (bp) [2k]
+maximum length of blunt contigs to trim [k]
.TP
.B v
v=-v to enable verbose logging
diff --git a/doc/abyss-tofastq.1 b/doc/abyss-tofastq.1
index e997080..4250191 100644
--- a/doc/abyss-tofastq.1
+++ b/doc/abyss-tofastq.1
@@ -1,4 +1,4 @@
-.TH abyss-tofastq "1" "2015-May" "ABySS 1.9.0" "User Commands"
+.TH abyss-tofastq "1" "2015-May" "ABySS 2.0.1" "User Commands"
.SH NAME
abyss-tofastq \- convert various file formats to FASTQ format
.br
diff --git a/doc/flowchart.graffle b/doc/flowchart.graffle
index c00b837..a695117 100644
--- a/doc/flowchart.graffle
+++ b/doc/flowchart.graffle
@@ -5198,7 +5198,7 @@
{\colortbl;\red255\green255\blue255;}
\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc
-\f0\b\fs28 \cf0 ABySS paired-end pipeline version 1.9.0}</string>
+\f0\b\fs28 \cf0 ABySS paired-end pipeline version 2.0.1}</string>
<key>VerticalPad</key>
<integer>0</integer>
</dict>
diff --git a/lib/bloomfilter/BloomFilter.hpp b/lib/bloomfilter/BloomFilter.hpp
new file mode 100644
index 0000000..4b60eb7
--- /dev/null
+++ b/lib/bloomfilter/BloomFilter.hpp
@@ -0,0 +1,446 @@
+/*
+ *
+ * BloomFilter.hpp
+ *
+ * Created on: Aug 10, 2012
+ * Author: cjustin
+ */
+
+#ifndef BLOOMFILTER_H_
+#define BLOOMFILTER_H_
+#include <string>
+#include <vector>
+#include <stdint.h>
+#include <math.h>
+#include <fstream>
+#include <iostream>
+#include <sys/stat.h>
+#include <cstring>
+#include <cassert>
+#include <cstdlib>
+#include <stdio.h>
+#include <cstring>
+#include "lib/rolling-hash/rolling.h"
+
+using namespace std;
+
+static const uint8_t bitsPerChar = 0x08;
+static const unsigned char bitMask[0x08] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
+ 0x40, 0x80 };
+
+inline unsigned popCnt(unsigned char x) {
+ return ((0x876543210
+ >> (((0x4332322132212110 >> ((x & 0xF) << 2)) & 0xF) << 2))
+ >> ((0x4332322132212110 >> (((x & 0xF0) >> 2)) & 0xF) << 2)) & 0xf;
+}
+
+class BloomFilter {
+public:
+
+ struct FileHeader {
+ char magic[8];
+ uint32_t hlen;
+ uint64_t size;
+ uint32_t nhash;
+ uint32_t kmer;
+ double dFPR;
+ uint64_t nEntry;
+ uint64_t tEntry;
+ };
+
+ /*
+ * Default constructor.
+ */
+ BloomFilter() :
+ m_filter(0), m_size(0), m_sizeInBytes(0), m_hashNum(0), m_kmerSize(
+ 0), m_dFPR(0), m_nEntry(0), m_tEntry(0) {
+ }
+
+ /* De novo filter constructor.
+ *
+ * preconditions:
+ * filterSize must be a multiple of 64
+ *
+ * kmerSize refers to the number of bases the kmer has
+ */
+ BloomFilter(size_t filterSize, unsigned hashNum, unsigned kmerSize) :
+ m_size(filterSize), m_hashNum(hashNum), m_kmerSize(kmerSize), m_dFPR(
+ 0), m_nEntry(0), m_tEntry(0) {
+ initSize(m_size);
+ memset(m_filter, 0, m_sizeInBytes);
+ }
+
+ /* De novo filter constructor.
+ * Allocates a filter size based on the number of expected elements and FPR
+ *
+ * If hashNum is set to 0, an optimal value is computed based on the FPR
+ */
+ BloomFilter(size_t expectedElemNum, double fpr, unsigned hashNum,
+ unsigned kmerSize) :
+ m_size(0), m_hashNum(hashNum), m_kmerSize(kmerSize), m_dFPR(fpr), m_nEntry(
+ 0), m_tEntry(0) {
+ if (m_hashNum == 0) {
+ m_hashNum = calcOptiHashNum(m_dFPR);
+ }
+ if (m_size == 0) {
+ m_size = calcOptimalSize(expectedElemNum, m_dFPR);
+ }
+ initSize(m_size);
+ memset(m_filter, 0, m_sizeInBytes);
+ }
+
+ BloomFilter(const string &filterFilePath) {
+ FILE *file = fopen(filterFilePath.c_str(), "rb");
+ if (file == NULL) {
+ cerr << "file \"" << filterFilePath << "\" could not be read."
+ << endl;
+ exit(1);
+ }
+
+ loadHeader(file);
+
+ long int lCurPos = ftell(file);
+ fseek(file, 0, 2);
+ size_t fileSize = ftell(file) - sizeof(struct FileHeader);
+ fseek(file, lCurPos, 0);
+ if (fileSize != m_sizeInBytes) {
+ cerr << "Error: " << filterFilePath
+ << " does not match size given by its information file. Size: "
+ << fileSize << " vs " << m_sizeInBytes << " bytes." << endl;
+ exit(1);
+ }
+
+ size_t countRead = fread(m_filter, fileSize, 1, file);
+ if (countRead != 1 && fclose(file) != 0) {
+ cerr << "file \"" << filterFilePath << "\" could not be read."
+ << endl;
+ exit(1);
+ }
+ }
+
+ void loadHeader(FILE *file) {
+
+ FileHeader header;
+ if (fread(&header, sizeof(struct FileHeader), 1, file) == 1) {
+ cerr << "Loading header..." << endl;
+ } else {
+ cerr << "Failed to header" << endl;
+ }
+ char magic[9];
+ strncpy(magic, header.magic, 8);
+ magic[8] = '\0';
+
+// cerr << "Loading header... magic: " <<
+// magic << " hlen: " <<
+// header.hlen << " size: " <<
+// header.size << " nhash: " <<
+// header.nhash << " kmer: " <<
+// header.kmer << " dFPR: " <<
+// header.dFPR << " aFPR: " <<
+// header.aFPR << " rFPR: " <<
+// header.rFPR << " nEntry: " <<
+// header.nEntry << " tEntry: " <<
+// header.tEntry << endl;
+
+ m_size = header.size;
+ initSize(m_size);
+ m_hashNum = header.nhash;
+ m_kmerSize = header.kmer;
+ }
+
+ /*
+ * Accepts a list of precomputed hash values. Faster than rehashing each time.
+ */
+ void insert(vector<size_t> const &precomputed) {
+
+ //iterates through hashed values adding it to the filter
+ for (size_t i = 0; i < m_hashNum; ++i) {
+ size_t normalizedValue = precomputed.at(i) % m_size;
+ __sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+ bitMask[normalizedValue % bitsPerChar]);
+ }
+ }
+
+ /*
+ * Accepts a list of precomputed hash values. Faster than rehashing each time.
+ */
+ void insert(const size_t precomputed[]) {
+
+ //iterates through hashed values adding it to the filter
+ for (size_t i = 0; i < m_hashNum; ++i) {
+ size_t normalizedValue = precomputed[i] % m_size;
+ __sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+ bitMask[normalizedValue % bitsPerChar]);
+ }
+ }
+
+ void insert(const char* kmer) {
+ uint64_t hVal = getChval(kmer, m_kmerSize);
+ for (unsigned i = 0; i < m_hashNum; i++) {
+ size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+ __sync_or_and_fetch(&m_filter[normalizedValue / bitsPerChar],
+ bitMask[normalizedValue % bitsPerChar]);
+ }
+ }
+
+ /*
+ * Returns if already inserted
+ */
+ bool insertAndCheck(const char* kmer) {
+ uint64_t hVal = getChval(kmer, m_kmerSize);
+ bool found = true;
+ for (unsigned i = 0; i < m_hashNum; i++) {
+ size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+ found &= __sync_or_and_fetch(
+ &m_filter[normalizedValue / bitsPerChar],
+ bitMask[normalizedValue % bitsPerChar]);
+ }
+ return found;
+ }
+
+ /*
+ * Accepts a list of precomputed hash values. Faster than rehashing each time.
+ * Returns if already inserted
+ */
+ bool insertAndCheck(vector<size_t> const &precomputed) {
+ //iterates through hashed values adding it to the filter
+ bool found = true;
+ for (size_t i = 0; i < m_hashNum; ++i) {
+ size_t normalizedValue = precomputed.at(i) % m_size;
+ found &= __sync_or_and_fetch(
+ &m_filter[normalizedValue / bitsPerChar],
+ bitMask[normalizedValue % bitsPerChar]);
+ }
+ return found;
+ }
+
+ /*
+ * Accepts a list of precomputed hash values. Faster than rehashing each time.
+ */
+ bool contains(vector<size_t> const &precomputed) const {
+ for (size_t i = 0; i < m_hashNum; ++i) {
+ size_t normalizedValue = precomputed.at(i) % m_size;
+ unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+ if ((m_filter[normalizedValue / bitsPerChar] & bit) != bit) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*
+ * Accepts a list of precomputed hash values. Faster than rehashing each time.
+ */
+ bool contains(const size_t precomputed[]) const {
+ for (size_t i = 0; i < m_hashNum; ++i) {
+ size_t normalizedValue = precomputed[i] % m_size;
+ unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+ if ((m_filter[normalizedValue / bitsPerChar] & bit) != bit) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /*
+ * Single pass filtering, computes hash values on the fly
+ */
+ bool contains(const char* kmer) const {
+ uint64_t hVal = getChval(kmer, m_kmerSize);
+ for (unsigned i = 0; i < m_hashNum; i++) {
+ size_t normalizedValue = (rol(varSeed, i) ^ hVal) % m_size;
+ unsigned char bit = bitMask[normalizedValue % bitsPerChar];
+ if ((m_filter[normalizedValue / bitsPerChar] & bit) == 0)
+ return false;
+ }
+ return true;
+ }
+
+ void writeHeader(ofstream &out) const {
+ FileHeader header;
+ strncpy(header.magic, "BlOOMFXX", 8);
+ char magic[9];
+ strncpy(magic, header.magic, 8);
+ magic[8] = '\0';
+
+ header.hlen = sizeof(struct FileHeader);
+ header.size = m_size;
+ header.nhash = m_hashNum;
+ header.kmer = m_kmerSize;
+ header.dFPR = m_dFPR;
+ header.nEntry = m_nEntry;
+ header.tEntry = m_tEntry;
+
+// cerr << "Writing header... magic: "
+// << magic << " hlen: "
+// << header.hlen << " size: "
+// << header.size << " nhash: "
+// << header.nhash << " kmer: "
+// << header.kmer << " dFPR: "
+// << header.dFPR << " aFPR: "
+// << header.aFPR << " rFPR: "
+// << header.rFPR << " nEntry: "
+// << header.nEntry << " tEntry: "
+// << header.tEntry << endl;
+
+ out.write(reinterpret_cast<char*>(&header), sizeof(struct FileHeader));
+ }
+
+ /*
+ * Stores the filter as a binary file to the path specified
+ * Stores uncompressed because the random data tends to
+ * compress poorly anyway
+ */
+ void storeFilter(string const &filterFilePath) const {
+ ofstream myFile(filterFilePath.c_str(), ios::out | ios::binary);
+
+ cerr << "Storing filter. Filter is " << m_sizeInBytes << "bytes."
+ << endl;
+
+ assert(myFile);
+ writeHeader(myFile);
+
+ //write out each block
+ myFile.write(reinterpret_cast<char*>(m_filter), m_sizeInBytes);
+
+ myFile.close();
+ assert(myFile);
+ }
+
+ size_t getPop() const {
+ size_t i, popBF = 0;
+#pragma omp parallel for reduction(+:popBF)
+ for (i = 0; i < (m_size + 7) / 8; i++)
+ popBF = popBF + popCnt(m_filter[i]);
+ return popBF;
+ }
+
+ unsigned getHashNum() const {
+ return m_hashNum;
+ }
+
+ unsigned getKmerSize() const {
+ return m_kmerSize;
+ }
+
+// void setdFPR(double value) {
+// m_dFPR = value;
+// }
+
+ /*
+ * Calculates that False positive rate that a redundant entry is actually
+ * a unique entry
+ */
+ double getRedudancyFPR() {
+ assert(m_nEntry > 0);
+ double total = log(calcFPR_numInserted(1));
+ for (size_t i = 2; i < m_nEntry; ++i) {
+ total = log(exp(total) + calcFPR_numInserted(i));
+ }
+ return exp(total) / m_nEntry;
+ }
+
+ /*
+ * Return FPR based on popcount
+ */
+ double getFPR() const {
+ return pow(double(getPop())/double(m_size), m_hashNum);
+ }
+
+ /*
+ * Return FPR based on number of inserted elements
+ */
+ double getFPR_numEle() const {
+ assert(m_nEntry > 0);
+ return calcFPR_numInserted(m_nEntry);
+ }
+
+ uint64_t getnEntry() {
+ return m_nEntry;
+ }
+
+ uint64_t gettEntry() {
+ return m_tEntry;
+ }
+
+ void setnEntry(uint64_t value) {
+ m_nEntry = value;
+ }
+
+ void settEntry(uint64_t value) {
+ m_tEntry = value;
+ }
+
+ size_t getFilterSize() const {
+ return m_size;
+ }
+
+ ~BloomFilter() {
+ delete[] m_filter;
+ }
+private:
+ BloomFilter(const BloomFilter& that); //to prevent copy construction
+
+ /*
+ * Checks filter size and initializes filter
+ */
+ void initSize(size_t size) {
+ if (size % 8 != 0) {
+ cerr << "ERROR: Filter Size \"" << size
+ << "\" is not a multiple of 8." << endl;
+ exit(1);
+ }
+ m_sizeInBytes = size / bitsPerChar;
+ m_filter = new unsigned char[m_sizeInBytes];
+ }
+
+ /*
+ * Only returns multiples of 64 for filter building purposes
+ * Is an estimated size using approximations of FPR formula
+ * given the number of hash functions
+ */
+ size_t calcOptimalSize(size_t entries, double fpr) const {
+ size_t non64ApproxVal = size_t(
+ -double(entries) * double(m_hashNum)
+ / log(1.0 - pow(fpr, double(1 / double(m_hashNum)))));
+
+ return non64ApproxVal + (64 - non64ApproxVal % 64);
+ }
+
+ /*
+ * Calculates the optimal number of hash function to use
+ * Calculation assumes optimal ratio of bytes per entry given a fpr
+ */
+ static unsigned calcOptiHashNum(double fpr) {
+ return unsigned(-log(fpr) / log(2));
+ }
+
+ /*
+ * Calculate FPR based on hash functions, size and number of entries
+ * see http://en.wikipedia.org/wiki/Bloom_filter
+ */
+ double calcFPR_numInserted(size_t numEntr) const {
+ return pow(
+ 1.0
+ - pow(1.0 - 1.0 / double(m_size),
+ double(numEntr) * m_hashNum), double(m_hashNum));
+ }
+
+ /*
+ * Calculates the optimal FPR to use based on hash functions
+ */
+ double calcFPR_hashNum(unsigned hashFunctNum) const {
+ return pow(2, -hashFunctNum);
+ }
+
+ uint8_t* m_filter;
+ size_t m_size;
+ size_t m_sizeInBytes;
+ unsigned m_hashNum;
+ unsigned m_kmerSize;
+ double m_dFPR;
+ uint64_t m_nEntry;
+ uint64_t m_tEntry;
+};
+
+#endif /* BLOOMFILTER_H_ */
diff --git a/lib/bloomfilter/Makefile.am b/lib/bloomfilter/Makefile.am
new file mode 100644
index 0000000..fb10c08
--- /dev/null
+++ b/lib/bloomfilter/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST = README.md
diff --git a/lib/bloomfilter/README.md b/lib/bloomfilter/README.md
new file mode 100644
index 0000000..651a381
--- /dev/null
+++ b/lib/bloomfilter/README.md
@@ -0,0 +1,4 @@
+These files come from:
+
+* https://github.com/bcgsc/bloomfilter
+* commit f1232c2
diff --git a/lib/rolling-hash/Makefile.am b/lib/rolling-hash/Makefile.am
new file mode 100644
index 0000000..fb10c08
--- /dev/null
+++ b/lib/rolling-hash/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST = README.md
diff --git a/lib/rolling-hash/README.md b/lib/rolling-hash/README.md
new file mode 100644
index 0000000..b76df57
--- /dev/null
+++ b/lib/rolling-hash/README.md
@@ -0,0 +1,2 @@
+* source repo: https://github.com/bcgsc/ntHash
+* git commit: 9f107de
diff --git a/lib/rolling-hash/rolling.h b/lib/rolling-hash/rolling.h
new file mode 100644
index 0000000..1e27d6a
--- /dev/null
+++ b/lib/rolling-hash/rolling.h
@@ -0,0 +1,316 @@
+#ifndef ROLLING_HASH_H
+#define ROLLING_HASH_H
+
+#include <stdint.h>
+
+// offset for the complement base in the random seeds table
+const int cpOff = -20;
+
+// shift for gerenerating multiple hash values
+const int varShift = 27;
+
+// seed for gerenerating multiple hash values
+const uint64_t varSeed = 10427061540882326010ul;
+
+// 64-bit random seed table corresponding to bases and their complements
+static const uint64_t seedTab[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, // 0..7
+ 0, 0, 0, 0, 0, 0, 0, 0, // 8..15
+ 0, 0, 0, 0, 0, 0, 0, 0, // 16..23
+ 0, 0, 0, 0, 0, 0, 0, 0, // 24..31
+ 0, 0, 0, 0, 0, 0, 0, 0, // 32..39
+ 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 40..47
+ 0, 0, 0, 3572411708064410444ul, 0, 0, 0, 0, // 48..55
+ 0, 0, 0, 0, 0, 0, 0, 0, // 56..63
+ 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 64..71
+ 0, 0, 0, 0, 0, 2978368046464386134ul, 0, 2319985823310095140ul, // 72..79
+ 0, 0, 0, 3572411708064410444ul, 2978368046464386134ul, 0, 0, 0, // 80..87
+ 0, 0, 0, 0, 0, 0, 0, 0, // 88..95
+ 4362857412768957556ul, 4362857412768957556ul, 0, 3572411708064410444ul, 0, 0, 0, 2319985823310095140ul, // 96..103
+ 0, 0, 0, 0, 0, 0, 0, 0, // 104..111
+ 0, 0, 0, 0, 2978368046464386134ul, 0, 0, 0, // 112..119
+ 0, 0, 0, 0, 0, 0, 0, 0, // 120..127
+ 0, 0, 0, 0, 0, 0, 0, 0, // 128..135
+ 0, 0, 0, 0, 0, 0, 0, 0, // 136..143
+ 0, 0, 0, 0, 0, 0, 0, 0, // 144..151
+ 0, 0, 0, 0, 0, 0, 0, 0, // 152..159
+ 0, 0, 0, 0, 0, 0, 0, 0, // 160..167
+ 0, 0, 0, 0, 0, 0, 0, 0, // 168..175
+ 0, 0, 0, 0, 0, 0, 0, 0, // 176..183
+ 0, 0, 0, 0, 0, 0, 0, 0, // 184..191
+ 0, 0, 0, 0, 0, 0, 0, 0, // 192..199
+ 0, 0, 0, 0, 0, 0, 0, 0, // 200..207
+ 0, 0, 0, 0, 0, 0, 0, 0, // 208..215
+ 0, 0, 0, 0, 0, 0, 0, 0, // 216..223
+ 0, 0, 0, 0, 0, 0, 0, 0, // 224..231
+ 0, 0, 0, 0, 0, 0, 0, 0, // 232..239
+ 0, 0, 0, 0, 0, 0, 0, 0, // 240..247
+ 0, 0, 0, 0, 0, 0, 0, 0 // 248..255
+};
+
+// rotate "v" to the left by "s" positions
+inline uint64_t rol(const uint64_t v, const int s) {
+ return (v << s) | (v >> (64 - s));
+}
+
+// rotate "v" to the right by "s" positions
+inline uint64_t ror(const uint64_t v, const int s) {
+ return (v >> s) | (v << (64 - s));
+}
+
+// forward-strand hash value of the base kmer, i.e. fhval(kmer_0)
+inline uint64_t getFhval(const char * kmerSeq, const unsigned k) {
+ uint64_t hVal=0;
+ for(unsigned i=0; i<k; i++)
+ hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+ return hVal;
+}
+
+// reverse-strand hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getRhval(const char * kmerSeq, const unsigned k) {
+ uint64_t hVal=0;
+ for(unsigned i=0; i<k; i++)
+ hVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+ return hVal;
+}
+
+// cannonical hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getChval(const char * kmerSeq, const unsigned k) {
+ uint64_t fhVal = getFhval(kmerSeq, k);
+ uint64_t rhVal = getRhval(kmerSeq, k);
+ return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// initialize forward-strand hash value of the first kmer, i.e. fhval(kmer_0)
+inline uint64_t initHashes(const char * kmerSeq, const unsigned k) {
+ return getFhval(kmerSeq, k);
+}
+
+// initialize cannonical hash value of the first kmer, i.e. chval(kmer_0)
+inline uint64_t initHashes(const char * kmerSeq, const unsigned k, uint64_t& fhVal, uint64_t& rhVal) {
+ fhVal = getFhval(kmerSeq, k);
+ rhVal = getRhval(kmerSeq, k);
+ return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// recursive forward-strand hash value for next k-mer
+inline uint64_t rollHashesRight(const uint64_t fhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
+ return(rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn]);
+}
+
+// recursive cannonical hash value for next k-mer
+inline uint64_t rollHashesRight(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charOut, const unsigned char charIn, const unsigned k) {
+ fhVal = rol(fhVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+ rhVal = ror(rhVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
+ return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// recursive forward-strand hash value for prev k-mer
+inline uint64_t rollHashesLeft(const uint64_t fhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
+ return(ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1));
+}
+
+// recursive canonical hash value for prev k-mer
+inline uint64_t rollHashesLeft(uint64_t& fhVal, uint64_t& rhVal, const unsigned char charIn, const unsigned char charOut, const unsigned k) {
+ fhVal = ror(fhVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+ rhVal = rol(rhVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
+ return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+// change a single base and update forward-strand hash value accordingly
+inline uint64_t setBase(uint64_t fhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
+{
+ fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+ kmerSeq[pos] = base;
+ fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+ return fhVal;
+}
+
+// change a single base and update hash values accordingly
+inline uint64_t setBase(uint64_t& fhVal, uint64_t& rhVal, char* kmerSeq, unsigned pos, char base, unsigned k)
+{
+ fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+ rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
+ kmerSeq[pos] = base;
+ fhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]], k-1-pos);
+ rhVal ^= rol(seedTab[(unsigned char)kmerSeq[pos]+cpOff], pos);
+ return (rhVal<fhVal)? rhVal : fhVal;
+}
+
+/**
+ * Compute multiple pseudo-independent hash values from a seed hash value.
+ *
+ * @param hashes array for storing computed hash values
+ * @param seedVal seed value for multi-hash calculation
+ * @param numHashes number of hash values to compute
+ * @param k-mer size
+ */
+inline void multiHash(uint64_t hashes[], uint64_t seedVal, unsigned numHashes, unsigned k)
+{
+ for (unsigned i = 0; i < numHashes; i++) {
+ hashes[i] = seedVal * (i ^ k * varSeed);
+ hashes[i] ^= hashes[i] >> varShift;
+ }
+}
+
+// spaced-seed hash values
+
+/**
+ * Calculate forward-strand spaced seed hash value of the base kmer, i.e. fhval(kmer_0)
+ *
+ * @param kVal set to forward-strand hash value for unmasked k-mer
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq k-mer to be hashed
+ * @param k k-mer size
+ * @return hash value for masked forward-strand k-mer
+ */
+inline uint64_t getFhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
+ kVal=0;
+ uint64_t sVal=0;
+ for(unsigned i=0; i<k; i++) {
+ kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+ if(seedSeq[i]=='1')
+ sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+ }
+ return sVal;
+}
+
+/**
+ * Calculate reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
+ *
+ * @param kVal set to reverse-strand hash value for unmasked k-mer
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq k-mer to be hashed
+ * @param k k-mer size
+ * @return hash for masked reverse-strand k-mer
+ */
+// reverse-strand spaced seed hash value of the base kmer, i.e. rhval(kmer_0)
+inline uint64_t getRhval(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned k) {
+ kVal=0;
+ uint64_t sVal=0;
+ for(unsigned i=0; i<k; i++) {
+ kVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+ if(seedSeq[i]=='1')
+ sVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+ }
+ return sVal;
+}
+
+/**
+ * Recursive forward-strand spaced seed hash value for next k-mer
+ *
+ * @param kVal hash value for current k-mer unmasked and in forward orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for *current* k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the right
+ * @param k k-mer size
+ * @return hash for masked k-mer in forward orientation
+ */
+inline uint64_t rollHashesRight(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+ const unsigned charOut = kmerSeq[0];
+ kVal = rol(kVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+ uint64_t sVal=kVal;
+ for(unsigned i=1; i<k-1; i++) {
+ if(seedSeq[i]!='1')
+ sVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
+ }
+ return sVal;
+}
+
+/**
+ * Recursive forward-strand spaced seed hash value for prev k-mer
+ *
+ * @param kVal hash value for current k-mer unmasked and in forward orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the left
+ * @param k k-mer size
+ * @return hash for masked k-mer in forward orientation
+ */
+inline uint64_t rollHashesLeft(uint64_t &kVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+ const unsigned charOut = kmerSeq[k-1];
+ kVal = ror(kVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+ uint64_t sVal=kVal;
+ for(unsigned i=1; i<k-1; i++) {
+ if(seedSeq[i]!='1')
+ sVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
+ }
+ return sVal;
+}
+
+/**
+ * Recursive canonical spaced seed hash value for next k-mer
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the right
+ * @param k k-mer size
+ * @return canonical hash value for masked k-mer
+ */
+inline uint64_t rollHashesRight(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+ const unsigned charOut = kmerSeq[0];
+ fkVal = rol(fkVal, 1) ^ rol(seedTab[charOut], k) ^ seedTab[charIn];
+ rkVal = ror(rkVal, 1) ^ ror(seedTab[charOut+cpOff], 1) ^ rol(seedTab[charIn+cpOff], k-1);
+ uint64_t fsVal=fkVal, rsVal=rkVal;
+ for(unsigned i=1; i<k-1; i++) {
+ if(seedSeq[i]!='1') {
+ fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]], k-1-i);
+ rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i+1]+cpOff], i);
+ }
+ }
+ return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+/**
+ * Recursive canonical spaced seed hash value for prev k-mer
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer (not the k-mer we are rolling into)
+ * @param charIn new base we are rolling in from the left
+ * @param k k-mer size
+ * @return canonical hash value for masked k-mer
+ */
+inline uint64_t rollHashesLeft(uint64_t &fkVal, uint64_t &rkVal, const char * seedSeq, const char * kmerSeq, const unsigned char charIn, const unsigned k) {
+ const unsigned charOut = kmerSeq[k-1];
+ fkVal = ror(fkVal, 1) ^ ror(seedTab[charOut], 1) ^ rol(seedTab[charIn], k-1);
+ rkVal = rol(rkVal, 1) ^ rol(seedTab[charOut+cpOff], k) ^ seedTab[charIn+cpOff];
+ uint64_t fsVal=fkVal, rsVal=rkVal;
+ for(unsigned i=1; i<k-1; i++) {
+ if(seedSeq[i]!='1') {
+ fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]], k-1-i);
+ rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i-1]+cpOff], i);
+ }
+ }
+ return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+/**
+ * Change a single base and recompute spaced seed hash values
+ *
+ * @param fkVal hash value for current k-mer unmasked and in forward orientation
+ * @param rkVal hash value for current k-mer unmasked and in reverse complement orientation
+ * @param seedSeq bitmask indicating "don't care" positions for hashing
+ * @param kmerSeq sequence for current k-mer
+ * @param pos position of base to change
+ * @param base new base value
+ * @param k k-mer size
+ * @return updated canonical hash value for masked k-mer
+ */
+inline uint64_t setBase(uint64_t& fkVal, uint64_t& rkVal, const char * seedSeq, char * kmerSeq, unsigned pos, char base, unsigned k)
+{
+ setBase(fkVal, rkVal, kmerSeq, pos, base, k);
+ uint64_t fsVal=fkVal, rsVal=rkVal;
+ for(unsigned i=0; i<k; i++) {
+ if(seedSeq[i]!='1') {
+ fsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]], k-1-i);
+ rsVal ^= rol(seedTab[(unsigned char)kmerSeq[i]+cpOff], i);
+ }
+ }
+ return (rsVal<fsVal)? rsVal : fsVal;
+}
+
+#endif
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/abyss.git
More information about the debian-med-commit
mailing list