[med-svn] [sga] 01/05: Imported Upstream version 0.10.14
Andreas Tille
tille at debian.org
Mon Jan 11 12:36:19 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch debian
in repository sga.
commit 592d36b4c8e8c3d7a8245ebbec1e036e8d567656
Author: Andreas Tille <tille at debian.org>
Date: Thu Dec 31 13:18:33 2015 +0100
Imported Upstream version 0.10.14
---
README.md | 2 +-
src/.gitignore | 9 +
src/Algorithm/ErrorCorrectProcess.cpp | 20 +-
src/Algorithm/ErrorCorrectProcess.h | 3 +
src/Algorithm/KmerOverlaps.cpp | 35 +-
src/Algorithm/KmerOverlaps.h | 1 +
src/Algorithm/QCProcess.cpp | 24 +-
src/Algorithm/QCProcess.h | 2 +
src/GraphDiff/DeBruijnHaplotypeBuilder.cpp | 145 ++++-
src/GraphDiff/DeBruijnHaplotypeBuilder.h | 8 +
src/GraphDiff/DindelRealignWindow.cpp | 293 ++++------
src/GraphDiff/DindelRealignWindow.h | 19 +-
src/GraphDiff/DindelUtil.cpp | 175 +++---
src/GraphDiff/DindelUtil.h | 7 -
src/GraphDiff/GraphCompare.cpp | 54 +-
src/GraphDiff/GraphCompare.h | 4 -
src/GraphDiff/HapgenUtil.cpp | 257 +++++++--
src/GraphDiff/HapgenUtil.h | 59 +-
src/SGA/Makefile.am | 16 +-
src/SGA/SGACommon.h | 3 +-
src/SGA/correct.cpp | 43 +-
src/SGA/filter.cpp | 16 +-
src/SGA/fm-merge.cpp | 4 +-
src/SGA/gen-ssa.cpp | 24 +-
src/SGA/graph-concordance.cpp | 487 ++++++++++++++++
src/SGA/graph-concordance.h | 23 +
src/SGA/graph-diff.cpp | 90 ++-
src/SGA/index.cpp | 42 +-
src/SGA/kmer-count.cpp | 291 ++++++++++
src/SGA/kmer-count.h | 8 +
src/SGA/{overlap.cpp => overlap-long.cpp} | 260 +++++----
src/SGA/overlap-long.h | 26 +
src/SGA/overlap.cpp | 19 +-
src/SGA/preprocess.cpp | 40 +-
src/SGA/preqc.cpp | 70 ++-
src/SGA/rmdup.cpp | 2 +-
src/SGA/sga.cpp | 53 +-
src/SGA/somatic-variant-filters.cpp | 864 +++++++++++++++++++++++++++++
src/SGA/somatic-variant-filters.h | 23 +
src/SQG/ASQG.cpp | 11 +
src/SQG/ASQG.h | 7 +
src/SuffixTools/BWTIndexSet.h | 3 +-
src/SuffixTools/SparseGapArray.h | 2 +-
src/Thirdparty/multiple_alignment.cpp | 82 +++
src/Thirdparty/multiple_alignment.h | 7 +
src/Thirdparty/overlapper.cpp | 299 ++++++----
src/Thirdparty/overlapper.h | 33 +-
src/Util/Makefile.am | 1 +
src/Util/SGAStats.h | 2 +-
src/Util/SeqReader.cpp | 8 +-
src/Util/SeqReader.h | 2 +-
src/Util/VCFUtil.cpp | 44 +-
src/Util/VCFUtil.h | 10 +-
src/Util/VariantIndex.cpp | 96 ++++
src/Util/VariantIndex.h | 59 ++
src/bin/sga-align | 5 +-
src/bin/sga-astat.py | 8 +-
src/bin/sga-preqc-report.py | 39 +-
src/bin/sga-variant-filters.pl | 115 ----
src/configure.ac | 2 +-
60 files changed, 3422 insertions(+), 934 deletions(-)
diff --git a/README.md b/README.md
index 9105041..b3887b5 100644
--- a/README.md
+++ b/README.md
@@ -8,4 +8,4 @@ For installation and usage instructions see [src/README](src/#readme)
For running examples see src/examples and the [sga wiki](https://github.com/jts/sga/wiki)
-For questions or support contact jared.simpson --at-- sanger.ac.uk
+For questions or support contact jared.simpson --at-- oicr.on.ca
diff --git a/src/.gitignore b/src/.gitignore
index 5197b27..7a74d70 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -12,3 +12,12 @@ build*
*.o
*.a
Makefile
+
+config.h
+config.log
+config.status
+stamp-h1
+compile
+.deps
+
+SGA/sga
\ No newline at end of file
diff --git a/src/Algorithm/ErrorCorrectProcess.cpp b/src/Algorithm/ErrorCorrectProcess.cpp
index 07fff59..ffcc99d 100644
--- a/src/Algorithm/ErrorCorrectProcess.cpp
+++ b/src/Algorithm/ErrorCorrectProcess.cpp
@@ -15,6 +15,7 @@
#include "StringThreader.h"
//#define KMER_TESTING 1
+//#define OVERLAPCORRECTION_VERBOSE 1
//
//
@@ -179,12 +180,19 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrectionNew(const SequenceWorkI
m_params.indices);
multiple_alignment.filterByCount(m_params.conflictCutoff);
-
+#ifdef OVERLAPCORRECTION_VERBOSE
+ printf("---> MAF after conflict\n");
+ multiple_alignment.print(200);
+ printf("---> PILEUP\n");
+ multiple_alignment.printPileup();
+ printf("---> DONE\n\n");
+#endif
+
bool last_round = (round == num_rounds - 1);
if(last_round)
- consensus = multiple_alignment.calculateBaseConsensus(10000, 0);
+ consensus = multiple_alignment.calculateBaseConsensusMinCoverage(m_params.base_threshold, 0, m_params.min_count_max_base);
else
- current_sequence = multiple_alignment.calculateBaseConsensus(10000, 0);
+ current_sequence = multiple_alignment.calculateBaseConsensusMinCoverage(m_params.base_threshold, 0, m_params.min_count_max_base);
if(m_params.printOverlaps)
multiple_alignment.print(200);
@@ -326,13 +334,13 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w
int threshold = CorrectionThresholds::Instance().getRequiredSupport(phred);
int left_k_idx = (i + 1 >= m_params.kmerLength ? i + 1 - m_params.kmerLength : 0);
- corrected = attemptKmerCorrection(i, left_k_idx, std::max(countVector[left_k_idx], threshold), readSequence);
+ corrected = attemptKmerCorrection(i, left_k_idx, std::max(countVector[left_k_idx] + m_params.countOffset, threshold), readSequence);
if(corrected)
break;
// base was not corrected, try using the rightmost covering kmer
size_t right_k_idx = std::min(i, n - m_params.kmerLength);
- corrected = attemptKmerCorrection(i, right_k_idx, std::max(countVector[right_k_idx], threshold), readSequence);
+ corrected = attemptKmerCorrection(i, right_k_idx, std::max(countVector[right_k_idx] + m_params.countOffset, threshold), readSequence);
if(corrected)
break;
}
@@ -386,7 +394,7 @@ bool ErrorCorrectProcess::attemptKmerCorrection(size_t i, size_t k_idx, size_t m
#if KMER_TESTING
printf("%c %zu\n", currBase, count);
#endif
- if(count > bestCount && count >= minCount)
+ if(count >= minCount)
{
// Multiple corrections exist, do not correct
if(bestBase != '$')
diff --git a/src/Algorithm/ErrorCorrectProcess.h b/src/Algorithm/ErrorCorrectProcess.h
index 7d6aa24..9043ed7 100644
--- a/src/Algorithm/ErrorCorrectProcess.h
+++ b/src/Algorithm/ErrorCorrectProcess.h
@@ -47,6 +47,8 @@ struct ErrorCorrectParameters
// Overlap-based corrector params
int minOverlap;
+ int min_count_max_base;
+ int base_threshold;
int numOverlapRounds;
double minIdentity;
int conflictCutoff;
@@ -55,6 +57,7 @@ struct ErrorCorrectParameters
// k-mer based corrector params
int numKmerRounds;
int kmerLength;
+ int countOffset;
// output options
bool printOverlaps;
diff --git a/src/Algorithm/KmerOverlaps.cpp b/src/Algorithm/KmerOverlaps.cpp
index 9a01960..0a541f4 100644
--- a/src/Algorithm/KmerOverlaps.cpp
+++ b/src/Algorithm/KmerOverlaps.cpp
@@ -14,6 +14,8 @@
#include "Timer.h"
#include "Verbosity.h"
+//#define OVERLAPCORRECTION_VERBOSE 1
+
//
MultipleAlignment KmerOverlaps::buildMultipleAlignment(const std::string& query,
size_t k,
@@ -22,11 +24,29 @@ MultipleAlignment KmerOverlaps::buildMultipleAlignment(const std::string& query,
int bandwidth,
const BWTIndexSet& indices)
{
+#ifdef OVERLAPCORRECTION_VERBOSE
+ printf("---> buildMultipleAlignment for query: %s\n",query.c_str());
+ printf("---> parameters: min_overlap %d minID %f k %d\n",min_overlap, min_identity, k);
+#endif
+
SequenceOverlapPairVector overlap_vector = retrieveMatches(query, k, min_overlap, min_identity, bandwidth, indices);
MultipleAlignment multiple_alignment;
multiple_alignment.addBaseSequence("query", query, "");
- for(size_t i = 0; i < overlap_vector.size(); ++i)
+ for(size_t i = 0; i < overlap_vector.size(); ++i) {
multiple_alignment.addOverlap("null", overlap_vector[i].sequence[1], "", overlap_vector[i].overlap);
+#ifdef OVERLAPCORRECTION_VERBOSE
+ printf("---> \tadd to maf: sequence %s \n", overlap_vector[i].sequence[1].c_str());
+#endif
+ }
+
+#ifdef OVERLAPCORRECTION_VERBOSE
+ printf("---> MAF\n");
+ multiple_alignment.print(200);
+ printf("---> PILEUP\n");
+ multiple_alignment.printPileup();
+ printf("---> DONE\n\//n");
+#endif
+
return multiple_alignment;
}
@@ -82,6 +102,8 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
int64_t max_interval_size = 200;
SequenceOverlapPairVector overlap_vector;
+ if(query.size() < k)
+ return overlap_vector;
// Use the FM-index to look up intervals for each kmer of the read. Each index
// in the interval is stored individually in the KmerMatchMap. We then
@@ -162,7 +184,14 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
// Use the overlaps that meet the thresholds to build a multiple alignment
for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
{
- std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
+ // If a read table is available in the index, use it to get the match sequence
+ // Otherwise get it from the BWT, which is slower
+ std::string match_sequence;
+ if(indices.pReadTable != NULL)
+ match_sequence = indices.pReadTable->getRead(iter->index).seq.toString();
+ else
+ match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
+
if(iter->is_reverse)
match_sequence = reverseComplement(match_sequence);
@@ -197,6 +226,7 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
SequenceOverlapPair op;
op.sequence[0] = query;
op.sequence[1] = match_sequence;
+ op.match_idx = iter->index;
op.overlap = overlap;
op.is_reversed = iter->is_reverse;
overlap_vector.push_back(op);
@@ -436,6 +466,7 @@ void _approximateSeededMatch(const std::string& in_query,
SequenceOverlapPair op;
op.sequence[0] = in_query;
op.sequence[1] = match_sequence;
+ op.match_idx = start_index_of_read;
op.overlap = overlap;
op.is_reversed = do_reverse;
out_vector.push_back(op);
diff --git a/src/Algorithm/KmerOverlaps.h b/src/Algorithm/KmerOverlaps.h
index 63dd9af..5085091 100644
--- a/src/Algorithm/KmerOverlaps.h
+++ b/src/Algorithm/KmerOverlaps.h
@@ -18,6 +18,7 @@
struct SequenceOverlapPair
{
std::string sequence[2];
+ int match_idx;
bool is_reversed;
SequenceOverlap overlap;
diff --git a/src/Algorithm/QCProcess.cpp b/src/Algorithm/QCProcess.cpp
index f526998..c079fb0 100644
--- a/src/Algorithm/QCProcess.cpp
+++ b/src/Algorithm/QCProcess.cpp
@@ -13,19 +13,14 @@
//
struct KmerWindow
{
- int64_t getCount() const
+ int64_t getCount(bool bothStrand) const
{
- int64_t count = 0;
- if(fwdIntervals.interval[0].isValid())
- {
- count += fwdIntervals.interval[0].size();
- }
-
- if(rcIntervals.interval[0].isValid())
- {
- count += rcIntervals.interval[0].size();
- }
- return count;
+ int64_t fwdCount = (fwdIntervals.interval[0].isValid()) ? fwdIntervals.interval[0].size() : 0;
+ int64_t rcCount = (rcIntervals.interval[0].isValid()) ? rcIntervals.interval[0].size() : 0;
+ if (bothStrand)
+ return std::min(fwdCount,rcCount);
+ else
+ return fwdCount + rcCount;
}
int start;
@@ -144,7 +139,7 @@ bool QCProcess::performKmerCheck(const SequenceWorkItem& workItem)
if(window.rcIntervals.interval[1].isValid())
BWTAlgorithms::updateBothR(window.rcIntervals, cb, m_params.pBWT);
- int64_t count = window.getCount();
+ int64_t count = window.getCount(m_params.kmerBothStrand);
if(count <= threshold)
{
// The extended kmer didn't meet the threshold
@@ -186,7 +181,7 @@ bool QCProcess::performKmerCheck(const SequenceWorkItem& workItem)
}
// Check the count of this interval
- int64_t count = window.getCount();
+ int64_t count = window.getCount(m_params.kmerBothStrand);
if(count <= threshold)
{
@@ -235,6 +230,7 @@ DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWorkItem& wo
// Calculate the canonical index for this string - the lowest
// value in the two lexicographic index
+ assert(fwdIntervals.interval[0].isValid() || rcIntervals.interval[0].isValid());
int64_t fi = fwdIntervals.interval[0].isValid() ? fwdIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
int64_t ri = rcIntervals.interval[0].isValid() ? rcIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
int64_t canonicalIdx = std::min(fi, ri);
diff --git a/src/Algorithm/QCProcess.h b/src/Algorithm/QCProcess.h
index 50d7ba9..ecb87f8 100644
--- a/src/Algorithm/QCProcess.h
+++ b/src/Algorithm/QCProcess.h
@@ -26,6 +26,7 @@ struct QCParameters
{
checkDuplicates = true;
checkKmer = true;
+ kmerBothStrand = false;
checkHPRuns = true;
checkDegenerate = true;
verbose = 0;
@@ -53,6 +54,7 @@ struct QCParameters
// Control parameters
bool checkDuplicates;
bool checkKmer;
+ bool kmerBothStrand;
bool checkHPRuns;
bool checkDegenerate;
bool substringOnly;
diff --git a/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp b/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
index 10262fb..19f5934 100644
--- a/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
+++ b/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
@@ -44,13 +44,12 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
PROFILE_FUNC("GraphCompare::buildVariantStringGraph")
assert(!m_startingKmer.empty());
- std::map<std::string, int> kmerCountMap;
-
// We search until we find the first common vertex in each direction
- size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
+ //size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
size_t MAX_ITERATIONS = 2000;
size_t MAX_SIMULTANEOUS_BRANCHES = 40;
size_t MAX_TOTAL_BRANCHES = 50;
+ int MAX_DISTANCE = 100;
// Tracking stats
size_t max_simul_branches_used = 0;
@@ -84,9 +83,11 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
// Calculate de Bruijn extensions for this node
std::string vertStr = curr.pVertex->getSeq().toString();
- AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);
+ AlphaCount64 extensionCounts =
+ BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);
- std::string extensionsUsed;
+ // Count valid extensions
+ std::string extensions;
for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
{
char b = DNA_ALPHABET::getBase(i);
@@ -95,9 +96,17 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
if(!acceptExt)
continue;
- extensionsUsed.push_back(b);
+ extensions.push_back(b);
+ }
+
+ // Do not allow join vertices to branch
+ if(curr.pVertex->getColor() == GC_RED && extensions.size() > 1)
+ continue;
+
+ for(size_t i = 0; i < extensions.size(); ++i)
+ {
+ char b = extensions[i];
std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction);
- kmerCountMap[newStr] = count;
// Create the new vertex and edge in the graph
// Skip if the vertex already exists
@@ -112,28 +121,33 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
// Add edges
VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction);
- // Check if this sequence is present in the FM-index of the target
+ // Check if this sequence is present in the reference genome
// If so, it is the join point of the de Bruijn graph and we extend no further.
- size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);
+ size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.referenceIndex);
- if(targetCount >= MIN_TARGET_COUNT)
+ if(targetCount == 1)
{
if(curr.direction == ED_SENSE)
sense_join_vector.push_back(pVertex);
else
antisense_join_vector.push_back(pVertex);
+ pVertex->setColor(GC_RED);
}
- else
- {
- // Add the vertex to the extension queue
- queue.push(BuilderExtensionNode(pVertex, curr.direction));
- }
+
+ // Add the vertex to the extension queue
+ BuilderExtensionNode nextNode(pVertex, curr.direction, curr.distance + 1);
+ if(nextNode.distance < MAX_DISTANCE)
+ queue.push(nextNode);
}
// Update the total number of times we branches the search
- if(!extensionsUsed.empty())
- total_branches += extensionsUsed.size() - 1;
+ if(!extensions.empty())
+ total_branches += extensions.size() - 1;
}
+ pGraph->setColors(GC_BLACK);
+
+ cullRedundantJoinVertices(antisense_join_vector, pGraph, !ED_ANTISENSE);
+ cullRedundantJoinVertices(sense_join_vector, pGraph, !ED_SENSE);
// If the graph construction was successful, walk the graph
// between the endpoints to make a string
@@ -157,3 +171,100 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
delete pGraph;
return HBRC_OK;
}
+
+void DeBruijnHaplotypeBuilder::cullRedundantJoinVertices(std::vector<Vertex*>& join_vertices, StringGraph* pGraph, EdgeDir direction)
+{
+ pGraph->checkColors(GC_BLACK);
+ // Remove join nodes that lie on the unipath of each join vertex
+ for(size_t i = 0; i < join_vertices.size(); ++i)
+ {
+ std::vector<Vertex*> unipath(1, join_vertices[i]);
+
+ // Skip vertices already marked
+ if(unipath.back()->getColor() == GC_RED)
+ continue;
+
+ // Extend unipath
+ while(1)
+ {
+ EdgePtrVec edges = unipath.back()->getEdges(direction);
+ if(edges.size() != 1)
+ break;
+ unipath.push_back(edges[0]->getEnd());
+ }
+
+ // Mark the non-terminal vertices of the unipath
+ for(size_t j = 1; j < unipath.size(); ++j)
+ unipath[j]->setColor(GC_RED);
+ }
+
+ std::vector<Vertex*> trimmed;
+ for(size_t i = 0; i < join_vertices.size(); ++i)
+ {
+ if(join_vertices[i]->getColor() != GC_RED)
+ trimmed.push_back(join_vertices[i]);
+ join_vertices[i]->setColor(GC_BLACK);
+ }
+
+ pGraph->setColors(GC_BLACK);
+ join_vertices.swap(trimmed);
+}
+
+Vertex* DeBruijnHaplotypeBuilder::extendUnambiguously(Vertex* pStart, StringGraph* pGraph,
+ EdgeDir direction, size_t max_distance, size_t min_target_count)
+{
+ size_t distance = 0;
+ Vertex* pCurrent = pStart;
+
+ while(distance < max_distance)
+ {
+ std::string vertStr = pCurrent->getSeq().toString();
+ AlphaCount64 extensionCounts =
+ BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, direction);
+
+ // Count valid extensions
+ char ext_base = '\0';
+ size_t ext_count = 0;
+ size_t n_ext = 0;
+ for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
+ {
+ char b = DNA_ALPHABET::getBase(i);
+ size_t count = extensionCounts.get(b);
+ if(count >= m_parameters.minDBGCount)
+ {
+ ext_base = b;
+ ext_count = count;
+ n_ext++;
+ }
+ }
+
+ // Stop if no extension or ambiguous
+ if(n_ext != 1)
+ return pCurrent;
+
+ std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, ext_base, direction);
+
+ // Stop if the vertex is already in the graph
+ if(pGraph->getVertex(newStr) != NULL)
+ return pCurrent;
+
+ // Stop if the vertex is not reprsented in the base sequence
+ size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);
+ if(targetCount < min_target_count)
+ return pCurrent;
+
+ // Create the new vertex and edge in the graph
+ // Allocate the new vertex and add it to the graph
+ Vertex* pNew = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr);
+ pNew->setColor(GC_BLACK);
+ pGraph->addVertex(pNew);
+
+ // Add edges
+ VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, pCurrent, pNew, direction);
+
+ pCurrent = pNew;
+ distance++;
+ }
+
+ return pCurrent;
+}
diff --git a/src/GraphDiff/DeBruijnHaplotypeBuilder.h b/src/GraphDiff/DeBruijnHaplotypeBuilder.h
index 46341fc..e7790b0 100644
--- a/src/GraphDiff/DeBruijnHaplotypeBuilder.h
+++ b/src/GraphDiff/DeBruijnHaplotypeBuilder.h
@@ -39,8 +39,16 @@ class DeBruijnHaplotypeBuilder
// Returns true if the graph was successfully built between the two sequences
HaplotypeBuilderReturnCode run(StringVector& out_haplotypes);
+ // Extend the graph along the unipath that pStart lies on
+ Vertex* extendUnambiguously(Vertex* pStart, StringGraph* pGraph,
+ EdgeDir dir, size_t max_distance, size_t min_target_count);
+
+
private:
+ // Remove join vertices that lie on the unipath to some other join vertex
+ void cullRedundantJoinVertices(std::vector<Vertex*>& join_vertices, StringGraph* pGraph, EdgeDir direction);
+
//
// Data
//
diff --git a/src/GraphDiff/DindelRealignWindow.cpp b/src/GraphDiff/DindelRealignWindow.cpp
index a2e3e22..f3259fb 100644
--- a/src/GraphDiff/DindelRealignWindow.cpp
+++ b/src/GraphDiff/DindelRealignWindow.cpp
@@ -40,7 +40,6 @@
const int DINDEL_DEBUG=0;
const int QUIET=1;
-const int ADDSNPS=0;
const int ALWAYS_REALIGN=1;
const int DEBUG_CALLINDEL=0;
const int REPOSITION_INDEL_WINDOW=1000;
@@ -148,17 +147,18 @@ SequenceOverlap localAlignmentToSequenceOverlap(const LocalAlignmentResult& loca
return overlap;
}
-SequenceOverlap computeHaplotypeAlignment(const std::string& base_sequence, const std::string& haplotype)
+SequenceOverlap computeHaplotypeAlignment(size_t k, const std::string& base_sequence, const std::string& haplotype)
{
- return HapgenUtil::alignHaplotypeToReference(base_sequence, haplotype);
+ return HapgenUtil::alignHaplotypeToReference(k, base_sequence, haplotype);
}
void addHaplotypeToMultipleAlignment(MultipleAlignment* pMA,
const std::string& base_sequence,
const std::string& incoming_name,
- const std::string& incoming_sequence)
+ const std::string& incoming_sequence,
+ size_t k)
{
- SequenceOverlap overlap = computeHaplotypeAlignment(base_sequence, incoming_sequence);
+ SequenceOverlap overlap = computeHaplotypeAlignment(k, base_sequence, incoming_sequence);
pMA->addOverlap(incoming_name, incoming_sequence, "", overlap);
}
@@ -371,8 +371,10 @@ void DindelVariant::write(std::ostream & out) const
void DindelHaplotype::alignHaplotype()
{
- // semi-globally align haplotypes to the first haplotype (arbitrary)
+ // align haplotypes to the reference
const std::string& rootSequence = m_refMapping.refSeq;
+
+ // Change haplotype to the reference strand
std::string alignSeq;
if(m_refMapping.isRC)
alignSeq = reverseComplement(m_seq);
@@ -385,7 +387,7 @@ void DindelHaplotype::alignHaplotype()
m_pMA = new MultipleAlignment();
m_pMA->addBaseSequence(ref_ss.str(), rootSequence, "");
- SequenceOverlap hap_overlap = computeHaplotypeAlignment(rootSequence, alignSeq);
+ SequenceOverlap hap_overlap = computeHaplotypeAlignment(m_assembly_k, rootSequence, alignSeq);
m_pMA->addOverlap("haplotype", alignSeq, "", hap_overlap);
m_firstAlignedBase = hap_overlap.match[1].start;
@@ -393,7 +395,6 @@ void DindelHaplotype::alignHaplotype()
{
std::cout << "DindelHaplotype::alignHaplotype:\n";
m_pMA->print(500);
- //std::cout << "DindelHaplotype::DindelHaplotype globalAlignmentCigar " << alignSeq << " vs root: " << _ma.expandedCigar << std::endl;
}
}
@@ -410,7 +411,6 @@ void DindelHaplotype::extractVariants()
m_refPos = std::vector<int>(m_seq.size(), 0);
m_isReference = false;
-
// Compute a mapping from a haplotype base to its aligned position on the reference sequence
int first_aligned_column = m_pMA->getFirstAlignedBaseColumn(refRow, varRow);
int last_aligned_column = m_pMA->getLastAlignedBaseColumn(refRow, varRow);
@@ -663,13 +663,15 @@ void DindelHaplotype::extractVariants()
}
}
-DindelHaplotype::DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping)
+DindelHaplotype::DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping, size_t assembly_k)
{
// initialize
m_pMA = NULL;
m_seq = haplotypeSequence;
m_refMapping = refMapping;
m_firstAlignedBase = 0;
+ m_assembly_k = assembly_k;
+
#ifdef DEBUG_NEW
std::cout << "refMapping " << m_refMapping.refName << " "
<< m_refMapping.refStart
@@ -707,6 +709,7 @@ void DindelHaplotype::copy(const DindelHaplotype & haplotype, int copyOptions)
m_refMapping = haplotype.m_refMapping;
m_isReference = haplotype.m_isReference;
m_firstAlignedBase = haplotype.m_firstAlignedBase;
+ m_assembly_k = haplotype.m_assembly_k;
if(haplotype.m_pMA != NULL)
m_pMA = new MultipleAlignment(*haplotype.m_pMA);
@@ -824,7 +827,9 @@ int DindelHaplotype::getClosestDistance(const DindelVariant& variant, int hapPos
std::cout << " haplotype variants: "; for (it = m_variant_to_pos.begin();it != m_variant_to_pos.end();it++) std::cout << " " << it->first; std::cout << std::endl;
std::cout << " h2 variants: "; for (size_t x=0;x<m_variants.size();x++) std::cout << " " << m_variants[x].getID(); std::cout << std::endl;
return -1;
- } else {
+ }
+ else
+ {
int s = it->second.first;
int e = it->second.second;
@@ -839,6 +844,7 @@ int DindelHaplotype::getClosestDistance(const DindelVariant& variant, int hapPos
hapPos2 = t;
*/
}
+
// overlaps with variant?
if (hapPosStartRead<=e && hapPosEndRead>=s)
{
@@ -918,7 +924,9 @@ int DindelHaplotype::getHapBase(int refPos) const
*/
-DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence)
+DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings,
+ const std::string & haplotypeSequence,
+ size_t assembly_k)
{
#ifdef DEBUG_NEW
std::cout << "There are " << referenceMappings.size() << " reference mappings.\n";
@@ -926,7 +934,7 @@ DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMap
m_referenceMappings = referenceMappings;
for (size_t i = 0; i < m_referenceMappings.size(); ++i)
- m_haplotypes.push_back(DindelHaplotype(haplotypeSequence, m_referenceMappings[i]));
+ m_haplotypes.push_back(DindelHaplotype(haplotypeSequence, m_referenceMappings[i], assembly_k));
m_seq = haplotypeSequence;
@@ -1062,17 +1070,18 @@ void DindelSequenceHash::makeHash(const std::string & sequence)
*
*/
DindelWindow::DindelWindow(const std::vector<std::string> & haplotypeSequences,
- const std::vector<DindelReferenceMapping> & referenceMappings)
+ const std::vector<DindelReferenceMapping> &referenceMappings,
+ size_t assembly_k)
{
m_pHaplotype_ma = NULL;
m_referenceMappings = referenceMappings;
// filter out duplicate haplotype sequences.
- initHaplotypes(haplotypeSequences, referenceMappings);
+ initHaplotypes(haplotypeSequences, referenceMappings, assembly_k);
// do multiple alignment of all haplotypes to each other.
- doMultipleHaplotypeAlignment();
+ doMultipleHaplotypeAlignment(assembly_k);
}
DindelWindow::DindelWindow(const DindelWindow & window)
@@ -1083,7 +1092,6 @@ DindelWindow::DindelWindow(const DindelWindow & window)
void DindelWindow::copy(const DindelWindow & window)
{
m_haplotypes = window.m_haplotypes;
- m_hashAltHaps = window.m_hashAltHaps;
m_candHapAlgorithm = window.m_candHapAlgorithm;
m_referenceMappings = window.m_referenceMappings;
@@ -1095,24 +1103,25 @@ void DindelWindow::copy(const DindelWindow & window)
}
void DindelWindow::initHaplotypes(const std::vector<std::string> & haplotypeSequences,
- const std::vector<DindelReferenceMapping> & referenceMappings)
+ const std::vector<DindelReferenceMapping> & referenceMappings,
+ size_t assembly_k)
{
for(size_t i = 0; i < haplotypeSequences.size(); ++i)
{
- if(m_hashAltHaps.find(haplotypeSequences[i]) == m_hashAltHaps.end())
- {
- if (DINDEL_DEBUG_3)
- std::cout << "DindelWindow::MultiHaplotype " << m_haplotypes.size() << "\n";
- m_haplotypes.push_back(DindelMultiHaplotype(referenceMappings, haplotypeSequences[i]));
- }
+ if (DINDEL_DEBUG_3)
+ std::cout << "DindelWindow::MultiHaplotype " << m_haplotypes.size() << "\n";
+ m_haplotypes.push_back(DindelMultiHaplotype(referenceMappings, haplotypeSequences[i], assembly_k));
}
}
-void DindelWindow::doMultipleHaplotypeAlignment()
+void DindelWindow::doMultipleHaplotypeAlignment(size_t assembly_k)
{
- // globally align haplotypes to the first haplotype (arbitrary)
+ // align haplotypes to the first haplotype, which is the reference sequence
std::vector<MAlignData> maVector;
+ // ensure that the first haplotype is the reference
+ assert(m_referenceMappings.front().refSeq.size() == m_haplotypes.front().getSequence().size());
+
assert(m_haplotypes.size() >= 1);
m_pHaplotype_ma = new MultipleAlignment;
@@ -1123,14 +1132,13 @@ void DindelWindow::doMultipleHaplotypeAlignment()
{
std::stringstream ss;
ss << "haplotype-" << h;
- addHaplotypeToMultipleAlignment(m_pHaplotype_ma, rootSequence, ss.str(), m_haplotypes[h].getSequence());
+ addHaplotypeToMultipleAlignment(m_pHaplotype_ma, rootSequence, ss.str(), m_haplotypes[h].getSequence(), assembly_k);
}
if (DINDEL_DEBUG_3)
{
std::cout << "DindelWindow::doMultipleHaplotypeAlignment:\n";
m_pHaplotype_ma->print(500);
-
}
}
@@ -1276,22 +1284,9 @@ void DindelRealignWindowResult::Inference::outputAsVCF(const DindelVariant & var
hps.insert(var.getHPLen());
int hp = *hps.rbegin();
- // calculate the dust score around a 100bp window centered at the start of the variant
- // 64-bp is the window size used in the SDUST paper
- int dw_size = 64;
- int dw_start = var.getPos() - dw_size/2;
- dw_start = std::max(0, dw_start); // clamp to 0
-
- int dw_end = var.getPos() + dw_size/2;
- const DNAString& chromosome = pRefTable->getRead(var.getChrom()).seq;
- dw_end = std::min(dw_end, (int)chromosome.length());
-
- double dust_score = 0.0f;
- if(dw_end - dw_start == dw_size)
- {
- std::string ref_dust_window = chromosome.substr(dw_start, dw_end - dw_start);
- dust_score = calculateDustScore(ref_dust_window);
- }
+ double dust_score = HapgenUtil::calculateDustScoreAtPosition(var.getChrom(),
+ var.getPos(),
+ pRefTable);
// Apply filters
std::string filter="NoCall";
@@ -1577,21 +1572,22 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
const int DRHA=0;
- if (DINDEL_DEBUG &&DRHA) std::cout << std::endl << "====> START 1 DindelRealignWindow::computeReadHaplotypeAlignment " << read.getID() << std::endl;
+ if (DINDEL_DEBUG && DRHA)
+ std::cout << std::endl << "====> START 1 DindelRealignWindow::computeReadHaplotypeAlignment " << read.getID() << std::endl;
// scenarios
/* Capital R's are matches, r's are mismatches
- 1. The seeds are too long to detect the possible gapped alignment of read to haplotype
- HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
- rrrrrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
- which should be
- HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
- RRR rrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
+ 1. The seeds are too long to detect the possible gapped alignment of read to haplotype
+ HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ rrrrrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
+ which should be
+ HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ RRR rrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
- 2. Deletion in read
- HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
- RRRRRRR RRRRRRRRRRRRRRRRRRRRRRRR
- */
+ 2. Deletion in read
+ HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+ RRRRRRR RRRRRRRRRRRRRRRRRRRRRRRR
+ */
// start and end are inclusive
// start may be <0, and end may be > haplotype length
@@ -1604,20 +1600,21 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
if (DINDEL_DEBUG && DRHA)
{
- std::cout << std::endl;
- std::cout << "HAPLOTYPE: " << haplotype.getSequence() << " isREF " << haplotype.isReference() << std::endl;
- std::string pad;
- std::string rseq = read.getSequence();
- if (start>=0)
- {
- pad = std::string(start, ' ');
- }
- else {
- rseq = read.getSequence().substr(-start, read.length());
- }
-
- std::cout << "READ: " << pad << rseq << std::endl;
- std::cout << "DLEN: " << dlen << " RLEN: " << rlen << " hlen: " << hlen << " start: " << start << " end: " << end << " read_index: " << readIndex << " " << read.getID() << std::endl;
+ std::cout << std::endl;
+ std::cout << "HAPLOTYPE: " << haplotype.getSequence() << " isREF " << haplotype.isReference() << std::endl;
+ std::string pad;
+ std::string rseq = read.getSequence();
+ if (start>=0)
+ {
+ pad = std::string(start, ' ');
+ }
+ else
+ {
+ rseq = read.getSequence().substr(-start, read.length());
+ }
+
+ std::cout << "READ: " << pad << rseq << std::endl;
+ std::cout << "DLEN: " << dlen << " RLEN: " << rlen << " hlen: " << hlen << " start: " << start << " end: " << end << " read_index: " << readIndex << " " << read.getID() << std::endl;
}
int nmm=-1; // number of mismatches between read and candidate haplotype
@@ -1632,57 +1629,62 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
double ll=0.0;
int b=0;
- double all_match = 0.0;
- nmm=0;
- if (!rcReadSeq) {
- for (int x=start;x<=end;x++,b++)
+ double all_match = 0.0;
+ nmm=0;
+ if (!rcReadSeq)
+ {
+ for (int x=start;x<=end;x++,b++)
{
- bool match = true;
- if (x>=0 && x<=hlen-1)
- {
- match = (read.getBase(b) == haplotype.getSequence()[x])?true:false;
- }
- if (match) ll += lpCorrect[b]; else {
- if (DINDEL_DEBUG) std::cout << " mm " << b << " lpError: " << lpError[b] << std::endl;
- ll += lpError[b]; nmm++;
- if (countMismatch && lpCorrect[b]>minLpCorrect) mismatches.push_back(Mismatch(x, read.getBase(b)));
+ bool match = true;
+ if (x>=0 && x<=hlen-1)
+ {
+ match = (read.getBase(b) == haplotype.getSequence()[x])?true:false;
+ }
+ if (match)
+ ll += lpCorrect[b];
+ else
+ {
+ if (DINDEL_DEBUG)
+ std::cout << " mm " << b << " lpError: " << lpError[b] << std::endl;
+ ll += lpError[b];
+ nmm++;
+ if (countMismatch && lpCorrect[b]>minLpCorrect)
+ mismatches.push_back(Mismatch(x, read.getBase(b)));
+ }
+ all_match += lpCorrect[b];
+ }
+ }
+ else
+ {
+ for (int x=start;x<=end;x++,b++)
+ {
+ bool match = true;
+ if (x>=0 && x<=hlen-1)
+ {
+ match = (_complement(read.getBase(rlen-1-b)) == haplotype.getSequence()[x])?true:false;
}
- all_match += lpCorrect[b];
- }
- } else {
- for (int x=start;x<=end;x++,b++)
- {
- bool match = true;
- if (x>=0 && x<=hlen-1)
- {
- match = (_complement(read.getBase(rlen-1-b)) == haplotype.getSequence()[x])?true:false;
- }
- if (match) ll += lpCorrect[rlen-1-b]; else {
- ll += lpError[rlen-1-b]; nmm++;
- if (countMismatch && lpCorrect[rlen-b-1]>minLpCorrect) mismatches.push_back(Mismatch(x, _complement(read.getBase(rlen-1-b))));
+
+ if (match)
+ {
+ ll += lpCorrect[rlen-1-b];
+ }
+ else
+ {
+ ll += lpError[rlen-1-b];
+ nmm++;
+ if (countMismatch && lpCorrect[rlen-b-1]>minLpCorrect)
+ mismatches.push_back(Mismatch(x, _complement(read.getBase(rlen-1-b))));
}
- }
- }
-
- if (ADDSNPS && nmm<=realignParameters.addSNPMaxMismatches) {
- for (std::list<Mismatch>::const_iterator iter = mismatches.begin(); iter != mismatches.end(); iter++)
- {
- // FIXME Need to add wrt haplotype multiple alignment column
- assert(1==0);
- /*
- int refPos = haplotype.getRefBase(iter->first);
- if (refPos>0)
- {
- m_posToCandidateSNP.addSNP(readIndex, refPos, haplotype.getSequence()[iter->first], iter->second); // pos, ref, alt resp.
- }
- */
}
}
- logLik = ll;
- isUngapped = true;
- }
- else { assert(1==0); }
+ logLik = ll;
+ isUngapped = true;
+ }
+ else
+ {
+ assert(1==0);
+ }
assert(!std::isnan(logLik));
assert(logLik<0.0);
@@ -1694,61 +1696,13 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
if (DINDEL_DEBUG && DRHA)
{
- std::cout << "LOGLIK: " << logLik << std::endl;
+ std::cout << "LOGLIK: " << logLik << std::endl;
}
- if (DINDEL_DEBUG &&DRHA) std::cout << std::endl << "====> END DindelRealignWindow::computeReadHaplotypeAlignment" << std::endl;
+ if (DINDEL_DEBUG && DRHA)
+ std::cout << std::endl << "====> END DindelRealignWindow::computeReadHaplotypeAlignment" << std::endl;
return ReadHaplotypeAlignment(logLik, nmm, isUngapped);
}
-void DindelRealignWindow::addSNPsToHaplotypes()
-{
- typedef std::map<size_t, std::vector<DindelVariant> > FreqCandidates;
- FreqCandidates freqCandidates;
- throw std::string("IMPLEMENT ME");
-
- for (MaPosToCandidateSNP::const_iterator iter = m_maPosToCandidateSNP.begin(); iter != m_maPosToCandidateSNP.end(); iter++)
- {
- for (size_t baseIdx=0;baseIdx<4;baseIdx++)
- {
- if (iter->second.baseToReads[baseIdx].size()>=realignParameters.addSNPMinCount)
- {
- // FIXME
- /*
- std::string ref; ref+=iter->second.m_refBase;
- std::string alt; alt+=iter->second.getBase(baseIdx);
- DindelVariant snp(m_dindelWindow.getChrom(), ref, alt, iter->first);
- snp.setPriorProb(realignParameters.variantPriors.getDefaultProbVariant(snp.getType()));
- freqCandidates[iter->second.baseToReads[baseIdx].size()].push_back(snp);
- if(!QUIET) std::cout << "DindelRealignWindow::addSNPsToHaplotypes adding SNP " << m_dindelWindow.getChrom() << " " << ref << " " << alt << " " << iter->first << std::endl;
-
- */
- }
- }
- }
-
- // add snps until maximum is reached.
- int numAdded=0;
- for (FreqCandidates::const_reverse_iterator iter=freqCandidates.rbegin();iter!=freqCandidates.rend();iter++)
- {
- bool stop=false;
- for(size_t x=0;x<iter->second.size();x++)
- {
- m_dindelWindow.addVariant(iter->second[x], true);
- numAdded++;
- if (numAdded == realignParameters.addSNPMaxSNPs)
- {
- stop =true;
- break;
- }
-
- }
- if (stop) break;
- }
-
-}
-
-
-
void DindelRealignWindow::algorithm_hmm(VCFCollection& out,
DindelReadReferenceAlignmentVector* pOutAlignments,
DindelRealignWindowResult * pThisResult,
@@ -1764,15 +1718,6 @@ void DindelRealignWindow::algorithm_hmm(VCFCollection& out,
// process the user-defined haplotypes
computeReadHaplotypeAlignmentsUsingHMM(0, numHaps-1);
- // now add SNPs identified by the high quality ungapped alignments
- if (ADDSNPS && realignParameters.addSNPMaxSNPs>0)
- {
- addSNPsToHaplotypes();
-
- // compute the likelihoods for the additional haplotypes
- computeReadHaplotypeAlignmentsUsingHMM(numHaps, m_dindelWindow.getHaplotypes().size()-1);
- }
-
DindelRealignWindowResult & result = *pThisResult;
// estimate haplotype frequencies using user specified max mapping thresholds
diff --git a/src/GraphDiff/DindelRealignWindow.h b/src/GraphDiff/DindelRealignWindow.h
index 6c276a0..01b7fde 100644
--- a/src/GraphDiff/DindelRealignWindow.h
+++ b/src/GraphDiff/DindelRealignWindow.h
@@ -397,7 +397,7 @@ class DindelHaplotype
public:
// Constructors
- DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping);
+ DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping, size_t assembly_k);
DindelHaplotype(const DindelHaplotype & haplotype, int copyOptions);
DindelHaplotype(const DindelHaplotype & haplotype);
~DindelHaplotype();
@@ -457,6 +457,7 @@ class DindelHaplotype
std::vector<int> m_hplen; // gives the homopolymer run length for any base in the haplotype sequence going left and right.
std::vector<int> m_refPos; // position of haplotype base on reference.
int m_firstAlignedBase;
+ size_t m_assembly_k;
// list of VCF4 style variants contained in haplotype
std::vector<DindelVariant> m_variants;
@@ -475,7 +476,7 @@ class DindelHaplotype
class DindelMultiHaplotype
{
public:
- DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence );
+ DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence, size_t assembly_k );
const std::string & getSequence() const { return m_seq; }
const std::vector<DindelVariant> & getVariants(int refIdx) const { return m_haplotypes[refIdx].getVariants(); }
int getNumReferenceMappings() const { return int(m_referenceMappings.size()); }
@@ -512,7 +513,10 @@ class DindelWindow
// Create window from a set of haplotypes and a reference sequence.
// Uses MultipleAlignment to annotate the variations in the haplotypes with respect to the reference sequence.
- DindelWindow(const std::vector<std::string> & haplotypeSequences, const std::vector<DindelReferenceMapping> & referenceMappings);
+ DindelWindow(const std::vector<std::string> & haplotypeSequences,
+ const std::vector<DindelReferenceMapping>& referenceMappings,
+ size_t assembly_k);
+
DindelWindow(const DindelWindow & window);
~DindelWindow();
@@ -523,8 +527,11 @@ class DindelWindow
private:
//
- void initHaplotypes(const std::vector<std::string> & haplotypeSequences, const std::vector<DindelReferenceMapping> & referenceMappings);
- void doMultipleHaplotypeAlignment();
+ void initHaplotypes(const std::vector<std::string>& haplotypeSequences,
+ const std::vector<DindelReferenceMapping>& referenceMappings,
+ size_t assembly_k);
+
+ void doMultipleHaplotypeAlignment(size_t assembly_k);
void copy(const DindelWindow & window);
// Data
@@ -534,7 +541,6 @@ class DindelWindow
// candidate haplotypes
std::vector<DindelMultiHaplotype> m_haplotypes;
- std::map<std::string, int> m_hashAltHaps;
std::vector< DindelReferenceMapping > m_referenceMappings;
MultipleAlignment *m_pHaplotype_ma;
@@ -946,7 +952,6 @@ class DindelRealignWindow
// Functions
void filterHaplotypes();
- void addSNPsToHaplotypes();
void processHaplotypes(size_t firstHapIdx, size_t lastHapIdx);
// convert seed positions into a likelihood given the candidate haplotype
diff --git a/src/GraphDiff/DindelUtil.cpp b/src/GraphDiff/DindelUtil.cpp
index 9d41c0e..51cc548 100644
--- a/src/GraphDiff/DindelUtil.cpp
+++ b/src/GraphDiff/DindelUtil.cpp
@@ -46,6 +46,7 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
{
HapgenAlignmentVector thisCandidateAlignments;
HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer,
+ parameters.kmer,
inHaplotypes[i],
parameters.referenceIndex,
parameters.pRefTable,
@@ -60,7 +61,15 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
if(Verbosity::Instance().getPrintLevel() > 3)
printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size());
- size_t MAX_ALIGNMENTS = 10;
+ // Check if this group of haplotypes is a SNP-only event
+ bool has_complex_variant = false;
+ for(size_t i = 0; i < candidateAlignments.size(); ++i)
+ {
+ if(candidateAlignments[i].numEvents > 0 && !candidateAlignments[i].isSNP)
+ has_complex_variant = true;
+ }
+
+ size_t MAX_ALIGNMENTS = 1;
if(candidateAlignments.size() > MAX_ALIGNMENTS)
return DRC_AMBIGUOUS_ALIGNMENT;
@@ -69,11 +78,17 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
int FLANKING_SIZE = 0;
if (parameters.dindelRealignParameters.realignMatePairs)
FLANKING_SIZE = 1000;
+
+ // all haplotypes, assembled and reference with the extra flanking sequence added
StringVector flankingHaplotypes;
+
+ // only the reference sequences, not the assembled haplotypes
+ StringVector referenceFlankingHaplotypes;
- // This vector contains the internal portion of the haplotypes, without the flanking sequence
+ // The internal portion of the haplotypes, without the flanking sequence.
// It is used to extract reads
StringVector candidateHaplotypes;
+
for(size_t i = 0; i < candidateAlignments.size(); ++i)
{
HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i],
@@ -81,10 +96,30 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
FLANKING_SIZE,
inHaplotypes,
flankingHaplotypes,
- candidateHaplotypes);
+ candidateHaplotypes,
+ referenceFlankingHaplotypes);
}
+
+ // There should be at least two flanking haplotypes, one that was assembled
+ // and one reference sequence found by alignment
+ if(flankingHaplotypes.size() < 2)
+ return DRC_NO_ALIGNMENT;
+
+ // Ensure the reference haplotype is a non-empty string
+ if(flankingHaplotypes[0].size() == 0)
+ return DRC_NO_ALIGNMENT;
+
+ // JS 26/08/14
+ // We now require the assembled haplotype to match uniquely to the reference
+ // and we require that the starting/ending k-mers of the haplotypes exactly
+ // match the reference. Enforce these checks now.
+ assert(referenceFlankingHaplotypes.size() == 1);
+
+ if(!HapgenUtil::checkHaplotypeKmersMatchReference(parameters.kmer, referenceFlankingHaplotypes.front(), inHaplotypes))
+ return DRC_AMBIGUOUS_ALIGNMENT;
+
if(Verbosity::Instance().getPrintLevel() > 3)
printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size());
@@ -149,17 +184,6 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
if (total_reads == 0)
return DRC_UNDER_DEPTH;
- // Generate the input haplotypes for dindel
- // We need at least 2 haplotypes (one is the reference)
- size_t totFlankingHaplotypes = flankingHaplotypes.size();
-
- if(totFlankingHaplotypes < 2)
- return DRC_NO_ALIGNMENT;
-
- // Ensure the reference haplotype is a non-empty string
- if(flankingHaplotypes[0].size() == 0)
- return DRC_NO_ALIGNMENT;
-
// Make Dindel referenceMappings
StringVector dindelHaplotypes;
std::set<DindelReferenceMapping> refMappings;
@@ -203,16 +227,27 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
it->referenceAlignmentScore = 1000;
// make flankingHaplotypes unique
- std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end());
-
- for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++)
+ // JS 27/08/14: After changing the alignment code to require
+ // k-mer matches between the reference and haplotype, the order
+ // that the haplotypes are sent to dindel matters. The following
+ // code was changed so that the unique reference haplotype is presented
+ // to dindel first.
+ std::set<std::string> setFlanking;
+
+ for(size_t fi = 0; fi < flankingHaplotypes.size(); ++fi)
{
- dindelHaplotypes.push_back(*it);
- //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end());
+ std::string& curr_haplotype = flankingHaplotypes[fi];
+ if(setFlanking.find(curr_haplotype) == setFlanking.end())
+ {
+ dindelHaplotypes.push_back(curr_haplotype);
+ setFlanking.insert(curr_haplotype);
+ }
}
std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end());
- DindelWindow dWindow(dindelHaplotypes, dRefMappings);
+
+ // Initialize DindelWindow
+ DindelWindow dWindow(dindelHaplotypes, dRefMappings, parameters.kmer);
//
// Run Dindel
@@ -271,6 +306,16 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
pPreviousResult = pThisResult;
}
+ // Add additional tags to each VCF record
+ double average_hap_length = 0.0f;
+ for(size_t i = 0; i < variant_haplotypes.size(); ++i)
+ average_hap_length += variant_haplotypes[i].size();
+ average_hap_length /= variant_haplotypes.size();
+
+ for(size_t i = 0; i <= 1; ++i)
+ for(size_t j = 0; j < vcfCollections[i].records.size(); ++j)
+ vcfCollections[i].records[j].addComment("AvgHapLen", average_hap_length);
+
// Copy raw VCFRecords to output
for(size_t i = 0; i <= 1; ++i)
{
@@ -369,96 +414,6 @@ void DindelUtil::doMultipleReadHaplotypeAlignment(const std::vector<DindelRead>
}
}
-// Compute the best alignment of the haplotype collection to the reference
-DindelReturnCode DindelUtil::computeBestAlignment(const StringVector& inHaplotypes,
- const SeqItemVector& variantMates,
- const SeqItemVector& variantRCMates,
- const GraphCompareParameters& parameters,
- HapgenAlignment& bestAlignment)
-{
- size_t MAX_DEPTH = 2000;
- if(variantMates.size() + variantRCMates.size() > MAX_DEPTH)
- return DRC_OVER_DEPTH;
-
- //
- // Align the haplotypes to the reference genome to generate candidate alignments
- //
- HapgenAlignmentVector candidateAlignments;
- for(size_t i = 0; i < inHaplotypes.size(); ++i)
- HapgenUtil::alignHaplotypeToReferenceBWASW(inHaplotypes[i], parameters.referenceIndex, candidateAlignments);
-
- // Remove duplicate or bad alignment pairs
- HapgenUtil::coalesceAlignments(candidateAlignments);
-
- if(candidateAlignments.empty())
- return DRC_NO_ALIGNMENT;
-
- //
- // Score each candidate alignment against the mates of all the variant reads
- //
- int bestCandidate = -1;
- double bestAverageScoreFrac = 0.0f;
- double secondBest = 0.0f;
- for(size_t i = 0; i < candidateAlignments.size(); ++i)
- {
- // Compute the average score of the reads' mates to the flanking sequence
- StringVector referenceFlanking;
- StringVector referenceHaplotypes;
- HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable,
- 1000, inHaplotypes, referenceFlanking, referenceHaplotypes);
-
- // If valid flanking haplotypes could not be made, skip this alignment
- if(referenceFlanking.empty())
- continue;
-
- // Realign the mates
- LocalAlignmentResultVector localAlignments = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantMates);
- LocalAlignmentResultVector localAlignmentsRC = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantRCMates);
-
- // Merge alignments
- localAlignments.insert(localAlignments.end(), localAlignmentsRC.begin(), localAlignmentsRC.end());
-
- double sum = 0.0f;
- double count = 0.0f;
-
- for(size_t j = 0; j < localAlignments.size(); ++j)
- {
- double max_score = localAlignments[j].queryEndIndex - localAlignments[j].queryStartIndex + 1;
- double frac = (double)localAlignments[j].score / max_score;
- //printf("Score: %d frac: %lf\n", localAlignments[j].score, frac);
- sum += frac;
- count += 1;
- }
-
- double score = sum / count;
- if(score > bestAverageScoreFrac)
- {
- secondBest = bestAverageScoreFrac;
- bestAverageScoreFrac = score;
- bestCandidate = i;
- }
- else if(score > secondBest)
- {
- secondBest = score;
- }
-
- //printf("Alignment %zu mate-score: %lf\n", i, score);
- }
-
- if(bestCandidate == -1)
- return DRC_NO_ALIGNMENT;
-
- /*
- if(bestAverageScoreFrac < 0.9f)
- return DRC_POOR_ALIGNMENT;
-
- if(bestAverageScoreFrac - secondBest < 0.05f)
- return DRC_AMBIGUOUS_ALIGNMENT;
- */
- bestAlignment = candidateAlignments[bestCandidate];
- return DRC_OK;
-}
-
// Initialize an array of code counts
void DindelUtil::initializeCodeCounts(int counts[DRC_NUM_CODES])
{
diff --git a/src/GraphDiff/DindelUtil.h b/src/GraphDiff/DindelUtil.h
index f1ca353..d90f344 100644
--- a/src/GraphDiff/DindelUtil.h
+++ b/src/GraphDiff/DindelUtil.h
@@ -51,13 +51,6 @@ namespace DindelUtil
void doMultipleReadHaplotypeAlignment(const std::vector<DindelRead> & dReads,
const StringVector & haplotypes);
- // Compute the best alignment
- DindelReturnCode computeBestAlignment(const StringVector& inHaplotypes,
- const SeqItemVector& variantMates,
- const SeqItemVector& variantRCMates,
- const GraphCompareParameters& parameters,
- HapgenAlignment& bestAlignment);
-
// Convert a read-haplotype alignment to a reference alignment in BAM format
void readHaplotypeAlignmentToBAM(const std::string& readID,
const std::string& readSequence,
diff --git a/src/GraphDiff/GraphCompare.cpp b/src/GraphDiff/GraphCompare.cpp
index ef5bccf..ea6edfb 100644
--- a/src/GraphDiff/GraphCompare.cpp
+++ b/src/GraphDiff/GraphCompare.cpp
@@ -114,7 +114,6 @@ GraphCompareResult GraphCompare::process(const SequenceWorkItem& item) const
GraphCompareResult result;
SeqRecord currRead = item.read;
std::string w = item.read.seq.toString();
-
int len = w.size();
int num_kmers = len - m_parameters.kmer + 1;
@@ -339,8 +338,8 @@ void GraphCompare::qcVariantHaplotypes(bool bReferenceMode, StringVector& varian
for(size_t j = 0; j < n; j++)
num_uncovered_bases += covered_bases.test(j) ? 0 : 1;
- if(max_d < k / 2)
- haplotype = "";
+ //if(max_d < k / 2)
+ // haplotype = "";
if(Verbosity::Instance().getPrintLevel() > 2)
{
@@ -360,8 +359,8 @@ void GraphCompare::qcVariantHaplotypes(bool bReferenceMode, StringVector& varian
continue;
// Calculate the largest k such that the haplotype is walk through a de Bruijn graph of this k
- size_t max_variant_k = calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.variantIndex);
- size_t max_base_k = calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.baseIndex);
+ size_t max_variant_k = HapgenUtil::calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.variantIndex);
+ size_t max_base_k = HapgenUtil::calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.baseIndex);
if(Verbosity::Instance().getPrintLevel() > 2)
printf("HaplotypeQC hap[%zu] MVK: %zu MBK: %zu\n", i, max_variant_k, max_base_k);
@@ -444,36 +443,6 @@ void GraphCompare::buildParallelBaseHaplotypes(const StringVector& variant_haplo
}
//
-size_t GraphCompare::calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices) const
-{
- size_t min_k = 15;
- for(size_t k = 99; k >= min_k; --k)
- {
- if(sequence.size() < k)
- continue;
- bool covered = true;
- size_t nk = sequence.size() - k + 1;
- for(size_t i = 0; i < nk; ++i)
- {
- std::string kmer = sequence.substr(i, k);
- int c = BWTAlgorithms::countSequenceOccurrences(kmer, indices);
-
- if(c < min_depth)
- {
- covered = false;
- break;
- }
- }
-
- if(covered)
- return k;
- }
-
- return 0;
-}
-
-
-//
size_t GraphCompare::calculateHaplotypeBranches(const std::string& sequence,
size_t k,
size_t min_branch_depth,
@@ -505,21 +474,6 @@ size_t GraphCompare::calculateHaplotypeBranches(const std::string& sequence,
return num_branches;
}
-//
-IntVector GraphCompare::makeCountProfile(const std::string& str, size_t k, const BWT* pBWT, int max)
-{
- IntVector out;
- if(str.size() < k)
- return out;
-
- for(size_t i = 0; i < str.size() - k + 1; ++i)
- {
- int count = BWTAlgorithms::countSequenceOccurrences(str.substr(i, k), pBWT);
- out.push_back(count > max ? max : count);
- }
- return out;
-}
-
// test kmers from a file
void GraphCompare::testKmersFromFile(const std::string& kmerFilename)
{
diff --git a/src/GraphDiff/GraphCompare.h b/src/GraphDiff/GraphCompare.h
index 8f9b09a..ccab6b3 100644
--- a/src/GraphDiff/GraphCompare.h
+++ b/src/GraphDiff/GraphCompare.h
@@ -173,9 +173,6 @@ class GraphCompare
// Mark all the kmers in str as being visited
void markVariantSequenceKmers(const std::string& str) const;
- // Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
- size_t calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices) const;
-
// Calculate the number of high coverage branches off a haplotype path through the de Bruijn graph
size_t calculateHaplotypeBranches(const std::string& sequence, size_t k, size_t min_branch_depth, const BWTIndexSet& indices);
@@ -184,7 +181,6 @@ class GraphCompare
// Debug functions
bool transformVariantString(const std::string& inStr, std::string& outStr);
- IntVector makeCountProfile(const std::string& str, size_t k, const BWT* pBWT, int max);
void showMappingLocations(const std::string& str);
//
diff --git a/src/GraphDiff/HapgenUtil.cpp b/src/GraphDiff/HapgenUtil.cpp
index 874d93b..3ad7064 100644
--- a/src/GraphDiff/HapgenUtil.cpp
+++ b/src/GraphDiff/HapgenUtil.cpp
@@ -16,37 +16,6 @@
#include "Verbosity.h"
#include "overlapper.h"
-// Align the haplotype to the reference genome represented by the BWT/SSA pair
-void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype,
- const BWTIndexSet& referenceIndex,
- HapgenAlignmentVector& outAlignments)
-{
- PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW")
- LRAlignment::LRParams params;
-
- params.zBest = 20;
-
- for(size_t i = 0; i <= 1; ++i)
- {
- LRAlignment::LRHitVector hits;
- std::string query = (i == 0) ? haplotype : reverseComplement(haplotype);
- LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits);
-
- // Convert the hits into alignments
- for(size_t j = 0; j < hits.size(); ++j)
- {
- int q_alignment_length = hits[j].q_end - hits[j].q_start;
-
- // Skip non-complete alignments
- if((int)haplotype.length() == q_alignment_length)
- {
- HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1);
- outAlignments.push_back(aln);
- }
- }
- }
-}
-
struct CandidateKmerAlignment
{
size_t query_index;
@@ -61,8 +30,14 @@ struct CandidateKmerAlignment
};
typedef std::vector<CandidateKmerAlignment> CandidateVector;
+bool isValidFlankingKmerPair(size_t kidx_0, size_t kidx_1)
+{
+ return kidx_0 != std::string::npos && kidx_1 != std::string::npos && kidx_1 > kidx_0;
+}
+
// Align the haplotype to the reference genome represented by the BWT/SSA pair
-void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
+void HapgenUtil::alignHaplotypeToReferenceKmer(size_t align_k,
+ size_t assemble_k,
const std::string& haplotype,
const BWTIndexSet& referenceIndex,
const ReadTable* pReferenceTable,
@@ -71,7 +46,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
int64_t max_interval_size = 4;
- if(haplotype.size() < k)
+ if(haplotype.size() < align_k)
return;
std::vector<int> event_count_vector;
@@ -87,10 +62,10 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
// Find shared kmers between the haplotype and the reference
CandidateVector candidates;
- size_t nqk = query.size() - k + 1;
+ size_t nqk = query.size() - align_k + 1;
for(size_t j = 0; j < nqk; ++j)
{
- std::string kmer = query.substr(j, k);
+ std::string kmer = query.substr(j, align_k);
// Find the interval of this kmer in the reference
BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
@@ -121,7 +96,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
for(size_t j = 0; j < candidates.size(); ++j)
{
// Extract window around reference
- size_t window_size = 200;
+ size_t window_size = 500;
int ref_start = candidates[j].target_extrapolated_start - window_size;
int ref_end = candidates[j].target_extrapolated_end + window_size;
const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
@@ -134,8 +109,16 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);
+ // Ensure that this alignment location is valid by checking for the presence of both unique flanking kmers
+ size_t kidx_0, kidx_1;
+ getFlankingKmerMatch(assemble_k, ref_substring, query, kidx_0, kidx_1);
+
+ if(!isValidFlankingKmerPair(kidx_0, kidx_1))
+ continue;
+
// Align haplotype to the reference
- SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
+ SequenceOverlap overlap = alignHaplotypeToReference(assemble_k, ref_substring, query);
+
if(overlap.score < 0 || !overlap.isValid())
continue;
@@ -144,6 +127,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
int alignment_length = alignment_end - alignment_start + 1;
// Crude count of the number of distinct variation events
+ bool has_indel = false;
int num_events = overlap.edit_distance;
std::stringstream c_parser(overlap.cigar);
int len;
@@ -154,28 +138,40 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
// Only count one event per insertion/deletion
if(t == 'D' || t == 'I')
+ {
num_events -= (len - 1);
+ has_indel = true;
+ }
}
// Skip poor alignments
- double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f);
- if(mismatch_rate > 0.05f || overlap.total_columns < 50)
+ double event_rate = (double)num_events / overlap.total_columns;
+ if(event_rate > 0.05f || overlap.total_columns < 50)
{
if(Verbosity::Instance().getPrintLevel() > 4)
{
printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n",
- 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
+ 1.0f - event_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
}
continue;
}
- HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse);
+ bool is_snp = !has_indel && overlap.edit_distance == 1;
+
+ HapgenAlignment aln(candidates[j].target_sequence_id,
+ alignment_start,
+ alignment_length,
+ overlap.score,
+ num_events,
+ is_reverse,
+ is_snp);
+
tmp_alignments.push_back(aln);
event_count_vector.push_back(num_events);
if(Verbosity::Instance().getPrintLevel() > 4)
{
printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n",
- 1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
+ 1.0f - event_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
}
// Record the best edit distance
if(num_events < min_events)
@@ -198,13 +194,85 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
}
}
-SequenceOverlap HapgenUtil::alignHaplotypeToReference(const std::string& reference,
+// Get the index of the first and last k-mers of the haplotype
+// in the reference sequence
+//
+// ----------==========-----------------============------ reference
+// ==========-----------------=========== haplotype
+// ^ start_idx ^ end_idx
+// These will be std::string::npos if the kmers are not on the reference
+void HapgenUtil::getFlankingKmerMatch(size_t k,
+ const std::string& reference,
+ const std::string& haplotype,
+ size_t& start_idx,
+ size_t& end_idx)
+{
+ // Trim reference string so that it has flanking k-mer matches with the haplotype
+ std::string start_kmer = haplotype.substr(0, k);
+ std::string end_kmer = haplotype.substr(haplotype.size() - k, k);
+
+ start_idx = reference.find(start_kmer);
+ end_idx = reference.find(end_kmer);
+}
+
+//
+SequenceOverlap HapgenUtil::alignHaplotypeToReference(size_t k,
+ const std::string& reference,
const std::string& haplotype)
{
- SequenceOverlap overlap = Overlapper::computeOverlapAffine(reference, haplotype);
+
+ // Get position of flanking k-mer matches
+ size_t kidx_0, kidx_1;
+ getFlankingKmerMatch(k, reference, haplotype, kidx_0, kidx_1);
+
+ if(Verbosity::Instance().getPrintLevel() > 5)
+ {
+ printf("R: %s\n", reference.c_str());
+ printf("H: %s\n", haplotype.c_str());
+ printf("K: %zu\n", k);
+ printf("KI0: %zu KI1: %zu\n", kidx_0, kidx_1);
+ }
+
+ // This function assumes that the flanking k-mers are in both strings
+ assert(isValidFlankingKmerPair(kidx_0, kidx_1));
+
+ // trim reference string so we can perform global alignment
+ std::string reference_substring = reference.substr(kidx_0, kidx_1 - kidx_0 + k);
+
+ // set up scoring scheme
+ OverlapperParams params = affine_default_params;
+ params.type = ALT_GLOBAL;
+ params.gap_ext_penalty = 0;
+
+ //
+ SequenceOverlap overlap = Overlapper::computeAlignmentAffine(reference_substring, haplotype, params);
+
+ // Offset alignment location by the location of the first kmer in the reference haplotype
+ overlap.match[0].start += kidx_0;
+ overlap.match[0].end += kidx_0;
+
+ if(Verbosity::Instance().getPrintLevel() > 5)
+ {
+ overlap.printAlignment(reference, haplotype);
+ }
return overlap;
}
+// Returns true if the start/end kmer for all haplotypes occur in the reference string
+bool HapgenUtil::checkHaplotypeKmersMatchReference(size_t k,
+ const std::string& reference,
+ const StringVector& inHaplotypes)
+{
+ for(size_t i = 0; i < inHaplotypes.size(); ++i)
+ {
+ size_t kidx_0, kidx_1;
+ getFlankingKmerMatch(k, reference, inHaplotypes[i], kidx_0, kidx_1);
+
+ if(!isValidFlankingKmerPair(kidx_0, kidx_1))
+ return false;
+ }
+ return true;
+}
// Coalesce a set of alignments into distinct locations
void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments)
@@ -225,7 +293,7 @@ void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments)
outAlignments.push_back(alignments[alignments.size()-1]);
// Kees: start from back because alignments are sorted in order of increasing score
- for(size_t i = alignments.size()-1; i-- > 0;)
+ for(size_t i = alignments.size() - 1; i-- > 0;)
{
// Check this alignment against the last alignment added to the output set
HapgenAlignment& prevAlign = outAlignments.back();
@@ -305,7 +373,8 @@ bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln,
int flanking,
const StringVector& inHaplotypes,
StringVector& outFlankingHaplotypes,
- StringVector& outHaplotypes)
+ StringVector& outHaplotypes,
+ StringVector& outFlankingReferenceHaplotypes)
{
std::string upstream;
std::string referenceHaplotype;
@@ -329,6 +398,7 @@ bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln,
std::string referenceFlanking = upstream + referenceHaplotype + downstream;
outFlankingHaplotypes.push_back(referenceFlanking);
outHaplotypes.push_back(referenceHaplotype);
+ outFlankingReferenceHaplotypes.push_back(referenceFlanking);
// Check that all sequences match the reference haplotype properly
/*
@@ -535,6 +605,101 @@ LocalAlignmentResultVector HapgenUtil::alignReadsLocally(const std::string& targ
return results;
}
+//
+double HapgenUtil::calculateDustScoreAtPosition(const std::string& name,
+ int position,
+ const ReadTable* pRefTable,
+ int window_size)
+{
+ int dw_start = position - window_size / 2;
+ dw_start = std::max(0, dw_start); // clamp to 0
+
+ int dw_end = position + window_size / 2;
+ const DNAString& chromosome = pRefTable->getRead(name).seq;
+ dw_end = std::min(dw_end, (int)chromosome.length());
+
+ double dust_score = 0.0f;
+ if(dw_end - dw_start == window_size)
+ {
+ std::string ref_dust_window = chromosome.substr(dw_start, dw_end - dw_start);
+ dust_score = calculateDustScore(ref_dust_window);
+ }
+ return dust_score;
+}
+
+//
+size_t HapgenUtil::getMaximumOneEdit(const std::string& str, const BWTIndexSet& indices)
+{
+ size_t max = 0;
+ std::string t = str;
+
+ //
+ for(size_t i = 0; i < str.size(); ++i)
+ {
+ char tmp = t[i];
+ for(int bi = 0; bi < DNA_ALPHABET::size; ++bi)
+ {
+ char b = DNA_ALPHABET::getBase(bi);
+ if(b != tmp)
+ {
+ t[i] = b;
+ size_t count = BWTAlgorithms::countSequenceOccurrences(t, indices);
+ if(count > max)
+ max = count;
+ }
+ }
+ t[i] = tmp;
+ }
+
+ printf("MaxOneEdit %zu\n", max);
+ return max;
+}
+
+// Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
+size_t HapgenUtil::calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices)
+{
+ size_t min_k = 15;
+ for(size_t k = 99; k >= min_k; --k)
+ {
+ if(sequence.size() < k)
+ continue;
+ bool covered = true;
+ size_t nk = sequence.size() - k + 1;
+ for(size_t i = 0; i < nk; ++i)
+ {
+ std::string kmer = sequence.substr(i, k);
+ int c = BWTAlgorithms::countSequenceOccurrences(kmer, indices);
+
+ if(c < min_depth)
+ {
+ covered = false;
+ break;
+ }
+ }
+
+ if(covered)
+ return k;
+ }
+
+ return 0;
+}
+
+//
+std::vector<int> HapgenUtil::makeCountProfile(const std::string& str, size_t k, int max, const BWTIndexSet& indices)
+{
+ std::vector<int> out;
+ if(str.size() < k)
+ return out;
+
+ for(size_t i = 0; i < str.size() - k + 1; ++i)
+ {
+ int count = BWTAlgorithms::countSequenceOccurrences(str.substr(i, k), indices);
+ out.push_back(count > max ? max : count);
+ }
+ return out;
+}
+
+
// Print an alignment to a reference
void HapgenUtil::printAlignment(const std::string& query, const HapgenAlignment& aln, const ReadTable* pRefTable)
{
diff --git a/src/GraphDiff/HapgenUtil.h b/src/GraphDiff/HapgenUtil.h
index cd0a373..78822aa 100644
--- a/src/GraphDiff/HapgenUtil.h
+++ b/src/GraphDiff/HapgenUtil.h
@@ -20,8 +20,14 @@
struct HapgenAlignment
{
//
- HapgenAlignment() : referenceID(-1), position(-1), length(0), score(-1), isRC(false) {}
- HapgenAlignment(const int& id, int p, int l, int s, bool rc) : referenceID(id), position(p), length(l), score(s), isRC(rc) {}
+ HapgenAlignment() : referenceID(-1), position(-1), length(0), score(-1), numEvents(0), isRC(false), isSNP(false) {}
+ HapgenAlignment(const int& id, int p, int l, int s, int ne, bool rc, bool snp) : referenceID(id),
+ position(p),
+ length(l),
+ score(s),
+ numEvents(ne),
+ isRC(rc),
+ isSNP(snp) {}
// Sort
// Kees: added sorting on alignment score
@@ -51,27 +57,25 @@ struct HapgenAlignment
int position;
int length; // alignment length
int score;
+ int numEvents;
bool isRC;
+ bool isSNP;
};
typedef std::vector<HapgenAlignment> HapgenAlignmentVector;
//
namespace HapgenUtil
{
-
-
- // Align the haplotype to the reference genome represented by the BWT/SSA pair
- void alignHaplotypeToReferenceBWASW(const std::string& haplotype,
- const BWTIndexSet& referenceIndex,
- HapgenAlignmentVector& outAlignments);
-
//
- void alignHaplotypeToReferenceKmer(size_t k,
+ void alignHaplotypeToReferenceKmer(size_t align_k,
+ size_t assemble_k,
const std::string& haplotype,
const BWTIndexSet& referenceIndex,
const ReadTable* pReferenceTable,
HapgenAlignmentVector& outAlignments);
+
+
// Coalesce a set of alignments into distinct locations
void coalesceAlignments(HapgenAlignmentVector& alignments);
@@ -96,7 +100,8 @@ namespace HapgenUtil
int flanking,
const StringVector& inHaplotypes,
StringVector& outFlankingHaplotypes,
- StringVector& outHaplotypes);
+ StringVector& outHaplotypes,
+ StringVector& outFlankingReferenceHaplotypes);
// Extract reads from an FM-index that have a k-mer match to any given haplotypes
@@ -121,11 +126,26 @@ namespace HapgenUtil
SeqRecordVector* pOutReads,
SeqRecordVector* pOutMates);
+ // Get the index of the first and last k-mers of the haplotype
+ // in the reference sequence
+ void getFlankingKmerMatch(size_t k,
+ const std::string& reference,
+ const std::string& haplotype,
+ size_t& start_idx,
+ size_t& end_idx);
// Perform an alignment between a haplotype sequence and a reference sequence
- SequenceOverlap alignHaplotypeToReference(const std::string& reference,
+ // This function assumes the sequences share the first/last k bases
+ SequenceOverlap alignHaplotypeToReference(size_t k,
+ const std::string& reference,
const std::string& haplotype);
+ // Returns true if the start/end kmer for all haplotypes occur in the reference string
+ bool checkHaplotypeKmersMatchReference(size_t k,
+ const std::string& reference,
+ const StringVector& inHaplotypes);
+
+
// Compute the best local alignment for each read in the array to the given sequence
LocalAlignmentResultVector alignReadsLocally(const std::string& target, const SeqItemVector& reads);
@@ -133,6 +153,21 @@ namespace HapgenUtil
// of the passed in sequence
bool checkAlignmentsAreConsistent(const std::string& refString, const StringVector& queries);
+ // Calculate the dust score around a reference position
+ double calculateDustScoreAtPosition(const std::string& name,
+ int position,
+ const ReadTable* pRefTable,
+ int window_size = 64);
+
+ // Return the count of the most frequent string within one-edit distance of the given string
+ size_t getMaximumOneEdit(const std::string& str, const BWTIndexSet& indices);
+
+ // Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
+ size_t calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices);
+
+ // Count the number of times each k-mer of str is in the BWT
+ std::vector<int> makeCountProfile(const std::string& str, size_t k, int max, const BWTIndexSet& indices);
+
// Print an alignment to a reference
void printAlignment(const std::string& query, const HapgenAlignment& aln, const ReadTable* pRefTable);
diff --git a/src/SGA/Makefile.am b/src/SGA/Makefile.am
index 0871a96..4931ef5 100644
--- a/src/SGA/Makefile.am
+++ b/src/SGA/Makefile.am
@@ -15,9 +15,9 @@ sga_CPPFLAGS = \
sga_LDADD = \
$(top_builddir)/Scaffold/libscaffold.a \
$(top_builddir)/GraphDiff/libgraphdiff.a \
- $(top_builddir)/StringGraph/libstringgraph.a \
+ $(top_builddir)/StringGraph/libstringgraph.a \
$(top_builddir)/Concurrency/libconcurrency.a \
- $(top_builddir)/Algorithm/libalgorithm.a \
+ $(top_builddir)/Algorithm/libalgorithm.a \
$(top_builddir)/SuffixTools/libsuffixtools.a \
$(top_builddir)/Bigraph/libbigraph.a \
$(top_builddir)/Util/libutil.a \
@@ -32,15 +32,16 @@ sga_SOURCES = sga.cpp \
assemble.cpp assemble.h \
correct.cpp correct.h \
oview.cpp oview.h \
- preprocess.cpp preprocess.h \
+ preprocess.cpp preprocess.h \
rmdup.cpp rmdup.h \
merge.cpp merge.h \
- subgraph.cpp subgraph.h \
- scaffold.cpp scaffold.h \
- scaffold2fasta.cpp scaffold2fasta.h \
+ subgraph.cpp subgraph.h \
+ scaffold.cpp scaffold.h \
+ scaffold2fasta.cpp scaffold2fasta.h \
connect.cpp connect.h \
walk.cpp walk.h \
filter.cpp filter.h \
+ kmer-count.cpp kmer-count.h \
stats.cpp stats.h \
fm-merge.cpp fm-merge.h \
gmap.h gmap.cpp \
@@ -49,10 +50,13 @@ sga_SOURCES = sga.cpp \
gen-ssa.h gen-ssa.cpp \
bwt2fa.h bwt2fa.cpp \
graph-diff.h graph-diff.cpp \
+ graph-concordance.h graph-concordance.cpp \
gapfill.h gapfill.cpp \
preqc.h preqc.cpp \
+ overlap-long.h overlap-long.cpp \
variant-detectability.h variant-detectability.cpp \
rewrite-evidence-bam.h rewrite-evidence-bam.cpp \
+ somatic-variant-filters-bam.h somatic-variant-filters.cpp \
haplotype-filter.h haplotype-filter.cpp \
OverlapCommon.h OverlapCommon.cpp \
SGACommon.h
diff --git a/src/SGA/SGACommon.h b/src/SGA/SGACommon.h
index 9cbdd05..29b0158 100644
--- a/src/SGA/SGACommon.h
+++ b/src/SGA/SGACommon.h
@@ -30,5 +30,6 @@
// Default values
#define DEFAULT_MIN_OVERLAP 45
#define DEFAULT_EXTRACT_LEN 100
-
+#define DEFAULT_MIN_COUNT_MAX_BASE 4
+#define DEFAULT_COUNT_OFFSET 1
#endif
diff --git a/src/SGA/correct.cpp b/src/SGA/correct.cpp
index 1958cea..8e15eb4 100644
--- a/src/SGA/correct.cpp
+++ b/src/SGA/correct.cpp
@@ -30,6 +30,8 @@
// Functions
int learnKmerParameters(const BWT* pBWT);
+//#define OVERLAPCORRECTION_VERBOSE 1
+
//
// Getopt
//
@@ -58,10 +60,15 @@ static const char *CORRECT_USAGE_MESSAGE =
" -k, --kmer-size=N The length of the kmer to use. (default: 31)\n"
" -x, --kmer-threshold=N Attempt to correct kmers that are seen less than N times. (default: 3)\n"
" -i, --kmer-rounds=N Perform N rounds of k-mer correction, correcting up to N bases (default: 10)\n"
+" -O, --count-offset=N When correcting a kmer, require the count of the new kmer is at least +N higher than the count of the old kmer. (default: 1)\n"
" --learn Attempt to learn the k-mer correction threshold (experimental). Overrides -x parameter.\n"
"\nOverlap correction parameters:\n"
" -e, --error-rate the maximum error rate allowed between two sequences to consider them overlapped (default: 0.04)\n"
" -m, --min-overlap=LEN minimum overlap required between two reads (default: 45)\n"
+" -M, --min-count-max-base=INT minimum count of the base that has the highest count in overlap correction.\n"
+" The base of the read is only corrected if the maximum base has at least this count.\n"
+" Should avoid mis-corrections in low coverage regions (default: 4)\n"
+" -X, --base-threshold=N Attempt to correct bases in a read that are seen less than N times (default: 2)\n"
" -c, --conflict=INT use INT as the threshold to detect a conflicted base in the multi-overlap (default: 5)\n"
" -l, --seed-length=LEN force the seed length to be LEN. By default, the seed length in the overlap step\n"
" is calculated to guarantee all overlaps with --error-rate differences are found.\n"
@@ -92,12 +99,15 @@ namespace opt
static double errorRate = 0.04;
static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
+ static unsigned int min_count_max_base = DEFAULT_MIN_COUNT_MAX_BASE;
+ static unsigned int countOffset = DEFAULT_COUNT_OFFSET;
static int seedLength = 0;
static int seedStride = 0;
static int conflictCutoff = 5;
static int branchCutoff = -1;
static int kmerLength = 31;
+ static int base_threshold = 2;
static int kmerThreshold = 3;
static int numKmerRounds = 10;
static bool bLearnKmerParams = false;
@@ -106,7 +116,7 @@ namespace opt
static ErrorCorrectAlgorithm algorithm = ECA_KMER;
}
-static const char* shortopts = "p:m:d:e:t:l:s:o:r:b:a:c:k:x:i:v";
+static const char* shortopts = "p:m:M:O:d:e:t:l:s:o:r:b:a:c:k:x:X:i:v";
enum { OPT_HELP = 1, OPT_VERSION, OPT_METRICS, OPT_DISCARD, OPT_LEARN };
@@ -114,6 +124,8 @@ static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
{ "threads", required_argument, NULL, 't' },
{ "min-overlap", required_argument, NULL, 'm' },
+ { "min-count-max-base", required_argument, NULL, 'M' },
+ { "count-offset", required_argument, NULL, 'O' },
{ "rounds", required_argument, NULL, 'r' },
{ "outfile", required_argument, NULL, 'o' },
{ "prefix", required_argument, NULL, 'p' },
@@ -126,6 +138,7 @@ static const struct option longopts[] = {
{ "branch-cutoff", required_argument, NULL, 'b' },
{ "kmer-size", required_argument, NULL, 'k' },
{ "kmer-threshold",required_argument, NULL, 'x' },
+ { "base-threshold",required_argument, NULL, 'X' },
{ "kmer-rounds", required_argument, NULL, 'i' },
{ "learn", no_argument, NULL, OPT_LEARN },
{ "discard", no_argument, NULL, OPT_DISCARD },
@@ -181,6 +194,9 @@ int correctMain(int argc, char** argv)
ecParams.algorithm = opt::algorithm;
ecParams.minOverlap = opt::minOverlap;
+ ecParams.min_count_max_base = opt::min_count_max_base;
+ ecParams.countOffset = opt::countOffset;
+ ecParams.base_threshold = opt::base_threshold;
ecParams.numOverlapRounds = opt::numOverlapRounds;
ecParams.minIdentity = 1.0f - opt::errorRate;
ecParams.conflictCutoff = opt::conflictCutoff;
@@ -189,6 +205,10 @@ int correctMain(int argc, char** argv)
ecParams.kmerLength = opt::kmerLength;
ecParams.printOverlaps = opt::verbose > 0;
+ printf("ecParams.min_count_max_base = %d\n",ecParams.min_count_max_base);
+ printf("ecParams.base_threshold = %d\n",ecParams.base_threshold);
+ printf("ecParams.countOffset = %d\n",ecParams.countOffset);
+
// Setup post-processor
bool bCollectMetrics = !opt::metricsFile.empty();
ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);
@@ -313,6 +333,8 @@ void parseCorrectOptions(int argc, char** argv)
switch (c)
{
case 'm': arg >> opt::minOverlap; break;
+ case 'M': arg >> opt::min_count_max_base; break;
+ case 'O': arg >> opt::countOffset; break;
case 'p': arg >> opt::prefix; break;
case 'o': arg >> opt::outFile; break;
case 'e': arg >> opt::errorRate; break;
@@ -325,6 +347,7 @@ void parseCorrectOptions(int argc, char** argv)
case 'c': arg >> opt::conflictCutoff; break;
case 'k': arg >> opt::kmerLength; break;
case 'x': arg >> opt::kmerThreshold; break;
+ case 'X': arg >> opt::base_threshold; break;
case '?': die = true; break;
case 'v': opt::verbose++; break;
case 'b': arg >> opt::branchCutoff; break;
@@ -382,6 +405,24 @@ void parseCorrectOptions(int argc, char** argv)
die = true;
}
+ if(opt::base_threshold <= 0)
+ {
+ std::cerr << SUBPROGRAM ": invalid base threshold: " << opt::base_threshold << ", must be greater than zero\n";
+ die = true;
+ }
+
+ if(opt::min_count_max_base <= 0)
+ {
+ std::cerr << SUBPROGRAM ": invalid min_count_max_base: " << opt::min_count_max_base << ", must be greater than zero\n";
+ die = true;
+ }
+
+ if(opt::countOffset <= 0)
+ {
+ std::cerr << SUBPROGRAM ": invalid countOffset: " << opt::countOffset << ", must be greater than zero. Otherwise, a kmer could be corrected to a kmer with the same count.\n";
+ die = true;
+ }
+
// Determine the correction algorithm to use
if(!algo_str.empty())
{
diff --git a/src/SGA/filter.cpp b/src/SGA/filter.cpp
index 9451134..f28614a 100644
--- a/src/SGA/filter.cpp
+++ b/src/SGA/filter.cpp
@@ -61,6 +61,7 @@ static const char *FILTER_USAGE_MESSAGE =
" --no-duplicate-check turn off duplicate removal\n"
" --substring-only when removing duplicates, only remove substring sequences, not full-length matches\n"
" --no-kmer-check turn off the kmer check\n"
+" --kmer-both-strand mimimum kmer coverage is required for both strand\n"
" --homopolymer-check check reads for hompolymer run length sequencing errors\n"
" --low-complexity-check filter out low complexity reads\n"
"\nK-mer filter options:\n"
@@ -84,6 +85,7 @@ namespace opt
static bool dupCheck = true;
static bool substringOnly = false;
static bool kmerCheck = true;
+ static bool kmerBothStrand = false;
static bool hpCheck = false;
static bool lowComplexityCheck = false;
@@ -93,7 +95,7 @@ namespace opt
static const char* shortopts = "p:d:t:o:k:x:v";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_SUBSTRING_ONLY, OPT_NO_RMDUP, OPT_NO_KMER, OPT_CHECK_HPRUNS, OPT_CHECK_COMPLEXITY };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_SUBSTRING_ONLY, OPT_NO_RMDUP, OPT_NO_KMER, OPT_KMER_BOTH_STRAND, OPT_CHECK_HPRUNS, OPT_CHECK_COMPLEXITY };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -107,6 +109,7 @@ static const struct option longopts[] = {
{ "version", no_argument, NULL, OPT_VERSION },
{ "no-duplicate-check", no_argument, NULL, OPT_NO_RMDUP },
{ "no-kmer-check", no_argument, NULL, OPT_NO_KMER },
+ { "kmer-both-strand", no_argument, NULL, OPT_KMER_BOTH_STRAND },
{ "homopolymer-check", no_argument, NULL, OPT_CHECK_HPRUNS },
{ "low-complexity-check", no_argument, NULL, OPT_CHECK_COMPLEXITY },
{ "substring-only", no_argument, NULL, OPT_SUBSTRING_ONLY },
@@ -145,6 +148,7 @@ int filterMain(int argc, char** argv)
params.checkDuplicates = opt::dupCheck;
params.substringOnly = opt::substringOnly;
params.checkKmer = opt::kmerCheck;
+ params.kmerBothStrand = opt::kmerBothStrand;
params.checkHPRuns = opt::hpCheck;
params.checkDegenerate = opt::lowComplexityCheck;
@@ -191,7 +195,7 @@ int filterMain(int argc, char** argv)
delete pSharedBV;
// Rebuild the FM-index without the discarded reads
- std::string out_prefix = stripFilename(opt::outFile);
+ std::string out_prefix = stripExtension(opt::outFile);
removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads);
removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads);
@@ -223,6 +227,7 @@ void parseFilterOptions(int argc, char** argv)
case 'x': arg >> opt::kmerThreshold; break;
case OPT_NO_RMDUP: opt::dupCheck = false; break;
case OPT_NO_KMER: opt::kmerCheck = false; break;
+ case OPT_KMER_BOTH_STRAND: opt::kmerBothStrand = true; break;
case OPT_CHECK_HPRUNS: opt::hpCheck = true; break;
case OPT_CHECK_COMPLEXITY: opt::lowComplexityCheck = true; break;
case OPT_SUBSTRING_ONLY: opt::substringOnly = true; break;
@@ -277,7 +282,7 @@ void parseFilterOptions(int argc, char** argv)
if(opt::prefix.empty())
{
- opt::prefix = stripFilename(opt::readsFile);
+ opt::prefix = stripExtension(opt::readsFile);
}
if(opt::outFile.empty())
@@ -287,6 +292,9 @@ void parseFilterOptions(int argc, char** argv)
}
else
{
- opt::discardFile = stripFilename(opt::outFile) + ".discard.fa";
+ if(isGzip(opt::outFile))
+ opt::discardFile = stripExtension(opt::outFile) + ".discard.fa" + GZIP_EXT;
+ else
+ opt::discardFile = stripExtension(opt::outFile) + ".discard.fa";
}
}
diff --git a/src/SGA/fm-merge.cpp b/src/SGA/fm-merge.cpp
index dd06119..649d9cf 100644
--- a/src/SGA/fm-merge.cpp
+++ b/src/SGA/fm-merge.cpp
@@ -39,7 +39,7 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"
static const char *FMMERGE_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READSFILE\n"
"Merge unambiguously sequences from the READSFILE using the FM-index.\n"
-"This program requires rmdup to be run before it.\n"
+"This program requires filter to be run before it and rmdup to be run after.\n"
"\n"
" --help display this help and exit\n"
" -v, --verbose display verbose output\n"
@@ -209,7 +209,7 @@ void parseFMMergeOptions(int argc, char** argv)
if(opt::prefix.empty())
{
- opt::prefix = stripFilename(opt::readsFile);
+ opt::prefix = stripExtension(opt::readsFile);
}
if(opt::outFile.empty())
diff --git a/src/SGA/gen-ssa.cpp b/src/SGA/gen-ssa.cpp
index 9bc916a..2125cda 100644
--- a/src/SGA/gen-ssa.cpp
+++ b/src/SGA/gen-ssa.cpp
@@ -39,6 +39,7 @@ static const char *GENSSA_USAGE_MESSAGE =
" --help display this help and exit\n"
" -t, --threads=NUM use NUM threads to construct the index (default: 1)\n"
" -c, --check validate that the suffix array/bwt is correct\n"
+" -s, --sai-only only build the lexicographic index\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
@@ -48,17 +49,19 @@ namespace opt
static std::string prefix;
static int numThreads = 1;
static int sampleRate = 32;
+ static bool saiOnly = false;
static bool validate = false;
}
-static const char* shortopts = "t:cv";
+static const char* shortopts = "t:cvs";
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_SAI_ONLY };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
{ "check", no_argument, NULL, 'c' },
{ "threads", required_argument, NULL, 't' },
+ { "sai-only", no_argument, NULL, OPT_SAI_ONLY },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
@@ -70,19 +73,27 @@ int genSSAMain(int argc, char** argv)
parseGenSSAOptions(argc, argv);
BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
- ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
pBWT->printInfo();
SampledSuffixArray* pSSA = new SampledSuffixArray();
- pSSA->build(pBWT, pRIT, opt::sampleRate);
+ if(opt::saiOnly)
+ {
+ pSSA->buildLexicoIndex(pBWT, opt::numThreads);
+ pSSA->writeLexicoIndex(opt::prefix + SAI_EXT);
+ }
+ else
+ {
+ ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
+ pSSA->build(pBWT, pRIT, opt::sampleRate);
+ pSSA->writeSSA(opt::prefix + SSA_EXT);
+ delete pRIT;
+ }
pSSA->printInfo();
- pSSA->writeSSA(opt::prefix + SSA_EXT);
if(opt::validate)
pSSA->validate(opt::readsFile, pBWT);
delete pBWT;
- delete pRIT;
delete pSSA;
return 0;
@@ -103,6 +114,7 @@ void parseGenSSAOptions(int argc, char** argv)
case 'c': opt::validate = true; break;
case 't': arg >> opt::numThreads; break;
case 'v': opt::verbose++; break;
+ case OPT_SAI_ONLY: opt::saiOnly = true; break;
case OPT_HELP:
std::cout << GENSSA_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
diff --git a/src/SGA/graph-concordance.cpp b/src/SGA/graph-concordance.cpp
new file mode 100644
index 0000000..9809b57
--- /dev/null
+++ b/src/SGA/graph-concordance.cpp
@@ -0,0 +1,487 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// graph-concordance - check variants in a VCF file
+// against an assembly graph
+//
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iterator>
+#include "Util.h"
+#include "graph-concordance.h"
+#include "BWTAlgorithms.h"
+#include "BWTIndexSet.h"
+#include "SGACommon.h"
+#include "Timer.h"
+#include "SequenceProcessFramework.h"
+#include "HapgenUtil.h"
+#include "SGAStats.h"
+#include "HashMap.h"
+#include "DindelRealignWindow.h"
+#include "VariantIndex.h"
+#include "HapgenUtil.h"
+
+//
+// Getopt
+//
+#define SUBPROGRAM "graph-concordance"
+static const char *GRAPH_CONCORDANCE_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2013 Wellcome Trust Sanger Institute\n";
+
+static const char *GRAPH_CONCORDANCE_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... -r variant.fastq -b base.fastq VCF_FILE\n"
+"Count read support for variants in a vcf file\n"
+"\n"
+" --help display this help and exit\n"
+" -v, --verbose display verbose output\n"
+" --reference=STR load the reference genome from FILE\n"
+" -t, --threads=NUM use NUM threads to compute the overlaps (default: 1)\n"
+" -g, --germline=FILE load germline variants from FILE\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+static const char* PROGRAM_IDENT =
+PACKAGE_NAME "::" SUBPROGRAM;
+
+namespace opt
+{
+ static unsigned int verbose;
+ static int k = 51;
+ static std::string variantFile;
+ static std::string baseFile;
+ static std::string haplotypeFile;
+ static std::string vcfFile;
+ static std::string referenceFile;
+ static std::string germlineFile;
+}
+
+static const char* shortopts = "b:r:o:k:t:g:v";
+
+enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_HAPLOID };
+
+static const struct option longopts[] = {
+ { "verbose", no_argument, NULL, 'v' },
+ { "threads", required_argument, NULL, 't' },
+ { "outfile", required_argument, NULL, 'o' },
+ { "reads", required_argument, NULL, 'r' },
+ { "base", required_argument, NULL, 'b' },
+ { "variant", required_argument, NULL, 'r' },
+ { "germline", required_argument, NULL, 'g' },
+ { "reference", required_argument, NULL, OPT_REFERENCE },
+ { "haploid", no_argument, NULL, OPT_HAPLOID },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+std::string applyVariant(const std::string& in, int pos,
+ const std::string& ref, const std::string& alt,
+ std::vector<int>& coordinate_map,
+ bool update_map, int variant_tag = -1)
+{
+ assert(variant_tag < 0);
+ std::string out = in;
+
+ // Search the coordinate map for the index of the ref string to be substituted
+ size_t idx = 0;
+ while(idx < coordinate_map.size())
+ {
+ if(coordinate_map[idx] == pos)
+ break;
+ idx += 1;
+ }
+
+ // If we could not find the position in the coordinate map
+ // then the input variant is for a reference base that has been removed
+ // already. Skip it.
+ if(idx == coordinate_map.size())
+ return "";
+
+ /*
+ std::cout << "In: " << in << "-\n";
+ std::cout << "Current map: \n";
+ for(size_t i = 0; i < coordinate_map.size(); ++i)
+ std::cout << i << " " << coordinate_map[i] << "\n";
+ std::cout << "\n";
+ std::cout << "Replace idx: " << idx << "\n";
+ std::cout << "Replace coordinate: " << coordinate_map[idx] << "\n";
+ std::cout << "ref: " << ref << "\n";
+ std::cout << "Alt: " << alt << "\n";
+ */
+
+ // Ensure that the reference string at the variant matches the expected
+ if(out.substr(idx, ref.length()) != ref)
+ {
+ std::cerr << "Warning: REF does not match reference\n";
+ return "";
+ }
+
+ // Replace the reference sequence with the alternative
+ out.replace(idx, ref.length(), alt);
+
+ if(update_map)
+ {
+ // shift coordinate map to account for indels
+
+ // Make an iterator to the first position that is replaced
+ std::vector<int>::iterator fi = coordinate_map.begin() + idx;
+
+ // Iterator which is one past the last position to remove
+ std::vector<int>::iterator li = fi + ref.size();
+
+ // erase returns an iterator pointing to the position
+ // following the last position removed, this is where
+ // we want to insert
+ std::vector<int>::iterator ii = coordinate_map.erase(fi, li);
+
+ // Insert new elements for the incoming bases
+ // These elements can be tagged with an integer
+ // by the calling function to indicate the sample
+ // the new bases came from
+ coordinate_map.insert(ii, alt.size(), variant_tag);
+
+ /*
+ std::cout << "Updated map: \n";
+ for(size_t i = 0; i < coordinate_map.size(); ++i)
+ std::cout << coordinate_map[i] << " ";
+ std::cout << "\n";
+ */
+ assert(coordinate_map.size() == out.size());
+ }
+ return out;
+}
+
+// Extract somatic k-mers from the haplotype accordining to the tagged
+// entries in the coordinate map
+StringVector extractSomaticKmers(const std::string& haplotype,
+ const std::vector<int> coordinate_map,
+ int somatic_tag)
+{
+ StringVector kmers;
+
+ assert(haplotype.size() == coordinate_map.size());
+
+ // Get the index in the vector of the first somatic variant
+ int first_idx = 0;
+ while(coordinate_map[first_idx] != somatic_tag &&
+ first_idx != (int)coordinate_map.size()) {
+ first_idx++;
+ }
+
+ // no somatic variants?
+ if(first_idx == (int)coordinate_map.size())
+ return kmers;
+
+ // Get the index of the last somatic variant
+ int last_idx = first_idx;
+ while(coordinate_map[last_idx] == somatic_tag)
+ last_idx++;
+ last_idx -= 1;
+
+ // Start k - 1 bases before the first index
+ first_idx -= (opt::k - 1);
+ if(first_idx < 0)
+ first_idx = 0;
+
+ for(size_t i = first_idx; i <= (size_t)last_idx; ++i)
+ {
+ std::string t = haplotype.substr(i, opt::k);
+ if(t.size() == (size_t)opt::k)
+ kmers.push_back(t);
+ }
+
+ return kmers;
+}
+
+std::string reconstructHaplotype(const VCFRecord& somatic_record,
+ const VariantRecordVector nearby_germline,
+ const ReadTable& refTable,
+ const BWTIndexSet& variantIndex,
+ size_t flanking_size)
+{
+ // Extract the reference haplotype
+ int eventLength = somatic_record.varStr.length();
+ int zeroBasedPos = somatic_record.refPosition - 1;
+ int start = zeroBasedPos - flanking_size - 1;
+ int end = zeroBasedPos + eventLength + 2 * flanking_size;
+ const SeqItem& chr = refTable.getRead(somatic_record.refName);
+ if(end > chr.seq.length())
+ return "";
+
+ std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+ // Set up a vector mapping positions on the current haplotype
+ // to original reference coordinates. This allows use to apply
+ // the germline variants to the haplotype when the coordinates
+ // shift due to indels
+ std::vector<int> coordinate_map(reference_haplotype.size());
+ for(int i = 0; i < (int)reference_haplotype.size(); ++i)
+ coordinate_map[i] = i + start;
+
+ // Do not attempt haplotype reconstruction if there are too many variants
+ if(nearby_germline.size() > 4)
+ return "";
+
+ int SOMATIC_TAG = -2;
+ int GERMLINE_TAG = -3;
+
+ // Iterate over all possible combinations of germline variants
+ size_t total_haplotypes = 1 << nearby_germline.size(); // 2 ^ n
+
+ for(size_t haplotype_id = 0; haplotype_id < total_haplotypes; haplotype_id++)
+ {
+ std::string haplotype = reference_haplotype;
+ std::vector<int> haplotype_coordinates = coordinate_map;
+
+ // Iterate over the germline variants and check if the i-th bit
+ // is set in the haplotype id. If so, apply the variant to the haplotype
+ for(size_t var_id = 0; var_id < nearby_germline.size(); ++var_id)
+ {
+ if( (haplotype_id >> var_id) & 1)
+ {
+ std::string new_haplotype =
+ applyVariant(haplotype,
+ nearby_germline[var_id].position - 1,
+ nearby_germline[var_id].ref_sequence,
+ nearby_germline[var_id].alt_sequence,
+ haplotype_coordinates,
+ true,
+ GERMLINE_TAG);
+ haplotype = new_haplotype;
+
+ if(haplotype.empty())
+ break;
+ }
+ }
+
+ if(haplotype.empty())
+ continue;
+
+ // Apply the somatic variant to this haplotype
+ std::string somatic_haplotype =
+ applyVariant(haplotype,
+ somatic_record.refPosition - 1,
+ somatic_record.refStr,
+ somatic_record.varStr,
+ haplotype_coordinates,
+ true,
+ SOMATIC_TAG);
+
+ if(somatic_haplotype.empty())
+ continue;
+
+ StringVector kmers = extractSomaticKmers(somatic_haplotype, haplotype_coordinates, SOMATIC_TAG);
+
+ // Count the number of somatic kmers that are present in the variant reads
+ size_t read_kmers = 0;
+ for(size_t ki = 0; ki < kmers.size(); ++ki)
+ {
+ if(BWTAlgorithms::countSequenceOccurrences(kmers[ki], variantIndex) > 0)
+ read_kmers += 1;
+ }
+
+ // Check if all somatic kmers are present in the reads
+ if(read_kmers == kmers.size())
+ return somatic_haplotype;
+ }
+
+ return "";
+}
+
+//
+// Main
+//
+int graphConcordanceMain(int argc, char** argv)
+{
+ parseGraphConcordanceOptions(argc, argv);
+
+ Timer* pTimer = new Timer(PROGRAM_IDENT);
+
+ // Load the reference
+ ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION);
+ refTable.indexReadsByID();
+
+ // Index the germline variants so we can make germline haplotypes
+ std::cerr << "Loading germline variant index..." << std::flush;
+ VariantIndex germlineIndex(opt::germlineFile, refTable);
+ std::cerr << "done\n";
+
+ // Load FM-index of the reads
+ BWTIndexSet variantIndex;
+ std::cerr << "Loading variant read index... " << std::flush;
+ std::string variantPrefix = stripGzippedExtension(opt::variantFile);
+ variantIndex.pBWT = new BWT(variantPrefix + BWT_EXT, 256);
+ variantIndex.pCache = new BWTIntervalCache(11, variantIndex.pBWT);
+ std::cerr << "done\n";
+
+ //
+ BWTIndexSet baseIndex;
+ std::cerr << "Loading base read index index... " << std::flush;
+ std::string basePrefix = stripGzippedExtension(opt::baseFile);
+ baseIndex.pBWT = new BWT(basePrefix + BWT_EXT, 256);
+ baseIndex.pCache = new BWTIntervalCache(11, baseIndex.pBWT);
+ std::cerr << "done\n";
+
+ std::ifstream input(opt::vcfFile.c_str());
+ std::string line;
+ while(getline(input, line))
+ {
+ if(line.empty())
+ continue;
+
+ if(line[0] == '#')
+ {
+ std::cout << line << "\n";
+ continue;
+ }
+
+ // parse record
+ VCFRecord record(line);
+ if(record.isMultiAllelic())
+ {
+ std::cerr << "Error: multi-allelic VCF found, please run vcfbreakmulti\n";
+ exit(EXIT_FAILURE);
+ }
+
+ // Grab nearby variants
+ size_t flanking_size = opt::k;
+ VariantRecordVector nearby_vector = germlineIndex.getNearVariants(record.refName,
+ record.refPosition,
+ flanking_size);
+
+ if(opt::verbose > 0)
+ {
+ std::cerr << "\n\n Variant: " << record << "\n";
+ std::cerr << "Nearby: " << nearby_vector.size() << "\n";
+ }
+
+ std::string classification = "UNKNOWN";
+
+ std::string haplotype;
+ if(nearby_vector.size() <= 4)
+ haplotype = reconstructHaplotype(record, nearby_vector, refTable, variantIndex, flanking_size);
+ else
+ classification = "TOO_COMPLEX";
+
+ size_t max_unique_variant_kmers = 0;
+ if(!haplotype.empty())
+ {
+ IntVector v_cp = HapgenUtil::makeCountProfile(haplotype, opt::k, 9, variantIndex);
+ IntVector b_cp = HapgenUtil::makeCountProfile(haplotype, opt::k, 9, baseIndex);
+
+
+ size_t unique_variant_kmers = 0;
+ for(size_t i = 0; i < v_cp.size(); ++i)
+ {
+ if(v_cp[i] > 0 && b_cp[i] == 0)
+ unique_variant_kmers++;
+ }
+
+ if(unique_variant_kmers > max_unique_variant_kmers)
+ max_unique_variant_kmers = unique_variant_kmers;
+
+ if(opt::verbose > 0)
+ {
+ std::cerr << "k-mer profile somatic (variant): ";
+ for(size_t j = 0; j < v_cp.size(); ++j)
+ std::cerr << v_cp[j];
+ std::cerr << "\n";
+
+ std::cerr << "k-mer profile somatic (base): ";
+ for(size_t j = 0; j < b_cp.size(); ++j)
+ std::cerr << b_cp[j];
+ std::cerr << "\n";
+ fprintf(stderr, "unique variant kmers: %zu\n\n", unique_variant_kmers);
+ fprintf(stderr, "max unique variant kmers: %zu\n\n", max_unique_variant_kmers);
+ }
+ }
+ else
+ {
+ classification = "CANNOT_RECONSTRUCT";
+ }
+
+ if(max_unique_variant_kmers > 5)
+ classification = "SOMATIC";
+ else
+ classification = "GERMLINE";
+
+ // write out the record
+ record.addComment("MaxUniqueVariantKmers", (int)max_unique_variant_kmers);
+ record.addComment("KmerClassification", classification);
+ std::cout << record << "\n";
+ }
+
+ // Cleanup
+ delete variantIndex.pBWT;
+ delete variantIndex.pCache;
+ delete baseIndex.pBWT;
+ delete baseIndex.pCache;
+ delete pTimer;
+
+ return 0;
+}
+
+//
+// Handle command line arguments
+//
+void parseGraphConcordanceOptions(int argc, char** argv)
+{
+ std::string algo_str;
+ bool die = false;
+ for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
+ {
+ std::istringstream arg(optarg != NULL ? optarg : "");
+ switch (c)
+ {
+ case 'b': arg >> opt::baseFile; break;
+ case 'r': arg >> opt::variantFile; break;
+ case 'g': arg >> opt::germlineFile; break;
+ case OPT_REFERENCE: arg >> opt::referenceFile; break;
+ case '?': die = true; break;
+ case 'v': opt::verbose++; break;
+ case OPT_HELP:
+ std::cout << GRAPH_CONCORDANCE_USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case OPT_VERSION:
+ std::cout << GRAPH_CONCORDANCE_VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ if (argc - optind < 1)
+ {
+ std::cerr << SUBPROGRAM ": missing arguments\n";
+ die = true;
+ }
+ else if (argc - optind > 1)
+ {
+ std::cerr << SUBPROGRAM ": too many arguments\n";
+ die = true;
+ }
+
+ if(opt::referenceFile.empty())
+ {
+ std::cerr << SUBPROGRAM ": a reference file must be given.\n";
+ die = true;
+ }
+
+ if(opt::germlineFile.empty())
+ {
+ std::cerr << SUBPROGRAM ": a germline variants file must be given.\n";
+ die = true;
+ }
+
+ opt::vcfFile = argv[optind++];
+
+ if (die)
+ {
+ std::cout << "\n" << GRAPH_CONCORDANCE_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+}
diff --git a/src/SGA/graph-concordance.h b/src/SGA/graph-concordance.h
new file mode 100644
index 0000000..a0954cc
--- /dev/null
+++ b/src/SGA/graph-concordance.h
@@ -0,0 +1,23 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// vcf-read-count - count the number of reads
+// supporting variant haplotypes
+//
+#ifndef VCF_READ_COUNT_H
+#define VCF_READ_COUNT_H
+#include <getopt.h>
+#include "config.h"
+
+// functions
+
+//
+int graphConcordanceMain(int argc, char** argv);
+
+// options
+void parseGraphConcordanceOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/graph-diff.cpp b/src/SGA/graph-diff.cpp
index 693ce76..103f847 100644
--- a/src/SGA/graph-diff.cpp
+++ b/src/SGA/graph-diff.cpp
@@ -31,6 +31,7 @@
void runVCFTester(GraphCompareParameters& parameters);
void runGraphDiff(GraphCompareParameters& parameters);
void runDebug(GraphCompareParameters& parameters);
+void runInteractive(GraphCompareParameters& parameters);
void preloadBloomFilter(const ReadTable* pReadTable, size_t k, BloomFilter* pBloomFilter);
// Defines to clarify awful template function calls
@@ -76,8 +77,7 @@ static const char *GRAPH_DIFF_USAGE_MESSAGE =
"Algorithm options:\n"
" -k, --kmer=K use K-mers to discover variants\n"
" -x, --min-discovery-count=T require a variant k-mer to be seen at least T times\n"
-" --debruijn use the de Bruijn graph assembly algorithm (default: string graph)\n"
-" --paired-debruijn use the de Bruijn graph assembly algorithm with paired-end constraints(default: string graph)\n"
+" -a, --algorithm=STR select the assembly algorithm to use from: debruijn, string\n"
" -m, --min-overlap=N require at least N bp overlap when assembling using a string graph\n"
" --min-dbg-count=T only use k-mers seen T times when assembling using a de Bruijn graph\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
@@ -99,11 +99,11 @@ namespace opt
static int minDBGCount = 2;
// Calling modes
- static bool deBruijnMode = false;
- static bool deBruijnPairedMode = false;
+ static GraphCompareAlgorithm algorithm = GCA_DEBRUIJN_GRAPH;
static bool lowCoverage = false;
static bool referenceMode = false;
static bool useQualityScores = false;
+ static bool interactiveMode = false;
// I/O
static std::string outPrefix = "sgavariants";
@@ -115,7 +115,7 @@ namespace opt
static std::string inputVCFFile;
}
-static const char* shortopts = "b:r:o:k:d:t:x:y:p:m:v";
+static const char* shortopts = "b:r:o:k:d:t:x:y:p:m:a:v";
enum { OPT_HELP = 1,
OPT_VERSION,
@@ -124,12 +124,11 @@ enum { OPT_HELP = 1,
OPT_DEBUG,
OPT_MIN_DBG_COUNT,
OPT_INDEX,
- OPT_DEBRUIJN,
- OPT_PAIRED_DEBRUIJN,
OPT_LOWCOVERAGE,
OPT_QUALSCORES,
OPT_BLOOM_GENOME,
- OPT_PRECACHE_REFERENCE };
+ OPT_PRECACHE_REFERENCE,
+ OPT_INTERACTIVE };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -141,10 +140,10 @@ static const struct option longopts[] = {
{ "sample-rate", required_argument, NULL, 'd' },
{ "prefix", required_argument, NULL, 'p' },
{ "min-overlap", required_argument, NULL, 'm' },
- { "debruijn", no_argument, NULL, OPT_DEBRUIJN },
- { "paired-debruijn", no_argument, NULL, OPT_PAIRED_DEBRUIJN },
+ { "algorithm", required_argument, NULL, 'a' },
{ "low-coverage", no_argument, NULL, OPT_LOWCOVERAGE },
{ "use-quality-scores", no_argument, NULL, OPT_QUALSCORES},
+ { "interactive", no_argument, NULL, OPT_INTERACTIVE},
{ "index", required_argument, NULL, OPT_INDEX },
{ "min-dbg-count", required_argument, NULL, OPT_MIN_DBG_COUNT },
{ "debug", required_argument, NULL, OPT_DEBUG },
@@ -248,14 +247,7 @@ int graphDiffMain(int argc, char** argv)
sharedParameters.referenceIndex = referenceIndex;
sharedParameters.pRefTable = &refTable;
sharedParameters.bReferenceMode = opt::referenceMode;
-
- if(opt::deBruijnPairedMode)
- sharedParameters.algorithm = GCA_PAIRED_DEBRUIJN_GRAPH;
- else if(opt::deBruijnMode)
- sharedParameters.algorithm = GCA_DEBRUIJN_GRAPH;
- else
- sharedParameters.algorithm = GCA_STRING_GRAPH;
-
+ sharedParameters.algorithm = opt::algorithm;
sharedParameters.kmer = opt::kmer;
sharedParameters.minDiscoveryCount = opt::minDiscoveryCount;
sharedParameters.minDBGCount = opt::minDBGCount;
@@ -278,6 +270,10 @@ int graphDiffMain(int argc, char** argv)
{
runDebug(sharedParameters);
}
+ else if(opt::interactiveMode)
+ {
+ runInteractive(sharedParameters);
+ }
else
{
// If a VCF file was provided, just test the variants in the file without any discovery
@@ -445,11 +441,54 @@ void runDebug(GraphCompareParameters& parameters)
graphCompare.testKmersFromFile(opt::debugFile);
}
+// Run interactively, parse a sequence from cin, process it then wait
+void runInteractive(GraphCompareParameters& parameters)
+{
+ // Initialize a bloom filter
+ size_t expected_bits = 0;
+ if(opt::bloomGenomeSize == -1)
+ {
+ // Calculate the expected bits using the number of bases in the reference
+ for(size_t i = 0; i < parameters.pRefTable->getCount(); ++i)
+ expected_bits += parameters.pRefTable->getReadLength(i);
+ }
+ else
+ {
+ expected_bits = opt::bloomGenomeSize;
+ }
+
+ size_t occupancy_factor = 20;
+ size_t bloom_size = occupancy_factor * expected_bits;
+
+ BloomFilter* pBloomFilter = new BloomFilter(bloom_size, 5);
+ parameters.pBloomFilter = pBloomFilter;
+ preloadBloomFilter(parameters.pRefTable, parameters.kmer, pBloomFilter);
+
+ GraphCompare graphCompare(parameters);
+ while(1)
+ {
+ std::string sequence;
+ std::cout << "Input a sequence: ";
+ std::cin >> sequence;
+
+ if(sequence == "done")
+ break;
+ SequenceWorkItem item;
+ item.read.seq = sequence;
+ item.read.id = "input";
+
+ std::cout << "processing: " << sequence << "\n";
+
+ graphCompare.process(item);
+ }
+}
+
//
// Handle command line arguments
//
void parseGraphDiffOptions(int argc, char** argv)
{
+ std::string algorithmString;
bool die = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
{
@@ -464,11 +503,10 @@ void parseGraphDiffOptions(int argc, char** argv)
case 'd': arg >> opt::sampleRate; break;
case 'p': arg >> opt::outPrefix; break;
case 'm': arg >> opt::minOverlap; break;
+ case 'a': arg >> algorithmString; break;
case '?': die = true; break;
case 'v': opt::verbose++; break;
case OPT_REFERENCE: arg >> opt::referenceFile; break;
- case OPT_DEBRUIJN: opt::deBruijnMode = true; break;
- case OPT_PAIRED_DEBRUIJN: opt::deBruijnPairedMode = true; break;
case OPT_LOWCOVERAGE: opt::lowCoverage = true; break;
case OPT_MIN_DBG_COUNT: arg >> opt::minDBGCount; break;
case OPT_BLOOM_GENOME: arg >> opt::bloomGenomeSize; break;
@@ -477,6 +515,7 @@ void parseGraphDiffOptions(int argc, char** argv)
case OPT_TESTVCF: arg >> opt::inputVCFFile; break;
case OPT_INDEX: arg >> opt::indexPrefix; break;
case OPT_QUALSCORES: opt::useQualityScores = true; break;
+ case OPT_INTERACTIVE: opt::interactiveMode = true; break;
case OPT_HELP:
std::cout << GRAPH_DIFF_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
@@ -508,6 +547,17 @@ void parseGraphDiffOptions(int argc, char** argv)
if(opt::baseFile.empty())
opt::referenceMode = true;
+ if(algorithmString == "paired") {
+ opt::algorithm = GCA_PAIRED_DEBRUIJN_GRAPH;
+ } else if(algorithmString == "debruijn") {
+ opt::algorithm = GCA_DEBRUIJN_GRAPH;
+ } else if(algorithmString == "string") {
+ opt::algorithm = GCA_STRING_GRAPH;
+ } else {
+ std::cerr << "Error: unrecognized algorithm string: " << algorithmString << "\n";
+ die = true;
+ }
+
if(opt::referenceFile.empty())
{
std::cerr << SUBPROGRAM ": error, a --reference file must be provided\n";
diff --git a/src/SGA/index.cpp b/src/SGA/index.cpp
index 4224c8a..bc9b9a3 100644
--- a/src/SGA/index.cpp
+++ b/src/SGA/index.cpp
@@ -51,6 +51,7 @@ static const char *INDEX_USAGE_MESSAGE =
" --no-reverse suppress construction of the reverse BWT. Use this option when building the index\n"
" for reads that will be error corrected using the k-mer corrector, which only needs the forward index\n"
" --no-forward suppress construction of the forward BWT. Use this option when building the forward and reverse index separately\n"
+" --no-sai suppress construction of the SAI file. This option only applies to -a ropebwt\n"
" -g, --gap-array=N use N bits of storage for each element of the gap array. Acceptable values are 4,8,16 or 32. Lower\n"
" values can substantially reduce the amount of memory required at the cost of less predictable memory usage.\n"
" When this value is set to 32, the memory requirement is essentially deterministic and requires ~5N bytes where\n"
@@ -69,13 +70,14 @@ namespace opt
static bool bDiskAlgo = false;
static bool bBuildReverse = true;
static bool bBuildForward = true;
+ static bool bBuildSAI = true;
static bool validate;
static int gapArrayStorage = 4;
}
static const char* shortopts = "p:a:m:t:d:g:cv";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_REVERSE,OPT_NO_FWD };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_REVERSE, OPT_NO_FWD, OPT_NO_SAI };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -87,6 +89,7 @@ static const struct option longopts[] = {
{ "algorithm", required_argument, NULL, 'a' },
{ "no-reverse", no_argument, NULL, OPT_NO_REVERSE },
{ "no-forward", no_argument, NULL, OPT_NO_FWD },
+ { "no-sai", no_argument, NULL, OPT_NO_SAI },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
@@ -154,13 +157,16 @@ void indexInMemoryRopebwt()
std::string bwt_filename = opt::prefix + BWT_EXT;
std::string sai_filename = opt::prefix + SAI_EXT;
BWTCA::runRopebwt(opt::readsFile, bwt_filename, use_threads, false);
- std::cout << "\t done bwt construction, generating .sai file\n";
- BWT* pBWT = new BWT(bwt_filename);
- SampledSuffixArray ssa;
- ssa.buildLexicoIndex(pBWT, opt::numThreads);
- ssa.writeLexicoIndex(sai_filename);
- delete pBWT;
+ if(opt::bBuildSAI)
+ {
+ std::cout << "\t done bwt construction, generating .sai file\n";
+ BWT* pBWT = new BWT(bwt_filename);
+ SampledSuffixArray ssa;
+ ssa.buildLexicoIndex(pBWT, opt::numThreads);
+ ssa.writeLexicoIndex(sai_filename);
+ delete pBWT;
+ }
}
if(opt::bBuildReverse)
@@ -169,12 +175,15 @@ void indexInMemoryRopebwt()
std::string rsai_filename = opt::prefix + RSAI_EXT;
BWTCA::runRopebwt(opt::readsFile, rbwt_filename, use_threads, true);
- std::cout << "\t done rbwt construction, generating .rsai file\n";
- BWT* pRBWT = new BWT(rbwt_filename);
- SampledSuffixArray ssa;
- ssa.buildLexicoIndex(pRBWT, opt::numThreads);
- ssa.writeLexicoIndex(rsai_filename);
- delete pRBWT;
+ if(opt::bBuildSAI)
+ {
+ std::cout << "\t done rbwt construction, generating .rsai file\n";
+ BWT* pRBWT = new BWT(rbwt_filename);
+ SampledSuffixArray ssa;
+ ssa.buildLexicoIndex(pRBWT, opt::numThreads);
+ ssa.writeLexicoIndex(rsai_filename);
+ delete pRBWT;
+ }
}
}
@@ -279,6 +288,7 @@ void parseIndexOptions(int argc, char** argv)
case 'v': opt::verbose++; break;
case OPT_NO_REVERSE: opt::bBuildReverse = false; break;
case OPT_NO_FWD: opt::bBuildForward = false; break;
+ case OPT_NO_SAI: opt::bBuildSAI = false; break;
case OPT_HELP:
std::cout << INDEX_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
@@ -321,6 +331,12 @@ void parseIndexOptions(int argc, char** argv)
die = true;
}
+ if(opt::algorithm == "ropebwt" && opt::bDiskAlgo)
+ {
+ std::cerr << SUBPROGRAM ": the options -a ropebwt and -d are not compatible, please only use one.\n";
+ die = true;
+ }
+
if (die)
{
std::cout << "\n" << INDEX_USAGE_MESSAGE;
diff --git a/src/SGA/kmer-count.cpp b/src/SGA/kmer-count.cpp
new file mode 100644
index 0000000..b6de14d
--- /dev/null
+++ b/src/SGA/kmer-count.cpp
@@ -0,0 +1,291 @@
+//-----------------------------------------------
+// Copyright 2009 Wellcome Trust Sanger Institute
+// Written by Jared Simpson/James Gurtowski (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// Kmer-count - Counting kmers in BWT or Sequence Files
+//
+
+#include <kmer-count.h>
+#include <iostream>
+#include <stack>
+#include <memory>
+#include <getopt.h>
+#include <BWT.h>
+#include <BWTInterval.h>
+#include <BWTAlgorithms.h>
+
+//
+// Getopt
+//
+
+#define SUBPROGRAM "kmer-count"
+static const char *KMERCOUNT_VERSION_MESSAGE = SUBPROGRAM " Version " PACKAGE_VERSION "\n";
+static const char *KMERCOUNT_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] src.{bwt,fa,fq} [test1.bwt] [test2.bwt]\n"
+"Generate a table of the k-mers in src.{bwt,fa,fq}, and optionaly count the number of time they appears in testX.bwt.\n"
+"Output on stdout the canonical kmers and their counts on forward and reverse strand if input is .bwt\n"
+"If src is a sequence file output forward and reverse counts for each kmer in the file\n"
+"\n"
+" --help display this help and exit\n"
+" --version display program version\n"
+" -k, --kmer-size=N The length of the kmer to use. (default: 27)\n"
+" -d, --sample-rate=N use occurrence array sample rate of N in the FM-index. Higher values use significantly\n"
+" less memory at the cost of higher runtime. This value must be a power of 2 (default: 128)\n"
+" -c, --cache-length=N Cache Length for bwt lookups (default: 10)\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+
+
+namespace opt {
+ static std::string inputSequenceFile;
+ static std::vector<std::string> bwtFiles;
+ static int sampleRate = BWT::DEFAULT_SAMPLE_RATE_SMALL;
+ static unsigned int kmerLength = 27;
+ static int intervalCacheLength = 10;
+}
+
+static const char* shortopts = "d:k:c:x:";
+enum { OPT_HELP = 1, OPT_VERSION };
+static const struct option longopts[] = {
+ { "sample-rate", required_argument, NULL, 'd' },
+ { "kmer-size", required_argument, NULL, 'k' },
+ { "cache-length", required_argument, NULL, 'c' },
+ { "help", no_argument, NULL, OPT_HELP },
+ { "version", no_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+
+void parseKmerCountOptions(int argc, char** argv) {
+
+ for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
+ {
+ std::istringstream arg(optarg != NULL ? optarg : "");
+ switch (c)
+ {
+ case 'd': arg >> opt::sampleRate; break;
+ case 'k': arg >> opt::kmerLength; break;
+ case 'c': arg >> opt::intervalCacheLength; break;
+ case OPT_HELP:
+ std::cout << KMERCOUNT_USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case OPT_VERSION:
+ std::cout << KMERCOUNT_VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ if(opt::kmerLength <= 0 || opt::kmerLength % 2 == 0)
+ {
+ std::cerr << SUBPROGRAM ": invalid kmer length: " << opt::kmerLength << ", must be greater than zero and odd\n";
+ std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+
+ if(optind >= argc)
+ {
+ std::cerr << SUBPROGRAM ": missing input bwt/sequence file" << std::endl;
+ std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+
+ std::string inFile(argv[optind]);
+
+ if(inFile.substr(inFile.find_last_of(".")) != ".bwt")
+ {
+ opt::inputSequenceFile = inFile;
+ ++optind;
+ }
+
+ for(;optind<argc;++optind)
+ {
+ opt::bwtFiles.push_back(argv[optind]);
+ }
+
+ if (opt::inputSequenceFile.empty() && opt::bwtFiles.size() < 1)
+ {
+ std::cerr << SUBPROGRAM ": missing arguments\n";
+ std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+}
+
+
+
+//
+// BWT Traversal algorithm
+//
+
+// Stack structure used in the depth first search of kmers
+struct stack_elt_t
+{
+ size_t str_sz;
+ char bp;
+ BWTInterval range;
+ stack_elt_t(size_t strsz,char bp):str_sz(strsz),bp(bp){}
+ stack_elt_t(size_t strsz,char bp,const BWTInterval& range):str_sz(strsz),bp(bp),range(range){}
+};
+
+
+
+// extract all canonical kmers of a bwt by performing a backward depth-first-search
+void traverse_kmer(const std::vector<BWTIndexSet> &indicies)
+{
+ std::stack< stack_elt_t > stack;
+ std::string str; // string storing the current path
+
+ // Intitialize the search with root elements
+ for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
+ {
+ stack_elt_t e(str.size(),ALPHABET[i]);
+ BWTAlgorithms::initInterval(e.range,e.bp,indicies[0].pBWT);
+ if (e.range.isValid()) stack.push(e);
+ }
+
+ std::vector<BWTIndexSet>::const_iterator idx_it;
+
+ // Perform the kmer search
+ while(!stack.empty())
+ {
+ // pop an element from the stack, and update the string path accordingly
+ stack_elt_t top = stack.top();
+ stack.pop();
+ str.resize(top.str_sz);
+ str.push_back(top.bp);
+ if (str.length()>= opt::kmerLength) {
+ // we found a kmer, retreive the number of occurence
+ std::string seq(reverse(str));
+ int64_t seq_count = top.range.size();
+
+ // look for the count of its reverse complement
+ std::string seq_rc(complement(str));
+ BWTInterval range_rc = BWTAlgorithms::findInterval(indicies[0].pBWT,seq_rc);
+ int64_t seq_rc_count = range_rc.isValid()?range_rc.size():0;
+
+ // print the current kmer if canonical
+ if (seq<seq_rc) {
+ std::cout << seq << '\t' << seq_count << '\t' << seq_rc_count;
+
+ for(idx_it = indicies.begin() + 1;
+ idx_it != indicies.end(); ++idx_it)
+ {
+ std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq, *idx_it);
+ std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq_rc, *idx_it);
+ }
+ std::cout << std::endl;
+
+ } else if (seq_rc_count<=0) {
+ // the current kmer is not canonical, but the reverse complement doesn't exists
+ // so print it now as it will never be traversed by the searching algorithm
+ std::cout << seq_rc << '\t' << seq_rc_count << '\t' << seq_count;
+ for(idx_it = indicies.begin() + 1;
+ idx_it != indicies.end(); ++idx_it)
+ {
+ std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq_rc, *idx_it);
+ std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq, *idx_it);
+ }
+ std::cout << std::endl;
+ }
+ } else
+ {
+ // not yet a suffix of size k, push next candidates
+ for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
+ {
+ stack_elt_t e(str.size(),ALPHABET[i],top.range);
+ BWTAlgorithms::updateInterval(e.range,e.bp,indicies[0].pBWT);
+ if (e.range.isValid()) stack.push(e);
+ }
+ }
+ }
+}
+
+
+
+
+void kmers_from_file(const std::string &inputFile,
+ const std::vector<BWTIndexSet> bwtIndicies)
+{
+
+ //Init sequence reader
+ SeqReader reader(inputFile, SRF_NO_VALIDATION);
+ SeqRecord record;
+ std::string kmer, kmer_rc;
+ std::cerr << "Processing " << inputFile << std::endl;
+
+ size_t kmer_idx;
+ std::vector<BWTIndexSet>::const_iterator indexset_it;
+
+ //read the sequences from the file and split into kmers
+ while(reader.get(record))
+ {
+ for(kmer_idx=0; kmer_idx < record.seq.length()-opt::kmerLength+1; ++kmer_idx)
+ {
+ kmer = record.seq.substr(kmer_idx, opt::kmerLength);
+ kmer_rc = reverseComplement(kmer);
+
+ std::cout << record.id << "\t" << kmer_idx << "\t" << kmer;
+
+ //print out kmer count for all the bwts
+ for(indexset_it = bwtIndicies.begin();
+ indexset_it != bwtIndicies.end();
+ ++indexset_it)
+ {
+ std::cout << '\t' << BWTAlgorithms::countSequenceOccurrencesSingleStrand(kmer, *indexset_it);
+ std::cout << '\t' << BWTAlgorithms::countSequenceOccurrencesSingleStrand(kmer_rc, *indexset_it);
+ }
+
+ std::cout << std::endl;
+
+ }
+ }
+}
+
+//
+// Main
+//
+
+int kmerCountMain(int argc, char** argv)
+{
+ // parse command line arguments
+ parseKmerCountOptions(argc,argv);
+
+ // allocate BWT objects
+ std::vector<BWTIndexSet> bwtIndicies;
+ BWTIndexSet tmpIdx;
+
+ for(std::vector<std::string>::iterator it = opt::bwtFiles.begin();
+ it != opt::bwtFiles.end(); ++it)
+ {
+ std::cerr << "Loading " << *it << std::endl;
+
+ tmpIdx.pBWT = new BWT(*it, opt::sampleRate);
+ tmpIdx.pCache = new BWTIntervalCache(opt::intervalCacheLength, tmpIdx.pBWT);
+
+ bwtIndicies.push_back(tmpIdx);
+ }
+
+ if( opt::inputSequenceFile.empty() )
+ {
+ // run kmer search
+ traverse_kmer(bwtIndicies);
+ }else
+ {
+ kmers_from_file(opt::inputSequenceFile, bwtIndicies);
+ }
+
+ // clean memory
+ std::vector<BWTIndexSet>::const_iterator indexset_it;
+ for(indexset_it = bwtIndicies.begin();
+ indexset_it != bwtIndicies.end();
+ ++indexset_it)
+ {
+ delete (*indexset_it).pBWT;
+ delete (*indexset_it).pCache;
+ }
+ return 0;
+}
+
+
+
diff --git a/src/SGA/kmer-count.h b/src/SGA/kmer-count.h
new file mode 100644
index 0000000..e44ba29
--- /dev/null
+++ b/src/SGA/kmer-count.h
@@ -0,0 +1,8 @@
+#ifndef KMERCOUNT_H
+#define KMERCOUNT_H
+#include "config.h"
+
+int kmerCountMain(int argc, char** argv);
+void parseKmerCountOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/overlap.cpp b/src/SGA/overlap-long.cpp
similarity index 65%
copy from src/SGA/overlap.cpp
copy to src/SGA/overlap-long.cpp
index 0e65452..81c9652 100644
--- a/src/SGA/overlap.cpp
+++ b/src/SGA/overlap-long.cpp
@@ -1,17 +1,17 @@
-//-----------------------------------------------
-// Copyright 2009 Wellcome Trust Sanger Institute
-// Written by Jared Simpson (js18 at sanger.ac.uk)
+//-----------------------------------------------------
+// Copyright 2013 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
// Released under the GPL
-//-----------------------------------------------
+//-----------------------------------------------------
//
-// overlap - compute pairwise overlaps between reads
+// overlap-long - compute overlaps for long reads
//
#include <iostream>
#include <fstream>
#include <sstream>
#include <iterator>
#include "Util.h"
-#include "overlap.h"
+#include "overlap-long.h"
#include "SuffixArray.h"
#include "BWT.h"
#include "SGACommon.h"
@@ -23,13 +23,7 @@
#include "SequenceProcessFramework.h"
#include "OverlapProcess.h"
#include "ReadInfoTable.h"
-
-//
-enum OutputType
-{
- OT_ASQG,
- OT_RAW
-};
+#include "KmerOverlaps.h"
// Functions
size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile,
@@ -47,16 +41,16 @@ void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsF
//
// Getopt
//
-#define SUBPROGRAM "overlap"
-static const char *OVERLAP_VERSION_MESSAGE =
+#define SUBPROGRAM "overlap-long"
+static const char *OVERLAP_LONG_VERSION_MESSAGE =
SUBPROGRAM " Version " PACKAGE_VERSION "\n"
"Written by Jared Simpson.\n"
"\n"
-"Copyright 2009 Wellcome Trust Sanger Institute\n";
+"Copyright 2013 Ontario Institute for Cancer Research\n";
-static const char *OVERLAP_USAGE_MESSAGE =
+static const char *OVERLAP_LONG_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READSFILE\n"
-"Compute pairwise overlap between all the sequences in READS\n"
+"Compute pairwise overlap between all the long reads in READS\n"
"\n"
" --help display this help and exit\n"
" -v, --verbose display verbose output\n"
@@ -64,9 +58,6 @@ static const char *OVERLAP_USAGE_MESSAGE =
" -e, --error-rate the maximum error rate allowed to consider two sequences aligned (default: exact matches only)\n"
" -m, --min-overlap=LEN minimum overlap required between two reads (default: 45)\n"
" -f, --target-file=FILE perform the overlap queries against the reads in FILE\n"
-" -x, --exhaustive output all overlaps, including transitive edges\n"
-" --exact force the use of the exact-mode irreducible block algorithm. This is faster\n"
-" but requires that no substrings are present in the input set.\n"
" -l, --seed-length=LEN force the seed length to be LEN. By default, the seed length in the overlap step\n"
" is calculated to guarantee all overlaps with --error-rate differences are found.\n"
" This option removes the guarantee but will be (much) faster. As SGA can tolerate some\n"
@@ -84,14 +75,13 @@ namespace opt
{
static unsigned int verbose;
static int numThreads = 1;
- static OutputType outputType = OT_ASQG;
static std::string readsFile;
static std::string targetFile;
static std::string outFile;
static double errorRate = 0.0f;
static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
- static int seedLength = 0;
+ static int seedLength = 100;
static int seedStride = 0;
static int sampleRate = BWT::DEFAULT_SAMPLE_RATE_SMALL;
static bool bIrreducibleOnly = true;
@@ -120,15 +110,42 @@ static const struct option longopts[] = {
{ NULL, 0, NULL, 0 }
};
+// Write an ascii pictogram of an overlap
+// O = overhang of length block_size (region of no overlap)
+// P = perfect match of length block_size
+// I = matched block with mismatches
+std::string ascii_overlap(const std::string& s0, const std::string& s1,
+ SequenceOverlap& ovr, int block_size = 50)
+{
+ std::string out;
+
+ int pre_overhang = ovr.match[0].start / block_size;
+ int post_overhang = (ovr.length[0] - ovr.match[0].end - 1) / block_size;
+
+ std::string p0;
+ std::string p1;
+
+ ovr.makePaddedMatches(s0, s1, &p0, &p1);
+
+ out.append(pre_overhang, '-');
+
+ for(size_t i = 0; i < p0.size(); i += block_size)
+ {
+ if(p0.substr(i, block_size) == p1.substr(i, block_size))
+ out.append(1, '=');
+ else
+ out.append(1, 'x');
+ }
+ out.append(post_overhang, '-');
+ return out;
+}
+
//
// Main
//
-int overlapMain(int argc, char** argv)
+int overlapLongMain(int argc, char** argv)
{
- parseOverlapOptions(argc, argv);
-
- // Prepare the output ASQG file
- assert(opt::outputType == OT_ASQG);
+ parseOverlapLongOptions(argc, argv);
// Open output file
std::ostream* pASQGWriter = createWriter(opt::outFile);
@@ -138,13 +155,9 @@ int overlapMain(int argc, char** argv)
headerRecord.setOverlapTag(opt::minOverlap);
headerRecord.setErrorRateTag(opt::errorRate);
headerRecord.setInputFileTag(opt::readsFile);
- headerRecord.setContainmentTag(true); // containments are always present
- headerRecord.setTransitiveTag(!opt::bIrreducibleOnly);
+ headerRecord.setTransitiveTag(true);
headerRecord.write(*pASQGWriter);
- // Compute the overlap hits
- StringVector hitsFilenames;
-
// Determine which index files to use. If a target file was provided,
// use the index of the target reads
std::string indexPrefix;
@@ -154,46 +167,102 @@ int overlapMain(int argc, char** argv)
indexPrefix = stripFilename(opt::readsFile);
BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
- BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
- OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,
- opt::errorRate, opt::seedLength,
- opt::seedStride, opt::bIrreducibleOnly);
-
- pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001);
- pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001);
-
+ SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI);
+
Timer* pTimer = new Timer(PROGRAM_IDENT);
pBWT->printInfo();
- // Make a prefix for the temporary hits files
- std::string outPrefix;
- outPrefix = stripFilename(opt::readsFile);
- if(!opt::targetFile.empty())
+ // Read the sequence file and write vertex records for each
+ // Also store the read names in a vector of strings
+ ReadTable reads;
+
+ SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION);
+ SeqRecord record;
+ while(pReader->get(record))
{
- outPrefix.append(1, '.');
- outPrefix.append(stripFilename(opt::targetFile));
- }
+ reads.addRead(record.toSeqItem());
+ ASQG::VertexRecord vr(record.id, record.seq.toString());
+ vr.write(*pASQGWriter);
- if(opt::numThreads <= 1)
- {
- printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT);
- computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
- }
- else
- {
- printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
- computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
+ if(reads.getCount() % 100000 == 0)
+ printf("Read %zu sequences\n", reads.getCount());
}
- // Get the number of strings in the BWT, this is used to pre-allocated the read table
- delete pOverlapper;
- delete pBWT;
- delete pRBWT;
+ delete pReader;
+ pReader = NULL;
- // Parse the hits files and write the overlaps to the ASQG file
- convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter);
+ BWTIndexSet index;
+ index.pBWT = pBWT;
+ index.pSSA = pSSA;
+ index.pReadTable = &reads;
+
+ // Make a prefix for the temporary hits files
+ size_t n_reads = reads.getCount();
+
+ omp_set_num_threads(opt::numThreads);
+
+#pragma omp parallel for
+ for(size_t read_idx = 0; read_idx < n_reads; ++read_idx)
+ {
+ const SeqItem& curr_read = reads.getRead(read_idx);
+
+ printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length());
+ SequenceOverlapPairVector sopv =
+ KmerOverlaps::retrieveMatches(curr_read.seq.toString(),
+ opt::seedLength,
+ opt::minOverlap,
+ 1 - opt::errorRate,
+ 100,
+ index);
+
+ printf("Found %zu matches\n", sopv.size());
+ for(size_t i = 0; i < sopv.size(); ++i)
+ {
+ std::string match_id = reads.getRead(sopv[i].match_idx).id;
+
+ // We only want to output each edge once so skip this overlap
+ // if the matched read has a lexicographically lower ID
+ if(curr_read.id > match_id)
+ continue;
+
+ std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50);
+ printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(),
+ sopv[i].overlap.match[0].start,
+ sopv[i].overlap.match[0].end,
+ match_id.c_str(),
+ sopv[i].overlap.getOverlapLength(),
+ sopv[i].overlap.getPercentIdentity(),
+ sopv[i].overlap.cigar.c_str());
+
+ // Convert to ASQG
+ SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]);
+ SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]);
+
+ // KmerOverlaps returns the coordinates of the overlap after flipping the reads
+ // to ensure the strand matches. The ASQG file wants the coordinate of the original
+ // sequencing strand. Flip here if necessary
+ if(sopv[i].is_reversed)
+ sc2.flip();
+
+ // Convert the SequenceOverlap the ASQG's overlap format
+ Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1);
+
+ ASQG::EdgeRecord er(ovr);
+ er.setCigarTag(sopv[i].overlap.cigar);
+ er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity());
+
+#pragma omp critical
+ {
+ er.write(*pASQGWriter);
+ }
+ }
+ }
// Cleanup
+ delete pReader;
+ delete pBWT;
+ delete pSSA;
+
delete pASQGWriter;
delete pTimer;
if(opt::numThreads > 1)
@@ -202,6 +271,7 @@ int overlapMain(int argc, char** argv)
return 0;
}
+/*
// Compute the hits for each read in the input file without threading
// Return the number of reads processed
size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile,
@@ -256,66 +326,12 @@ size_t computeHitsParallel(int numThreads, const std::string& prefix, const std:
delete processorVector[i];
return numProcessed;
}
-
-//
-void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsFilenames, std::ostream* pASQGWriter)
-{
- // Load the suffix array index and the reverse suffix array index
- // Note these are not the full suffix arrays
- SuffixArray* pFwdSAI = new SuffixArray(indexPrefix + SAI_EXT);
- SuffixArray* pRevSAI = new SuffixArray(indexPrefix + RSAI_EXT);
-
- // Load the ReadInfoTable for the queries to look up the ID and lengths of the hits
- ReadInfoTable* pQueryRIT = new ReadInfoTable(opt::readsFile);
-
- // If the target file is not the query file, load its ReadInfoTable
- ReadInfoTable* pTargetRIT;
- if(!opt::targetFile.empty() && opt::targetFile != opt::readsFile)
- pTargetRIT = new ReadInfoTable(opt::targetFile);
- else
- pTargetRIT = pQueryRIT;
-
- bool bIsSelfCompare = pTargetRIT == pQueryRIT;
-
- // Convert the hits to overlaps and write them to the asqg file as initial edges
- for(StringVector::const_iterator iter = hitsFilenames.begin(); iter != hitsFilenames.end(); ++iter)
- {
- printf("[%s] parsing file %s\n", PROGRAM_IDENT, iter->c_str());
- std::istream* pReader = createReader(*iter);
-
- // Read each hit sequentially, converting it to an overlap
- std::string line;
- while(getline(*pReader, line))
- {
- size_t readIdx;
- size_t totalEntries;
- bool isSubstring;
- OverlapVector ov;
- OverlapCommon::parseHitsString(line, pQueryRIT, pTargetRIT, pFwdSAI, pRevSAI, bIsSelfCompare, readIdx, totalEntries, ov, isSubstring);
- for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter)
- {
- ASQG::EdgeRecord edgeRecord(*iter);
- edgeRecord.write(*pASQGWriter);
- }
- }
- delete pReader;
-
- // delete the hits file
- unlink(iter->c_str());
- }
-
- // Deallocate data
- if(pTargetRIT != pQueryRIT)
- delete pTargetRIT;
- delete pFwdSAI;
- delete pRevSAI;
- delete pQueryRIT;
-}
+*/
//
// Handle command line arguments
//
-void parseOverlapOptions(int argc, char** argv)
+void parseOverlapLongOptions(int argc, char** argv)
{
bool die = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
@@ -336,10 +352,10 @@ void parseOverlapOptions(int argc, char** argv)
case '?': die = true; break;
case 'v': opt::verbose++; break;
case OPT_HELP:
- std::cout << OVERLAP_USAGE_MESSAGE;
+ std::cout << OVERLAP_LONG_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
- std::cout << OVERLAP_VERSION_MESSAGE;
+ std::cout << OVERLAP_LONG_VERSION_MESSAGE;
exit(EXIT_SUCCESS);
}
}
@@ -369,7 +385,7 @@ void parseOverlapOptions(int argc, char** argv)
if (die)
{
- std::cout << "\n" << OVERLAP_USAGE_MESSAGE;
+ std::cout << "\n" << OVERLAP_LONG_USAGE_MESSAGE;
exit(EXIT_FAILURE);
}
diff --git a/src/SGA/overlap-long.h b/src/SGA/overlap-long.h
new file mode 100644
index 0000000..9bdf838
--- /dev/null
+++ b/src/SGA/overlap-long.h
@@ -0,0 +1,26 @@
+//-----------------------------------------------------
+// Copyright 2013 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// overlap-long - compute overlaps for long reads
+//
+#ifndef OVERLAP_LONG_H
+#define OVERLAP_LONG_H
+#include <getopt.h>
+#include "config.h"
+#include "BWT.h"
+#include "Match.h"
+#include "BWTAlgorithms.h"
+#include "OverlapAlgorithm.h"
+
+// functions
+
+//
+int overlapLongMain(int argc, char** argv);
+
+// options
+void parseOverlapLongOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/overlap.cpp b/src/SGA/overlap.cpp
index 0e65452..f1cb50c 100644
--- a/src/SGA/overlap.cpp
+++ b/src/SGA/overlap.cpp
@@ -63,6 +63,7 @@ static const char *OVERLAP_USAGE_MESSAGE =
" -t, --threads=NUM use NUM worker threads to compute the overlaps (default: no threading)\n"
" -e, --error-rate the maximum error rate allowed to consider two sequences aligned (default: exact matches only)\n"
" -m, --min-overlap=LEN minimum overlap required between two reads (default: 45)\n"
+" -p, --prefix=PREFIX use PREFIX for the names of the index files (default: prefix of the input file)\n"
" -f, --target-file=FILE perform the overlap queries against the reads in FILE\n"
" -x, --exhaustive output all overlaps, including transitive edges\n"
" --exact force the use of the exact-mode irreducible block algorithm. This is faster\n"
@@ -88,7 +89,8 @@ namespace opt
static std::string readsFile;
static std::string targetFile;
static std::string outFile;
-
+ static std::string prefix;
+
static double errorRate = 0.0f;
static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
static int seedLength = 0;
@@ -98,7 +100,7 @@ namespace opt
static bool bExactIrreducible = false;
}
-static const char* shortopts = "m:d:e:t:l:s:o:f:vix";
+static const char* shortopts = "m:d:e:t:l:s:o:f:p:vix";
enum { OPT_HELP = 1, OPT_VERSION, OPT_EXACT };
@@ -148,11 +150,15 @@ int overlapMain(int argc, char** argv)
// Determine which index files to use. If a target file was provided,
// use the index of the target reads
std::string indexPrefix;
- if(!opt::targetFile.empty())
- indexPrefix = stripFilename(opt::targetFile);
+ if(!opt::prefix.empty())
+ indexPrefix = opt::prefix;
else
- indexPrefix = stripFilename(opt::readsFile);
-
+ {
+ if(!opt::targetFile.empty())
+ indexPrefix = stripExtension(opt::targetFile);
+ else
+ indexPrefix = stripExtension(opt::readsFile);
+ }
BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT,
@@ -325,6 +331,7 @@ void parseOverlapOptions(int argc, char** argv)
{
case 'm': arg >> opt::minOverlap; break;
case 'o': arg >> opt::outFile; break;
+ case 'p': arg >> opt::prefix; break;
case 'e': arg >> opt::errorRate; break;
case 't': arg >> opt::numThreads; break;
case 'l': arg >> opt::seedLength; break;
diff --git a/src/SGA/preprocess.cpp b/src/SGA/preprocess.cpp
index 503e171..93c0282 100644
--- a/src/SGA/preprocess.cpp
+++ b/src/SGA/preprocess.cpp
@@ -36,6 +36,7 @@ static const char *PREPROCESS_USAGE_MESSAGE =
"\n"
" --help display this help and exit\n"
" -v, --verbose display verbose output\n"
+" --seed set random seed\n"
"\nInput/Output options:\n"
" -o, --out=FILE write the reads to FILE (default: stdout)\n"
" -p, --pe-mode=INT 0 - do not treat reads as paired (default)\n"
@@ -45,6 +46,7 @@ static const char *PREPROCESS_USAGE_MESSAGE =
" --pe-orphans=FILE if one half of a read pair fails filtering, write the passed half to FILE\n"
"\nConversions/Filtering:\n"
" --phred64 convert quality values from phred-64 to phred-33.\n"
+" --discard-quality do not output quality scores\n"
" -q, --quality-trim=INT perform Heng Li's BWA quality trim algorithm. \n"
" Reads are trimmed according to the formula:\n"
" argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT\n"
@@ -80,6 +82,7 @@ enum QualityScaling
namespace opt
{
static unsigned int verbose;
+ static unsigned int seed = 0;
static std::string outFile;
static unsigned int qualityTrim = 0;
static unsigned int hardClip = 0;
@@ -89,6 +92,7 @@ namespace opt
static double sampleFreq = 1.0f;
static bool bDiscardAmbiguous = true;
+ static bool bDiscardQuality = false;
static QualityScaling qualityScale = QS_SANGER;
static bool bFilterGC = false;
@@ -106,8 +110,11 @@ namespace opt
static const char* shortopts = "o:q:m:h:p:r:c:s:f:vi";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_PERMUTE, OPT_QSCALE, OPT_MINGC, OPT_MAXGC,
- OPT_DUST, OPT_DUST_THRESHOLD, OPT_SUFFIX, OPT_PHRED64, OPT_OUTPUTORPHANS, OPT_DISABLE_PRIMER };
+enum { OPT_HELP = 1, OPT_SEED, OPT_VERSION, OPT_PERMUTE,
+ OPT_QSCALE, OPT_MINGC, OPT_MAXGC,
+ OPT_DUST, OPT_DUST_THRESHOLD, OPT_SUFFIX,
+ OPT_PHRED64, OPT_OUTPUTORPHANS, OPT_DISABLE_PRIMER,
+ OPT_DISCARD_QUALITY };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -130,7 +137,9 @@ static const struct option longopts[] = {
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ "permute-ambiguous", no_argument, NULL, OPT_PERMUTE },
+ { "discard-quality", no_argument, NULL, OPT_DISCARD_QUALITY },
{ "no-primer-check", no_argument, NULL, OPT_DISABLE_PRIMER },
+ { "seed", required_argument, NULL, OPT_SEED },
{ NULL, 0, NULL, 0 }
};
@@ -150,6 +159,12 @@ int preprocessMain(int argc, char** argv)
Timer* pTimer = new Timer("sga preprocess");
parsePreprocessOptions(argc, argv);
+ // set random seed
+ if (opt::seed == 0)
+ {
+ opt::seed = time(NULL);
+ }
+
std::cerr << "Parameters:\n";
std::cerr << "QualTrim: " << opt::qualityTrim << "\n";
@@ -169,6 +184,8 @@ int preprocessMain(int argc, char** argv)
std::cerr << "Orphan file: " << (opt::orphanFile.empty() ? "none" : opt::orphanFile) << "\n";
if(opt::bDiscardAmbiguous)
std::cerr << "Discarding sequences with ambiguous bases\n";
+ if(opt::bDiscardQuality)
+ std::cerr << "Discarding quality scores\n";
if(opt::bDustFilter)
std::cerr << "Dust threshold: " << opt::dustThreshold << "\n";
if(!opt::suffix.empty())
@@ -179,9 +196,10 @@ int preprocessMain(int argc, char** argv)
std::cerr << "Adapter sequence fwd: " << opt::adapterF << "\n";
std::cerr << "Adapter sequence rev: " << opt::adapterR << "\n";
}
+ std::cerr << "Seed: " << opt::seed << "\n";
// Seed the RNG
- srand(time(NULL));
+ srand(opt::seed);
std::ostream* pWriter;
if(opt::outFile.empty())
@@ -244,6 +262,13 @@ int preprocessMain(int argc, char** argv)
// Read from separate files
std::string filename1 = argv[optind++];
std::string filename2 = argv[optind++];
+
+ if(filename1 == "-" || filename2 == "-")
+ {
+ std::cerr << "Reading from stdin is not supported in --pe-mode 1\n";
+ std::cerr << "Maybe you meant --pe-mode 2 (interleaved pairs?)\n";
+ exit(EXIT_FAILURE);
+ }
pReader1 = new SeqReader(filename1, SRF_NO_VALIDATION);
pReader2 = new SeqReader(filename2, SRF_NO_VALIDATION);
@@ -321,7 +346,6 @@ int preprocessMain(int argc, char** argv)
}
delete pReader1;
pReader1 = NULL;
-
}
}
@@ -500,7 +524,11 @@ bool processRead(SeqRecord& record)
}
record.seq = seqStr;
- record.qual = qualStr;
+
+ if(opt::bDiscardQuality)
+ record.qual.clear();
+ else
+ record.qual = qualStr;
if(record.seq.length() == 0 || record.seq.length() < opt::minLength)
return false;
@@ -611,6 +639,8 @@ void parsePreprocessOptions(int argc, char** argv)
case OPT_PERMUTE: opt::bDiscardAmbiguous = false; break;
case OPT_DUST: opt::bDustFilter = true; break;
case OPT_DISABLE_PRIMER: opt::bDisablePrimerCheck = true; break;
+ case OPT_DISCARD_QUALITY: opt::bDiscardQuality = true; break;
+ case OPT_SEED: arg >> opt::seed; break;
case OPT_HELP:
std::cout << PREPROCESS_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
diff --git a/src/SGA/preqc.cpp b/src/SGA/preqc.cpp
index ec8c006..4c39e4a 100644
--- a/src/SGA/preqc.cpp
+++ b/src/SGA/preqc.cpp
@@ -158,6 +158,7 @@ static const char *PREQC_USAGE_MESSAGE =
" --help display this help and exit\n"
" -v, --verbose display verbose output\n"
" -t, --threads=NUM use NUM threads (default: 1)\n"
+" --simple only compute the metrics that do not need the FM-index\n"
" --max-contig-length=N stop contig extension at N bp (default: 50000)\n"
" --reference=FILE use the reference FILE to calculate GC plot\n"
" --diploid-reference-mode generate metrics assuming that the input data\n"
@@ -181,11 +182,12 @@ namespace opt
static std::string referenceFile;
static int diploidReferenceMode = 0;
static bool forceEM = false;
+ static bool simple = false;
}
static const char* shortopts = "p:d:t:o:k:n:b:v";
-enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_MAX_CONTIG, OPT_DIPLOID, OPT_FORCE_EM };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_MAX_CONTIG, OPT_DIPLOID, OPT_FORCE_EM, OPT_SIMPLE };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
@@ -197,6 +199,7 @@ static const struct option longopts[] = {
{ "branch-cutoff", required_argument, NULL, 'b' },
{ "max-contig-length", required_argument, NULL, OPT_MAX_CONTIG },
{ "reference", required_argument, NULL, OPT_REFERENCE },
+ { "simple", no_argument, NULL, OPT_SIMPLE },
{ "force-EM", no_argument, NULL, OPT_FORCE_EM },
{ "diploid-reference-mode", no_argument, NULL, OPT_DIPLOID },
{ "help", no_argument, NULL, OPT_HELP },
@@ -689,6 +692,9 @@ void generate_position_of_first_error(JSONWriter* pWriter, const BWTIndexSet& in
for(size_t i = 0; i < n_samples; ++i)
{
std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
+ if(s.length() < k)
+ continue;
+
size_t nk = s.size() - k + 1;
size_t first_kmer_count =
BWTAlgorithms::countSequenceOccurrences(s.substr(0, k), index_set.pBWT);
@@ -760,6 +766,9 @@ void generate_errors_per_base(JSONWriter* pWriter, const BWTIndexSet& index_set)
for(int i = 0; i < n_samples; ++i)
{
std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
+ if(s.length() < k)
+ continue;
+
KmerOverlaps::retrieveMatches(s, k, min_overlap, max_error_rate, 2, index_set);
//KmerOverlaps::approximateMatch(s, min_overlap, max_error_rate, 2, 200, index_set);
@@ -1963,43 +1972,51 @@ int preQCMain(int argc, char** argv)
parsePreQCOptions(argc, argv);
Timer* pTimer = new Timer(PROGRAM_IDENT);
- fprintf(stderr, "Loading FM-index of %s\n", opt::readsFile.c_str());
- BWTIndexSet index_set;
- index_set.pBWT = new BWT(opt::prefix + BWT_EXT);
- index_set.pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
- index_set.pCache = new BWTIntervalCache(10, index_set.pBWT);
-
rapidjson::FileStream f(stdout);
JSONWriter writer(f);
// Top-level document
writer.StartObject();
- if(!opt::diploidReferenceMode)
- {
- GenomeEstimates estimates = generate_genome_size(&writer, index_set);
- generate_branch_classification(&writer, estimates, index_set);
- generate_de_bruijn_simulation(&writer, estimates, index_set);
- generate_gc_distribution(&writer, index_set);
- generate_quality_stats(&writer, opt::readsFile);
- generate_pe_fragment_sizes(&writer, index_set);
- generate_kmer_coverage(&writer, index_set);
- generate_position_of_first_error(&writer, index_set);
- generate_errors_per_base(&writer, index_set);
- generate_unipath_length_data(&writer, index_set);
- generate_duplication_rate(&writer, index_set);
- }
- else
+ // In simple mode we only compute metrics that do not need the FM-index
+ generate_quality_stats(&writer, opt::readsFile);
+
+ // Load the FM-index and compute the rest of the metrics if requested
+ if(!opt::simple)
{
- generate_reference_branch_classification(&writer, index_set);
+ BWTIndexSet index_set;
+ fprintf(stderr, "Loading FM-index of %s\n", opt::readsFile.c_str());
+ index_set.pBWT = new BWT(opt::prefix + BWT_EXT);
+ index_set.pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
+ index_set.pCache = new BWTIntervalCache(10, index_set.pBWT);
+
+ if(!opt::diploidReferenceMode)
+ {
+ GenomeEstimates estimates = generate_genome_size(&writer, index_set);
+
+ generate_branch_classification(&writer, estimates, index_set);
+ generate_de_bruijn_simulation(&writer, estimates, index_set);
+ generate_gc_distribution(&writer, index_set);
+ generate_pe_fragment_sizes(&writer, index_set);
+ generate_kmer_coverage(&writer, index_set);
+ generate_position_of_first_error(&writer, index_set);
+ generate_errors_per_base(&writer, index_set);
+ generate_unipath_length_data(&writer, index_set);
+ generate_duplication_rate(&writer, index_set);
+ }
+ else
+ {
+ generate_reference_branch_classification(&writer, index_set);
+ }
+
+ delete index_set.pBWT;
+ delete index_set.pSSA;
+ delete index_set.pCache;
}
// End document
writer.EndObject();
- delete index_set.pBWT;
- delete index_set.pSSA;
- delete index_set.pCache;
delete pTimer;
return 0;
}
@@ -2079,6 +2096,7 @@ void parsePreQCOptions(int argc, char** argv)
case 't': arg >> opt::numThreads; break;
case '?': die = true; break;
case 'v': opt::verbose++; break;
+ case OPT_SIMPLE: opt::simple = true; break;
case OPT_MAX_CONTIG: arg >> opt::maxContigLength; break;
case OPT_DIPLOID: opt::diploidReferenceMode = true; break;
case OPT_REFERENCE: arg >> opt::referenceFile; break;
diff --git a/src/SGA/rmdup.cpp b/src/SGA/rmdup.cpp
index cc7727d..425ddb2 100644
--- a/src/SGA/rmdup.cpp
+++ b/src/SGA/rmdup.cpp
@@ -120,7 +120,7 @@ void rmdup()
delete pRBWT;
delete pTimer;
- std::string out_prefix = stripFilename(opt::outFile);
+ std::string out_prefix = stripExtension(opt::outFile);
std::string dupsFile = parseDupHits(hitsFilenames, out_prefix);
// Rebuild the indices without the duplicated sequences
diff --git a/src/SGA/sga.cpp b/src/SGA/sga.cpp
index d932233..1d5f174 100644
--- a/src/SGA/sga.cpp
+++ b/src/SGA/sga.cpp
@@ -10,6 +10,7 @@
#include <iostream>
#include "index.h"
#include "overlap.h"
+#include "overlap-long.h"
#include "assemble.h"
#include "oview.h"
#include "rmdup.h"
@@ -35,6 +36,9 @@
#include "rewrite-evidence-bam.h"
#include "preqc.h"
#include "haplotype-filter.h"
+#include "graph-concordance.h"
+#include "somatic-variant-filters.h"
+#include "kmer-count.h"
#define PROGRAM_BIN "sga"
#define AUTHOR "Jared Simpson"
@@ -51,31 +55,34 @@ static const char *SGA_USAGE_MESSAGE =
"Contact: " AUTHOR " [" PACKAGE_BUGREPORT "]\n"
"Usage: " PROGRAM_BIN " <command> [options]\n\n"
"Commands:\n"
-" preprocess filter and quality-trim reads\n"
-" index build the BWT and FM-index for a set of reads\n"
-" merge merge multiple BWT/FM-index files into a single index\n"
-" bwt2fa transform a bwt back into a set of sequences\n"
-" correct correct sequencing errors in a set of reads\n"
-" fm-merge merge unambiguously overlapped sequences using the FM-index\n"
-" overlap compute overlaps between reads\n"
-" assemble generate contigs from an assembly graph\n"
-" oview view overlap alignments\n"
-" subgraph extract a subgraph from a graph\n"
-" filter remove reads from a data set\n"
-" rmdup duplicate read removal\n"
-" gen-ssa generate a sampled suffix array for the given set of reads\n"
-" scaffold generate ordered sets of contigs using distance estimates\n"
-" scaffold2fasta convert the output of the scaffold subprogram into a fasta file\n"
-" gapfill fill intra-scaffold gaps\n"
+" preprocess filter and quality-trim reads\n"
+" index build the BWT and FM-index for a set of reads\n"
+" merge merge multiple BWT/FM-index files into a single index\n"
+" bwt2fa transform a bwt back into a set of sequences\n"
+" correct correct sequencing errors in a set of reads\n"
+" fm-merge merge unambiguously overlapped sequences using the FM-index\n"
+" overlap compute overlaps between reads\n"
+" assemble generate contigs from an assembly graph\n"
+" oview view overlap alignments\n"
+" subgraph extract a subgraph from a graph\n"
+" filter remove reads from a data set\n"
+" rmdup duplicate read removal\n"
+" gen-ssa generate a sampled suffix array for the given set of reads\n"
+" scaffold generate ordered sets of contigs using distance estimates\n"
+" scaffold2fasta convert the output of the scaffold subprogram into a fasta file\n"
+" gapfill fill intra-scaffold gaps\n"
"\n\nVariant Calling Commands:\n"
-" graph-diff compare reads to find sequence variants\n"
-" rewrite-evidence-bam fill in sequence and quality information for a variant evidence BAM\n"
-" haplotype-filter filter out low-quality haplotypes\n"
+" graph-diff compare reads to find sequence variants\n"
+" graph-concordance check called variants for representation in the assembly graph\n"
+" rewrite-evidence-bam fill in sequence and quality information for a variant evidence BAM\n"
+" haplotype-filter filter out low-quality haplotypes\n"
+" somatic-variant-filters filter out low-quality variants\n"
"\n\nExperimental commands:\n"
" preqc perform pre-assembly quality checks on a set of reads\n"
" stats print summary statistics about a read set\n"
" filterBAM filter out contaminating mate-pair data in a BAM file\n"
" cluster find clusters of reads belonging to the same connected component in an assembly graph\n"
+" kmer-count extract all kmers from a BWT file\n"
//" connect resolve the complete sequence of a paired-end fragment\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
@@ -116,6 +123,8 @@ int main(int argc, char** argv)
FMMergeMain(argc - 1, argv + 1);
else if(command == "overlap")
overlapMain(argc - 1, argv + 1);
+ else if(command == "overlap-long")
+ overlapLongMain(argc - 1, argv + 1);
else if(command == "correct")
correctMain(argc - 1, argv + 1);
else if(command == "assemble")
@@ -154,6 +163,12 @@ int main(int argc, char** argv)
preQCMain(argc - 1, argv + 1);
else if(command == "haplotype-filter")
haplotypeFilterMain(argc - 1, argv + 1);
+ else if(command == "graph-concordance")
+ graphConcordanceMain(argc - 1, argv + 1);
+ else if(command == "somatic-variant-filters")
+ somaticVariantFiltersMain(argc - 1, argv + 1);
+ else if(command == "kmer-count")
+ kmerCountMain(argc - 1, argv + 1);
else
{
std::cerr << "Unrecognized command: " << command << "\n";
diff --git a/src/SGA/somatic-variant-filters.cpp b/src/SGA/somatic-variant-filters.cpp
new file mode 100644
index 0000000..3fca50b
--- /dev/null
+++ b/src/SGA/somatic-variant-filters.cpp
@@ -0,0 +1,864 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// variant-filtering - apply various filters
+// to a somatic VCF file
+//
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iterator>
+#include "Util.h"
+#include "somatic-variant-filters.h"
+#include "BWTAlgorithms.h"
+#include "BWTIndexSet.h"
+#include "SGACommon.h"
+#include "Timer.h"
+#include "SequenceProcessFramework.h"
+#include "HapgenUtil.h"
+#include "SGAStats.h"
+#include "HashMap.h"
+#include "DindelRealignWindow.h"
+#include "VariantIndex.h"
+#include "HapgenUtil.h"
+#include "MultiAlignment.h"
+#include "api/BamReader.h"
+
+// Types
+typedef HashMap<std::string, std::string> StringStringHash;
+
+//
+// Getopt
+//
+#define SUBPROGRAM "somatic-variant-filters"
+static const char *SOMATIC_VARIANT_FILTERS_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2014 Ontario Institute for Cancer Research\n";
+
+static const char *SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... VCF_FILE\n"
+"Count read support for variants in a vcf file\n"
+"\n"
+" --help display this help and exit\n"
+" -v, --verbose display verbose output\n"
+" -t, --threads=NUM use NUM threads to compute the overlaps (default: 1)\n"
+" --reference=STR load the reference genome from FILE\n"
+" --tumor-bam=STR load the aligned tumor reads from FILE\n"
+" --normal-bam=STR load the aligned normal reads from FILE\n"
+" --annotate-only only annotate with INFO tags, rather than filter\n"
+"\n"
+"Filtering cutoffs:\n"
+" --min-af=FLOAT minimum allele frequency (AF tag)\n"
+" --min-var-dp=INT minimum variant depth (VarDP)\n"
+" --max-strand-bias=FLOAT maximum strand bias (SB)\n"
+" --max-hp-length=INT maximum homopolymer context length (HPlen)\n"
+" --max-dust=FLOAT maximum dust scores (Dust)\n"
+" --max-normal-reads=INT maximum normal reads showing variant\n"
+" --min-normal-depth=INT minimum normal depth at the locus\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+static const char* PROGRAM_IDENT =
+PACKAGE_NAME "::" SUBPROGRAM;
+
+namespace opt
+{
+ static unsigned int verbose;
+ static std::string vcfFile;
+ static std::string referenceFile;
+ static std::string tumorBamFile;
+ static std::string normalBamFile;
+
+ static int numThreads = 1;
+ static double minAF = 0.0f;
+ static double errorBound = 0.02;
+ static int minVarDP = 0;
+ static double maxStrandBias = 2.0f;
+ static int maxHPLen = 7;
+ static double maxDust = 2.0f;
+ static size_t maxNormalReads = 1;
+ static size_t minNormalDepth = 5;
+ static int minMedianQuality = 15;
+ static size_t maximumAlignments = 500;
+ static double minHaplotypeLength = 125.0f;
+ static bool annotateOnly = false;
+}
+
+static const char* shortopts = "t:v";
+
+enum { OPT_HELP = 1,
+ OPT_VERSION,
+ OPT_REFERENCE,
+ OPT_TUMOR_BAM,
+ OPT_NORMAL_BAM,
+ OPT_ANNOTATE,
+ OPT_MIN_AF,
+ OPT_MIN_VARDP,
+ OPT_MAX_SB,
+ OPT_MAX_HP,
+ OPT_MAX_DUST,
+ OPT_MAX_NORMAL_READS,
+ OPT_MIN_NORMAL_DEPTH };
+
+static const struct option longopts[] = {
+ { "verbose", no_argument, NULL, 'v' },
+ { "annotate-only", no_argument, NULL, OPT_ANNOTATE },
+ { "threads", required_argument, NULL, 't' },
+ { "reference", required_argument, NULL, OPT_REFERENCE },
+ { "tumor-bam", required_argument, NULL, OPT_TUMOR_BAM },
+ { "normal-bam", required_argument, NULL, OPT_NORMAL_BAM },
+ { "min-af", required_argument, NULL, OPT_MIN_AF },
+ { "min-var-dp", required_argument, NULL, OPT_MIN_VARDP },
+ { "max-strand-bias", required_argument, NULL, OPT_MAX_SB },
+ { "max-hp-length", required_argument, NULL, OPT_MAX_HP },
+ { "max-dust", required_argument, NULL, OPT_MAX_DUST },
+ { "max-normal-reads", required_argument, NULL, OPT_MAX_NORMAL_READS },
+ { "min-normal-depth", required_argument, NULL, OPT_MIN_NORMAL_DEPTH },
+ { "help", required_argument, NULL, OPT_HELP },
+ { "version", required_argument, NULL, OPT_VERSION },
+ { NULL, 0, NULL, 0 }
+};
+
+// Calculate the median value of the vector
+template<typename T>
+double median(const std::vector<T> v)
+{
+ std::vector<T> c = v;
+ std::sort(c.begin(), c.end());
+ double r = 0.0f;
+ int m = c.size() / 2;
+ if(c.size() % 2 == 1)
+ r = c[m];
+ else
+ r = (c[m - 1] + c[m]) / 2.0f;
+ return r;
+}
+
+struct CoverageStats
+{
+ CoverageStats() : n_total_reads(0), n_evidence_reads(0), too_many_alignments(false) {}
+
+ // functions
+ double calculateVAF() const
+ {
+ return n_total_reads > 0 ?
+ n_evidence_reads / (double)n_total_reads : 0.0f;
+ }
+
+ //
+ size_t n_total_reads;
+ size_t n_evidence_reads;
+ double median_mapping_quality;
+ std::vector<int> snv_evidence_quals;
+ bool too_many_alignments;
+};
+
+// This struct holds segments
+// of a read that match a variant
+// The read is broken into (up to) 3 pieces
+// -the segment that aligns to the reference before the variant
+// -the segment that aligns to the variant region
+// -the segment that aligns after the variant
+struct VariantReadSegments
+{
+ std::string preSegment;
+ std::string variantSegment;
+ std::string postSegment;
+
+ std::string preQual;
+ std::string variantQual;
+ std::string postQual;
+};
+
+VariantReadSegments splitReadAtVariant(const BamTools::BamAlignment& alignment,
+ const VCFRecord& record)
+{
+ // Set up a vector of aligned reference positions
+ // for every base of the read
+ std::vector<int> reference_positions(alignment.QueryBases.size(), -1);
+ size_t read_idx = 0;
+ size_t ref_idx = alignment.Position;
+
+ std::stringstream cigar;
+ //
+ for(size_t ci = 0; ci < alignment.CigarData.size(); ++ci)
+ {
+ BamTools::CigarOp op = alignment.CigarData[ci];
+ switch(op.Type) {
+ case 'M':
+ for(size_t l = 0; l < op.Length; ++l)
+ reference_positions[read_idx++] = ref_idx++;
+ break;
+ case 'D':
+ case 'N':
+ ref_idx += op.Length;
+ break;
+ case 'I':
+ case 'S':
+ read_idx += op.Length;
+ break;
+ }
+
+ cigar << op.Length << op.Type;
+ }
+
+ // Find the last aligned position before the variant and the first position after
+ int rightmost_before_variant = -1;
+ int leftmost_after_variant = std::numeric_limits<int>::max();
+
+ int variant_start = record.refPosition - 1; // VCF is 1-based, bamtools is 0-based
+ int variant_end = variant_start + record.refStr.size();
+
+ for(int i = 0; i < (int)reference_positions.size(); ++i) {
+ if(reference_positions[i] != -1 && reference_positions[i] < variant_start && i > rightmost_before_variant)
+ rightmost_before_variant = i;
+ if(reference_positions[i] != -1 && reference_positions[i] >= variant_end && i < leftmost_after_variant)
+ leftmost_after_variant = i;
+ }
+
+ VariantReadSegments ret;
+ ret.preSegment = alignment.QueryBases.substr(0, rightmost_before_variant + 1);
+ ret.preQual = alignment.Qualities.substr(0, rightmost_before_variant + 1);
+
+ ret.variantSegment = alignment.QueryBases.substr(rightmost_before_variant + 1, leftmost_after_variant - rightmost_before_variant - 1);
+ ret.variantQual = alignment.Qualities.substr(rightmost_before_variant + 1, leftmost_after_variant - rightmost_before_variant - 1);
+
+ if(leftmost_after_variant < (int)alignment.QueryBases.size())
+ {
+ ret.postSegment = alignment.QueryBases.substr(leftmost_after_variant);
+ ret.postQual = alignment.Qualities.substr(leftmost_after_variant);
+ }
+
+ return ret;
+}
+
+//
+//
+//
+CoverageStats getVariantCoverage(BamTools::BamReader* pReader, const VCFRecord& record, const ReadTable* refTable)
+{
+ CoverageStats stats;
+
+ static const int flankingSize = 100;
+ static const double minPercentIdentity = 95.0f;
+
+ bool is_snv = record.refStr.size() == 1 && record.varStr.size() == 1;
+
+ // Grab the reference haplotype
+ int zeroBasedPos = record.refPosition - 1;
+ int start = zeroBasedPos - flankingSize - 1;
+ if(start < 0)
+ start = 0;
+
+ int end = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+ const SeqItem& chr = refTable->getRead(record.refName);
+ if(end > (int)chr.seq.length())
+ end = (int)chr.seq.length();
+
+ std::string reference_haplotype = chr.seq.substr(start, end - start);
+ int translatedPos = zeroBasedPos - start;
+
+ std::string variant_haplotype = reference_haplotype;
+
+ // Ensure that the reference string at the variant matches the expected
+ if(variant_haplotype.substr(translatedPos, record.refStr.length()) != record.refStr)
+ {
+ std::cerr << "Warning, reference base does not match VCF record.\n";
+ std::cerr << record << "\n";
+ }
+ variant_haplotype.replace(translatedPos, record.refStr.length(), record.varStr);
+
+ // Grab all reads in reference region
+ int refID = pReader->GetReferenceID(record.refName);
+ if(refID < 0)
+ return stats;
+
+ int refStart = record.refPosition;
+ int refEnd = record.refPosition;
+ pReader->SetRegion(refID, refStart, refID, refEnd);
+ BamTools::BamAlignment aln;
+
+ std::vector<double> mapping_quality;
+ std::vector<BamTools::BamAlignment> alignments;
+ while(pReader->GetNextAlignment(aln)) {
+ if(aln.MapQuality > 0)
+ alignments.push_back(aln);
+ mapping_quality.push_back(aln.MapQuality);
+
+ if(alignments.size() > opt::maximumAlignments)
+ {
+ stats.too_many_alignments = true;
+ return stats;
+ }
+ }
+
+ if(!mapping_quality.empty())
+ stats.median_mapping_quality = median(mapping_quality);
+ else
+ stats.median_mapping_quality = 60;
+
+ // Shuffle and take the first N alignments only
+ std::random_shuffle(alignments.begin(), alignments.end());
+
+#if HAVE_OPENMP
+ #pragma omp parallel for
+#endif
+ for(size_t i = 0; i < alignments.size(); ++i) {
+ BamTools::BamAlignment alignment = alignments[i];
+
+ VariantReadSegments segments = splitReadAtVariant(alignment, record);
+
+ if(opt::verbose > 1)
+ {
+ fprintf(stderr, "var: %zu %s -> %s\n", record.refPosition, record.refStr.c_str(), record.varStr.c_str());
+ fprintf(stderr, "pos: %d\n", alignment.Position);
+ fprintf(stderr, "strand: %s\n", alignment.IsReverseStrand() ? "-" : "+");
+ fprintf(stderr, "read: %s\n", alignment.QueryBases.c_str());
+ fprintf(stderr, "qual: %s\n", alignment.Qualities.c_str());
+ fprintf(stderr, "alnb: %s\n", alignment.AlignedBases.c_str());
+
+ fprintf(stderr, "Pre: %s\n", segments.preSegment.c_str());
+ fprintf(stderr, "Var: %s\n", segments.variantSegment.c_str());
+ fprintf(stderr, "Pos: %s\n", segments.postSegment.c_str());
+
+ fprintf(stderr, "PreQual: %s\n", segments.preQual.c_str());
+ fprintf(stderr, "VarQual: %s\n", segments.variantQual.c_str());
+ fprintf(stderr, "PosQual: %s\n", segments.postQual.c_str());
+ }
+
+ bool aligned_at_variant = segments.variantSegment.size() > 0 &&
+ (segments.preSegment.size() > 0 || segments.postSegment.size() > 0);
+
+ if(!aligned_at_variant)
+ continue;
+
+ bool is_evidence_read = false;
+ if(segments.variantSegment != record.refStr)
+ {
+ if(segments.variantSegment == record.varStr)
+ {
+ // Evidence read via the current alignment
+ is_evidence_read = true;
+ }
+ else
+ {
+ // Check for evidence via realignment to the variant haplotype
+ SequenceOverlap ref_overlap =
+ Overlapper::computeAlignmentAffine(alignment.QueryBases, reference_haplotype);
+ SequenceOverlap var_overlap =
+ Overlapper::computeAlignmentAffine(alignment.QueryBases, variant_haplotype);
+
+ bool quality_alignment = (ref_overlap.getPercentIdentity() >= minPercentIdentity ||
+ var_overlap.getPercentIdentity() >= minPercentIdentity);
+
+ is_evidence_read = quality_alignment && var_overlap.score > ref_overlap.score;
+ }
+ }
+
+ #pragma omp critical
+ {
+ stats.n_total_reads += 1;
+ if(is_evidence_read)
+ {
+ stats.n_evidence_reads += 1;
+ if(is_snv && segments.variantQual.size() == 1)
+ {
+ char qb = segments.variantQual[0];
+ int q = Quality::char2phred(qb);
+ stats.snv_evidence_quals.push_back(q);
+ }
+ }
+ }
+ }
+
+ return stats;
+}
+
+void makeTagHash(const VCFRecord& record, StringStringHash& tagHash)
+{
+ for(size_t i = 0; i < record.comments.size(); i++)
+ {
+ StringVector kv = split(record.comments[i], '=');
+ assert(kv.size() == 2 || kv.size() == 1);
+ if(kv.size() == 2)
+ tagHash[kv[0]] = kv[1];
+ else
+ tagHash[kv[0]] = "true";
+ }
+}
+
+
+template<typename T>
+bool getTagValue(StringStringHash& tagHash, const std::string& key, T& out)
+{
+ StringStringHash::iterator iter = tagHash.find(key);
+ if(iter == tagHash.end())
+ return false; // tag not available
+ std::stringstream parser(iter->second);
+ parser >> out;
+ return true;
+}
+
+std::string makeVariantKey(const VCFRecord& record)
+{
+ std::stringstream ss;
+ ss << record.refName << ":" << record.refPosition << ":"
+ << record.refStr << ":" << record.varStr;
+ return ss.str();
+}
+
+// Get coordinates of the reference haplotype for this
+// variant, plus some flanking sequence
+void getReferenceInterval(const VCFRecord& record,
+ int flankingSize,
+ int& zeroBasedPos,
+ int& flankStart,
+ int& flankEnd)
+{
+ zeroBasedPos = record.refPosition - 1;
+ flankStart = zeroBasedPos - flankingSize - 1;
+ flankEnd = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+}
+
+// Modify the reference coordinates to avoid going over the ends
+void clampReferenceInterval(const SeqItem& chr, int& start, int&end)
+{
+ if(start < 0)
+ start = 0;
+
+ if(end > (int)chr.seq.length())
+ end = (int)chr.seq.length();
+}
+
+// Calculate Homopolymer lengths at the variant position
+int calculateHomopolymerLength(const VCFRecord& record, const ReadTable* refTable)
+{
+ static const int flankingSize = 100;
+
+ // Get coordinates of reference haplotype
+ int zeroBasedPos, start, end;
+ getReferenceInterval(record, flankingSize, zeroBasedPos, start, end);
+ const SeqItem& chr = refTable->getRead(record.refName);
+
+ clampReferenceInterval(chr, start, end);
+ std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+ // we use the MA class for the homopolymer counting code
+ MAlignDataVector mav;
+ MultiAlignment ma(reference_haplotype, mav);
+
+ // Count run lengths of nucleotides
+ std::vector<size_t> run_lengths;
+
+ // Count runs
+ char curr_char = reference_haplotype[0];
+ size_t curr_run = 1;
+ for(size_t i = 1; i < reference_haplotype.size(); ++i)
+ {
+ if(reference_haplotype[i] != curr_char)
+ {
+ run_lengths.push_back(curr_run);
+ curr_run = 1;
+ curr_char = reference_haplotype[i];
+ }
+ else
+ {
+ curr_run++;
+ }
+ }
+ run_lengths.push_back(curr_run); // last run
+
+ // set up a map from the reference base to the run length its in
+ std::vector<size_t> index_to_run_lengths(reference_haplotype.size());
+ size_t j = 0;
+ for(size_t i = 0; i < run_lengths.size(); ++i)
+ {
+ size_t stop = j + run_lengths[i];
+ while(j < stop)
+ {
+ index_to_run_lengths[j] = i;
+ j++;
+ }
+ }
+ assert(j == reference_haplotype.size());
+
+ // Calculate the maximum homopolymer run as the max over the variant reference region
+ size_t eventStart = zeroBasedPos - start;
+ size_t eventEnd = eventStart + record.refStr.size();
+ size_t maxhp = 0;
+ for(size_t i = eventStart; i <= eventEnd; ++i)
+ {
+ size_t idx = index_to_run_lengths[i];
+ if(run_lengths[idx] > maxhp)
+ maxhp = run_lengths[idx];
+ }
+ return maxhp;
+}
+
+struct RepeatCounts
+{
+ std::string unit;
+ size_t numRefUnits;
+};
+
+RepeatCounts getRepeatCounts(const VCFRecord& record, const ReadTable* refTable)
+{
+ // Get coordinates of reference haplotype
+ int zeroBasedPos, start, end;
+ getReferenceInterval(record, 100, zeroBasedPos, start, end);
+ const SeqItem& chr = refTable->getRead(record.refName);
+
+ clampReferenceInterval(chr, start, end);
+ std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+ // VCF encodes indels as TAC -> T
+ // Calculate the inserted deleted (or substituted) sequence
+ // without including the matching reference base
+ std::string repeat_unit;
+ if(record.varStr[0] == record.refStr[0])
+ {
+ // indel
+ if(record.varStr.size() > record.refStr.size())
+ repeat_unit = record.varStr.substr(1);
+ else
+ repeat_unit = record.refStr.substr(1);
+ }
+ else
+ {
+ repeat_unit = record.varStr;
+ }
+
+ size_t repeat_len = repeat_unit.size();
+ size_t event_start = zeroBasedPos - start;
+ size_t event_end = event_start + record.refStr.size();
+
+ // Get the first position of the repeat unit near the variant
+ size_t first_pos = reference_haplotype.find(repeat_unit, event_start);
+
+ size_t num_units = 0;
+
+ if(first_pos <= event_end)
+ {
+ // Count forwards
+ int p = (int)first_pos;
+ while(p < (int)reference_haplotype.size() && reference_haplotype.compare(p, repeat_len, repeat_unit) == 0)
+ p += repeat_len;
+ num_units = (p - first_pos) / repeat_len;
+
+ // Count backwards
+ p = first_pos - repeat_len;
+ while(p >= 0 && reference_haplotype.compare(p, repeat_len, repeat_unit) == 0)
+ p -= repeat_len;
+ num_units += ((first_pos - p) / repeat_len) - 1;
+ }
+
+ RepeatCounts out;
+ out.unit = repeat_unit;
+ out.numRefUnits = num_units;
+ return out;
+}
+
+// Get the 3-mer preceding and following the variant
+void getVariantContext(const VCFRecord& record, const ReadTable* refTable,
+ std::string& prefix, std::string& suffix)
+{
+ static const int flankingSize = 100;
+
+ // Grab the reference haplotype
+ int zeroBasedPos = record.refPosition - 1;
+ int start = zeroBasedPos - flankingSize - 1;
+ if(start < 0)
+ start = 0;
+
+ int end = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+ const SeqItem& chr = refTable->getRead(record.refName);
+ if(end > (int)chr.seq.length())
+ end = (int)chr.seq.length();
+
+ std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+ size_t eventStart = zeroBasedPos - start;
+ size_t eventEnd = eventStart + record.refStr.size();
+ static const size_t k = 3;
+ prefix = reference_haplotype.substr(eventStart - k, k);
+ suffix = reference_haplotype.substr(eventEnd, k);
+}
+
+//
+// Main
+//
+int somaticVariantFiltersMain(int argc, char** argv)
+{
+ parseSomaticVariantFiltersOptions(argc, argv);
+
+ Timer* pTimer = new Timer(PROGRAM_IDENT);
+
+#if HAVE_OPENMP
+ omp_set_num_threads(opt::numThreads);
+#endif
+ // Load Reference
+ ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION);
+ refTable.indexReadsByID();
+
+ // Load BAMs
+ BamTools::BamReader* pTumorBamReader = new BamTools::BamReader;
+ pTumorBamReader->Open(opt::tumorBamFile);
+ pTumorBamReader->LocateIndex();
+ assert(pTumorBamReader->HasIndex());
+
+ BamTools::BamReader* pNormalBamReader = new BamTools::BamReader;
+ pNormalBamReader->Open(opt::normalBamFile);
+ pNormalBamReader->LocateIndex();
+ assert(pNormalBamReader->HasIndex());
+
+ // Track duplicated variants
+ HashSet<std::string> duplicateHash;
+
+ std::istream* pInput = createReader(opt::vcfFile.c_str());
+ std::string line;
+
+ while(getline(*pInput, line))
+ {
+ if(line.empty())
+ continue;
+
+ if(line[0] == '#')
+ {
+ std::cout << line << "\n";
+ continue;
+ }
+
+ // parse record
+ VCFRecord record(line);
+ if(record.isMultiAllelic())
+ {
+ std::cerr << "Error: multi-allelic VCF found, please run vcfbreakmulti\n";
+ exit(EXIT_FAILURE);
+ }
+
+ // Check if we've seen this variant already
+ std::string key = makeVariantKey(record);
+ if(duplicateHash.find(key) != duplicateHash.end())
+ continue;
+ else
+ duplicateHash.insert(key);
+
+ if(opt::verbose > 0)
+ {
+ std::stringstream ss;
+ ss << "Variant: " << record << "\n";
+ fprintf(stderr, "===============================================\n%s", ss.str().c_str());
+ }
+
+ StringStringHash tagHash;
+ makeTagHash(record, tagHash);
+
+ StringVector fail_reasons;
+
+ int hplen = 0;
+ if(!getTagValue(tagHash, "HPLen", hplen))
+ hplen = calculateHomopolymerLength(record, &refTable);
+
+ if(hplen > opt::maxHPLen)
+ fail_reasons.push_back("Homopolymer");
+
+ double dust = 0.0f;
+ if(!getTagValue(tagHash, "Dust", dust))
+ dust = HapgenUtil::calculateDustScoreAtPosition(record.refName,
+ record.refPosition,
+ &refTable);
+
+ if(dust > opt::maxDust)
+ fail_reasons.push_back("LowComplexity");
+
+ double af;
+ if(getTagValue(tagHash, "AF", af) && af < opt::minAF)
+ fail_reasons.push_back("LowAlleleFrequency");
+
+ int varDP;
+ if(getTagValue(tagHash, "VarDP", varDP) && varDP < opt::minVarDP)
+ fail_reasons.push_back("LowVarDP");
+
+ double avgHapLen;
+ if(getTagValue(tagHash, "AvgHapLen", avgHapLen) && avgHapLen < opt::minHaplotypeLength)
+ fail_reasons.push_back("ShortHaplotype");
+
+ double strandBias;
+ if(getTagValue(tagHash, "SB", strandBias) && strandBias >= opt::maxStrandBias)
+ fail_reasons.push_back("StrandBias");
+
+ // Count the number of copies of the inserted/deleted sequence in the reference
+ RepeatCounts repeatCounts = getRepeatCounts(record, &refTable);
+
+ // Realignment-based stats
+ CoverageStats tumor_stats = getVariantCoverage(pTumorBamReader, record, &refTable);
+ CoverageStats normal_stats = getVariantCoverage(pNormalBamReader, record, &refTable);
+
+ if(opt::verbose > 0)
+ {
+ fprintf(stderr, "Tumor: [%zu %zu]\n", tumor_stats.n_total_reads, tumor_stats.n_evidence_reads);
+ fprintf(stderr, "Normal: [%zu %zu]\n", normal_stats.n_total_reads, normal_stats.n_evidence_reads);
+ }
+
+ if(!tumor_stats.too_many_alignments && !normal_stats.too_many_alignments)
+ {
+ // Check that there is not evidence for the variant in the normal sample
+ if(normal_stats.n_evidence_reads > opt::maxNormalReads)
+ fail_reasons.push_back("NormalEvidence");
+
+ // If the normal is poorly covered in this region, we may call a germline variant as a variant
+ if(normal_stats.n_total_reads < opt::minNormalDepth)
+ fail_reasons.push_back("LowNormalDepth");
+
+ if(!tumor_stats.snv_evidence_quals.empty())
+ {
+ // Check that the base scores of SNVs are reasonably high
+ double median_quality = median(tumor_stats.snv_evidence_quals);
+ if(median_quality < opt::minMedianQuality)
+ fail_reasons.push_back("LowQuality");
+
+ // For very deep regions, errors can be mistaken for SNVs
+ // Check if the variant frequency is very low. This check is not performed
+ // for indels and MNPs as they might not be aligned to this region.
+ double vf_estimate = tumor_stats.n_evidence_reads / (double)tumor_stats.n_total_reads;
+ if(vf_estimate < opt::errorBound)
+ fail_reasons.push_back("PossibleError");
+ }
+
+ // Check that the mapping quality of reads in the region is reasonable
+ if(tumor_stats.median_mapping_quality < opt::minMedianQuality)
+ fail_reasons.push_back("LowMappingQuality");
+ }
+ else
+ {
+ // If the depth in the region is excessively high, the statistical models may break down
+ fail_reasons.push_back("DepthLimitReached");
+ }
+
+ if(!fail_reasons.empty() && !opt::annotateOnly)
+ {
+ if(record.passStr != "PASS" && record.passStr != ".")
+ fail_reasons.insert(fail_reasons.begin(), record.passStr);
+
+ std::stringstream strss;
+ std::copy(fail_reasons.begin(), fail_reasons.end(), std::ostream_iterator<std::string>(strss, ";"));
+ record.passStr = strss.str();
+ record.passStr.erase(record.passStr.size() - 1); // erase trailing ;
+ }
+
+ // Add INFO tags indicating allele coverage
+ if(opt::annotateOnly)
+ {
+ double tumor_vf = tumor_stats.calculateVAF();
+ double normal_vf = normal_stats.calculateVAF();
+
+ std::string prefix;
+ std::string suffix;
+ getVariantContext(record, &refTable, prefix, suffix);
+
+ record.addComment("TumorVAF", tumor_vf);
+ record.addComment("NormalVAF", normal_vf);
+ record.addComment("TumorVarDepth", (int)tumor_stats.n_evidence_reads);
+ record.addComment("TumorTotalDepth", (int)tumor_stats.n_total_reads);
+ record.addComment("NormalVarDepth", (int)normal_stats.n_evidence_reads);
+ record.addComment("NormalTotalDepth", (int)normal_stats.n_total_reads);
+ record.addComment("5pContext", prefix);
+ record.addComment("3pContext", suffix);
+ record.addComment("RepeatUnit", repeatCounts.unit);
+ record.addComment("RepeatRefCount", (int)repeatCounts.numRefUnits);
+ }
+
+ std::cout << record << "\n";
+ }
+
+ // Cleanup
+ delete pInput;
+ delete pTumorBamReader;
+ delete pNormalBamReader;
+ delete pTimer;
+
+ return 0;
+}
+
+//
+// Handle command line arguments
+//
+void parseSomaticVariantFiltersOptions(int argc, char** argv)
+{
+ std::string algo_str;
+ bool die = false;
+ for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
+ {
+ std::istringstream arg(optarg != NULL ? optarg : "");
+ switch (c)
+ {
+ case OPT_REFERENCE: arg >> opt::referenceFile; break;
+ case OPT_TUMOR_BAM: arg >> opt::tumorBamFile; break;
+ case OPT_NORMAL_BAM: arg >> opt::normalBamFile; break;
+ case OPT_MIN_AF: arg >> opt::minAF; break;
+ case OPT_MIN_VARDP: arg >> opt::minVarDP; break;
+ case OPT_MAX_SB: arg >> opt::maxStrandBias; break;
+ case OPT_MAX_HP: arg >> opt::maxHPLen; break;
+ case OPT_MAX_DUST: arg >> opt::maxDust; break;
+ case OPT_MAX_NORMAL_READS: arg >> opt::maxNormalReads; break;
+ case OPT_MIN_NORMAL_DEPTH: arg >> opt::minNormalDepth; break;
+ case OPT_ANNOTATE: opt::annotateOnly = true; break;
+ case 't': arg >> opt::numThreads; break;
+ case '?': die = true; break;
+ case 'v': opt::verbose++; break;
+ case OPT_HELP:
+ std::cout << SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE;
+ exit(EXIT_SUCCESS);
+ case OPT_VERSION:
+ std::cout << SOMATIC_VARIANT_FILTERS_VERSION_MESSAGE;
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ if (argc - optind < 1)
+ {
+ std::cerr << SUBPROGRAM ": missing arguments\n";
+ die = true;
+ }
+ else if (argc - optind > 1)
+ {
+ std::cerr << SUBPROGRAM ": too many arguments\n";
+ die = true;
+ }
+
+ if(opt::numThreads <= 0)
+ {
+ std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n";
+ die = true;
+ }
+
+ if(opt::tumorBamFile.empty())
+ {
+ std::cerr << SUBPROGRAM ": a --tumor-bam must be provided\n";
+ die = true;
+ }
+
+ if(opt::normalBamFile.empty())
+ {
+ std::cerr << SUBPROGRAM ": a --normal-bam must be provided\n";
+ die = true;
+ }
+
+ if(opt::referenceFile.empty())
+ {
+ std::cerr << SUBPROGRAM ": a --reference must be provided\n";
+ die = true;
+ }
+
+ opt::vcfFile = argv[optind++];
+
+ if (die)
+ {
+ std::cout << "\n" << SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE;
+ exit(EXIT_FAILURE);
+ }
+}
diff --git a/src/SGA/somatic-variant-filters.h b/src/SGA/somatic-variant-filters.h
new file mode 100644
index 0000000..95f225e
--- /dev/null
+++ b/src/SGA/somatic-variant-filters.h
@@ -0,0 +1,23 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// variant-filtering - apply various filters
+// to a somatic VCF file
+//
+#ifndef SOMATIC_VARIANT_FILTERS_H
+#define SOMATIC_VARIANT_FILTERS_H
+#include <getopt.h>
+#include "config.h"
+
+// functions
+
+//
+int somaticVariantFiltersMain(int argc, char** argv);
+
+// options
+void parseSomaticVariantFiltersOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SQG/ASQG.cpp b/src/SQG/ASQG.cpp
index c4a3911..d73558f 100644
--- a/src/SQG/ASQG.cpp
+++ b/src/SQG/ASQG.cpp
@@ -33,6 +33,10 @@ static char TRANSITIVE_TAG[] = "TE"; // 1 if the graph has transitive edges
// Vertex tags
static char SUBSTRING_TAG[] = "SS";
+// Edge tags
+static char CIGAR_TAG[] = "CG";
+static char PERCENT_IDENTITY_TAG[] = "PI";
+
//
// Header Record
//
@@ -251,6 +255,13 @@ void EdgeRecord::write(std::ostream& out)
std::stringstream ss;
ss << m_overlap;
fields.push_back(ss.str());
+
+ if(m_cigarTag.isInitialized())
+ fields.push_back(m_cigarTag.toTagString(CIGAR_TAG));
+
+ if(m_percentIdentityTag.isInitialized())
+ fields.push_back(m_percentIdentityTag.toTagString(PERCENT_IDENTITY_TAG));
+
writeFields(out, fields);
}
diff --git a/src/SQG/ASQG.h b/src/SQG/ASQG.h
index c385114..3f9ddd5 100644
--- a/src/SQG/ASQG.h
+++ b/src/SQG/ASQG.h
@@ -92,12 +92,19 @@ namespace ASQG
const Overlap& getOverlap() const { return m_overlap; }
+ void setCigarTag(const std::string& cigar) { m_cigarTag.set(cigar); }
+ void setPercentIdentityTag(const float f) { m_percentIdentityTag.set(f); }
+
void write(std::ostream& out);
void parse(const std::string& record);
private:
Overlap m_overlap;
+
+ // Optional tags
+ SQG::FloatTag m_percentIdentityTag;
+ SQG::StringTag m_cigarTag;
};
// Parsing functions
diff --git a/src/SuffixTools/BWTIndexSet.h b/src/SuffixTools/BWTIndexSet.h
index 5b55527..1aa05b2 100644
--- a/src/SuffixTools/BWTIndexSet.h
+++ b/src/SuffixTools/BWTIndexSet.h
@@ -24,7 +24,7 @@
struct BWTIndexSet
{
// Constructor
- BWTIndexSet() : pBWT(NULL), pRBWT(NULL), pCache(NULL), pSSA(NULL), pPopIdx(NULL), pQualityTable(NULL) {}
+ BWTIndexSet() : pBWT(NULL), pRBWT(NULL), pCache(NULL), pSSA(NULL), pPopIdx(NULL), pQualityTable(NULL), pReadTable(NULL) {}
// Data
const BWT* pBWT;
@@ -33,6 +33,7 @@ struct BWTIndexSet
const SampledSuffixArray* pSSA;
const PopulationIndex* pPopIdx;
const QualityTable* pQualityTable;
+ const ReadTable* pReadTable;
};
#endif
diff --git a/src/SuffixTools/SparseGapArray.h b/src/SuffixTools/SparseGapArray.h
index deebfcc..1262fc3 100644
--- a/src/SuffixTools/SparseGapArray.h
+++ b/src/SuffixTools/SparseGapArray.h
@@ -320,7 +320,7 @@ class SparseGapArray : public GapArray
//
void initOverflow(size_t i, OverflowStorage c)
{
- m_overflow.insert(std::make_pair<size_t, OverflowStorage>(i, c));
+ m_overflow.insert(std::make_pair(i, c));
}
//
diff --git a/src/Thirdparty/multiple_alignment.cpp b/src/Thirdparty/multiple_alignment.cpp
index a33848f..6421cb6 100644
--- a/src/Thirdparty/multiple_alignment.cpp
+++ b/src/Thirdparty/multiple_alignment.cpp
@@ -512,6 +512,88 @@ std::string MultipleAlignment::calculateCigarBetweenRows(size_t row0, size_t row
return Overlapper::compactCigar(cigar);
}
+// new function based on the standard SGA function calculateBaseConsensus()
+// we modified the function for more accurate overlap correction results
+// 1. We only attempt to correct a base in the read, if it is seen less than base_threshold times (default 2 --> we only correct the base in the read if no other read has the same base).
+// 2. We do not correct a position in the read if there is >1 possibility. Possibility is defined as the number of different bases with at least a count of min_count_max_base.
+// 3. We only correct the base in the read if the base with the maximum count occurs at least min_count_max_base times.
+std::string MultipleAlignment::calculateBaseConsensusMinCoverage(int base_threshold, int min_trim_coverage, int min_count_max_base)
+{
+ assert(!m_sequences.empty());
+ std::string consensus_sequence;
+ MultipleAlignmentElement& base_element = m_sequences.front();
+ size_t start_column = base_element.getStartColumn();
+ size_t end_column = base_element.getEndColumn();
+
+ // This index records the last base in the consensus that had coverage greater than
+ // min_trim_coverage. After the consensus calculation the read is trimmed back to this position
+ int last_good_base = -1;
+
+ for(size_t c = start_column; c <= end_column; ++c) {
+ std::vector<int> counts = getColumnBaseCounts(c);
+
+ char max_symbol = '\0';
+ int max_count = -1;
+ int total_depth = 0;
+ size_t noPossibleCorrections = 0;
+
+#ifdef MA_DEBUG_CONSENSUS
+ printf("%zu\t", c);
+#endif
+ for(size_t a = 0; a < m_alphabet_size; ++a) {
+ char symbol = m_alphabet[a];
+ total_depth += counts[a];
+
+ if(symbol != 'N' && counts[a] >= min_count_max_base) {
+ noPossibleCorrections ++;
+ }
+ if(symbol != 'N' && counts[a] > max_count) {
+ max_symbol = symbol;
+ max_count = counts[a];
+ }
+#ifdef MA_DEBUG_CONSENSUS
+ printf("%c:%d ", symbol, counts[a]);
+#endif
+ }
+
+ char base_symbol = base_element.getColumnSymbol(c);
+ int base_count = counts[symbol2index(base_symbol)];
+
+ // Choose a consensus base for this column. Only change a base
+ // if has been seen less than base_threshold times and the max
+ // base in the column has been seen more times than the base symbol
+ char consensus_symbol;
+ // Here we require the 3 conditions listed above
+ if(max_count > base_count && base_count < base_threshold && max_count >= min_count_max_base && noPossibleCorrections == 1)
+ consensus_symbol = max_symbol;
+ else
+ consensus_symbol = base_symbol;
+
+ // Output a symbol to the consensus. Skip padding symbols and leading
+ // bases that are less than the minimum required depth to avoid trimming
+ if(consensus_symbol != '-' &&
+ (!consensus_sequence.empty() || total_depth >= min_trim_coverage))
+ consensus_sequence.push_back(consensus_symbol);
+
+ // Record the position of the last good base
+ if(total_depth >= min_trim_coverage) {
+ int consensus_index = consensus_sequence.size() - 1;
+ if(consensus_index > last_good_base)
+ last_good_base = consensus_index;
+ }
+#ifdef MA_DEBUG_CONSENSUS
+ printf("CALL: %c\n", consensus_symbol);
+#endif
+ }
+
+ if(last_good_base != -1)
+ consensus_sequence.erase(last_good_base + 1);
+ else
+ consensus_sequence.clear();
+
+ return consensus_sequence;
+}
+
std::string MultipleAlignment::calculateBaseConsensus(int min_call_coverage, int min_trim_coverage)
{
assert(!m_sequences.empty());
diff --git a/src/Thirdparty/multiple_alignment.h b/src/Thirdparty/multiple_alignment.h
index 175ac52..a5b5c87 100644
--- a/src/Thirdparty/multiple_alignment.h
+++ b/src/Thirdparty/multiple_alignment.h
@@ -204,6 +204,13 @@ class MultipleAlignment
// min_trim_coverage depth at the ends of the base sequence.
std::string calculateBaseConsensus(int min_call_coverage, int min_trim_coverage);
+ // Calculate a new consensus sequence for the base sequence of the multiple alignment
+ // A base call is changed only if it has been seen in less than min_call_coverage sequences
+ // Leading/trailing bases are trimmed from the consensus sequence if there is less than
+ // min_trim_coverage depth at the ends of the base sequence.
+ // Also require a minimum count for the consensus base (min_count_base_overlap+1) and only one frequent base
+ std::string calculateBaseConsensusMinCoverage(int min_call_coverage, int min_trim_coverage, int min_count_base_overlap);
+
// Calculate consensus sequence of the base element that maximizes the likelihood of the multiple alignment
void calculateBaseConsensusLikelihood(std::string* consensus_sequence, std::string* consensus_quality);
diff --git a/src/Thirdparty/overlapper.cpp b/src/Thirdparty/overlapper.cpp
index 1ab6076..497f24b 100644
--- a/src/Thirdparty/overlapper.cpp
+++ b/src/Thirdparty/overlapper.cpp
@@ -31,19 +31,18 @@
#include <sstream>
#include <limits>
#include <stdio.h>
+#include <inttypes.h>
-OverlapperParams default_params = { 2, -6, -3 };
-OverlapperParams ungapped_params = { 2, -10000, -3 };
-
-
-
+//
+OverlapperParams default_params = { 2, -6, -3, -2, ALT_OVERLAP };
+OverlapperParams ungapped_params = { 2, -10000, -3, -2, ALT_OVERLAP };
+OverlapperParams affine_default_params = { 2, -5, -3, -2, ALT_OVERLAP, true, true, true, true, true };
//
#define max3(x,y,z) std::max(std::max(x,y), z)
//#define DEBUG_OVERLAPPER 1
//#define DEBUG_EXTEND 1
-
//
SequenceInterval::SequenceInterval() : start(0), end(-1)
{
@@ -170,7 +169,10 @@ int SequenceOverlap::calculateTotalColumns() const
//
void SequenceOverlap::printAlignment(const std::string& s1, const std::string& s2) const
{
- assert(isValid());
+ if(!isValid()) {
+ std::cerr << "Alignment is not valid\n";
+ return;
+ }
std::string out_1;
std::string out_2;
@@ -199,7 +201,7 @@ void SequenceOverlap::printAlignment(const std::string& s1, const std::string& s
char code;
while(cigar_parser >> length >> code) {
assert(length > 0);
- if(code == 'M') {
+ if(code == 'M' || code == 'X' || code == '=') {
out_1.append(s1.substr(current_1, length));
out_2.append(s2.substr(current_2, length));
current_1 += length;
@@ -627,7 +629,12 @@ SequenceOverlap Overlapper::extendMatch(const std::string& s1, const std::string
return output;
}
-// The score for this cell coming from a match, deletion and insertion
+static const uint8_t FROM_DIAG = 0;
+static const uint8_t FROM_LEFT = 1;
+static const uint8_t FROM_UP = 2;
+static const uint8_t FROM_INVALID = 3;
+
+// The score for this cell coming from a match, deletion and insertion and its direction
struct AffineCell
{
AffineCell() : G(0), I(-std::numeric_limits<int>::max()), D(-std::numeric_limits<int>::max()) {}
@@ -636,164 +643,244 @@ struct AffineCell
int G;
int I;
int D;
+
+ uint8_t Gt;
+ uint8_t It;
+ uint8_t Dt;
};
typedef std::vector<AffineCell> AffineCells;
typedef std::vector<AffineCells> AffineMatrix;
-SequenceOverlap Overlapper::computeOverlapAffine(const std::string& s1, const std::string& s2, const OverlapperParams params)
+SequenceOverlap Overlapper::computeAlignmentAffine(const std::string& s1, const std::string& s2, const OverlapperParams params)
{
// Exit with invalid intervals if either string is zero length
SequenceOverlap output;
if(s1.empty() || s2.empty()) {
- std::cerr << "Overlapper::computeOverlap error: empty input sequence\n";
+ std::cerr << "Overlapper::computeAlignmentAffine error: empty input sequence\n";
exit(EXIT_FAILURE);
}
+ bool gap_s1_start;
+ bool gap_s1_end;
+ bool gap_s2_start;
+ bool gap_s2_end;
+
+ // Set the bools for the explicit alignment types
+ if (params.type == ALT_GLOBAL)
+ {
+ gap_s1_start = false;
+ gap_s1_end = false;
+ gap_s2_start = false;
+ gap_s2_end = false;
+ }
+ else if (params.type == ALT_OVERLAP)
+ {
+ gap_s1_start = true;
+ gap_s1_end = true;
+ gap_s2_start = true;
+ gap_s2_end = true;
+ }
+ else if (params.type == ALT_CONTAINMENT)
+ {
+ gap_s1_start = true;
+ gap_s1_end = true;
+ gap_s2_start = false;
+ gap_s2_end = false;
+ }
+ else if (params.type == ALT_CUSTOM)
+ {
+ gap_s1_start = params.gap_s1_start;
+ gap_s1_end = params.gap_s1_end;
+ gap_s2_start = params.gap_s2_start;
+ gap_s2_end = params.gap_s2_end;
+ }
+ else
+ {
+ // Unknown alignment type
+ abort();
+ }
+
// Initialize the scoring matrix
size_t num_columns = s1.size() + 1;
size_t num_rows = s2.size() + 1;
-
- int gap_open = 5;
- int gap_ext = 2;
-
+ int gap_open = -params.gap_penalty;
+ int gap_ext = -params.gap_ext_penalty;
+
AffineMatrix score_matrix;
score_matrix.resize(num_columns);
+
for(size_t i = 0; i < score_matrix.size(); ++i)
score_matrix[i].resize(num_rows);
+
+ // Initialze first row and column
+ // Penalties in first row iff gap_s1_start==false
+ int c = (gap_s1_start == false ? 1 : 0);
+ for(size_t i = 1; i < num_columns; ++i) {
+ int v = -(gap_open + i * gap_ext) * c;
+ score_matrix[i][0].D = v;
+ score_matrix[i][0].Dt = (i == 1? FROM_DIAG : FROM_LEFT);
+ score_matrix[i][0].G = v;
+ score_matrix[i][0].Gt = FROM_LEFT;
+ }
+
+ // Penalties in first column iff gap_s2_start==false
+ c = (gap_s2_start == false ? 1 : 0);
+ for(size_t j = 1; j < num_rows; ++j) {
+ int v = -(gap_open + j * gap_ext) * c;
+ score_matrix[0][j].I = v;
+ score_matrix[0][j].It = (j == 1? FROM_DIAG : FROM_UP);
+ score_matrix[0][j].G = v;
+ score_matrix[0][j].Gt = FROM_UP;
+ }
// Calculate scores
for(size_t i = 1; i < num_columns; ++i) {
for(size_t j = 1; j < num_rows; ++j) {
-
+
// Calculate the score for entry (i,j)
int idx_1 = i - 1;
int idx_2 = j - 1;
-
int diagonal = score_matrix[i-1][j-1].G + (s1[idx_1] == s2[idx_2] ? params.match_score : params.mismatch_penalty);
-
- // When computing the score starting from the left/right cells, we have to determine
- // whether to extend an existing gap or start a new one.
+
AffineCell& curr = score_matrix[i][j];
-
AffineCell& up = score_matrix[i][j-1];
- if(up.I > up.G - gap_open)
- curr.I = up.I - gap_ext;
- else
- curr.I = up.G - (gap_open + gap_ext);
-
AffineCell& left = score_matrix[i-1][j];
- if(left.D > left.G - gap_open)
- curr.D = left.D - gap_ext;
- else
- curr.D = left.G - (gap_open + gap_ext);
+
+ // When computing the score starting from the left/right cells, we have to determine
+ // whether to extend an existing gap or start a new one.
+ // In the last column, insertion costs are controlled by gap_s2_end
+ int ins_open = (i < num_columns - 1 or not gap_s2_end? gap_open : 0);
+ int ins_ext = (i < num_columns - 1 or not gap_s2_end? gap_ext : 0);
+
+ // In the last row, deletion costs are controlled by gap_s1_end
+ int del_open = (j < num_rows - 1 or not gap_s1_end? gap_open : 0);
+ int del_ext = (j < num_rows - 1 or not gap_s1_end? gap_ext : 0);
+
+ if(up.I > up.G - ins_open) {
+ curr.I = up.I - ins_ext;
+ curr.It = FROM_UP;
+ } else {
+ curr.I = up.G - (ins_open + ins_ext);
+ curr.It = FROM_DIAG;
+ }
+ if(left.D > left.G - del_open) {
+ curr.D = left.D - del_ext;
+ curr.Dt = FROM_LEFT;
+ } else {
+ curr.D = left.G - (del_open + del_ext);
+ curr.Dt = FROM_DIAG;
+ }
curr.G = max3(curr.D, curr.I, diagonal);
+ if(curr.G == curr.I)
+ curr.Gt = FROM_UP;
+ else if(curr.G == curr.D)
+ curr.Gt = FROM_LEFT;
+ else
+ curr.Gt = FROM_DIAG;
}
}
-
- // The location of the highest scoring match in the
- // last row or last column is the maximum scoring overlap
- // for the pair of strings. We start the backtracking from
- // that cell
- int max_row_value = std::numeric_limits<int>::min();
- int max_column_value = std::numeric_limits<int>::min();
- size_t max_row_index = 0;
- size_t max_column_index = 0;
-
- // Check every column of the last row
- // The first column is skipped to avoid empty alignments
- for(size_t i = 1; i < num_columns; ++i) {
- int v = score_matrix[i][num_rows - 1].G;
- if(v > max_row_value) {
- max_row_value = v;
- max_row_index = i;
+
+ // With the new scores, the max score is always in the bottom right cell
+ size_t i = num_columns - 1;
+ size_t j = num_rows - 1;
+
+ output.score = score_matrix[i][j].G;
+ uint8_t direction = score_matrix[i][j].Gt;
+
+ // However, the alignment might contain free end gaps which we now remove
+ if (gap_s2_end)
+ {
+ while (j >= 1 and direction == FROM_UP)
+ {
+ direction = score_matrix[i][j].It;
+ --j;
+ if (direction == FROM_DIAG)
+ direction = score_matrix[i][j].Gt;
}
}
- // Check every row of the last column
- for(size_t j = 1; j < num_rows; ++j) {
- int v = score_matrix[num_columns - 1][j].G;
- if(v > max_column_value) {
- max_column_value = v;
- max_column_index = j;
+ if (gap_s1_end and j == num_rows - 1)
+ {
+ while (i >= 1 and direction == FROM_LEFT)
+ {
+ direction = score_matrix[i][j].Dt;
+ --i;
+ if (direction == FROM_DIAG)
+ direction = score_matrix[i][j].Gt;
}
}
- // Compute the location at which to start the backtrack
- size_t i;
- size_t j;
-
- if(max_column_value > max_row_value) {
- i = num_columns - 1;
- j = max_column_index;
- output.score = max_column_value;
- } else {
- i = max_row_index;
- j = num_rows - 1;
- output.score = max_row_value;
- }
-
// Set the alignment endpoints to be the index of the last aligned base
output.match[0].end = i - 1;
output.match[1].end = j - 1;
output.length[0] = s1.length();
output.length[1] = s2.length();
+
#ifdef DEBUG_OVERLAPPER
printf("Endpoints selected: (%d %d) with score %d\n", output.match[0].end, output.match[1].end, output.score);
#endif
-
+
output.edit_distance = 0;
output.total_columns = 0;
-
std::string cigar;
- while(i > 0 && j > 0) {
- // Compute the possible previous locations of the path
- int idx_1 = i - 1;
- int idx_2 = j - 1;
-
- bool is_match = s1[idx_1] == s2[idx_2];
- int diagonal = score_matrix[i - 1][j - 1].G + (is_match ? params.match_score : params.mismatch_penalty);
- int up1 = score_matrix[i][j-1].G - (gap_open + gap_ext);
- int up2 = score_matrix[i][j-1].I - gap_ext;
-
- int left1 = score_matrix[i-1][j].G - (gap_open + gap_ext);
- int left2 = score_matrix[i-1][j].D - gap_ext;
-
- int curr = score_matrix[i][j].G;
-
- // If there are multiple possible paths to this cell
- // we break ties in order of insertion,deletion,match
- // this helps left-justify matches for homopolymer runs
- // of unequal lengths
- if(curr == up1 || curr == up2) {
+
+ // We stop when we hit an edge along which gaps are free
+ while (not (i == 0 and j == 0) // absolute stop, regardless of free gaps
+ and not (i == 0 and gap_s2_start) // stop at left edge if s2 start gaps are free
+ and not (j == 0 and gap_s1_start)) // stop at top edge if s1 start gaps are free
+ {
+ if (direction == FROM_UP)
+ {
cigar.push_back('I');
- j -= 1;
- output.edit_distance += 1;
- } else if(curr == left1 || curr == left2) {
+ ++output.edit_distance;
+ direction = score_matrix[i][j].It;
+ --j;
+
+ if (direction == FROM_DIAG)
+ direction = score_matrix[i][j].Gt;
+ }
+ else if (direction == FROM_LEFT)
+ {
cigar.push_back('D');
- i -= 1;
- output.edit_distance += 1;
- } else {
- assert(curr == diagonal);
- if(!is_match)
- output.edit_distance += 1;
- cigar.push_back('M');
- i -= 1;
- j -= 1;
+ ++output.edit_distance;
+ direction = score_matrix[i][j].Dt;
+ --i;
+
+ if (direction == FROM_DIAG)
+ direction = score_matrix[i][j].Gt;
}
-
- output.total_columns += 1;
+ else
+ {
+ assert(direction == FROM_DIAG);
+ assert(i > 0);
+ assert(j > 0);
+
+ bool is_match = s1[i - 1] == s2[j - 1];
+
+ if (is_match)
+ {
+ cigar.push_back(params.use_m_ops? 'M' : '=');
+ }
+ else
+ {
+ cigar.push_back(params.use_m_ops? 'M' : 'X');
+ ++output.edit_distance;
+ }
+ --i;
+ --j;
+ direction = score_matrix[i][j].Gt;
+ }
+ ++output.total_columns;
}
-
+
// Set the alignment startpoints
output.match[0].start = i;
output.match[1].start = j;
-
// Compact the expanded cigar string into the canonical run length encoding
// The backtracking produces a cigar string in reversed order, flip it
std::reverse(cigar.begin(), cigar.end());
- assert(!cigar.empty());
output.cigar = compactCigar(cigar);
return output;
}
diff --git a/src/Thirdparty/overlapper.h b/src/Thirdparty/overlapper.h
index 1834ef9..1cd6a59 100644
--- a/src/Thirdparty/overlapper.h
+++ b/src/Thirdparty/overlapper.h
@@ -127,16 +127,40 @@ struct SequenceOverlap
};
+enum AffineAlignmentType
+{
+ ALT_OVERLAP,
+ ALT_GLOBAL,
+ ALT_CONTAINMENT,
+ ALT_CUSTOM
+};
+
struct OverlapperParams
{
int match_score;
int gap_penalty;
int mismatch_penalty;
+ int gap_ext_penalty;
+ AffineAlignmentType type;
+ // The following 4 bools indicate whether it is free to align the respective positions to gaps.
+ // The existing alignment types are a subset of the 16 possible settings, as follows:
+ // ALT_GLOBAL: all false (no free gaps)
+ // ALT_OVERLAP: all true
+ // ALT_CONTAINMENT: s1 gaps are free, but not s2
+ bool gap_s1_start;
+ bool gap_s1_end;
+ bool gap_s2_start;
+ bool gap_s2_end;
+ // The following controls the cigar op used for matches and mismatches
+ // if true: use 'M' for both matches and mismatches (default)
+ // if false: use '=' for matches, 'X' for mismatches
+ bool use_m_ops;
};
// Global variables
-extern OverlapperParams default_params; // { 2, -5, -3 };
-extern OverlapperParams ungapped_params; // { 2, -10000, -3 };
+extern OverlapperParams default_params; // { 2, -6, -3, -2 };
+extern OverlapperParams ungapped_params; // { 2, -10000, -3, -2 };
+extern OverlapperParams affine_default_params; // { 2, -5, -3, -2 };
//
namespace Overlapper
@@ -153,7 +177,10 @@ SequenceOverlap computeOverlap(const std::string& s1, const std::string& s2, con
SequenceOverlap extendMatch(const std::string& s1, const std::string& s2, int start_1, int start_2, int bandwidth);
// Perform an alignment using affine gap penalties
-SequenceOverlap computeOverlapAffine(const std::string& s1, const std::string& s2, const OverlapperParams params = default_params);
+// This can perform either "overlap" type alignments (suffix to prefix)
+// or global type alignments (end-to-end).
+SequenceOverlap computeAlignmentAffine(const std::string& s1, const std::string& s2, const OverlapperParams params = affine_default_params);
+SequenceOverlap computeAlignmentAffine2(const std::string& s1, const std::string& s2, const OverlapperParams params);
// Compact an expanded CIGAR string into a regular cigar string
std::string compactCigar(const std::string& ecigar);
diff --git a/src/Util/Makefile.am b/src/Util/Makefile.am
index f382b25..5126c89 100644
--- a/src/Util/Makefile.am
+++ b/src/Util/Makefile.am
@@ -31,6 +31,7 @@ libutil_a_SOURCES = \
VCFUtil.h VCFUtil.cpp \
QualityTable.h QualityTable.cpp \
BloomFilter.h BloomFilter.cpp \
+ VariantIndex.h VariantIndex.cpp \
Verbosity.h \
Timer.h \
EncodedString.h \
diff --git a/src/Util/SGAStats.h b/src/Util/SGAStats.h
index 55d3744..1937513 100644
--- a/src/Util/SGAStats.h
+++ b/src/Util/SGAStats.h
@@ -9,7 +9,7 @@
//
#ifndef SGASTATS_H
-#define STATS_H
+#define SGASTATS_H
namespace SGAStats
{
diff --git a/src/Util/SeqReader.cpp b/src/Util/SeqReader.cpp
index 0dcaeac..804244d 100644
--- a/src/Util/SeqReader.cpp
+++ b/src/Util/SeqReader.cpp
@@ -13,12 +13,16 @@
SeqReader::SeqReader(std::string filename, uint32_t flags) : m_flags(flags)
{
- m_pHandle = createReader(filename);
+ if(filename == "-")
+ m_pHandle = &std::cin;
+ else
+ m_pHandle = createReader(filename);
}
SeqReader::~SeqReader()
{
- delete m_pHandle;
+ if(m_pHandle != &std::cin)
+ delete m_pHandle;
}
// Extract an element from the file
diff --git a/src/Util/SeqReader.h b/src/Util/SeqReader.h
index ed8c38d..e2547e9 100644
--- a/src/Util/SeqReader.h
+++ b/src/Util/SeqReader.h
@@ -1,7 +1,7 @@
//-----------------------------------------------
// Copyright 2009 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18 at sanger.ac.uk)
-// Released under the GPL license
+// Released under the GPL
//-----------------------------------------------
//
// SeqReader - Reads fasta or fastq sequence files
diff --git a/src/Util/VCFUtil.cpp b/src/Util/VCFUtil.cpp
index 163b480..ad02450 100644
--- a/src/Util/VCFUtil.cpp
+++ b/src/Util/VCFUtil.cpp
@@ -10,6 +10,7 @@
#include <iostream>
#include <stdlib.h>
#include <iterator>
+#include <sstream>
#include "VCFUtil.h"
#include "StdAlnTools.h"
#include "MultiAlignment.h"
@@ -23,6 +24,47 @@ VCFRecord::VCFRecord()
quality = 20.0f;
}
+// parse from a line
+VCFRecord::VCFRecord(const std::string& line)
+{
+ std::stringstream parser(line);
+
+ parser >> refName;
+ parser >> refPosition;
+ parser >> id;
+ parser >> refStr;
+ parser >> varStr;
+
+ // Non-conformant VCF might represent unknown qualities with '.'
+ std::string quality_str;
+ parser >> quality_str;
+ if(quality_str == ".")
+ {
+ quality = 255;
+ }
+ else
+ {
+ std::stringstream qparser(quality_str);
+ qparser >> quality;
+ }
+
+ parser >> passStr;
+
+ std::string tmp;
+ parser >> tmp;
+
+ // Split the comments on the semicolon delimiter
+ comments = split(tmp, ';');
+
+ parser >> formatStr;
+ if(!formatStr.empty())
+ {
+ // parse genotypes
+ while(parser >> tmp)
+ sampleStr.push_back(tmp);
+ }
+}
+
void VCFRecord::addComment(const std::string& key, const std::string& value)
{
std::string out = key;
@@ -67,7 +109,7 @@ void VCFRecord::printVerbose() const
}
// Write the VCF record to the stream
-std::ostream& operator<<(std::ostream& o, VCFRecord& record)
+std::ostream& operator<<(std::ostream& o, const VCFRecord& record)
{
o.precision(5);
o.setf(std::ios::fixed,std::ios::floatfield);
diff --git a/src/Util/VCFUtil.h b/src/Util/VCFUtil.h
index dcdba7b..85eed7b 100644
--- a/src/Util/VCFUtil.h
+++ b/src/Util/VCFUtil.h
@@ -39,7 +39,13 @@ enum VCFReturnCode
struct VCFRecord
{
// functions
+
+ // initialize an empty record
VCFRecord();
+
+ // parse a record from a tab-delimited string
+ VCFRecord(const std::string& line);
+
void addComment(const std::string& key, const std::string& value);
void addComment(const std::string& key, const int& value);
void addComment(const std::string& key, const double& value);
@@ -49,7 +55,7 @@ struct VCFRecord
void printVerbose() const;
VCFClassification classify() const;
- friend std::ostream& operator<<(std::ostream& o, VCFRecord& record);
+ friend std::ostream& operator<<(std::ostream& o, const VCFRecord& record);
static bool sort(const VCFRecord& a, const VCFRecord& b)
{
@@ -59,6 +65,8 @@ struct VCFRecord
return a.refPosition < b.refPosition;
}
+ bool isMultiAllelic() const { return varStr.find(",") != std::string::npos; }
+
// data
std::string refName;
size_t refPosition;
diff --git a/src/Util/VariantIndex.cpp b/src/Util/VariantIndex.cpp
new file mode 100644
index 0000000..09b9c71
--- /dev/null
+++ b/src/Util/VariantIndex.cpp
@@ -0,0 +1,96 @@
+//-----------------------------------------------------
+// Copyright 2014 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// Data structure for performing proximity queries
+// against a set of variants
+//
+#include <iostream>
+#include <algorithm>
+#include <assert.h>
+#include "VariantIndex.h"
+
+VariantIndex::VariantIndex(const std::string& filename, const ReadTable& refTable)
+{
+ std::ifstream input(filename.c_str());
+ std::string line;
+
+ // Parse the VCF file
+ while(getline(input, line))
+ {
+ if(line.empty())
+ continue;
+
+ if(line[0] == '#')
+ continue;
+
+ VCFRecord record(line);
+
+ // Do not allow multi-allelic records
+ if(record.isMultiAllelic())
+ {
+ std::cerr << "Error: multi-allelic Variant found, please run vcfbreakmulti\n";
+ exit(EXIT_FAILURE);
+ }
+
+ // Convert to a minimal representation of the change
+ VariantRecord mvf = { record.refName, record.refStr, record.varStr, record.refPosition };
+ m_records.push_back(mvf);
+ }
+
+ buildIndex(refTable);
+}
+
+void VariantIndex::buildIndex(const ReadTable& refTable)
+{
+ for(size_t i = 0; i < refTable.getCount(); ++i)
+ {
+ const SeqItem& seq_record = refTable.getRead(i);
+ std::string chromosome = seq_record.id;
+ size_t length = seq_record.seq.length();
+ int buckets = (length / BUCKET_SIZE) + 1;
+
+ m_map[chromosome].resize(buckets);
+ }
+
+ for(size_t i = 0; i < m_records.size(); ++i)
+ {
+ VariantIndexMap::iterator iter = m_map.find(m_records[i].reference);
+ assert(iter != m_map.end());
+
+ int bucket_id = m_records[i].position / BUCKET_SIZE;
+ assert(bucket_id < (int)iter->second.size());
+ IntVector& bucket = iter->second[bucket_id];
+ bucket.push_back(i);
+ }
+}
+
+VariantRecordVector VariantIndex::getNearVariants(const std::string& reference,
+ int position,
+ int distance) const
+{
+ VariantRecordVector out;
+
+ VariantIndexMap::const_iterator iter = m_map.find(reference);
+ assert(iter != m_map.end());
+ const IntVectorVector& chr_buckets = iter->second;
+
+ int lower_index = std::max(position - distance, 0) / BUCKET_SIZE;
+ int upper_index = std::min((position + distance) / BUCKET_SIZE, (int)chr_buckets.size() - 1);
+
+ for(int bi = lower_index; bi <= upper_index; ++bi)
+ {
+ const IntVector& bucket = chr_buckets[bi];
+ for(size_t vi = 0; vi < bucket.size(); ++vi)
+ {
+ const VariantRecord& record = m_records[bucket[vi]];
+ if(abs(record.position - position) < distance)
+ out.push_back(record);
+ }
+ }
+
+ return out;
+}
+
diff --git a/src/Util/VariantIndex.h b/src/Util/VariantIndex.h
new file mode 100644
index 0000000..0de6b01
--- /dev/null
+++ b/src/Util/VariantIndex.h
@@ -0,0 +1,59 @@
+//-----------------------------------------------------
+// Copyright 2014 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// Data structure for performing proximity queries
+// against a set of variants
+//
+#ifndef VARIANTINDEX_H
+#define VARIANTINDEX_H
+#include "Util.h"
+#include "SeqReader.h"
+#include "VCFUtil.h"
+#include "ReadTable.h"
+#include <map>
+
+// To save space we store a vcf-like record
+// with only the required fields
+struct VariantRecord
+{
+ std::string reference;
+ std::string ref_sequence;
+ std::string alt_sequence;
+ size_t position;
+};
+
+typedef std::vector<VariantRecord> VariantRecordVector;
+typedef std::vector<int> IntVector;
+typedef std::vector<IntVector> IntVectorVector;
+
+// map from chromosome to a vector of buckets, one every 10kbp
+typedef std::map<std::string, IntVectorVector> VariantIndexMap;
+
+class VariantIndex
+{
+ public:
+ //
+ VariantIndex(const std::string& filename, const ReadTable& refTable);
+
+ // Return a vector of the variants that are close to the input variant
+ VariantRecordVector getNearVariants(const std::string& reference,
+ int position,
+ int distance = 30) const;
+
+ private:
+
+ //
+ void buildIndex(const ReadTable& refTable);
+
+ //
+ VariantRecordVector m_records;
+ VariantIndexMap m_map;
+
+ //
+ static const int BUCKET_SIZE = 10000;
+};
+
+#endif
diff --git a/src/bin/sga-align b/src/bin/sga-align
index 323a2ba..d8fcd3d 100755
--- a/src/bin/sga-align
+++ b/src/bin/sga-align
@@ -28,7 +28,10 @@ def runBWAIndex(inFile):
def runBWAAln(inFiles, contigsFile, outFile):
assert len(inFiles) == 2
-
+ if inFiles[0] == inFiles[1]:
+ print 'Error: the two reads files are the same'
+ sys.exit(1)
+
contigsBasename = getBasename(contigsFile)
tempSAI = map(lambda x : getBasename(x) + "." + contigsBasename + ".bwsai", inFiles)
diff --git a/src/bin/sga-astat.py b/src/bin/sga-astat.py
index aa2b807..819b6a2 100755
--- a/src/bin/sga-astat.py
+++ b/src/bin/sga-astat.py
@@ -75,9 +75,9 @@ contigData = list()
# Read the contig names and lengths from the bam
bamFile = pysam.Samfile(bamFilename, "rb")
-for (name, len) in zip(bamFile.references, bamFile.lengths):
- t = name, len, 0
- contigData.append(ContigData(name, len))
+for (name, length) in zip(bamFile.references, bamFile.lengths):
+ t = name, length, 0
+ contigData.append(ContigData(name, length))
#print 'Name: ' + name
#print 'Length: ' + str(len)
@@ -126,7 +126,7 @@ bootstrapLen = 0;
bootstrapReads = 0;
if genomeSize == 0:
- for i in range(0, numContigsForInitialEstimate):
+ for i in range(0, min(numContigsForInitialEstimate, len(contigData))):
cd = contigData[i]
bootstrapLen += cd.nlen
bootstrapReads += cd.n
diff --git a/src/bin/sga-preqc-report.py b/src/bin/sga-preqc-report.py
index 87758a6..4d45472 100755
--- a/src/bin/sga-preqc-report.py
+++ b/src/bin/sga-preqc-report.py
@@ -2,6 +2,8 @@
"""Generate a readable report from preqc output.
"""
+from __future__ import print_function, division
+
import sys, os.path
import matplotlib as MPL
import argparse
@@ -73,17 +75,17 @@ def main(argv=None):
# misc: listing available plots
if( args.list_plots ):
plot_funcs, plotsample_funcs = _get_available_plot_functions()
- print "Available plots:"
- print '\n'.join(['\t'+k for k in sorted(plot_funcs)])
- print "Available per-sample plots:"
- print '\n'.join(['\t'+k for k in sorted(plotsample_funcs)])
+ print("Available plots:")
+ print('\n'.join(['\t'+k for k in sorted(plot_funcs)]))
+ print("Available per-sample plots:")
+ print('\n'.join(['\t'+k for k in sorted(plotsample_funcs)]))
sys.exit(0)
# main mode
output_pfx = args.output
preqc_files = args.preqc_file
- print 'output_pfx: ', output_pfx
- print 'preqc_files:'
- print '\t' + '\n\t'.join(preqc_files)
+ print('output_pfx: ', output_pfx)
+ print('preqc_files:')
+ print('\t' + '\n\t'.join(preqc_files))
# load data
data = load_preqc_datafiles(preqc_files)
# make the report
@@ -670,7 +672,7 @@ def plotsample_gc_distribution(ax, d):
ax.set_title(d['name'] + ' GC Bias')
ax.set_xlabel("GC %")
ax.set_ylabel("k-mer coverage")
- return True
+ return True
else:
# no data, draw an empty plot
return _finish_plot(ax, [], 0, 'No GC data for %s' % d['name'])
@@ -683,15 +685,21 @@ def load_preqc_datafiles(preqc_files):
also assign colors and markers"""
# load the data
data = []
+ deserial_fail = []
for f in preqc_files:
f = os.path.abspath(f)
if( os.path.getsize(f) <= 0 ):
- print "Warning: empty file '%s' ... skipping"%f
+ print("Warning: empty file '%s' ... skipping"%f)
continue
if( f in [d['file'] for d in data] ):
- print "Warning: duplicate file '%s' ... skipping"%f
+ print("Warning: duplicate file '%s' ... skipping"%f)
+ continue
+ try:
+ deserial = json.load(open(f, 'r'))
+ except ValueError:
+ deserial_fail.append(f)
continue
- data.append(json.load(open(f, 'r')))
+ data.append(deserial)
data[-1]['file'] = f
# create unique names for each entry
names = unique_names_from_filenames([d['file'] for d in data], splitext=True)
@@ -701,6 +709,8 @@ def load_preqc_datafiles(preqc_files):
data[i]['name'] = names[i]
data[i]['plot_color'] = plot_colors[i%len(plot_colors)]
data[i]['plot_marker'] = PLOT_MARKERS[i%len(PLOT_MARKERS)]
+ for failed in deserial_fail:
+ print("Warning: failed to de-serialize file:", failed)
return data
@@ -754,7 +764,7 @@ def make_report_with_subplots(output_pfx, data, save_png=False, pylab_show=False
plot_legend(subplots[2][0], data)
# Coverage plots
plot_kmer_distribution(subplots[2][2], data, use_markers=False, legend_loc=None)
- #plot_random_walk(subplots[2][4], data, use_markers=False, legend_loc=None) #@jts deprecated
+ #plot_random_walk(subplots[2][4], data, use_markers=False, legend_loc=None) #@jts deprecated
# use the rest of the subplots for gc
gc_subplots = []
@@ -821,7 +831,8 @@ def make_report_plot_per_page(plots, output_pfx, data, save_png=False, pylab_sho
func = plotsample_funcs[plotname]
data_list = data
else: # unrecogized, skip
- print >>sys.stderr, "Warning: plot name '"+plotname+"' not recognized ... skipping"
+ print("Warning: plot name '"+plotname+"' not recognized ... skipping",
+ file=sys.stderr)
continue
# loop over each sample if needed, else d == data (data_list=[data])
@@ -831,7 +842,7 @@ def make_report_plot_per_page(plots, output_pfx, data, save_png=False, pylab_sho
if( len(data_list) > 1 ):
figname += '_'+d['name'].replace('/','_').replace('\\','_')
# make the fig
- print 'plotting:',figname
+ print('plotting:',figname)
fig, subplots = _create_fig_and_subplots(figname, 1, 1)
# plot
rv = func(subplots[0], d)
diff --git a/src/bin/sga-variant-filters.pl b/src/bin/sga-variant-filters.pl
deleted file mode 100755
index 6f8f964..0000000
--- a/src/bin/sga-variant-filters.pl
+++ /dev/null
@@ -1,115 +0,0 @@
-#! /usr/bin/perl
-# Perform filtering of SGA variant calls
-
-use strict;
-use Getopt::Long;
-use File::Basename;
-
-my $dbsnp_path = ""; # Filter variants against dbSNP at the given directory
-my $sga_file = "";
-my $dust_cutoff = 2.0;
-my $strand_cutoff = 2.0;
-my $hplen_cutoff = 7;
-
-GetOptions("dbsnp=s" => \$dbsnp_path,
- "sga=s" => \$sga_file);
-
-die("The --sga option is mandatory") if($sga_file eq "");
-my %non_dbsnp_sites;
-my $using_dbsnp = 0;
-if($dbsnp_path ne "") {
- $using_dbsnp = 1;
- load_nondbsnp_sites($sga_file, $dbsnp_path);
-}
-
-filter_annotations($sga_file);
-
-sub filter_annotations
-{
- my($in) = @_;
- my $base = basename($in, ".vcf");
- my $outname = "$base.filters.vcf";
- open(OUT, ">$outname") || die("Cannot open $outname");
- open(IN, $in) || die("Cannot open $in");
- my $total_out = 0;
- my %seen_hash;
-
- while(<IN>) {
- # Print header lines
- if(/^#/) {
- print OUT;
- next;
- }
-
- chomp;
-
- # If this call is exactly the same as a previous call, ignore it
- my @f = split;
- my $key = sprintf("%s.%d.%s.%s", $f[0], $f[1], $f[3], $f[4]);
- if(defined($seen_hash{$key})) {
- next;
- } else {
- $seen_hash{$key} = 1;
- }
-
- my @filter_reason;
- my ($dust) = (/Dust=(\d+\.\d+)/);
- if($dust > $dust_cutoff) {
- push @filter_reason, "LowComplexity";
- }
-
- my ($strand) = (/SB=(\d+\.\d+)/);
- if($strand > $strand_cutoff) {
- push @filter_reason, "StrandBias";
- }
-
- my ($hplen) = (/HPLen=(\d+)/);
- if($hplen > $hplen_cutoff) {
- push @filter_reason, "Homopolymer";
- }
-
- if($using_dbsnp && !defined($non_dbsnp_sites{$key})) {
- push @filter_reason, "dbSNP";
- }
-
- if(scalar(@filter_reason) > 0) {
- $f[6] = join(";", @filter_reason);
- } else {
- $total_out++;
- }
- print OUT join("\t", @f) . "\n";
- }
-
- close(IN);
- close(OUT);
-
- print "$total_out calls passed all filters\n";
- return $outname;
-}
-
-# Build a hash of calls that are NOT present in dbsnp
-sub load_nondbsnp_sites
-{
- my($in, $path) = @_;
- my $out = "$in.dbsnp_filtered.vcf";
-
- # sort, bgzip and tabix the file
- system("cat $in | vcf-sort > $in.tmp.sorted.vcf");
- system("bgzip -f $in.tmp.sorted.vcf");
- system("vcf tabix -f -p vcf $in.tmp.sorted.vcf.gz");
-
- # find the non-dbsnp sites
- open(SITES, "vcf isec -C $in.tmp.sorted.vcf.gz $path |");
- while(<SITES>) {
- chomp;
- my @fields = split;
- my $site_key = "$fields[0].$fields[1].$fields[2].$fields[3]";
- $non_dbsnp_sites{$site_key} = 1;
- }
- close(SITES);
-
- # Cleanup tmp
- unlink("$in.tmp.sorted.vcf");
- unlink("$in.tmp.sorted.vcf.gz");
- unlink("$in.tmp.sorted.vcf.gz.tbi");
-}
diff --git a/src/configure.ac b/src/configure.ac
index 73a1559..f774efc 100644
--- a/src/configure.ac
+++ b/src/configure.ac
@@ -1,7 +1,7 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT(sga, 0.10.13, js18 at sanger.ac.uk)
+AC_INIT(sga, 0.10.14, js18 at sanger.ac.uk)
AM_INIT_AUTOMAKE(foreign)
AC_CONFIG_SRCDIR([SGA/sga.cpp])
AC_CONFIG_HEADER([config.h])
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/sga.git
More information about the debian-med-commit
mailing list