[med-svn] [sga] 01/05: Imported Upstream version 0.10.14

Mon Jan 11 12:36:19 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch debian
in repository sga.

commit 592d36b4c8e8c3d7a8245ebbec1e036e8d567656
Author: Andreas Tille <tille at debian.org>
Date:   Thu Dec 31 13:18:33 2015 +0100

    Imported Upstream version 0.10.14
---
 README.md                                  |   2 +-
 src/.gitignore                             |   9 +
 src/Algorithm/ErrorCorrectProcess.cpp      |  20 +-
 src/Algorithm/ErrorCorrectProcess.h        |   3 +
 src/Algorithm/KmerOverlaps.cpp             |  35 +-
 src/Algorithm/KmerOverlaps.h               |   1 +
 src/Algorithm/QCProcess.cpp                |  24 +-
 src/Algorithm/QCProcess.h                  |   2 +
 src/GraphDiff/DeBruijnHaplotypeBuilder.cpp | 145 ++++-
 src/GraphDiff/DeBruijnHaplotypeBuilder.h   |   8 +
 src/GraphDiff/DindelRealignWindow.cpp      | 293 ++++------
 src/GraphDiff/DindelRealignWindow.h        |  19 +-
 src/GraphDiff/DindelUtil.cpp               | 175 +++---
 src/GraphDiff/DindelUtil.h                 |   7 -
 src/GraphDiff/GraphCompare.cpp             |  54 +-
 src/GraphDiff/GraphCompare.h               |   4 -
 src/GraphDiff/HapgenUtil.cpp               | 257 +++++++--
 src/GraphDiff/HapgenUtil.h                 |  59 +-
 src/SGA/Makefile.am                        |  16 +-
 src/SGA/SGACommon.h                        |   3 +-
 src/SGA/correct.cpp                        |  43 +-
 src/SGA/filter.cpp                         |  16 +-
 src/SGA/fm-merge.cpp                       |   4 +-
 src/SGA/gen-ssa.cpp                        |  24 +-
 src/SGA/graph-concordance.cpp              | 487 ++++++++++++++++
 src/SGA/graph-concordance.h                |  23 +
 src/SGA/graph-diff.cpp                     |  90 ++-
 src/SGA/index.cpp                          |  42 +-
 src/SGA/kmer-count.cpp                     | 291 ++++++++++
 src/SGA/kmer-count.h                       |   8 +
 src/SGA/{overlap.cpp => overlap-long.cpp}  | 260 +++++----
 src/SGA/overlap-long.h                     |  26 +
 src/SGA/overlap.cpp                        |  19 +-
 src/SGA/preprocess.cpp                     |  40 +-
 src/SGA/preqc.cpp                          |  70 ++-
 src/SGA/rmdup.cpp                          |   2 +-
 src/SGA/sga.cpp                            |  53 +-
 src/SGA/somatic-variant-filters.cpp        | 864 +++++++++++++++++++++++++++++
 src/SGA/somatic-variant-filters.h          |  23 +
 src/SQG/ASQG.cpp                           |  11 +
 src/SQG/ASQG.h                             |   7 +
 src/SuffixTools/BWTIndexSet.h              |   3 +-
 src/SuffixTools/SparseGapArray.h           |   2 +-
 src/Thirdparty/multiple_alignment.cpp      |  82 +++
 src/Thirdparty/multiple_alignment.h        |   7 +
 src/Thirdparty/overlapper.cpp              | 299 ++++++----
 src/Thirdparty/overlapper.h                |  33 +-
 src/Util/Makefile.am                       |   1 +
 src/Util/SGAStats.h                        |   2 +-
 src/Util/SeqReader.cpp                     |   8 +-
 src/Util/SeqReader.h                       |   2 +-
 src/Util/VCFUtil.cpp                       |  44 +-
 src/Util/VCFUtil.h                         |  10 +-
 src/Util/VariantIndex.cpp                  |  96 ++++
 src/Util/VariantIndex.h                    |  59 ++
 src/bin/sga-align                          |   5 +-
 src/bin/sga-astat.py                       |   8 +-
 src/bin/sga-preqc-report.py                |  39 +-
 src/bin/sga-variant-filters.pl             | 115 ----
 src/configure.ac                           |   2 +-
 60 files changed, 3422 insertions(+), 934 deletions(-)

diff --git a/README.md b/README.md
index 9105041..b3887b5 100644
--- a/README.md
+++ b/README.md
@@ -8,4 +8,4 @@ For installation and usage instructions see [src/README](src/#readme)
 
 For running examples see src/examples and the [sga wiki](https://github.com/jts/sga/wiki)
 
-For questions or support contact jared.simpson --at-- sanger.ac.uk
+For questions or support contact jared.simpson --at-- oicr.on.ca
diff --git a/src/.gitignore b/src/.gitignore
index 5197b27..7a74d70 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -12,3 +12,12 @@ build*
 *.o
 *.a
 Makefile
+
+config.h
+config.log
+config.status
+stamp-h1
+compile
+.deps
+
+SGA/sga
\ No newline at end of file
diff --git a/src/Algorithm/ErrorCorrectProcess.cpp b/src/Algorithm/ErrorCorrectProcess.cpp
index 07fff59..ffcc99d 100644
--- a/src/Algorithm/ErrorCorrectProcess.cpp
+++ b/src/Algorithm/ErrorCorrectProcess.cpp
@@ -15,6 +15,7 @@
 #include "StringThreader.h"
 
 //#define KMER_TESTING 1
+//#define OVERLAPCORRECTION_VERBOSE 1
 
 //
 //
@@ -179,12 +180,19 @@ ErrorCorrectResult ErrorCorrectProcess::overlapCorrectionNew(const SequenceWorkI
                                                                                     m_params.indices);
 
         multiple_alignment.filterByCount(m_params.conflictCutoff);
-        
+#ifdef OVERLAPCORRECTION_VERBOSE
+        printf("---> MAF after conflict\n");
+        multiple_alignment.print(200);
+        printf("---> PILEUP\n");
+        multiple_alignment.printPileup();
+        printf("---> DONE\n\n");
+#endif
+
         bool last_round = (round == num_rounds - 1);
         if(last_round)
-            consensus = multiple_alignment.calculateBaseConsensus(10000, 0);
+	  consensus = multiple_alignment.calculateBaseConsensusMinCoverage(m_params.base_threshold, 0, m_params.min_count_max_base);
         else
-            current_sequence = multiple_alignment.calculateBaseConsensus(10000, 0);
+	  current_sequence = multiple_alignment.calculateBaseConsensusMinCoverage(m_params.base_threshold, 0, m_params.min_count_max_base);
 
         if(m_params.printOverlaps)
             multiple_alignment.print(200);
@@ -326,13 +334,13 @@ ErrorCorrectResult ErrorCorrectProcess::kmerCorrection(const SequenceWorkItem& w
                 int threshold = CorrectionThresholds::Instance().getRequiredSupport(phred);
 
                 int left_k_idx = (i + 1 >= m_params.kmerLength ? i + 1 - m_params.kmerLength : 0);
-                corrected = attemptKmerCorrection(i, left_k_idx, std::max(countVector[left_k_idx], threshold), readSequence);
+                corrected = attemptKmerCorrection(i, left_k_idx, std::max(countVector[left_k_idx] + m_params.countOffset, threshold), readSequence);
                 if(corrected)
                     break;
 
                 // base was not corrected, try using the rightmost covering kmer
                 size_t right_k_idx = std::min(i, n - m_params.kmerLength);
-                corrected = attemptKmerCorrection(i, right_k_idx, std::max(countVector[right_k_idx], threshold), readSequence);
+                corrected = attemptKmerCorrection(i, right_k_idx, std::max(countVector[right_k_idx] + m_params.countOffset, threshold), readSequence);
                 if(corrected)
                     break;
             }
@@ -386,7 +394,7 @@ bool ErrorCorrectProcess::attemptKmerCorrection(size_t i, size_t k_idx, size_t m
 #if KMER_TESTING
         printf("%c %zu\n", currBase, count);
 #endif
-        if(count > bestCount && count >= minCount)
+        if(count >= minCount)
         {
             // Multiple corrections exist, do not correct
             if(bestBase != '$')
diff --git a/src/Algorithm/ErrorCorrectProcess.h b/src/Algorithm/ErrorCorrectProcess.h
index 7d6aa24..9043ed7 100644
--- a/src/Algorithm/ErrorCorrectProcess.h
+++ b/src/Algorithm/ErrorCorrectProcess.h
@@ -47,6 +47,8 @@ struct ErrorCorrectParameters
 
     // Overlap-based corrector params
     int minOverlap;
+    int min_count_max_base;
+    int base_threshold;
     int numOverlapRounds;
     double minIdentity;
     int conflictCutoff;
@@ -55,6 +57,7 @@ struct ErrorCorrectParameters
     // k-mer based corrector params
     int numKmerRounds;
     int kmerLength;
+    int countOffset;
 
     // output options
     bool printOverlaps;
diff --git a/src/Algorithm/KmerOverlaps.cpp b/src/Algorithm/KmerOverlaps.cpp
index 9a01960..0a541f4 100644
--- a/src/Algorithm/KmerOverlaps.cpp
+++ b/src/Algorithm/KmerOverlaps.cpp
@@ -14,6 +14,8 @@
 #include "Timer.h"
 #include "Verbosity.h"
 
+//#define OVERLAPCORRECTION_VERBOSE 1
+
 //
 MultipleAlignment KmerOverlaps::buildMultipleAlignment(const std::string& query, 
                                                        size_t k,
@@ -22,11 +24,29 @@ MultipleAlignment KmerOverlaps::buildMultipleAlignment(const std::string& query,
                                                        int bandwidth,
                                                        const BWTIndexSet& indices)
 {
+#ifdef OVERLAPCORRECTION_VERBOSE
+    printf("---> buildMultipleAlignment for query: %s\n",query.c_str());
+    printf("---> parameters: min_overlap %d minID %f   k %d\n",min_overlap, min_identity, k);
+#endif
+
     SequenceOverlapPairVector overlap_vector = retrieveMatches(query, k, min_overlap, min_identity, bandwidth, indices);
     MultipleAlignment multiple_alignment;
     multiple_alignment.addBaseSequence("query", query, "");
-    for(size_t i = 0; i < overlap_vector.size(); ++i)
+    for(size_t i = 0; i < overlap_vector.size(); ++i) {
         multiple_alignment.addOverlap("null", overlap_vector[i].sequence[1], "", overlap_vector[i].overlap);
+#ifdef OVERLAPCORRECTION_VERBOSE
+        printf("---> \tadd to maf: sequence %s  \n", overlap_vector[i].sequence[1].c_str());
+#endif
+	  }
+
+#ifdef OVERLAPCORRECTION_VERBOSE
+    printf("---> MAF\n");
+    multiple_alignment.print(200);
+    printf("---> PILEUP\n");
+    multiple_alignment.printPileup();
+    printf("---> DONE\n\//n");
+#endif
+
     return multiple_alignment;
 }
 
@@ -82,6 +102,8 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
 
     int64_t max_interval_size = 200;
     SequenceOverlapPairVector overlap_vector;
+    if(query.size() < k)
+        return overlap_vector;
 
     // Use the FM-index to look up intervals for each kmer of the read. Each index
     // in the interval is stored individually in the KmerMatchMap. We then
@@ -162,7 +184,14 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
     // Use the overlaps that meet the thresholds to build a multiple alignment
     for(KmerMatchSet::iterator iter = matches.begin(); iter != matches.end(); ++iter)
     {
-        std::string match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
+        // If a read table is available in the index, use it to get the match sequence
+        // Otherwise get it from the BWT, which is slower
+        std::string match_sequence;
+        if(indices.pReadTable != NULL)
+            match_sequence = indices.pReadTable->getRead(iter->index).seq.toString();
+        else
+            match_sequence = BWTAlgorithms::extractString(indices.pBWT, iter->index);
+
         if(iter->is_reverse)
             match_sequence = reverseComplement(match_sequence);
         
@@ -197,6 +226,7 @@ SequenceOverlapPairVector KmerOverlaps::retrieveMatches(const std::string& query
             SequenceOverlapPair op;
             op.sequence[0] = query;
             op.sequence[1] = match_sequence;
+            op.match_idx = iter->index;
             op.overlap = overlap;
             op.is_reversed = iter->is_reverse;
             overlap_vector.push_back(op);
@@ -436,6 +466,7 @@ void _approximateSeededMatch(const std::string& in_query,
                 SequenceOverlapPair op;
                 op.sequence[0] = in_query;
                 op.sequence[1] = match_sequence;
+                op.match_idx = start_index_of_read;
                 op.overlap = overlap;
                 op.is_reversed = do_reverse;
                 out_vector.push_back(op);
diff --git a/src/Algorithm/KmerOverlaps.h b/src/Algorithm/KmerOverlaps.h
index 63dd9af..5085091 100644
--- a/src/Algorithm/KmerOverlaps.h
+++ b/src/Algorithm/KmerOverlaps.h
@@ -18,6 +18,7 @@
 struct SequenceOverlapPair
 {
     std::string sequence[2];
+    int match_idx;
     bool is_reversed;
     SequenceOverlap overlap;
 
diff --git a/src/Algorithm/QCProcess.cpp b/src/Algorithm/QCProcess.cpp
index f526998..c079fb0 100644
--- a/src/Algorithm/QCProcess.cpp
+++ b/src/Algorithm/QCProcess.cpp
@@ -13,19 +13,14 @@
 //
 struct KmerWindow
 {
-    int64_t getCount() const
+    int64_t getCount(bool bothStrand) const
     {
-        int64_t count = 0;
-        if(fwdIntervals.interval[0].isValid())
-        {
-            count += fwdIntervals.interval[0].size();
-        }
-        
-        if(rcIntervals.interval[0].isValid())
-        {
-            count += rcIntervals.interval[0].size();
-        }
-        return count;
+        int64_t fwdCount = (fwdIntervals.interval[0].isValid()) ? fwdIntervals.interval[0].size() : 0;
+        int64_t rcCount = (rcIntervals.interval[0].isValid()) ? rcIntervals.interval[0].size() : 0;
+        if (bothStrand)
+            return std::min(fwdCount,rcCount);
+        else
+            return fwdCount + rcCount;
     }
 
     int start;
@@ -144,7 +139,7 @@ bool QCProcess::performKmerCheck(const SequenceWorkItem& workItem)
             if(window.rcIntervals.interval[1].isValid())
                 BWTAlgorithms::updateBothR(window.rcIntervals, cb, m_params.pBWT);
 
-            int64_t count = window.getCount();
+            int64_t count = window.getCount(m_params.kmerBothStrand);
             if(count <= threshold)
             {
                 // The extended kmer didn't meet the threshold
@@ -186,7 +181,7 @@ bool QCProcess::performKmerCheck(const SequenceWorkItem& workItem)
         }
 
         // Check the count of this interval
-        int64_t count = window.getCount();
+        int64_t count = window.getCount(m_params.kmerBothStrand);
             
         if(count <= threshold)
         {
@@ -235,6 +230,7 @@ DuplicateCheckResult QCProcess::performDuplicateCheck(const SequenceWorkItem& wo
 
     // Calculate the canonical index for this string - the lowest
     // value in the two lexicographic index
+    assert(fwdIntervals.interval[0].isValid() || rcIntervals.interval[0].isValid());
     int64_t fi = fwdIntervals.interval[0].isValid() ? fwdIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
     int64_t ri = rcIntervals.interval[0].isValid() ? rcIntervals.interval[0].lower : std::numeric_limits<int64_t>::max();
     int64_t canonicalIdx = std::min(fi, ri);
diff --git a/src/Algorithm/QCProcess.h b/src/Algorithm/QCProcess.h
index 50d7ba9..ecb87f8 100644
--- a/src/Algorithm/QCProcess.h
+++ b/src/Algorithm/QCProcess.h
@@ -26,6 +26,7 @@ struct QCParameters
     {
         checkDuplicates = true;
         checkKmer = true;
+	kmerBothStrand = false;
         checkHPRuns = true;
         checkDegenerate = true;
         verbose = 0;
@@ -53,6 +54,7 @@ struct QCParameters
     // Control parameters
     bool checkDuplicates;
     bool checkKmer;
+    bool kmerBothStrand;
     bool checkHPRuns;
     bool checkDegenerate;
     bool substringOnly;
diff --git a/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp b/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
index 10262fb..19f5934 100644
--- a/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
+++ b/src/GraphDiff/DeBruijnHaplotypeBuilder.cpp
@@ -44,13 +44,12 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
     PROFILE_FUNC("GraphCompare::buildVariantStringGraph")
     assert(!m_startingKmer.empty());
 
-    std::map<std::string, int> kmerCountMap;
-
     // We search until we find the first common vertex in each direction
-    size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
+    //size_t MIN_TARGET_COUNT = m_parameters.bReferenceMode ? 1 : 2;
     size_t MAX_ITERATIONS = 2000;
     size_t MAX_SIMULTANEOUS_BRANCHES = 40;
     size_t MAX_TOTAL_BRANCHES = 50;
+    int MAX_DISTANCE = 100;
 
     // Tracking stats
     size_t max_simul_branches_used = 0;
@@ -84,9 +83,11 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
 
         // Calculate de Bruijn extensions for this node
         std::string vertStr = curr.pVertex->getSeq().toString();
-        AlphaCount64 extensionCounts = BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);
+        AlphaCount64 extensionCounts = 
+            BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, curr.direction);
 
-        std::string extensionsUsed;
+        // Count valid extensions
+        std::string extensions;
         for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
         {
             char b = DNA_ALPHABET::getBase(i);
@@ -95,9 +96,17 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
             if(!acceptExt)
                 continue;
 
-            extensionsUsed.push_back(b);
+            extensions.push_back(b);
+        }
+
+        // Do not allow join vertices to branch
+        if(curr.pVertex->getColor() == GC_RED && extensions.size() > 1)
+            continue;
+
+        for(size_t i = 0; i < extensions.size(); ++i)
+        {
+            char b = extensions[i];
             std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, b, curr.direction);
-            kmerCountMap[newStr] = count;
 
             // Create the new vertex and edge in the graph
             // Skip if the vertex already exists
@@ -112,28 +121,33 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
             // Add edges
             VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, curr.pVertex, pVertex, curr.direction);
             
-            // Check if this sequence is present in the FM-index of the target
+            // Check if this sequence is present in the reference genome
             // If so, it is the join point of the de Bruijn graph and we extend no further.
-            size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);
+            size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.referenceIndex);
 
-            if(targetCount >= MIN_TARGET_COUNT)
+            if(targetCount == 1)
             {
                 if(curr.direction == ED_SENSE)
                     sense_join_vector.push_back(pVertex);
                 else
                     antisense_join_vector.push_back(pVertex);
+                pVertex->setColor(GC_RED);
             }
-            else
-            {
-                // Add the vertex to the extension queue
-                queue.push(BuilderExtensionNode(pVertex, curr.direction));
-            }
+
+            // Add the vertex to the extension queue
+            BuilderExtensionNode nextNode(pVertex, curr.direction, curr.distance + 1);
+            if(nextNode.distance < MAX_DISTANCE)
+                queue.push(nextNode);
         }
         
         // Update the total number of times we branches the search
-        if(!extensionsUsed.empty())
-            total_branches += extensionsUsed.size() - 1;
+        if(!extensions.empty())
+            total_branches += extensions.size() - 1;
     }
+    pGraph->setColors(GC_BLACK);
+ 
+    cullRedundantJoinVertices(antisense_join_vector, pGraph, !ED_ANTISENSE);
+    cullRedundantJoinVertices(sense_join_vector, pGraph, !ED_SENSE);
 
     // If the graph construction was successful, walk the graph
     // between the endpoints to make a string
@@ -157,3 +171,100 @@ HaplotypeBuilderReturnCode DeBruijnHaplotypeBuilder::run(StringVector& out_haplo
     delete pGraph;
     return HBRC_OK;
 }
+
+void DeBruijnHaplotypeBuilder::cullRedundantJoinVertices(std::vector<Vertex*>& join_vertices, StringGraph* pGraph, EdgeDir direction)
+{
+    pGraph->checkColors(GC_BLACK);
+    // Remove join nodes that lie on the unipath of each join vertex
+    for(size_t i = 0; i < join_vertices.size(); ++i)
+    {
+        std::vector<Vertex*> unipath(1, join_vertices[i]);
+
+        // Skip vertices already marked
+        if(unipath.back()->getColor() == GC_RED)
+            continue;
+
+        // Extend unipath
+        while(1)
+        {
+            EdgePtrVec edges = unipath.back()->getEdges(direction);
+            if(edges.size() != 1)
+                break;
+            unipath.push_back(edges[0]->getEnd());
+        }
+
+        // Mark the non-terminal vertices of the unipath
+        for(size_t j = 1; j < unipath.size(); ++j)
+            unipath[j]->setColor(GC_RED);
+    }
+
+    std::vector<Vertex*> trimmed;
+    for(size_t i = 0; i < join_vertices.size(); ++i)
+    {
+        if(join_vertices[i]->getColor() != GC_RED)
+            trimmed.push_back(join_vertices[i]);
+        join_vertices[i]->setColor(GC_BLACK);
+    }
+
+    pGraph->setColors(GC_BLACK);
+    join_vertices.swap(trimmed);
+}
+
+Vertex* DeBruijnHaplotypeBuilder::extendUnambiguously(Vertex* pStart, StringGraph* pGraph, 
+                                                      EdgeDir direction, size_t max_distance, size_t min_target_count)
+{
+    size_t distance = 0;
+    Vertex* pCurrent = pStart;
+
+    while(distance < max_distance)
+    {
+        std::string vertStr = pCurrent->getSeq().toString();
+        AlphaCount64 extensionCounts = 
+            BWTAlgorithms::calculateDeBruijnExtensionsSingleIndex(vertStr, m_parameters.variantIndex.pBWT, direction);
+        
+        // Count valid extensions
+        char ext_base = '\0';
+        size_t ext_count = 0;
+        size_t n_ext = 0;
+        for(size_t i = 0; i < DNA_ALPHABET::size; ++i)
+        {
+            char b = DNA_ALPHABET::getBase(i);
+            size_t count = extensionCounts.get(b);
+            if(count >= m_parameters.minDBGCount)
+            {
+                ext_base = b;
+                ext_count = count;
+                n_ext++;
+            }
+        }
+        
+        // Stop if no extension or ambiguous
+        if(n_ext != 1)
+            return pCurrent;
+
+        std::string newStr = VariationBuilderCommon::makeDeBruijnVertex(vertStr, ext_base, direction);
+        
+        // Stop if the vertex is already in the graph
+        if(pGraph->getVertex(newStr) != NULL)
+            return pCurrent;
+
+        // Stop if the vertex is not reprsented in the base sequence
+        size_t targetCount = BWTAlgorithms::countSequenceOccurrences(newStr, m_parameters.baseIndex);
+        if(targetCount < min_target_count)
+            return pCurrent;
+
+        // Create the new vertex and edge in the graph
+        // Allocate the new vertex and add it to the graph
+        Vertex* pNew = new(pGraph->getVertexAllocator()) Vertex(newStr, newStr);
+        pNew->setColor(GC_BLACK);
+        pGraph->addVertex(pNew);
+
+        // Add edges
+        VariationBuilderCommon::addSameStrandDeBruijnEdges(pGraph, pCurrent, pNew, direction);
+
+        pCurrent = pNew;
+        distance++;
+    }
+
+    return pCurrent;
+}
diff --git a/src/GraphDiff/DeBruijnHaplotypeBuilder.h b/src/GraphDiff/DeBruijnHaplotypeBuilder.h
index 46341fc..e7790b0 100644
--- a/src/GraphDiff/DeBruijnHaplotypeBuilder.h
+++ b/src/GraphDiff/DeBruijnHaplotypeBuilder.h
@@ -39,8 +39,16 @@ class DeBruijnHaplotypeBuilder
         // Returns true if the graph was successfully built between the two sequences
         HaplotypeBuilderReturnCode run(StringVector& out_haplotypes);
         
+        // Extend the graph along the unipath that pStart lies on
+        Vertex* extendUnambiguously(Vertex* pStart, StringGraph* pGraph, 
+                                    EdgeDir dir, size_t max_distance, size_t min_target_count);
+
+
     private:
 
+        // Remove join vertices that lie on the unipath to some other join vertex
+        void cullRedundantJoinVertices(std::vector<Vertex*>& join_vertices, StringGraph* pGraph, EdgeDir direction);
+
         //
         // Data
         //
diff --git a/src/GraphDiff/DindelRealignWindow.cpp b/src/GraphDiff/DindelRealignWindow.cpp
index a2e3e22..f3259fb 100644
--- a/src/GraphDiff/DindelRealignWindow.cpp
+++ b/src/GraphDiff/DindelRealignWindow.cpp
@@ -40,7 +40,6 @@
 
 const int DINDEL_DEBUG=0;
 const int QUIET=1;
-const int ADDSNPS=0;
 const int ALWAYS_REALIGN=1;
 const int DEBUG_CALLINDEL=0;
 const int REPOSITION_INDEL_WINDOW=1000;
@@ -148,17 +147,18 @@ SequenceOverlap localAlignmentToSequenceOverlap(const LocalAlignmentResult& loca
     return overlap;
 }
 
-SequenceOverlap computeHaplotypeAlignment(const std::string& base_sequence, const std::string& haplotype)
+SequenceOverlap computeHaplotypeAlignment(size_t k, const std::string& base_sequence, const std::string& haplotype)
 {
-    return HapgenUtil::alignHaplotypeToReference(base_sequence, haplotype);
+    return HapgenUtil::alignHaplotypeToReference(k, base_sequence, haplotype);
 }
 
 void addHaplotypeToMultipleAlignment(MultipleAlignment* pMA,
                                      const std::string& base_sequence, 
                                      const std::string& incoming_name, 
-                                     const std::string& incoming_sequence)
+                                     const std::string& incoming_sequence,
+                                     size_t k)
 {
-    SequenceOverlap overlap = computeHaplotypeAlignment(base_sequence, incoming_sequence);
+    SequenceOverlap overlap = computeHaplotypeAlignment(k, base_sequence, incoming_sequence);
     pMA->addOverlap(incoming_name, incoming_sequence, "", overlap);
 }
 
@@ -371,8 +371,10 @@ void DindelVariant::write(std::ostream & out) const
 
 void DindelHaplotype::alignHaplotype()
 {
-    // semi-globally align haplotypes to the first haplotype (arbitrary)
+    // align haplotypes to the reference
     const std::string& rootSequence = m_refMapping.refSeq;
+
+    // Change haplotype to the reference strand
     std::string alignSeq;
     if(m_refMapping.isRC)
         alignSeq = reverseComplement(m_seq);
@@ -385,7 +387,7 @@ void DindelHaplotype::alignHaplotype()
 
     m_pMA = new MultipleAlignment();
     m_pMA->addBaseSequence(ref_ss.str(), rootSequence, "");
-    SequenceOverlap hap_overlap = computeHaplotypeAlignment(rootSequence, alignSeq);
+    SequenceOverlap hap_overlap = computeHaplotypeAlignment(m_assembly_k, rootSequence, alignSeq);
     m_pMA->addOverlap("haplotype", alignSeq, "", hap_overlap);
     m_firstAlignedBase = hap_overlap.match[1].start;
 
@@ -393,7 +395,6 @@ void DindelHaplotype::alignHaplotype()
     {
         std::cout << "DindelHaplotype::alignHaplotype:\n";
         m_pMA->print(500);
-        //std::cout << "DindelHaplotype::DindelHaplotype globalAlignmentCigar " << alignSeq << " vs root: " << _ma.expandedCigar << std::endl;
     }
 }
 
@@ -410,7 +411,6 @@ void DindelHaplotype::extractVariants()
     m_refPos = std::vector<int>(m_seq.size(), 0);
     m_isReference = false;
 
-
     // Compute a mapping from a haplotype base to its aligned position on the reference sequence
     int first_aligned_column = m_pMA->getFirstAlignedBaseColumn(refRow, varRow);
     int last_aligned_column = m_pMA->getLastAlignedBaseColumn(refRow, varRow);
@@ -663,13 +663,15 @@ void DindelHaplotype::extractVariants()
     }
 }
 
-DindelHaplotype::DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping)
+DindelHaplotype::DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping, size_t assembly_k)
 {
     // initialize
     m_pMA = NULL;
     m_seq = haplotypeSequence;
     m_refMapping = refMapping;
     m_firstAlignedBase = 0;
+    m_assembly_k = assembly_k;
+
 #ifdef DEBUG_NEW
     std::cout << "refMapping " << m_refMapping.refName << " " 
               << m_refMapping.refStart 
@@ -707,6 +709,7 @@ void DindelHaplotype::copy(const DindelHaplotype & haplotype, int copyOptions)
    m_refMapping = haplotype.m_refMapping;
    m_isReference = haplotype.m_isReference;
    m_firstAlignedBase = haplotype.m_firstAlignedBase;
+   m_assembly_k = haplotype.m_assembly_k;
 
    if(haplotype.m_pMA != NULL)
        m_pMA = new MultipleAlignment(*haplotype.m_pMA);
@@ -824,7 +827,9 @@ int DindelHaplotype::getClosestDistance(const DindelVariant& variant, int hapPos
         std::cout << " haplotype variants: "; for (it = m_variant_to_pos.begin();it != m_variant_to_pos.end();it++) std::cout << " " << it->first; std::cout << std::endl;
         std::cout << " h2 variants:        "; for (size_t x=0;x<m_variants.size();x++) std::cout << " " << m_variants[x].getID(); std::cout << std::endl;
         return -1;
-    } else {
+    } 
+    else 
+    {
         int s = it->second.first;
         int e = it->second.second;
 
@@ -839,6 +844,7 @@ int DindelHaplotype::getClosestDistance(const DindelVariant& variant, int hapPos
             hapPos2 = t;
             */
         }
+
         // overlaps with variant?
         if (hapPosStartRead<=e && hapPosEndRead>=s)
         {
@@ -918,7 +924,9 @@ int DindelHaplotype::getHapBase(int refPos) const
 
  */
 
-DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence)
+DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, 
+                                           const std::string & haplotypeSequence,
+                                           size_t assembly_k)
 {
 #ifdef DEBUG_NEW
     std::cout << "There are " << referenceMappings.size() << " reference mappings.\n";
@@ -926,7 +934,7 @@ DindelMultiHaplotype::DindelMultiHaplotype(const std::vector< DindelReferenceMap
     m_referenceMappings = referenceMappings;
     
     for (size_t i = 0; i < m_referenceMappings.size(); ++i)
-        m_haplotypes.push_back(DindelHaplotype(haplotypeSequence, m_referenceMappings[i]));
+        m_haplotypes.push_back(DindelHaplotype(haplotypeSequence, m_referenceMappings[i], assembly_k));
 
     m_seq = haplotypeSequence;
 
@@ -1062,17 +1070,18 @@ void DindelSequenceHash::makeHash(const std::string & sequence)
  *
  */
 DindelWindow::DindelWindow(const std::vector<std::string> & haplotypeSequences,
-                           const std::vector<DindelReferenceMapping>  & referenceMappings)
+                           const std::vector<DindelReferenceMapping>  &referenceMappings,
+                           size_t assembly_k)
 {
 
     m_pHaplotype_ma = NULL;
     m_referenceMappings = referenceMappings;
     
     // filter out duplicate haplotype sequences.
-    initHaplotypes(haplotypeSequences, referenceMappings);
+    initHaplotypes(haplotypeSequences, referenceMappings, assembly_k);
 
     // do multiple alignment of all haplotypes to each other.
-    doMultipleHaplotypeAlignment();
+    doMultipleHaplotypeAlignment(assembly_k);
 }
 
 DindelWindow::DindelWindow(const DindelWindow & window)
@@ -1083,7 +1092,6 @@ DindelWindow::DindelWindow(const DindelWindow & window)
 void DindelWindow::copy(const DindelWindow & window)
 {
     m_haplotypes = window.m_haplotypes;
-    m_hashAltHaps = window.m_hashAltHaps;
     m_candHapAlgorithm = window.m_candHapAlgorithm;
     m_referenceMappings = window.m_referenceMappings;
 
@@ -1095,24 +1103,25 @@ void DindelWindow::copy(const DindelWindow & window)
 }
 
 void DindelWindow::initHaplotypes(const std::vector<std::string> & haplotypeSequences,
-                                  const std::vector<DindelReferenceMapping>  & referenceMappings)
+                                  const std::vector<DindelReferenceMapping>  & referenceMappings,
+                                  size_t assembly_k)
 {
     for(size_t i = 0; i < haplotypeSequences.size(); ++i)
     {
-        if(m_hashAltHaps.find(haplotypeSequences[i]) == m_hashAltHaps.end())
-        {
-            if (DINDEL_DEBUG_3)
-                std::cout << "DindelWindow::MultiHaplotype " << m_haplotypes.size() << "\n";
-            m_haplotypes.push_back(DindelMultiHaplotype(referenceMappings, haplotypeSequences[i]));
-        }
+        if (DINDEL_DEBUG_3)
+            std::cout << "DindelWindow::MultiHaplotype " << m_haplotypes.size() << "\n";
+        m_haplotypes.push_back(DindelMultiHaplotype(referenceMappings, haplotypeSequences[i], assembly_k));
     }
 }
 
-void DindelWindow::doMultipleHaplotypeAlignment()
+void DindelWindow::doMultipleHaplotypeAlignment(size_t assembly_k)
 {
-    // globally align haplotypes to the first haplotype (arbitrary)
+    // align haplotypes to the first haplotype, which is the reference sequence
     std::vector<MAlignData> maVector;
 
+    // ensure that the first haplotype is the reference
+    assert(m_referenceMappings.front().refSeq.size() == m_haplotypes.front().getSequence().size());
+
     assert(m_haplotypes.size() >= 1);
 
     m_pHaplotype_ma = new MultipleAlignment;
@@ -1123,14 +1132,13 @@ void DindelWindow::doMultipleHaplotypeAlignment()
     {
         std::stringstream ss; 
         ss << "haplotype-" << h;
-        addHaplotypeToMultipleAlignment(m_pHaplotype_ma, rootSequence, ss.str(), m_haplotypes[h].getSequence());
+        addHaplotypeToMultipleAlignment(m_pHaplotype_ma, rootSequence, ss.str(), m_haplotypes[h].getSequence(), assembly_k);
     }
 
     if (DINDEL_DEBUG_3)
     {
         std::cout << "DindelWindow::doMultipleHaplotypeAlignment:\n";
         m_pHaplotype_ma->print(500);
-
     }
 }
 
@@ -1276,22 +1284,9 @@ void DindelRealignWindowResult::Inference::outputAsVCF(const DindelVariant & var
     hps.insert(var.getHPLen());
     int hp = *hps.rbegin();
 
-    // calculate the dust score around a 100bp window centered at the start of the variant
-    // 64-bp is the window size used in the SDUST paper
-    int dw_size = 64;
-    int dw_start = var.getPos() - dw_size/2;
-    dw_start = std::max(0, dw_start); // clamp to 0
-
-    int dw_end = var.getPos() + dw_size/2;
-    const DNAString& chromosome = pRefTable->getRead(var.getChrom()).seq;
-    dw_end = std::min(dw_end, (int)chromosome.length());
-
-    double dust_score = 0.0f;
-    if(dw_end - dw_start == dw_size)
-    {
-        std::string ref_dust_window = chromosome.substr(dw_start, dw_end - dw_start);
-        dust_score = calculateDustScore(ref_dust_window);
-    }
+    double dust_score = HapgenUtil::calculateDustScoreAtPosition(var.getChrom(),
+                                                                 var.getPos(),
+                                                                 pRefTable);
 
     // Apply filters
     std::string filter="NoCall";
@@ -1577,21 +1572,22 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
 
     const int DRHA=0;
 
-    if (DINDEL_DEBUG &&DRHA) std::cout << std::endl <<  "====> START 1 DindelRealignWindow::computeReadHaplotypeAlignment " << read.getID() << std::endl;
+    if (DINDEL_DEBUG && DRHA) 
+        std::cout << std::endl <<  "====> START 1 DindelRealignWindow::computeReadHaplotypeAlignment " << read.getID() << std::endl;
     // scenarios
     /* Capital R's are matches, r's are mismatches
-    1. The seeds are too long to detect the possible gapped alignment of read to haplotype
-           HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
-                     rrrrrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
-        which should be
-           HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
-                  RRR   rrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
+       1. The seeds are too long to detect the possible gapped alignment of read to haplotype
+       HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+       rrrrrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
+       which should be
+       HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+       RRR   rrrrRRRRRRRRRRRRRRRRRRRRRRRRRRR
 
 
-    2. Deletion in read
-           HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
-                   RRRRRRR      RRRRRRRRRRRRRRRRRRRRRRRR
-    */
+       2. Deletion in read
+       HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
+       RRRRRRR      RRRRRRRRRRRRRRRRRRRRRRRR
+     */
 
     // start and end are inclusive
     // start may be <0, and end may be > haplotype length
@@ -1604,20 +1600,21 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
 
     if (DINDEL_DEBUG && DRHA)
     {
-	    std::cout << std::endl;
-	    std::cout << "HAPLOTYPE: " << haplotype.getSequence() << " isREF " << haplotype.isReference() << std::endl;
-	    std::string pad;
-	    std::string rseq = read.getSequence();
-	    if (start>=0)
-	    {
-		    pad = std::string(start, ' ');
-	    }
-	    else {
-		   rseq = read.getSequence().substr(-start, read.length());
-	    }
-
-	    std::cout << "READ:      " << pad << rseq << std::endl;
-	    std::cout << "DLEN: " << dlen << " RLEN: " << rlen << " hlen: " << hlen << " start: " << start << " end: " << end << " read_index: " << readIndex << " " << read.getID() <<  std::endl;
+        std::cout << std::endl;
+        std::cout << "HAPLOTYPE: " << haplotype.getSequence() << " isREF " << haplotype.isReference() << std::endl;
+        std::string pad;
+        std::string rseq = read.getSequence();
+        if (start>=0)
+        {
+            pad = std::string(start, ' ');
+        }
+        else 
+        {
+            rseq = read.getSequence().substr(-start, read.length());
+        }
+
+        std::cout << "READ:      " << pad << rseq << std::endl;
+        std::cout << "DLEN: " << dlen << " RLEN: " << rlen << " hlen: " << hlen << " start: " << start << " end: " << end << " read_index: " << readIndex << " " << read.getID() <<  std::endl;
     }
 
     int nmm=-1; // number of mismatches between read and candidate haplotype
@@ -1632,57 +1629,62 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
 
         double ll=0.0;
         int b=0;
-    	double all_match = 0.0;
-	    nmm=0;
-        if (!rcReadSeq) {
-	    for (int x=start;x<=end;x++,b++)
+        double all_match = 0.0;
+        nmm=0;
+        if (!rcReadSeq) 
+        {
+            for (int x=start;x<=end;x++,b++)
             {
-		bool match = true;
-		if (x>=0 && x<=hlen-1)
-		{
-			match = (read.getBase(b) == haplotype.getSequence()[x])?true:false;
-		}
-		if (match) ll += lpCorrect[b]; else {
-		    if (DINDEL_DEBUG) std::cout << " mm " << b << " lpError: " << lpError[b] << std::endl;
-                    ll += lpError[b]; nmm++;
-                    if (countMismatch && lpCorrect[b]>minLpCorrect) mismatches.push_back(Mismatch(x, read.getBase(b)));
+                bool match = true;
+                if (x>=0 && x<=hlen-1)
+                {
+                    match = (read.getBase(b) == haplotype.getSequence()[x])?true:false;
+                }
+                if (match) 
+                    ll += lpCorrect[b]; 
+                else 
+                {
+                    if (DINDEL_DEBUG) 
+                        std::cout << " mm " << b << " lpError: " << lpError[b] << std::endl;
+                    ll += lpError[b]; 
+                    nmm++;
+                    if (countMismatch && lpCorrect[b]>minLpCorrect) 
+                        mismatches.push_back(Mismatch(x, read.getBase(b)));
+                }
+                all_match += lpCorrect[b];
+            }
+        } 
+        else 
+        {
+            for (int x=start;x<=end;x++,b++)
+            {
+                bool match = true;
+                if (x>=0 && x<=hlen-1)
+                {
+                    match = (_complement(read.getBase(rlen-1-b)) == haplotype.getSequence()[x])?true:false;
                 }
-		all_match += lpCorrect[b];
-	    }
-	} else {
-	    for (int x=start;x<=end;x++,b++)
-	    {
-	        bool match = true;
-	        if (x>=0 && x<=hlen-1)
-		{
-			match = (_complement(read.getBase(rlen-1-b)) == haplotype.getSequence()[x])?true:false;
-		}
-	        if (match) ll += lpCorrect[rlen-1-b]; else {
-                    ll += lpError[rlen-1-b]; nmm++;
-                    if (countMismatch && lpCorrect[rlen-b-1]>minLpCorrect) mismatches.push_back(Mismatch(x, _complement(read.getBase(rlen-1-b))));
+
+                if (match) 
+                {
+                    ll += lpCorrect[rlen-1-b];
+                }
+                else 
+                {
+                    ll += lpError[rlen-1-b]; 
+                    nmm++;
+                    if (countMismatch && lpCorrect[rlen-b-1]>minLpCorrect)
+                         mismatches.push_back(Mismatch(x, _complement(read.getBase(rlen-1-b))));
                 }
-	    }
-	}
-
-        if (ADDSNPS && nmm<=realignParameters.addSNPMaxMismatches) {
-            for (std::list<Mismatch>::const_iterator iter = mismatches.begin(); iter != mismatches.end(); iter++) 
-	    {
-                // FIXME Need to add wrt haplotype multiple alignment column
-		assert(1==0);    
-		/*
-		int refPos = haplotype.getRefBase(iter->first);
-		if (refPos>0)
-		{
-                    m_posToCandidateSNP.addSNP(readIndex, refPos, haplotype.getSequence()[iter->first], iter->second); // pos, ref, alt resp.
-		}
-		*/
             }
         }
-	logLik = ll;
-	isUngapped = true;
-    }
-    else { assert(1==0); }
 
+        logLik = ll;
+        isUngapped = true;
+    }
+    else 
+    { 
+        assert(1==0); 
+    }
 
     assert(!std::isnan(logLik));
     assert(logLik<0.0);
@@ -1694,61 +1696,13 @@ ReadHaplotypeAlignment DindelRealignWindow::computeReadHaplotypeAlignment(size_t
 
     if (DINDEL_DEBUG && DRHA)
     {
-	std::cout << "LOGLIK: " << logLik << std::endl;
+	    std::cout << "LOGLIK: " << logLik << std::endl;
     }
-    if (DINDEL_DEBUG &&DRHA) std::cout << std::endl <<  "====> END DindelRealignWindow::computeReadHaplotypeAlignment" << std::endl;
+    if (DINDEL_DEBUG && DRHA)
+         std::cout << std::endl <<  "====> END DindelRealignWindow::computeReadHaplotypeAlignment" << std::endl;
     return ReadHaplotypeAlignment(logLik, nmm, isUngapped);
 }
 
-void DindelRealignWindow::addSNPsToHaplotypes()
-{
-    typedef std::map<size_t, std::vector<DindelVariant> > FreqCandidates;
-    FreqCandidates freqCandidates;
-    throw std::string("IMPLEMENT ME");
-
-    for (MaPosToCandidateSNP::const_iterator iter = m_maPosToCandidateSNP.begin(); iter != m_maPosToCandidateSNP.end(); iter++)
-    {
-        for (size_t baseIdx=0;baseIdx<4;baseIdx++)
-        {
-            if (iter->second.baseToReads[baseIdx].size()>=realignParameters.addSNPMinCount)
-            {
-                // FIXME 
-                /*
-                std::string ref; ref+=iter->second.m_refBase;
-                std::string alt; alt+=iter->second.getBase(baseIdx);
-                DindelVariant snp(m_dindelWindow.getChrom(), ref, alt, iter->first);
-                snp.setPriorProb(realignParameters.variantPriors.getDefaultProbVariant(snp.getType()));
-                freqCandidates[iter->second.baseToReads[baseIdx].size()].push_back(snp);
-                if(!QUIET) std::cout << "DindelRealignWindow::addSNPsToHaplotypes adding SNP " << m_dindelWindow.getChrom() << " " << ref << " "  <<  alt << " " <<  iter->first << std::endl;
-
-                 */
-            }
-        }
-    }
-
-    // add snps until maximum is reached.
-    int numAdded=0;
-    for (FreqCandidates::const_reverse_iterator iter=freqCandidates.rbegin();iter!=freqCandidates.rend();iter++)
-    {
-        bool stop=false;
-        for(size_t x=0;x<iter->second.size();x++)
-        {
-            m_dindelWindow.addVariant(iter->second[x], true);
-            numAdded++;
-            if (numAdded == realignParameters.addSNPMaxSNPs)
-            {
-                stop =true;
-                break;
-            }
-
-        }
-        if (stop) break;
-    }
-
-}
-
-
-
 void DindelRealignWindow::algorithm_hmm(VCFCollection& out, 
                                         DindelReadReferenceAlignmentVector* pOutAlignments, 
                                         DindelRealignWindowResult * pThisResult, 
@@ -1764,15 +1718,6 @@ void DindelRealignWindow::algorithm_hmm(VCFCollection& out,
     // process the user-defined haplotypes
     computeReadHaplotypeAlignmentsUsingHMM(0, numHaps-1);
 
-    // now add SNPs identified by the high quality ungapped alignments
-    if (ADDSNPS && realignParameters.addSNPMaxSNPs>0)
-    {
-        addSNPsToHaplotypes();
-
-        // compute the likelihoods for the additional haplotypes
-        computeReadHaplotypeAlignmentsUsingHMM(numHaps, m_dindelWindow.getHaplotypes().size()-1);
-    }
-
     DindelRealignWindowResult & result = *pThisResult;
 
     // estimate haplotype frequencies using user specified max mapping thresholds
diff --git a/src/GraphDiff/DindelRealignWindow.h b/src/GraphDiff/DindelRealignWindow.h
index 6c276a0..01b7fde 100644
--- a/src/GraphDiff/DindelRealignWindow.h
+++ b/src/GraphDiff/DindelRealignWindow.h
@@ -397,7 +397,7 @@ class DindelHaplotype
     public:
 
         // Constructors
-        DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping);
+        DindelHaplotype(const std::string & haplotypeSequence, const DindelReferenceMapping & refMapping, size_t assembly_k);
         DindelHaplotype(const DindelHaplotype & haplotype, int copyOptions);
         DindelHaplotype(const DindelHaplotype & haplotype);
         ~DindelHaplotype();
@@ -457,6 +457,7 @@ class DindelHaplotype
         std::vector<int> m_hplen; // gives the homopolymer run length for any base in the haplotype sequence going left and right.
         std::vector<int> m_refPos; // position of haplotype base on reference.
         int m_firstAlignedBase;
+        size_t m_assembly_k;
 
         // list of VCF4 style variants contained in haplotype
         std::vector<DindelVariant> m_variants;
@@ -475,7 +476,7 @@ class DindelHaplotype
 class DindelMultiHaplotype
 {
 public:
-    DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence  );
+    DindelMultiHaplotype(const std::vector< DindelReferenceMapping > & referenceMappings, const std::string & haplotypeSequence, size_t assembly_k );
     const std::string & getSequence() const { return m_seq; }
     const std::vector<DindelVariant> & getVariants(int refIdx) const { return m_haplotypes[refIdx].getVariants(); }
     int getNumReferenceMappings() const { return int(m_referenceMappings.size()); }
@@ -512,7 +513,10 @@ class DindelWindow
         
         // Create window from a set of haplotypes and a reference sequence.
         // Uses MultipleAlignment to annotate the variations in the haplotypes with respect to the reference sequence.
-        DindelWindow(const std::vector<std::string> & haplotypeSequences, const std::vector<DindelReferenceMapping>  & referenceMappings);
+        DindelWindow(const std::vector<std::string> & haplotypeSequences, 
+                     const std::vector<DindelReferenceMapping>& referenceMappings,
+                     size_t assembly_k);
+
         DindelWindow(const DindelWindow & window);
         ~DindelWindow();
 
@@ -523,8 +527,11 @@ class DindelWindow
     private:
         
         //
-        void initHaplotypes(const std::vector<std::string> & haplotypeSequences, const std::vector<DindelReferenceMapping> & referenceMappings);
-        void doMultipleHaplotypeAlignment();
+        void initHaplotypes(const std::vector<std::string>& haplotypeSequences, 
+                            const std::vector<DindelReferenceMapping>& referenceMappings,
+                            size_t assembly_k);
+
+        void doMultipleHaplotypeAlignment(size_t assembly_k);
         void copy(const DindelWindow & window);
 
         // Data
@@ -534,7 +541,6 @@ class DindelWindow
         
         // candidate haplotypes
         std::vector<DindelMultiHaplotype> m_haplotypes;
-        std::map<std::string, int> m_hashAltHaps;
         std::vector< DindelReferenceMapping > m_referenceMappings;
 
         MultipleAlignment *m_pHaplotype_ma;
@@ -946,7 +952,6 @@ class DindelRealignWindow
 
         // Functions
         void filterHaplotypes();
-        void addSNPsToHaplotypes();
         void processHaplotypes(size_t firstHapIdx, size_t lastHapIdx);
 
         // convert seed positions into a likelihood given the candidate haplotype
diff --git a/src/GraphDiff/DindelUtil.cpp b/src/GraphDiff/DindelUtil.cpp
index 9d41c0e..51cc548 100644
--- a/src/GraphDiff/DindelUtil.cpp
+++ b/src/GraphDiff/DindelUtil.cpp
@@ -46,6 +46,7 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
     {
         HapgenAlignmentVector thisCandidateAlignments;
         HapgenUtil::alignHaplotypeToReferenceKmer(align_kmer,
+                                                  parameters.kmer,
                                                   inHaplotypes[i],
                                                   parameters.referenceIndex,
                                                   parameters.pRefTable,
@@ -60,7 +61,15 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
     if(Verbosity::Instance().getPrintLevel() > 3)
         printf("runDindel -- %zu candidate alignments found\n", candidateAlignments.size());
     
-    size_t MAX_ALIGNMENTS = 10;
+    // Check if this group of haplotypes is a SNP-only event
+    bool has_complex_variant = false;
+    for(size_t i = 0; i < candidateAlignments.size(); ++i)
+    {
+        if(candidateAlignments[i].numEvents > 0 && !candidateAlignments[i].isSNP)
+            has_complex_variant = true;
+    }
+
+    size_t MAX_ALIGNMENTS = 1;
     if(candidateAlignments.size() > MAX_ALIGNMENTS)
         return DRC_AMBIGUOUS_ALIGNMENT;
 
@@ -69,11 +78,17 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
     int FLANKING_SIZE = 0;
     if (parameters.dindelRealignParameters.realignMatePairs)
         FLANKING_SIZE = 1000;
+    
+    // all haplotypes, assembled and reference with the extra flanking sequence added
     StringVector flankingHaplotypes;
+    
+    // only the reference sequences, not the assembled haplotypes
+    StringVector referenceFlankingHaplotypes; 
 
-    // This vector contains the internal portion of the haplotypes, without the flanking sequence
+    // The internal portion of the haplotypes, without the flanking sequence.
     // It is used to extract reads
     StringVector candidateHaplotypes;
+
     for(size_t i = 0; i < candidateAlignments.size(); ++i)
     {
         HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i],
@@ -81,10 +96,30 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
                                            FLANKING_SIZE,
                                            inHaplotypes,
                                            flankingHaplotypes,
-                                           candidateHaplotypes);
+                                           candidateHaplotypes,
+                                           referenceFlankingHaplotypes);
     
     }
 
+
+    // There should be at least two flanking haplotypes, one that was assembled
+    // and one reference sequence found by alignment
+    if(flankingHaplotypes.size() < 2)
+        return DRC_NO_ALIGNMENT;
+
+    // Ensure the reference haplotype is a non-empty string
+    if(flankingHaplotypes[0].size() == 0)
+        return DRC_NO_ALIGNMENT;
+
+    // JS 26/08/14
+    // We now require the assembled haplotype to match uniquely to the reference
+    // and we require that the starting/ending k-mers of the haplotypes exactly
+    // match the reference. Enforce these checks now.
+    assert(referenceFlankingHaplotypes.size() == 1);
+
+    if(!HapgenUtil::checkHaplotypeKmersMatchReference(parameters.kmer, referenceFlankingHaplotypes.front(), inHaplotypes))
+        return DRC_AMBIGUOUS_ALIGNMENT;
+
     if(Verbosity::Instance().getPrintLevel() > 3)
         printf("runDindel -- made %zu flanking haplotypes\n", candidateHaplotypes.size());
 
@@ -149,17 +184,6 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
     if (total_reads == 0)
         return DRC_UNDER_DEPTH;
 
-    // Generate the input haplotypes for dindel
-    // We need at least 2 haplotypes (one is the reference)
-    size_t totFlankingHaplotypes = flankingHaplotypes.size();
-
-    if(totFlankingHaplotypes < 2)
-        return DRC_NO_ALIGNMENT;
-
-    // Ensure the reference haplotype is a non-empty string
-    if(flankingHaplotypes[0].size() == 0)
-        return DRC_NO_ALIGNMENT;
-
     // Make Dindel referenceMappings
     StringVector dindelHaplotypes;
     std::set<DindelReferenceMapping> refMappings;
@@ -203,16 +227,27 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
         it->referenceAlignmentScore = 1000;
     
     // make flankingHaplotypes unique
-    std::set< std::string > setFlanking(flankingHaplotypes.begin(), flankingHaplotypes.end());
-
-    for(std::set< std::string >::const_iterator it = setFlanking.begin(); it != setFlanking.end(); it++)
+    // JS 27/08/14: After changing the alignment code to require
+    // k-mer matches between the reference and haplotype, the order
+    // that the haplotypes are sent to dindel matters. The following
+    // code was changed so that the unique reference haplotype is presented
+    // to dindel first.
+    std::set<std::string> setFlanking;
+    
+    for(size_t fi = 0; fi < flankingHaplotypes.size(); ++fi)
     {
-        dindelHaplotypes.push_back(*it);
-        //dindelRefMappings[i] = std::vector<DindelReferenceMapping>(refMappings.begin(),refMappings.end());
+        std::string& curr_haplotype = flankingHaplotypes[fi];
+        if(setFlanking.find(curr_haplotype) == setFlanking.end())
+        {
+            dindelHaplotypes.push_back(curr_haplotype);
+            setFlanking.insert(curr_haplotype);
+        }
     }
 
     std::vector<DindelReferenceMapping> dRefMappings(refMappings.begin(),refMappings.end());
-    DindelWindow dWindow(dindelHaplotypes, dRefMappings);
+    
+    // Initialize DindelWindow
+    DindelWindow dWindow(dindelHaplotypes, dRefMappings, parameters.kmer);
 
     //
     // Run Dindel
@@ -271,6 +306,16 @@ DindelReturnCode DindelUtil::runDindelPairMatePair(const std::string& id,
             pPreviousResult = pThisResult;
     }
 
+    // Add additional tags to each VCF record
+    double average_hap_length = 0.0f;
+    for(size_t i = 0; i < variant_haplotypes.size(); ++i)
+        average_hap_length += variant_haplotypes[i].size();
+    average_hap_length /= variant_haplotypes.size();
+    
+    for(size_t i = 0; i <= 1; ++i)
+        for(size_t j = 0; j < vcfCollections[i].records.size(); ++j)
+            vcfCollections[i].records[j].addComment("AvgHapLen", average_hap_length);
+
     // Copy raw VCFRecords to output
     for(size_t i = 0; i <= 1; ++i)
     {
@@ -369,96 +414,6 @@ void DindelUtil::doMultipleReadHaplotypeAlignment(const std::vector<DindelRead>
     }
 }
 
-// Compute the best alignment of the haplotype collection to the reference
-DindelReturnCode DindelUtil::computeBestAlignment(const StringVector& inHaplotypes, 
-                                                  const SeqItemVector& variantMates,
-                                                  const SeqItemVector& variantRCMates,
-                                                  const GraphCompareParameters& parameters,
-                                                  HapgenAlignment& bestAlignment)
-{
-    size_t MAX_DEPTH = 2000;
-    if(variantMates.size() + variantRCMates.size() > MAX_DEPTH)
-        return DRC_OVER_DEPTH;
-
-    //
-    // Align the haplotypes to the reference genome to generate candidate alignments
-    //
-    HapgenAlignmentVector candidateAlignments;
-    for(size_t i = 0; i < inHaplotypes.size(); ++i)
-        HapgenUtil::alignHaplotypeToReferenceBWASW(inHaplotypes[i], parameters.referenceIndex, candidateAlignments);
-
-    // Remove duplicate or bad alignment pairs
-    HapgenUtil::coalesceAlignments(candidateAlignments);
-
-    if(candidateAlignments.empty())
-        return DRC_NO_ALIGNMENT;
-
-    //
-    // Score each candidate alignment against the mates of all the variant reads
-    //
-    int bestCandidate = -1;
-    double bestAverageScoreFrac = 0.0f;
-    double secondBest = 0.0f;
-    for(size_t i = 0; i < candidateAlignments.size(); ++i)
-    {
-        // Compute the average score of the reads' mates to the flanking sequence
-        StringVector referenceFlanking;
-        StringVector referenceHaplotypes;
-        HapgenUtil::makeFlankingHaplotypes(candidateAlignments[i], parameters.pRefTable, 
-                                           1000, inHaplotypes, referenceFlanking, referenceHaplotypes);
-
-        // If valid flanking haplotypes could not be made, skip this alignment
-        if(referenceFlanking.empty())
-            continue;
-
-        // Realign the mates
-        LocalAlignmentResultVector localAlignments = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantMates);
-        LocalAlignmentResultVector localAlignmentsRC = HapgenUtil::alignReadsLocally(referenceFlanking[0], variantRCMates);
-
-        // Merge alignments
-        localAlignments.insert(localAlignments.end(), localAlignmentsRC.begin(), localAlignmentsRC.end());
-
-        double sum = 0.0f;
-        double count = 0.0f;
-
-        for(size_t j = 0; j < localAlignments.size(); ++j)
-        {
-            double max_score = localAlignments[j].queryEndIndex - localAlignments[j].queryStartIndex + 1;
-            double frac = (double)localAlignments[j].score / max_score;
-            //printf("Score: %d frac: %lf\n", localAlignments[j].score, frac);
-            sum += frac;
-            count += 1;
-        }
-
-        double score = sum / count;
-        if(score > bestAverageScoreFrac)
-        {
-            secondBest = bestAverageScoreFrac;
-            bestAverageScoreFrac = score;
-            bestCandidate = i;
-        }
-        else if(score > secondBest)
-        {
-            secondBest = score;
-        }
-
-        //printf("Alignment %zu mate-score: %lf\n", i, score);
-    }
-
-    if(bestCandidate == -1)
-        return DRC_NO_ALIGNMENT;
-
-    /*
-    if(bestAverageScoreFrac < 0.9f)
-        return DRC_POOR_ALIGNMENT;
-
-    if(bestAverageScoreFrac - secondBest < 0.05f)
-        return DRC_AMBIGUOUS_ALIGNMENT;
-    */
-    bestAlignment = candidateAlignments[bestCandidate];
-    return DRC_OK;
-}
-
 // Initialize an array of code counts
 void DindelUtil::initializeCodeCounts(int counts[DRC_NUM_CODES])
 {
diff --git a/src/GraphDiff/DindelUtil.h b/src/GraphDiff/DindelUtil.h
index f1ca353..d90f344 100644
--- a/src/GraphDiff/DindelUtil.h
+++ b/src/GraphDiff/DindelUtil.h
@@ -51,13 +51,6 @@ namespace DindelUtil
     void doMultipleReadHaplotypeAlignment(const std::vector<DindelRead> & dReads,
                                           const StringVector & haplotypes);
 
-    // Compute the best alignment 
-    DindelReturnCode computeBestAlignment(const StringVector& inHaplotypes, 
-                                          const SeqItemVector& variantMates,
-                                          const SeqItemVector& variantRCMates,
-                                          const GraphCompareParameters& parameters,
-                                          HapgenAlignment& bestAlignment);
-
     // Convert a read-haplotype alignment to a reference alignment in BAM format
     void readHaplotypeAlignmentToBAM(const std::string& readID,
                                      const std::string& readSequence,
diff --git a/src/GraphDiff/GraphCompare.cpp b/src/GraphDiff/GraphCompare.cpp
index ef5bccf..ea6edfb 100644
--- a/src/GraphDiff/GraphCompare.cpp
+++ b/src/GraphDiff/GraphCompare.cpp
@@ -114,7 +114,6 @@ GraphCompareResult GraphCompare::process(const SequenceWorkItem& item) const
     GraphCompareResult result;
     SeqRecord currRead = item.read;
     std::string w = item.read.seq.toString();
-    
     int len = w.size();
     int num_kmers = len - m_parameters.kmer + 1;
 
@@ -339,8 +338,8 @@ void GraphCompare::qcVariantHaplotypes(bool bReferenceMode, StringVector& varian
             for(size_t j = 0; j < n; j++)
                 num_uncovered_bases += covered_bases.test(j) ? 0 : 1;
 
-            if(max_d < k / 2)
-                haplotype = "";
+            //if(max_d < k / 2)
+            //    haplotype = "";
 
             if(Verbosity::Instance().getPrintLevel() > 2)
             {
@@ -360,8 +359,8 @@ void GraphCompare::qcVariantHaplotypes(bool bReferenceMode, StringVector& varian
                 continue;
 
             // Calculate the largest k such that the haplotype is walk through a de Bruijn graph of this k
-            size_t max_variant_k = calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.variantIndex);
-            size_t max_base_k = calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.baseIndex);
+            size_t max_variant_k = HapgenUtil::calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.variantIndex);
+            size_t max_base_k = HapgenUtil::calculateMaxCoveringK(variant_haplotypes[i], MIN_COVERAGE, m_parameters.baseIndex);
             if(Verbosity::Instance().getPrintLevel() > 2)
                 printf("HaplotypeQC hap[%zu] MVK: %zu MBK: %zu\n", i, max_variant_k, max_base_k);
 
@@ -444,36 +443,6 @@ void GraphCompare::buildParallelBaseHaplotypes(const StringVector& variant_haplo
 }
 
 //
-size_t GraphCompare::calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices) const
-{
-    size_t min_k = 15;
-    for(size_t k = 99; k >= min_k; --k)
-    {
-        if(sequence.size() < k)
-            continue;
-        bool covered = true;
-        size_t nk = sequence.size() - k + 1;
-        for(size_t i = 0; i < nk; ++i)
-        {
-            std::string kmer = sequence.substr(i, k);
-            int c = BWTAlgorithms::countSequenceOccurrences(kmer, indices);
-
-            if(c < min_depth)
-            {
-                covered = false;
-                break;
-            }
-        }
-
-        if(covered)
-            return k;
-    }
-
-    return 0;
-}
-
-
-//
 size_t GraphCompare::calculateHaplotypeBranches(const std::string& sequence, 
                                                 size_t k, 
                                                 size_t min_branch_depth, 
@@ -505,21 +474,6 @@ size_t GraphCompare::calculateHaplotypeBranches(const std::string& sequence,
     return num_branches;
 }
 
-//
-IntVector GraphCompare::makeCountProfile(const std::string& str, size_t k, const BWT* pBWT, int max)
-{
-    IntVector out;
-    if(str.size() < k)
-        return out;
-
-    for(size_t i = 0; i < str.size() - k + 1; ++i)
-    {
-        int count = BWTAlgorithms::countSequenceOccurrences(str.substr(i, k), pBWT);
-        out.push_back(count > max ? max : count);
-    }
-    return out;
-}
-
 // test kmers from a file
 void GraphCompare::testKmersFromFile(const std::string& kmerFilename)
 {
diff --git a/src/GraphDiff/GraphCompare.h b/src/GraphDiff/GraphCompare.h
index 8f9b09a..ccab6b3 100644
--- a/src/GraphDiff/GraphCompare.h
+++ b/src/GraphDiff/GraphCompare.h
@@ -173,9 +173,6 @@ class GraphCompare
         // Mark all the kmers in str as being visited
         void markVariantSequenceKmers(const std::string& str) const;
         
-        // Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
-        size_t calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices) const;
-
         // Calculate the number of high coverage branches off a haplotype path through the de Bruijn graph
         size_t calculateHaplotypeBranches(const std::string& sequence, size_t k, size_t min_branch_depth, const BWTIndexSet& indices);
 
@@ -184,7 +181,6 @@ class GraphCompare
         
         // Debug functions
         bool transformVariantString(const std::string& inStr, std::string& outStr);
-        IntVector makeCountProfile(const std::string& str, size_t k, const BWT* pBWT, int max);
         void showMappingLocations(const std::string& str);
 
         //
diff --git a/src/GraphDiff/HapgenUtil.cpp b/src/GraphDiff/HapgenUtil.cpp
index 874d93b..3ad7064 100644
--- a/src/GraphDiff/HapgenUtil.cpp
+++ b/src/GraphDiff/HapgenUtil.cpp
@@ -16,37 +16,6 @@
 #include "Verbosity.h"
 #include "overlapper.h"
 
-// Align the haplotype to the reference genome represented by the BWT/SSA pair
-void HapgenUtil::alignHaplotypeToReferenceBWASW(const std::string& haplotype,
-                                                const BWTIndexSet& referenceIndex,
-                                                HapgenAlignmentVector& outAlignments)
-{
-    PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceBWASW")
-    LRAlignment::LRParams params;
-
-    params.zBest = 20;
-
-    for(size_t i = 0; i <= 1; ++i)
-    {
-        LRAlignment::LRHitVector hits;
-        std::string query = (i == 0) ? haplotype : reverseComplement(haplotype);
-        LRAlignment::bwaswAlignment(query, referenceIndex.pBWT, referenceIndex.pSSA, params, hits);
-
-        // Convert the hits into alignments
-        for(size_t j = 0; j < hits.size(); ++j)
-        {
-            int q_alignment_length = hits[j].q_end - hits[j].q_start;
-
-            // Skip non-complete alignments
-            if((int)haplotype.length() == q_alignment_length)
-            {
-                HapgenAlignment aln(hits[j].targetID, hits[j].t_start, hits[j].length, hits[j].G, i == 1);
-                outAlignments.push_back(aln);
-            }
-        }
-    }
-}
-
 struct CandidateKmerAlignment
 {
     size_t query_index;
@@ -61,8 +30,14 @@ struct CandidateKmerAlignment
 };
 typedef std::vector<CandidateKmerAlignment> CandidateVector;
 
+bool isValidFlankingKmerPair(size_t kidx_0, size_t kidx_1)
+{
+    return kidx_0 != std::string::npos && kidx_1 != std::string::npos && kidx_1 > kidx_0;
+}
+
 // Align the haplotype to the reference genome represented by the BWT/SSA pair
-void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
+void HapgenUtil::alignHaplotypeToReferenceKmer(size_t align_k,
+                                               size_t assemble_k,
                                                const std::string& haplotype,
                                                const BWTIndexSet& referenceIndex,
                                                const ReadTable* pReferenceTable,
@@ -71,7 +46,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
     PROFILE_FUNC("HapgenUtil::alignHaplotypesToReferenceKmer")
     int64_t max_interval_size = 4;
 
-    if(haplotype.size() < k)
+    if(haplotype.size() < align_k)
         return;
 
     std::vector<int> event_count_vector;
@@ -87,10 +62,10 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
         // Find shared kmers between the haplotype and the reference
         CandidateVector candidates;
 
-        size_t nqk = query.size() - k + 1;
+        size_t nqk = query.size() - align_k + 1;
         for(size_t j = 0; j < nqk; ++j)
         {
-            std::string kmer = query.substr(j, k);
+            std::string kmer = query.substr(j, align_k);
 
             // Find the interval of this kmer in the reference
             BWTInterval interval = BWTAlgorithms::findInterval(referenceIndex, kmer);
@@ -121,7 +96,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
         for(size_t j = 0; j < candidates.size(); ++j)
         {
             // Extract window around reference
-            size_t window_size = 200;
+            size_t window_size = 500;
             int ref_start = candidates[j].target_extrapolated_start - window_size;
             int ref_end = candidates[j].target_extrapolated_end + window_size;
             const SeqItem& ref_record = pReferenceTable->getRead(candidates[j].target_sequence_id);
@@ -134,8 +109,16 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
 
             std::string ref_substring = ref_sequence.substr(ref_start, ref_end - ref_start);
 
+            // Ensure that this alignment location is valid by checking for the presence of both unique flanking kmers
+            size_t kidx_0, kidx_1;
+            getFlankingKmerMatch(assemble_k, ref_substring, query, kidx_0, kidx_1);
+
+            if(!isValidFlankingKmerPair(kidx_0, kidx_1))
+                continue;
+
             // Align haplotype to the reference
-            SequenceOverlap overlap = alignHaplotypeToReference(ref_substring, query);
+            SequenceOverlap overlap = alignHaplotypeToReference(assemble_k, ref_substring, query);
+
             if(overlap.score < 0 || !overlap.isValid())
                 continue;
 
@@ -144,6 +127,7 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
             int alignment_length = alignment_end - alignment_start + 1;
 
             // Crude count of the number of distinct variation events
+            bool has_indel = false;
             int num_events = overlap.edit_distance;
             std::stringstream c_parser(overlap.cigar);
             int len;
@@ -154,28 +138,40 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
 
                 // Only count one event per insertion/deletion
                 if(t == 'D' || t == 'I')
+                {
                     num_events -= (len - 1);
+                    has_indel = true;
+                }
             }
 
             // Skip poor alignments
-            double mismatch_rate = 1.0f - (overlap.getPercentIdentity() / 100.f);
-            if(mismatch_rate > 0.05f || overlap.total_columns < 50)
+            double event_rate = (double)num_events / overlap.total_columns;
+            if(event_rate > 0.05f || overlap.total_columns < 50)
             {
                 if(Verbosity::Instance().getPrintLevel() > 4)
                 {
                     printf("Haplotype Alignment - Ignoring low quality alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
-                        1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
+                        1.0f - event_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
                 }
                 continue;
             }
 
-            HapgenAlignment aln(candidates[j].target_sequence_id, alignment_start, alignment_length, overlap.score, is_reverse);
+            bool is_snp = !has_indel && overlap.edit_distance == 1;
+
+            HapgenAlignment aln(candidates[j].target_sequence_id, 
+                                alignment_start, 
+                                alignment_length, 
+                                overlap.score, 
+                                num_events,
+                                is_reverse, 
+                                is_snp);
+
             tmp_alignments.push_back(aln);
             event_count_vector.push_back(num_events);
             if(Verbosity::Instance().getPrintLevel() > 4)
             {
                 printf("Haplotype Alignment - Accepting alignment (%.3lf, %dbp, %d events) to %s:%d\n", 
-                    1.0f - mismatch_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
+                    1.0f - event_rate, overlap.total_columns, num_events, ref_record.id.c_str(), ref_start);
             }            
             // Record the best edit distance
             if(num_events < min_events) 
@@ -198,13 +194,85 @@ void HapgenUtil::alignHaplotypeToReferenceKmer(size_t k,
     }
 }
 
-SequenceOverlap HapgenUtil::alignHaplotypeToReference(const std::string& reference,
+// Get the index of the first and last k-mers of the haplotype
+// in the reference sequence
+//
+// ----------==========-----------------============------  reference
+//           ==========-----------------===========         haplotype
+//           ^ start_idx                ^ end_idx
+// These will be std::string::npos if the kmers are not on the reference
+void HapgenUtil::getFlankingKmerMatch(size_t k,
+                                      const std::string& reference,
+                                      const std::string& haplotype,
+                                      size_t& start_idx,
+                                      size_t& end_idx)
+{
+    // Trim reference string so that it has flanking k-mer matches with the haplotype
+    std::string start_kmer = haplotype.substr(0, k);
+    std::string end_kmer = haplotype.substr(haplotype.size() - k, k);
+    
+    start_idx = reference.find(start_kmer);
+    end_idx = reference.find(end_kmer);
+}
+
+//
+SequenceOverlap HapgenUtil::alignHaplotypeToReference(size_t k,
+                                                      const std::string& reference,
                                                       const std::string& haplotype)
 {
-    SequenceOverlap overlap = Overlapper::computeOverlapAffine(reference, haplotype);
+
+    // Get position of flanking k-mer matches
+    size_t kidx_0, kidx_1;
+    getFlankingKmerMatch(k, reference, haplotype, kidx_0, kidx_1);
+    
+    if(Verbosity::Instance().getPrintLevel() > 5)
+    {
+        printf("R: %s\n", reference.c_str());
+        printf("H: %s\n", haplotype.c_str());
+        printf("K: %zu\n", k);
+        printf("KI0: %zu KI1: %zu\n", kidx_0, kidx_1);
+    }
+    
+    // This function assumes that the flanking k-mers are in both strings
+    assert(isValidFlankingKmerPair(kidx_0, kidx_1));
+
+    // trim reference string so we can perform global alignment
+    std::string reference_substring = reference.substr(kidx_0, kidx_1 - kidx_0 + k);
+
+    // set up scoring scheme
+    OverlapperParams params = affine_default_params;
+    params.type = ALT_GLOBAL;
+    params.gap_ext_penalty = 0;
+
+    //
+    SequenceOverlap overlap = Overlapper::computeAlignmentAffine(reference_substring, haplotype, params);
+    
+    // Offset alignment location by the location of the first kmer in the reference haplotype
+    overlap.match[0].start += kidx_0;
+    overlap.match[0].end += kidx_0;
+    
+    if(Verbosity::Instance().getPrintLevel() > 5)
+    {
+        overlap.printAlignment(reference, haplotype);
+    }
     return overlap;
 }
 
+// Returns true if the start/end kmer for all haplotypes occur in the reference string
+bool HapgenUtil::checkHaplotypeKmersMatchReference(size_t k,
+                                                   const std::string& reference,
+                                                   const StringVector& inHaplotypes)
+{
+    for(size_t i = 0; i < inHaplotypes.size(); ++i)
+    {
+        size_t kidx_0, kidx_1;
+        getFlankingKmerMatch(k, reference, inHaplotypes[i], kidx_0, kidx_1);
+
+        if(!isValidFlankingKmerPair(kidx_0, kidx_1))
+            return false;
+    }
+    return true;
+}
 
 // Coalesce a set of alignments into distinct locations
 void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments)
@@ -225,7 +293,7 @@ void HapgenUtil::coalesceAlignments(HapgenAlignmentVector& alignments)
     outAlignments.push_back(alignments[alignments.size()-1]);
 
     // Kees: start from back because alignments are sorted in order of increasing score
-    for(size_t i = alignments.size()-1; i-- > 0;)
+    for(size_t i = alignments.size() - 1; i-- > 0;)
     {
         // Check this alignment against the last alignment added to the output set
         HapgenAlignment& prevAlign = outAlignments.back();
@@ -305,7 +373,8 @@ bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln,
                                         int flanking,
                                         const StringVector& inHaplotypes,
                                         StringVector& outFlankingHaplotypes,
-                                        StringVector& outHaplotypes)
+                                        StringVector& outHaplotypes,
+                                        StringVector& outFlankingReferenceHaplotypes)
 {
     std::string upstream;
     std::string referenceHaplotype;
@@ -329,6 +398,7 @@ bool HapgenUtil::makeFlankingHaplotypes(const HapgenAlignment& aln,
     std::string referenceFlanking = upstream + referenceHaplotype + downstream;
     outFlankingHaplotypes.push_back(referenceFlanking);
     outHaplotypes.push_back(referenceHaplotype);
+    outFlankingReferenceHaplotypes.push_back(referenceFlanking);
 
     // Check that all sequences match the reference haplotype properly
     /*
@@ -535,6 +605,101 @@ LocalAlignmentResultVector HapgenUtil::alignReadsLocally(const std::string& targ
     return results;
 }
 
+//
+double HapgenUtil::calculateDustScoreAtPosition(const std::string& name, 
+                                                int position, 
+                                                const ReadTable* pRefTable,
+                                                int window_size)
+{
+    int dw_start = position - window_size / 2;
+    dw_start = std::max(0, dw_start); // clamp to 0
+
+    int dw_end = position + window_size / 2;
+    const DNAString& chromosome = pRefTable->getRead(name).seq;
+    dw_end = std::min(dw_end, (int)chromosome.length());
+
+    double dust_score = 0.0f;
+    if(dw_end - dw_start == window_size)
+    {
+        std::string ref_dust_window = chromosome.substr(dw_start, dw_end - dw_start);
+        dust_score = calculateDustScore(ref_dust_window);
+    }
+    return dust_score;
+}
+
+//
+size_t HapgenUtil::getMaximumOneEdit(const std::string& str, const BWTIndexSet& indices)
+{
+    size_t max = 0;
+    std::string t = str;
+
+    //
+    for(size_t i = 0; i < str.size(); ++i)
+    {
+        char tmp = t[i];
+        for(int bi = 0; bi < DNA_ALPHABET::size; ++bi)
+        {
+            char b = DNA_ALPHABET::getBase(bi);
+            if(b != tmp)
+            {
+                t[i] = b;
+                size_t count = BWTAlgorithms::countSequenceOccurrences(t, indices);
+                if(count > max)
+                    max = count;
+            }
+        }
+        t[i] = tmp;
+    }
+
+    printf("MaxOneEdit %zu\n", max);
+    return max;
+}
+
+// Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
+size_t HapgenUtil::calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices)
+{
+    size_t min_k = 15;
+    for(size_t k = 99; k >= min_k; --k)
+    {
+        if(sequence.size() < k)
+            continue;
+        bool covered = true;
+        size_t nk = sequence.size() - k + 1;
+        for(size_t i = 0; i < nk; ++i)
+        {
+            std::string kmer = sequence.substr(i, k);
+            int c = BWTAlgorithms::countSequenceOccurrences(kmer, indices);
+
+            if(c < min_depth)
+            {
+                covered = false;
+                break;
+            }
+        }
+
+        if(covered)
+            return k;
+    }
+
+    return 0;
+}
+
+//
+std::vector<int> HapgenUtil::makeCountProfile(const std::string& str, size_t k, int max, const BWTIndexSet& indices)
+{
+    std::vector<int> out;
+    if(str.size() < k)
+        return out;
+
+    for(size_t i = 0; i < str.size() - k + 1; ++i)
+    {
+        int count = BWTAlgorithms::countSequenceOccurrences(str.substr(i, k), indices);
+        out.push_back(count > max ? max : count);
+    }
+    return out;
+}
+
+
 // Print an alignment to a reference
 void HapgenUtil::printAlignment(const std::string& query, const HapgenAlignment& aln, const ReadTable* pRefTable)
 {
diff --git a/src/GraphDiff/HapgenUtil.h b/src/GraphDiff/HapgenUtil.h
index cd0a373..78822aa 100644
--- a/src/GraphDiff/HapgenUtil.h
+++ b/src/GraphDiff/HapgenUtil.h
@@ -20,8 +20,14 @@
 struct HapgenAlignment
 {
     //
-    HapgenAlignment() : referenceID(-1), position(-1), length(0), score(-1), isRC(false) {}
-    HapgenAlignment(const int& id, int p, int l, int s, bool rc) : referenceID(id), position(p), length(l), score(s), isRC(rc) {}
+    HapgenAlignment() : referenceID(-1), position(-1), length(0), score(-1), numEvents(0), isRC(false), isSNP(false) {}
+    HapgenAlignment(const int& id, int p, int l, int s, int ne, bool rc, bool snp) : referenceID(id), 
+                                                                                     position(p), 
+                                                                                     length(l), 
+                                                                                     score(s),
+                                                                                     numEvents(ne), 
+                                                                                     isRC(rc), 
+                                                                                     isSNP(snp) {}
 
     // Sort
     // Kees: added sorting on alignment score
@@ -51,27 +57,25 @@ struct HapgenAlignment
     int position;
     int length; // alignment length
     int score;
+    int numEvents;
     bool isRC;
+    bool isSNP;
 };
 typedef std::vector<HapgenAlignment> HapgenAlignmentVector;
 
 //
 namespace HapgenUtil
 {
-
-
-    // Align the haplotype to the reference genome represented by the BWT/SSA pair
-    void alignHaplotypeToReferenceBWASW(const std::string& haplotype,
-                                        const BWTIndexSet& referenceIndex,
-                                        HapgenAlignmentVector& outAlignments);
-
     //
-    void alignHaplotypeToReferenceKmer(size_t k,
+    void alignHaplotypeToReferenceKmer(size_t align_k,
+                                       size_t assemble_k,
                                        const std::string& haplotype,
                                        const BWTIndexSet& referenceIndex,
                                        const ReadTable* pReferenceTable,
                                        HapgenAlignmentVector& outAlignments);
 
+    
+
 
     // Coalesce a set of alignments into distinct locations
     void coalesceAlignments(HapgenAlignmentVector& alignments);
@@ -96,7 +100,8 @@ namespace HapgenUtil
                                 int flanking,
                                 const StringVector& inHaplotypes,
                                 StringVector& outFlankingHaplotypes,
-                                StringVector& outHaplotypes);
+                                StringVector& outHaplotypes,
+                                StringVector& outFlankingReferenceHaplotypes);
 
 
     // Extract reads from an FM-index that have a k-mer match to any given haplotypes
@@ -121,11 +126,26 @@ namespace HapgenUtil
                                        SeqRecordVector* pOutReads, 
                                        SeqRecordVector* pOutMates);
 
+    // Get the index of the first and last k-mers of the haplotype
+    // in the reference sequence
+    void getFlankingKmerMatch(size_t k,
+                              const std::string& reference,
+                              const std::string& haplotype,
+                              size_t& start_idx,
+                              size_t& end_idx);
     
     // Perform an alignment between a haplotype sequence and a reference sequence
-    SequenceOverlap alignHaplotypeToReference(const std::string& reference,
+    // This function assumes the sequences share the first/last k bases
+    SequenceOverlap alignHaplotypeToReference(size_t k,
+                                              const std::string& reference,
                                               const std::string& haplotype);
 
+    // Returns true if the start/end kmer for all haplotypes occur in the reference string
+    bool checkHaplotypeKmersMatchReference(size_t k,
+                                           const std::string& reference,
+                                           const StringVector& inHaplotypes);
+
+
     // Compute the best local alignment for each read in the array to the given sequence
     LocalAlignmentResultVector alignReadsLocally(const std::string& target, const SeqItemVector& reads);
 
@@ -133,6 +153,21 @@ namespace HapgenUtil
     // of the passed in sequence
     bool checkAlignmentsAreConsistent(const std::string& refString, const StringVector& queries);
 
+    // Calculate the dust score around a reference position
+    double calculateDustScoreAtPosition(const std::string& name, 
+                                        int position, 
+                                        const ReadTable* pRefTable,
+                                        int window_size = 64);
+
+    // Return the count of the most frequent string within one-edit distance of the given string 
+    size_t getMaximumOneEdit(const std::string& str, const BWTIndexSet& indices);
+    
+    // Calculate the largest k such that every k-mer in the sequence is present at least min_depth times in the BWT
+    size_t calculateMaxCoveringK(const std::string& sequence, int min_depth, const BWTIndexSet& indices);
+
+    // Count the number of times each k-mer of str is in the BWT
+    std::vector<int> makeCountProfile(const std::string& str, size_t k, int max, const BWTIndexSet& indices);
+
     // Print an alignment to a reference
     void printAlignment(const std::string& query, const HapgenAlignment& aln, const ReadTable* pRefTable);
 
diff --git a/src/SGA/Makefile.am b/src/SGA/Makefile.am
index 0871a96..4931ef5 100644
--- a/src/SGA/Makefile.am
+++ b/src/SGA/Makefile.am
@@ -15,9 +15,9 @@ sga_CPPFLAGS = \
 sga_LDADD = \
 	$(top_builddir)/Scaffold/libscaffold.a \
 	$(top_builddir)/GraphDiff/libgraphdiff.a \
-    $(top_builddir)/StringGraph/libstringgraph.a \
+	$(top_builddir)/StringGraph/libstringgraph.a \
 	$(top_builddir)/Concurrency/libconcurrency.a \
-    $(top_builddir)/Algorithm/libalgorithm.a \
+	$(top_builddir)/Algorithm/libalgorithm.a \
 	$(top_builddir)/SuffixTools/libsuffixtools.a \
 	$(top_builddir)/Bigraph/libbigraph.a \
 	$(top_builddir)/Util/libutil.a \
@@ -32,15 +32,16 @@ sga_SOURCES = sga.cpp \
               assemble.cpp assemble.h \
               correct.cpp correct.h \
               oview.cpp oview.h \
-			  preprocess.cpp preprocess.h \
+              preprocess.cpp preprocess.h \
               rmdup.cpp rmdup.h \
               merge.cpp merge.h \
-			  subgraph.cpp subgraph.h \
-			  scaffold.cpp scaffold.h \
-			  scaffold2fasta.cpp scaffold2fasta.h \
+              subgraph.cpp subgraph.h \
+              scaffold.cpp scaffold.h \
+              scaffold2fasta.cpp scaffold2fasta.h \
               connect.cpp connect.h \
               walk.cpp walk.h \
               filter.cpp filter.h \
+              kmer-count.cpp kmer-count.h \
               stats.cpp stats.h \
               fm-merge.cpp fm-merge.h \
               gmap.h gmap.cpp \
@@ -49,10 +50,13 @@ sga_SOURCES = sga.cpp \
               gen-ssa.h gen-ssa.cpp \
               bwt2fa.h bwt2fa.cpp \
               graph-diff.h graph-diff.cpp \
+              graph-concordance.h graph-concordance.cpp \
               gapfill.h gapfill.cpp \
               preqc.h preqc.cpp \
+              overlap-long.h overlap-long.cpp \
               variant-detectability.h variant-detectability.cpp \
               rewrite-evidence-bam.h rewrite-evidence-bam.cpp \
+              somatic-variant-filters-bam.h somatic-variant-filters.cpp \
               haplotype-filter.h haplotype-filter.cpp \
               OverlapCommon.h OverlapCommon.cpp \
               SGACommon.h 
diff --git a/src/SGA/SGACommon.h b/src/SGA/SGACommon.h
index 9cbdd05..29b0158 100644
--- a/src/SGA/SGACommon.h
+++ b/src/SGA/SGACommon.h
@@ -30,5 +30,6 @@
 // Default values
 #define DEFAULT_MIN_OVERLAP 45
 #define DEFAULT_EXTRACT_LEN 100
-
+#define DEFAULT_MIN_COUNT_MAX_BASE 4
+#define DEFAULT_COUNT_OFFSET 1
 #endif
diff --git a/src/SGA/correct.cpp b/src/SGA/correct.cpp
index 1958cea..8e15eb4 100644
--- a/src/SGA/correct.cpp
+++ b/src/SGA/correct.cpp
@@ -30,6 +30,8 @@
 // Functions
 int learnKmerParameters(const BWT* pBWT);
 
+//#define OVERLAPCORRECTION_VERBOSE 1
+
 //
 // Getopt
 //
@@ -58,10 +60,15 @@ static const char *CORRECT_USAGE_MESSAGE =
 "      -k, --kmer-size=N                The length of the kmer to use. (default: 31)\n"
 "      -x, --kmer-threshold=N           Attempt to correct kmers that are seen less than N times. (default: 3)\n"
 "      -i, --kmer-rounds=N              Perform N rounds of k-mer correction, correcting up to N bases (default: 10)\n"
+"      -O, --count-offset=N             When correcting a kmer, require the count of the new kmer is at least +N higher than the count of the old kmer. (default: 1)\n"
 "          --learn                      Attempt to learn the k-mer correction threshold (experimental). Overrides -x parameter.\n"
 "\nOverlap correction parameters:\n"
 "      -e, --error-rate                 the maximum error rate allowed between two sequences to consider them overlapped (default: 0.04)\n"
 "      -m, --min-overlap=LEN            minimum overlap required between two reads (default: 45)\n"
+"      -M, --min-count-max-base=INT     minimum count of the base that has the highest count in overlap correction.\n"
+"                                       The base of the read is only corrected if the maximum base has at least this count.\n"
+"                                       Should avoid mis-corrections in low coverage regions (default: 4)\n"
+"      -X, --base-threshold=N           Attempt to correct bases in a read that are seen less than N times (default: 2)\n"
 "      -c, --conflict=INT               use INT as the threshold to detect a conflicted base in the multi-overlap (default: 5)\n"
 "      -l, --seed-length=LEN            force the seed length to be LEN. By default, the seed length in the overlap step\n"
 "                                       is calculated to guarantee all overlaps with --error-rate differences are found.\n"
@@ -92,12 +99,15 @@ namespace opt
     
     static double errorRate = 0.04;
     static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
+    static unsigned int min_count_max_base = DEFAULT_MIN_COUNT_MAX_BASE;
+    static unsigned int countOffset = DEFAULT_COUNT_OFFSET;
     static int seedLength = 0;
     static int seedStride = 0;
     static int conflictCutoff = 5;
     static int branchCutoff = -1;
 
     static int kmerLength = 31;
+    static int base_threshold = 2;
     static int kmerThreshold = 3;
     static int numKmerRounds = 10;
     static bool bLearnKmerParams = false;
@@ -106,7 +116,7 @@ namespace opt
     static ErrorCorrectAlgorithm algorithm = ECA_KMER;
 }
 
-static const char* shortopts = "p:m:d:e:t:l:s:o:r:b:a:c:k:x:i:v";
+static const char* shortopts = "p:m:M:O:d:e:t:l:s:o:r:b:a:c:k:x:X:i:v";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_METRICS, OPT_DISCARD, OPT_LEARN };
 
@@ -114,6 +124,8 @@ static const struct option longopts[] = {
     { "verbose",       no_argument,       NULL, 'v' },
     { "threads",       required_argument, NULL, 't' },
     { "min-overlap",   required_argument, NULL, 'm' },
+    { "min-count-max-base",   required_argument, NULL, 'M' },
+    { "count-offset",   required_argument, NULL, 'O' },
     { "rounds",        required_argument, NULL, 'r' },
     { "outfile",       required_argument, NULL, 'o' },
     { "prefix",        required_argument, NULL, 'p' },
@@ -126,6 +138,7 @@ static const struct option longopts[] = {
     { "branch-cutoff", required_argument, NULL, 'b' },
     { "kmer-size",     required_argument, NULL, 'k' },
     { "kmer-threshold",required_argument, NULL, 'x' },
+    { "base-threshold",required_argument, NULL, 'X' },
     { "kmer-rounds",   required_argument, NULL, 'i' },
     { "learn",         no_argument,       NULL, OPT_LEARN },
     { "discard",       no_argument,       NULL, OPT_DISCARD },
@@ -181,6 +194,9 @@ int correctMain(int argc, char** argv)
     ecParams.algorithm = opt::algorithm;
 
     ecParams.minOverlap = opt::minOverlap;
+    ecParams.min_count_max_base = opt::min_count_max_base;
+    ecParams.countOffset = opt::countOffset;
+    ecParams.base_threshold = opt::base_threshold;
     ecParams.numOverlapRounds = opt::numOverlapRounds;
     ecParams.minIdentity = 1.0f - opt::errorRate;
     ecParams.conflictCutoff = opt::conflictCutoff;
@@ -189,6 +205,10 @@ int correctMain(int argc, char** argv)
     ecParams.kmerLength = opt::kmerLength;
     ecParams.printOverlaps = opt::verbose > 0;
 
+	 printf("ecParams.min_count_max_base = %d\n",ecParams.min_count_max_base);
+	 printf("ecParams.base_threshold = %d\n",ecParams.base_threshold);
+	 printf("ecParams.countOffset = %d\n",ecParams.countOffset);
+
     // Setup post-processor
     bool bCollectMetrics = !opt::metricsFile.empty();
     ErrorCorrectPostProcess postProcessor(pWriter, pDiscardWriter, bCollectMetrics);
@@ -313,6 +333,8 @@ void parseCorrectOptions(int argc, char** argv)
         switch (c) 
         {
             case 'm': arg >> opt::minOverlap; break;
+            case 'M': arg >> opt::min_count_max_base; break;
+            case 'O': arg >> opt::countOffset; break;
             case 'p': arg >> opt::prefix; break;
             case 'o': arg >> opt::outFile; break;
             case 'e': arg >> opt::errorRate; break;
@@ -325,6 +347,7 @@ void parseCorrectOptions(int argc, char** argv)
             case 'c': arg >> opt::conflictCutoff; break;
             case 'k': arg >> opt::kmerLength; break;
             case 'x': arg >> opt::kmerThreshold; break;
+            case 'X': arg >> opt::base_threshold; break;
             case '?': die = true; break;
             case 'v': opt::verbose++; break;
             case 'b': arg >> opt::branchCutoff; break;
@@ -382,6 +405,24 @@ void parseCorrectOptions(int argc, char** argv)
         die = true;
     }
 
+    if(opt::base_threshold <= 0)
+    {
+        std::cerr << SUBPROGRAM ": invalid base threshold: " << opt::base_threshold << ", must be greater than zero\n";
+        die = true;
+    }
+
+    if(opt::min_count_max_base <= 0)
+    {
+        std::cerr << SUBPROGRAM ": invalid min_count_max_base: " << opt::min_count_max_base << ", must be greater than zero\n";
+        die = true;
+    }
+
+    if(opt::countOffset <= 0)
+    {
+        std::cerr << SUBPROGRAM ": invalid countOffset: " << opt::countOffset << ", must be greater than zero. Otherwise, a kmer could be corrected to a kmer with the same count.\n";
+        die = true;
+    }
+
     // Determine the correction algorithm to use
     if(!algo_str.empty())
     {
diff --git a/src/SGA/filter.cpp b/src/SGA/filter.cpp
index 9451134..f28614a 100644
--- a/src/SGA/filter.cpp
+++ b/src/SGA/filter.cpp
@@ -61,6 +61,7 @@ static const char *FILTER_USAGE_MESSAGE =
 "      --no-duplicate-check             turn off duplicate removal\n"
 "      --substring-only                 when removing duplicates, only remove substring sequences, not full-length matches\n"
 "      --no-kmer-check                  turn off the kmer check\n"
+"      --kmer-both-strand               mimimum kmer coverage is required for both strand\n"
 "      --homopolymer-check              check reads for hompolymer run length sequencing errors\n"
 "      --low-complexity-check           filter out low complexity reads\n"
 "\nK-mer filter options:\n"
@@ -84,6 +85,7 @@ namespace opt
     static bool dupCheck = true;
     static bool substringOnly = false;
     static bool kmerCheck = true;
+    static bool kmerBothStrand = false;
     static bool hpCheck = false;
     static bool lowComplexityCheck = false;
 
@@ -93,7 +95,7 @@ namespace opt
 
 static const char* shortopts = "p:d:t:o:k:x:v";
 
-enum { OPT_HELP = 1, OPT_VERSION, OPT_SUBSTRING_ONLY, OPT_NO_RMDUP, OPT_NO_KMER, OPT_CHECK_HPRUNS, OPT_CHECK_COMPLEXITY };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_SUBSTRING_ONLY, OPT_NO_RMDUP, OPT_NO_KMER, OPT_KMER_BOTH_STRAND, OPT_CHECK_HPRUNS, OPT_CHECK_COMPLEXITY };
 
 static const struct option longopts[] = {
     { "verbose",               no_argument,       NULL, 'v' },
@@ -107,6 +109,7 @@ static const struct option longopts[] = {
     { "version",               no_argument,       NULL, OPT_VERSION },
     { "no-duplicate-check",    no_argument,       NULL, OPT_NO_RMDUP },
     { "no-kmer-check",         no_argument,       NULL, OPT_NO_KMER },
+    { "kmer-both-strand",      no_argument,       NULL, OPT_KMER_BOTH_STRAND },
     { "homopolymer-check",     no_argument,       NULL, OPT_CHECK_HPRUNS },
     { "low-complexity-check",  no_argument,       NULL, OPT_CHECK_COMPLEXITY },
     { "substring-only",        no_argument,       NULL, OPT_SUBSTRING_ONLY },
@@ -145,6 +148,7 @@ int filterMain(int argc, char** argv)
     params.checkDuplicates = opt::dupCheck;
     params.substringOnly = opt::substringOnly;
     params.checkKmer = opt::kmerCheck;
+    params.kmerBothStrand = opt::kmerBothStrand;
     params.checkHPRuns = opt::hpCheck;
     params.checkDegenerate = opt::lowComplexityCheck;
 
@@ -191,7 +195,7 @@ int filterMain(int argc, char** argv)
         delete pSharedBV;
 
     // Rebuild the FM-index without the discarded reads
-    std::string out_prefix = stripFilename(opt::outFile);
+    std::string out_prefix = stripExtension(opt::outFile);
     removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, BWT_EXT, SAI_EXT, false, opt::numThreads);
     removeReadsFromIndices(opt::prefix, opt::discardFile, out_prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads);
 
@@ -223,6 +227,7 @@ void parseFilterOptions(int argc, char** argv)
             case 'x': arg >> opt::kmerThreshold; break;
             case OPT_NO_RMDUP: opt::dupCheck = false; break;
             case OPT_NO_KMER: opt::kmerCheck = false; break;
+            case OPT_KMER_BOTH_STRAND: opt::kmerBothStrand = true; break;
             case OPT_CHECK_HPRUNS: opt::hpCheck = true; break;
             case OPT_CHECK_COMPLEXITY: opt::lowComplexityCheck = true; break;
             case OPT_SUBSTRING_ONLY: opt::substringOnly = true; break;
@@ -277,7 +282,7 @@ void parseFilterOptions(int argc, char** argv)
 
     if(opt::prefix.empty())
     {
-        opt::prefix = stripFilename(opt::readsFile);
+        opt::prefix = stripExtension(opt::readsFile);
     }
 
     if(opt::outFile.empty())
@@ -287,6 +292,9 @@ void parseFilterOptions(int argc, char** argv)
     }
     else
     {
-        opt::discardFile = stripFilename(opt::outFile) + ".discard.fa";
+        if(isGzip(opt::outFile))
+            opt::discardFile = stripExtension(opt::outFile) + ".discard.fa" + GZIP_EXT;
+        else
+            opt::discardFile = stripExtension(opt::outFile) + ".discard.fa";
     }
 }
diff --git a/src/SGA/fm-merge.cpp b/src/SGA/fm-merge.cpp
index dd06119..649d9cf 100644
--- a/src/SGA/fm-merge.cpp
+++ b/src/SGA/fm-merge.cpp
@@ -39,7 +39,7 @@ SUBPROGRAM " Version " PACKAGE_VERSION "\n"
 static const char *FMMERGE_USAGE_MESSAGE =
 "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READSFILE\n"
 "Merge unambiguously sequences from the READSFILE using the FM-index.\n"
-"This program requires rmdup to be run before it.\n"
+"This program requires filter to be run before it and rmdup to be run after.\n"
 "\n"
 "      --help                           display this help and exit\n"
 "      -v, --verbose                    display verbose output\n"
@@ -209,7 +209,7 @@ void parseFMMergeOptions(int argc, char** argv)
 
     if(opt::prefix.empty())
     {
-        opt::prefix = stripFilename(opt::readsFile);
+        opt::prefix = stripExtension(opt::readsFile);
     }
 
     if(opt::outFile.empty())
diff --git a/src/SGA/gen-ssa.cpp b/src/SGA/gen-ssa.cpp
index 9bc916a..2125cda 100644
--- a/src/SGA/gen-ssa.cpp
+++ b/src/SGA/gen-ssa.cpp
@@ -39,6 +39,7 @@ static const char *GENSSA_USAGE_MESSAGE =
 "      --help                           display this help and exit\n"
 "  -t, --threads=NUM                    use NUM threads to construct the index (default: 1)\n"
 "  -c, --check                          validate that the suffix array/bwt is correct\n"
+"  -s, --sai-only                       only build the lexicographic index\n"
 "\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
 
 namespace opt
@@ -48,17 +49,19 @@ namespace opt
     static std::string prefix;
     static int numThreads = 1;
     static int sampleRate = 32;
+    static bool saiOnly = false;
     static bool validate = false;
 }
 
-static const char* shortopts = "t:cv";
+static const char* shortopts = "t:cvs";
 
-enum { OPT_HELP = 1, OPT_VERSION };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_SAI_ONLY };
 
 static const struct option longopts[] = {
     { "verbose",     no_argument,       NULL, 'v' },
     { "check",       no_argument,       NULL, 'c' },
     { "threads",     required_argument, NULL, 't' },
+    { "sai-only",    no_argument,       NULL, OPT_SAI_ONLY },
     { "help",        no_argument,       NULL, OPT_HELP },
     { "version",     no_argument,       NULL, OPT_VERSION },
     { NULL, 0, NULL, 0 }
@@ -70,19 +73,27 @@ int genSSAMain(int argc, char** argv)
     parseGenSSAOptions(argc, argv);
     
     BWT* pBWT = new BWT(opt::prefix + BWT_EXT);
-    ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
     pBWT->printInfo();
 
     SampledSuffixArray* pSSA = new SampledSuffixArray();
-    pSSA->build(pBWT, pRIT, opt::sampleRate);
+    if(opt::saiOnly)
+    {
+        pSSA->buildLexicoIndex(pBWT, opt::numThreads);
+        pSSA->writeLexicoIndex(opt::prefix + SAI_EXT);
+    }
+    else
+    {
+        ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pBWT->getNumStrings(), RIO_NUMERICID);
+        pSSA->build(pBWT, pRIT, opt::sampleRate);
+        pSSA->writeSSA(opt::prefix + SSA_EXT);
+        delete pRIT;
+    }
     pSSA->printInfo();
-    pSSA->writeSSA(opt::prefix + SSA_EXT);
 
     if(opt::validate)
         pSSA->validate(opt::readsFile, pBWT);
 
     delete pBWT;
-    delete pRIT;
     delete pSSA;
 
     return 0;
@@ -103,6 +114,7 @@ void parseGenSSAOptions(int argc, char** argv)
             case 'c': opt::validate = true; break;
             case 't': arg >> opt::numThreads; break;
             case 'v': opt::verbose++; break;
+            case OPT_SAI_ONLY: opt::saiOnly = true; break;
             case OPT_HELP:
                 std::cout << GENSSA_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
diff --git a/src/SGA/graph-concordance.cpp b/src/SGA/graph-concordance.cpp
new file mode 100644
index 0000000..9809b57
--- /dev/null
+++ b/src/SGA/graph-concordance.cpp
@@ -0,0 +1,487 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// graph-concordance - check variants in a VCF file
+// against an assembly graph
+//
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iterator>
+#include "Util.h"
+#include "graph-concordance.h"
+#include "BWTAlgorithms.h"
+#include "BWTIndexSet.h"
+#include "SGACommon.h"
+#include "Timer.h"
+#include "SequenceProcessFramework.h"
+#include "HapgenUtil.h"
+#include "SGAStats.h"
+#include "HashMap.h"
+#include "DindelRealignWindow.h"
+#include "VariantIndex.h"
+#include "HapgenUtil.h"
+
+//
+// Getopt
+//
+#define SUBPROGRAM "graph-concordance"
+static const char *GRAPH_CONCORDANCE_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2013 Wellcome Trust Sanger Institute\n";
+
+static const char *GRAPH_CONCORDANCE_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... -r variant.fastq -b base.fastq VCF_FILE\n"
+"Count read support for variants in a vcf file\n"
+"\n"
+"      --help                           display this help and exit\n"
+"      -v, --verbose                    display verbose output\n"
+"          --reference=STR              load the reference genome from FILE\n"
+"      -t, --threads=NUM                use NUM threads to compute the overlaps (default: 1)\n"
+"      -g, --germline=FILE              load germline variants from FILE\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+static const char* PROGRAM_IDENT =
+PACKAGE_NAME "::" SUBPROGRAM;
+
+namespace opt
+{
+    static unsigned int verbose;
+    static int k = 51;
+    static std::string variantFile;
+    static std::string baseFile;
+    static std::string haplotypeFile;
+    static std::string vcfFile;
+    static std::string referenceFile;
+    static std::string germlineFile;
+}
+
+static const char* shortopts = "b:r:o:k:t:g:v";
+
+enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_HAPLOID };
+
+static const struct option longopts[] = {
+    { "verbose",               no_argument,       NULL, 'v' },
+    { "threads",               required_argument, NULL, 't' },
+    { "outfile",               required_argument, NULL, 'o' },
+    { "reads",                 required_argument, NULL, 'r' },
+    { "base",                  required_argument, NULL, 'b' },
+    { "variant",               required_argument, NULL, 'r' },
+    { "germline",              required_argument, NULL, 'g' },
+    { "reference",             required_argument, NULL, OPT_REFERENCE },
+    { "haploid",               no_argument,       NULL, OPT_HAPLOID },
+    { "help",                  no_argument,       NULL, OPT_HELP },
+    { "version",               no_argument,       NULL, OPT_VERSION },
+    { NULL, 0, NULL, 0 }
+};
+
+std::string applyVariant(const std::string& in, int pos,
+                         const std::string& ref, const std::string& alt,
+                         std::vector<int>& coordinate_map, 
+                         bool update_map, int variant_tag = -1)
+{
+    assert(variant_tag < 0);
+    std::string out = in;
+
+    // Search the coordinate map for the index of the ref string to be substituted
+    size_t idx = 0;
+    while(idx < coordinate_map.size())
+    {
+        if(coordinate_map[idx] == pos)
+            break;
+        idx += 1;
+    }
+
+    // If we could not find the position in the coordinate map
+    // then the input variant is for a reference base that has been removed
+    // already. Skip it.
+    if(idx == coordinate_map.size())
+        return "";
+
+    /*
+    std::cout << "In: " << in << "-\n";
+    std::cout << "Current map: \n";
+    for(size_t i = 0; i < coordinate_map.size(); ++i)
+        std::cout << i << " " << coordinate_map[i] << "\n";
+    std::cout << "\n";
+    std::cout << "Replace idx: " << idx << "\n";
+    std::cout << "Replace coordinate: " << coordinate_map[idx] << "\n";
+    std::cout << "ref: " << ref << "\n";
+    std::cout << "Alt: " << alt << "\n";
+    */
+
+    // Ensure that the reference string at the variant matches the expected
+    if(out.substr(idx, ref.length()) != ref)
+    {
+        std::cerr << "Warning: REF does not match reference\n";
+        return "";
+    }
+
+    // Replace the reference sequence with the alternative
+    out.replace(idx, ref.length(), alt);
+
+    if(update_map)
+    {
+        // shift coordinate map to account for indels
+
+        // Make an iterator to the first position that is replaced
+        std::vector<int>::iterator fi = coordinate_map.begin() + idx;
+
+        // Iterator which is one past the last position to remove
+        std::vector<int>::iterator li = fi + ref.size();
+        
+        // erase returns an iterator pointing to the position
+        // following the last position removed, this is where
+        // we want to insert 
+        std::vector<int>::iterator ii = coordinate_map.erase(fi, li);
+
+        // Insert new elements for the incoming bases
+        // These elements can be tagged with an integer
+        // by the calling function to indicate the sample
+        // the new bases came from
+        coordinate_map.insert(ii, alt.size(), variant_tag);
+        
+        /*
+        std::cout << "Updated map: \n";
+        for(size_t i = 0; i < coordinate_map.size(); ++i)
+            std::cout << coordinate_map[i] << " ";
+        std::cout << "\n";
+        */
+        assert(coordinate_map.size() == out.size());
+    }
+    return out;
+}
+
+// Extract somatic k-mers from the haplotype accordining to the tagged
+// entries in the coordinate map
+StringVector extractSomaticKmers(const std::string& haplotype,
+                                 const std::vector<int> coordinate_map,
+                                 int somatic_tag)
+{
+    StringVector kmers;
+
+    assert(haplotype.size() == coordinate_map.size());
+
+    // Get the index in the vector of the first somatic variant
+    int first_idx = 0;
+    while(coordinate_map[first_idx] != somatic_tag && 
+          first_idx != (int)coordinate_map.size()) {
+        first_idx++;
+    }
+    
+    // no somatic variants?
+    if(first_idx == (int)coordinate_map.size())
+        return kmers;
+    
+    // Get the index of the last somatic variant
+    int last_idx = first_idx;
+    while(coordinate_map[last_idx] == somatic_tag)
+        last_idx++;
+    last_idx -= 1;
+
+    // Start k - 1 bases before the first index
+    first_idx -= (opt::k - 1);
+    if(first_idx < 0)
+        first_idx = 0;
+
+    for(size_t i = first_idx; i <= (size_t)last_idx; ++i)
+    {
+        std::string t = haplotype.substr(i, opt::k);
+        if(t.size() == (size_t)opt::k)
+            kmers.push_back(t);
+    }
+
+    return kmers;
+}
+
+std::string reconstructHaplotype(const VCFRecord& somatic_record, 
+                                 const VariantRecordVector nearby_germline, 
+                                 const ReadTable& refTable,
+                                 const BWTIndexSet& variantIndex,
+                                 size_t flanking_size)
+{
+    // Extract the reference haplotype
+    int eventLength = somatic_record.varStr.length();
+    int zeroBasedPos = somatic_record.refPosition - 1;
+    int start = zeroBasedPos - flanking_size - 1;
+    int end = zeroBasedPos + eventLength + 2 * flanking_size;
+    const SeqItem& chr = refTable.getRead(somatic_record.refName);
+    if(end > chr.seq.length())
+        return "";
+
+    std::string reference_haplotype = chr.seq.substr(start, end - start);
+    
+    // Set up a vector mapping positions on the current haplotype
+    // to original reference coordinates. This allows use to apply
+    // the germline variants to the haplotype when the coordinates
+    // shift due to indels
+    std::vector<int> coordinate_map(reference_haplotype.size());
+    for(int i = 0; i < (int)reference_haplotype.size(); ++i)
+        coordinate_map[i] = i + start;
+
+    // Do not attempt haplotype reconstruction if there are too many variants
+    if(nearby_germline.size() > 4)
+        return "";
+    
+    int SOMATIC_TAG = -2;
+    int GERMLINE_TAG = -3;
+
+    // Iterate over all possible combinations of germline variants
+    size_t total_haplotypes = 1 << nearby_germline.size(); // 2 ^ n
+
+    for(size_t haplotype_id = 0; haplotype_id < total_haplotypes; haplotype_id++)
+    {
+        std::string haplotype = reference_haplotype;
+        std::vector<int> haplotype_coordinates = coordinate_map;
+
+        // Iterate over the germline variants and check if the i-th bit
+        // is set in the haplotype id. If so, apply the variant to the haplotype
+        for(size_t var_id = 0; var_id < nearby_germline.size(); ++var_id)
+        {
+            if( (haplotype_id >> var_id) & 1)
+            {
+                std::string new_haplotype = 
+                    applyVariant(haplotype, 
+                                 nearby_germline[var_id].position - 1,
+                                 nearby_germline[var_id].ref_sequence, 
+                                 nearby_germline[var_id].alt_sequence,
+                                 haplotype_coordinates, 
+                                 true, 
+                                 GERMLINE_TAG);
+                haplotype = new_haplotype;
+
+                if(haplotype.empty())
+                    break;
+            }
+        }
+
+        if(haplotype.empty())
+            continue;
+
+        // Apply the somatic variant to this haplotype
+        std::string somatic_haplotype = 
+            applyVariant(haplotype, 
+                         somatic_record.refPosition - 1,
+                         somatic_record.refStr, 
+                         somatic_record.varStr,
+                         haplotype_coordinates, 
+                         true, 
+                         SOMATIC_TAG);
+    
+        if(somatic_haplotype.empty())
+            continue;
+
+        StringVector kmers = extractSomaticKmers(somatic_haplotype, haplotype_coordinates, SOMATIC_TAG);
+        
+        // Count the number of somatic kmers that are present in the variant reads
+        size_t read_kmers = 0;
+        for(size_t ki = 0; ki < kmers.size(); ++ki)
+        {
+            if(BWTAlgorithms::countSequenceOccurrences(kmers[ki], variantIndex) > 0)
+                read_kmers += 1;
+        }
+
+        // Check if all somatic kmers are present in the reads
+        if(read_kmers == kmers.size())
+            return somatic_haplotype;
+    }
+
+    return "";
+}
+
+//
+// Main
+//
+int graphConcordanceMain(int argc, char** argv)
+{
+    parseGraphConcordanceOptions(argc, argv);
+
+    Timer* pTimer = new Timer(PROGRAM_IDENT);
+
+    // Load the reference
+    ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION);
+    refTable.indexReadsByID();
+
+    // Index the germline variants so we can make germline haplotypes
+    std::cerr << "Loading germline variant index..." << std::flush;
+    VariantIndex germlineIndex(opt::germlineFile, refTable);
+    std::cerr << "done\n";
+
+    // Load FM-index of the reads
+    BWTIndexSet variantIndex;
+    std::cerr << "Loading variant read index... " << std::flush;
+    std::string variantPrefix = stripGzippedExtension(opt::variantFile);
+    variantIndex.pBWT = new BWT(variantPrefix + BWT_EXT, 256);
+    variantIndex.pCache = new BWTIntervalCache(11, variantIndex.pBWT);
+    std::cerr << "done\n";
+    
+    //
+    BWTIndexSet baseIndex;
+    std::cerr << "Loading base read index index... " << std::flush;
+    std::string basePrefix = stripGzippedExtension(opt::baseFile);
+    baseIndex.pBWT = new BWT(basePrefix + BWT_EXT, 256);
+    baseIndex.pCache = new BWTIntervalCache(11, baseIndex.pBWT);
+    std::cerr << "done\n";
+
+    std::ifstream input(opt::vcfFile.c_str());
+    std::string line;
+    while(getline(input, line))
+    {
+        if(line.empty())
+            continue;
+
+        if(line[0] == '#')
+        {
+            std::cout << line << "\n";
+            continue;
+        }
+        
+        // parse record
+        VCFRecord record(line);
+        if(record.isMultiAllelic())
+        {
+            std::cerr << "Error: multi-allelic VCF found, please run vcfbreakmulti\n";
+            exit(EXIT_FAILURE);
+        }
+
+        // Grab nearby variants
+        size_t flanking_size = opt::k;
+        VariantRecordVector nearby_vector = germlineIndex.getNearVariants(record.refName, 
+                                                                          record.refPosition, 
+                                                                          flanking_size);
+        
+        if(opt::verbose > 0)
+        {
+            std::cerr << "\n\n Variant: " << record << "\n";
+            std::cerr << "Nearby: " << nearby_vector.size() << "\n";
+        }
+
+        std::string classification = "UNKNOWN";
+
+        std::string haplotype;
+        if(nearby_vector.size() <= 4)
+            haplotype = reconstructHaplotype(record, nearby_vector, refTable, variantIndex, flanking_size);
+        else
+            classification = "TOO_COMPLEX";
+
+        size_t max_unique_variant_kmers = 0;
+        if(!haplotype.empty())
+        {    
+            IntVector v_cp = HapgenUtil::makeCountProfile(haplotype, opt::k, 9, variantIndex);
+            IntVector b_cp = HapgenUtil::makeCountProfile(haplotype, opt::k, 9, baseIndex);
+            
+        
+            size_t unique_variant_kmers = 0;
+            for(size_t i = 0; i < v_cp.size(); ++i)
+            {
+                if(v_cp[i] > 0 && b_cp[i] == 0)
+                    unique_variant_kmers++;
+            }
+
+            if(unique_variant_kmers > max_unique_variant_kmers)
+                max_unique_variant_kmers = unique_variant_kmers;
+            
+            if(opt::verbose > 0)
+            {
+                std::cerr << "k-mer profile somatic (variant):  ";
+                for(size_t j = 0; j < v_cp.size(); ++j)
+                    std::cerr << v_cp[j];
+                std::cerr << "\n";
+
+                std::cerr << "k-mer profile somatic (base):     ";
+                for(size_t j = 0; j < b_cp.size(); ++j)
+                    std::cerr << b_cp[j];
+                std::cerr << "\n";
+                fprintf(stderr, "unique variant kmers: %zu\n\n", unique_variant_kmers);
+                fprintf(stderr, "max unique variant kmers: %zu\n\n", max_unique_variant_kmers);
+            }
+        }
+        else
+        {
+            classification = "CANNOT_RECONSTRUCT";
+        }
+
+        if(max_unique_variant_kmers > 5)
+            classification = "SOMATIC";
+        else
+            classification = "GERMLINE";
+        
+        // write out the record
+        record.addComment("MaxUniqueVariantKmers", (int)max_unique_variant_kmers);
+        record.addComment("KmerClassification", classification);
+        std::cout << record << "\n";
+    }
+    
+    // Cleanup
+    delete variantIndex.pBWT;
+    delete variantIndex.pCache;
+    delete baseIndex.pBWT;
+    delete baseIndex.pCache;
+    delete pTimer;
+
+    return 0;
+}
+
+// 
+// Handle command line arguments
+//
+void parseGraphConcordanceOptions(int argc, char** argv)
+{
+    std::string algo_str;
+    bool die = false;
+    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
+    {
+        std::istringstream arg(optarg != NULL ? optarg : "");
+        switch (c) 
+        {
+            case 'b': arg >> opt::baseFile; break;
+            case 'r': arg >> opt::variantFile; break;
+            case 'g': arg >> opt::germlineFile; break;
+            case OPT_REFERENCE: arg >> opt::referenceFile; break;
+            case '?': die = true; break;
+            case 'v': opt::verbose++; break;
+            case OPT_HELP:
+                std::cout << GRAPH_CONCORDANCE_USAGE_MESSAGE;
+                exit(EXIT_SUCCESS);
+            case OPT_VERSION:
+                std::cout << GRAPH_CONCORDANCE_VERSION_MESSAGE;
+                exit(EXIT_SUCCESS);
+        }
+    }
+
+    if (argc - optind < 1) 
+    {
+        std::cerr << SUBPROGRAM ": missing arguments\n";
+        die = true;
+    } 
+    else if (argc - optind > 1) 
+    {
+        std::cerr << SUBPROGRAM ": too many arguments\n";
+        die = true;
+    }
+
+    if(opt::referenceFile.empty())
+    {
+        std::cerr << SUBPROGRAM ": a reference file must be given.\n";
+        die = true;
+    }
+
+    if(opt::germlineFile.empty())
+    {
+        std::cerr << SUBPROGRAM ": a germline variants file must be given.\n";
+        die = true;
+    }
+
+    opt::vcfFile = argv[optind++];
+
+    if (die) 
+    {
+        std::cout << "\n" << GRAPH_CONCORDANCE_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
+}
diff --git a/src/SGA/graph-concordance.h b/src/SGA/graph-concordance.h
new file mode 100644
index 0000000..a0954cc
--- /dev/null
+++ b/src/SGA/graph-concordance.h
@@ -0,0 +1,23 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// vcf-read-count - count the number of reads
+// supporting variant haplotypes
+//
+#ifndef VCF_READ_COUNT_H
+#define VCF_READ_COUNT_H
+#include <getopt.h>
+#include "config.h"
+
+// functions
+
+//
+int graphConcordanceMain(int argc, char** argv);
+
+// options
+void parseGraphConcordanceOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/graph-diff.cpp b/src/SGA/graph-diff.cpp
index 693ce76..103f847 100644
--- a/src/SGA/graph-diff.cpp
+++ b/src/SGA/graph-diff.cpp
@@ -31,6 +31,7 @@
 void runVCFTester(GraphCompareParameters& parameters);
 void runGraphDiff(GraphCompareParameters& parameters);
 void runDebug(GraphCompareParameters& parameters);
+void runInteractive(GraphCompareParameters& parameters);
 void preloadBloomFilter(const ReadTable* pReadTable, size_t k, BloomFilter* pBloomFilter);
 
 // Defines to clarify awful template function calls
@@ -76,8 +77,7 @@ static const char *GRAPH_DIFF_USAGE_MESSAGE =
 "Algorithm options:\n"
 "      -k, --kmer=K                     use K-mers to discover variants\n"
 "      -x, --min-discovery-count=T      require a variant k-mer to be seen at least T times\n"
-"          --debruijn                   use the de Bruijn graph assembly algorithm (default: string graph)\n"
-"          --paired-debruijn            use the de Bruijn graph assembly algorithm with paired-end constraints(default: string graph)\n"
+"      -a, --algorithm=STR              select the assembly algorithm to use from: debruijn, string\n"
 "      -m, --min-overlap=N              require at least N bp overlap when assembling using a string graph\n" 
 "          --min-dbg-count=T            only use k-mers seen T times when assembling using a de Bruijn graph\n"
 "\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
@@ -99,11 +99,11 @@ namespace opt
     static int minDBGCount = 2;
 
     // Calling modes
-    static bool deBruijnMode = false;
-    static bool deBruijnPairedMode = false;
+    static GraphCompareAlgorithm algorithm = GCA_DEBRUIJN_GRAPH;
     static bool lowCoverage = false;
     static bool referenceMode = false;
     static bool useQualityScores = false;
+    static bool interactiveMode = false;
 
     // I/O
     static std::string outPrefix = "sgavariants";
@@ -115,7 +115,7 @@ namespace opt
     static std::string inputVCFFile;
 }
 
-static const char* shortopts = "b:r:o:k:d:t:x:y:p:m:v";
+static const char* shortopts = "b:r:o:k:d:t:x:y:p:m:a:v";
 
 enum { OPT_HELP = 1, 
        OPT_VERSION, 
@@ -124,12 +124,11 @@ enum { OPT_HELP = 1,
        OPT_DEBUG, 
        OPT_MIN_DBG_COUNT, 
        OPT_INDEX, 
-       OPT_DEBRUIJN, 
-       OPT_PAIRED_DEBRUIJN, 
        OPT_LOWCOVERAGE, 
        OPT_QUALSCORES,
        OPT_BLOOM_GENOME,
-       OPT_PRECACHE_REFERENCE };
+       OPT_PRECACHE_REFERENCE,
+       OPT_INTERACTIVE };
 
 static const struct option longopts[] = {
     { "verbose",              no_argument,       NULL, 'v' },
@@ -141,10 +140,10 @@ static const struct option longopts[] = {
     { "sample-rate",          required_argument, NULL, 'd' },
     { "prefix",               required_argument, NULL, 'p' },
     { "min-overlap",          required_argument, NULL, 'm' },
-    { "debruijn",             no_argument,       NULL, OPT_DEBRUIJN },
-    { "paired-debruijn",      no_argument,       NULL, OPT_PAIRED_DEBRUIJN },
+    { "algorithm",            required_argument, NULL, 'a' },
     { "low-coverage",         no_argument,       NULL, OPT_LOWCOVERAGE },
     { "use-quality-scores",   no_argument,       NULL, OPT_QUALSCORES},
+    { "interactive",          no_argument,       NULL, OPT_INTERACTIVE},
     { "index",                required_argument, NULL, OPT_INDEX },
     { "min-dbg-count",        required_argument, NULL, OPT_MIN_DBG_COUNT },
     { "debug",                required_argument, NULL, OPT_DEBUG },
@@ -248,14 +247,7 @@ int graphDiffMain(int argc, char** argv)
     sharedParameters.referenceIndex = referenceIndex;
     sharedParameters.pRefTable = &refTable;
     sharedParameters.bReferenceMode = opt::referenceMode;
-
-    if(opt::deBruijnPairedMode)
-        sharedParameters.algorithm = GCA_PAIRED_DEBRUIJN_GRAPH;
-    else if(opt::deBruijnMode)
-        sharedParameters.algorithm = GCA_DEBRUIJN_GRAPH;
-    else
-        sharedParameters.algorithm = GCA_STRING_GRAPH;
-
+    sharedParameters.algorithm = opt::algorithm;
     sharedParameters.kmer = opt::kmer;
     sharedParameters.minDiscoveryCount = opt::minDiscoveryCount;
     sharedParameters.minDBGCount = opt::minDBGCount;
@@ -278,6 +270,10 @@ int graphDiffMain(int argc, char** argv)
     {
         runDebug(sharedParameters);
     }
+    else if(opt::interactiveMode)
+    {
+        runInteractive(sharedParameters);
+    }
     else
     {
         // If a VCF file was provided, just test the variants in the file without any discovery
@@ -445,11 +441,54 @@ void runDebug(GraphCompareParameters& parameters)
     graphCompare.testKmersFromFile(opt::debugFile);
 }
 
+// Run interactively, parse a sequence from cin, process it then wait
+void runInteractive(GraphCompareParameters& parameters)
+{
+    // Initialize a bloom filter
+    size_t expected_bits = 0;
+    if(opt::bloomGenomeSize == -1)
+    {
+        // Calculate the expected bits using the number of bases in the reference
+        for(size_t i = 0; i < parameters.pRefTable->getCount(); ++i)
+            expected_bits += parameters.pRefTable->getReadLength(i);
+    }
+    else 
+    {
+        expected_bits = opt::bloomGenomeSize;
+    }
+
+    size_t occupancy_factor = 20;
+    size_t bloom_size = occupancy_factor * expected_bits;
+
+    BloomFilter* pBloomFilter = new BloomFilter(bloom_size, 5);
+    parameters.pBloomFilter = pBloomFilter;
+    preloadBloomFilter(parameters.pRefTable, parameters.kmer, pBloomFilter);
+    
+    GraphCompare graphCompare(parameters); 
+    while(1) 
+    {
+        std::string sequence;
+        std::cout << "Input a sequence: ";
+        std::cin >> sequence;
+
+        if(sequence == "done")
+            break;
+        SequenceWorkItem item;
+        item.read.seq = sequence;
+        item.read.id = "input";
+
+        std::cout << "processing: " << sequence << "\n";
+
+        graphCompare.process(item);
+    }
+}
+
 // 
 // Handle command line arguments
 //
 void parseGraphDiffOptions(int argc, char** argv)
 {
+    std::string algorithmString;
     bool die = false;
     for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
     {
@@ -464,11 +503,10 @@ void parseGraphDiffOptions(int argc, char** argv)
             case 'd': arg >> opt::sampleRate; break;
             case 'p': arg >> opt::outPrefix; break;
             case 'm': arg >> opt::minOverlap; break;
+            case 'a': arg >> algorithmString; break;
             case '?': die = true; break;
             case 'v': opt::verbose++; break;
             case OPT_REFERENCE: arg >> opt::referenceFile; break;
-            case OPT_DEBRUIJN: opt::deBruijnMode = true; break;
-            case OPT_PAIRED_DEBRUIJN: opt::deBruijnPairedMode = true; break;
             case OPT_LOWCOVERAGE: opt::lowCoverage = true; break;
             case OPT_MIN_DBG_COUNT: arg >> opt::minDBGCount; break;
             case OPT_BLOOM_GENOME: arg >> opt::bloomGenomeSize; break;
@@ -477,6 +515,7 @@ void parseGraphDiffOptions(int argc, char** argv)
             case OPT_TESTVCF: arg >> opt::inputVCFFile; break;
             case OPT_INDEX: arg >> opt::indexPrefix; break;
             case OPT_QUALSCORES:  opt::useQualityScores = true; break;
+            case OPT_INTERACTIVE:  opt::interactiveMode = true; break;
             case OPT_HELP:
                 std::cout << GRAPH_DIFF_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
@@ -508,6 +547,17 @@ void parseGraphDiffOptions(int argc, char** argv)
     if(opt::baseFile.empty())
         opt::referenceMode = true;
 
+    if(algorithmString == "paired") {
+        opt::algorithm = GCA_PAIRED_DEBRUIJN_GRAPH;
+    } else if(algorithmString == "debruijn") {
+        opt::algorithm = GCA_DEBRUIJN_GRAPH;
+    } else if(algorithmString == "string") {
+        opt::algorithm = GCA_STRING_GRAPH;
+    } else {
+        std::cerr << "Error: unrecognized algorithm string: " << algorithmString << "\n";
+        die = true;
+    }
+
     if(opt::referenceFile.empty())
     {
         std::cerr << SUBPROGRAM ": error, a --reference file must be provided\n";
diff --git a/src/SGA/index.cpp b/src/SGA/index.cpp
index 4224c8a..bc9b9a3 100644
--- a/src/SGA/index.cpp
+++ b/src/SGA/index.cpp
@@ -51,6 +51,7 @@ static const char *INDEX_USAGE_MESSAGE =
 "      --no-reverse                     suppress construction of the reverse BWT. Use this option when building the index\n"
 "                                       for reads that will be error corrected using the k-mer corrector, which only needs the forward index\n"
 "      --no-forward                     suppress construction of the forward BWT. Use this option when building the forward and reverse index separately\n"
+"      --no-sai                         suppress construction of the SAI file. This option only applies to -a ropebwt\n"
 "  -g, --gap-array=N                    use N bits of storage for each element of the gap array. Acceptable values are 4,8,16 or 32. Lower\n"
 "                                       values can substantially reduce the amount of memory required at the cost of less predictable memory usage.\n"
 "                                       When this value is set to 32, the memory requirement is essentially deterministic and requires ~5N bytes where\n"
@@ -69,13 +70,14 @@ namespace opt
     static bool bDiskAlgo = false;
     static bool bBuildReverse = true;
     static bool bBuildForward = true;
+    static bool bBuildSAI = true;
     static bool validate;
     static int gapArrayStorage = 4;
 }
 
 static const char* shortopts = "p:a:m:t:d:g:cv";
 
-enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_REVERSE,OPT_NO_FWD };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_REVERSE, OPT_NO_FWD, OPT_NO_SAI };
 
 static const struct option longopts[] = {
     { "verbose",     no_argument,       NULL, 'v' },
@@ -87,6 +89,7 @@ static const struct option longopts[] = {
     { "algorithm",   required_argument, NULL, 'a' },
     { "no-reverse",  no_argument,       NULL, OPT_NO_REVERSE },
     { "no-forward",  no_argument,       NULL, OPT_NO_FWD },
+    { "no-sai",      no_argument,       NULL, OPT_NO_SAI },
     { "help",        no_argument,       NULL, OPT_HELP },
     { "version",     no_argument,       NULL, OPT_VERSION },
     { NULL, 0, NULL, 0 }
@@ -154,13 +157,16 @@ void indexInMemoryRopebwt()
         std::string bwt_filename = opt::prefix + BWT_EXT;
         std::string sai_filename = opt::prefix + SAI_EXT;
         BWTCA::runRopebwt(opt::readsFile, bwt_filename, use_threads, false);
-        std::cout << "\t done bwt construction, generating .sai file\n";
 
-        BWT* pBWT = new BWT(bwt_filename);
-        SampledSuffixArray ssa;
-        ssa.buildLexicoIndex(pBWT, opt::numThreads);
-        ssa.writeLexicoIndex(sai_filename);
-        delete pBWT;
+        if(opt::bBuildSAI)
+        {
+            std::cout << "\t done bwt construction, generating .sai file\n";
+            BWT* pBWT = new BWT(bwt_filename);
+            SampledSuffixArray ssa;
+            ssa.buildLexicoIndex(pBWT, opt::numThreads);
+            ssa.writeLexicoIndex(sai_filename);
+            delete pBWT;
+        }
     }
 
     if(opt::bBuildReverse)
@@ -169,12 +175,15 @@ void indexInMemoryRopebwt()
         std::string rsai_filename = opt::prefix + RSAI_EXT;
         BWTCA::runRopebwt(opt::readsFile, rbwt_filename, use_threads, true);
 
-        std::cout << "\t done rbwt construction, generating .rsai file\n";
-        BWT* pRBWT = new BWT(rbwt_filename);
-        SampledSuffixArray ssa;
-        ssa.buildLexicoIndex(pRBWT, opt::numThreads);
-        ssa.writeLexicoIndex(rsai_filename);
-        delete pRBWT;
+        if(opt::bBuildSAI)
+        {
+            std::cout << "\t done rbwt construction, generating .rsai file\n";
+            BWT* pRBWT = new BWT(rbwt_filename);
+            SampledSuffixArray ssa;
+            ssa.buildLexicoIndex(pRBWT, opt::numThreads);
+            ssa.writeLexicoIndex(rsai_filename);
+            delete pRBWT;
+        }
     }
 }
 
@@ -279,6 +288,7 @@ void parseIndexOptions(int argc, char** argv)
             case 'v': opt::verbose++; break;
             case OPT_NO_REVERSE: opt::bBuildReverse = false; break;
             case OPT_NO_FWD: opt::bBuildForward = false; break;
+            case OPT_NO_SAI: opt::bBuildSAI = false; break;
             case OPT_HELP:
                 std::cout << INDEX_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
@@ -321,6 +331,12 @@ void parseIndexOptions(int argc, char** argv)
         die = true;
     }
 
+    if(opt::algorithm == "ropebwt" && opt::bDiskAlgo)
+    {
+        std::cerr << SUBPROGRAM ": the options -a ropebwt and -d are not compatible, please only use one.\n";
+        die = true;
+    }
+
     if (die) 
     {
         std::cout << "\n" << INDEX_USAGE_MESSAGE;
diff --git a/src/SGA/kmer-count.cpp b/src/SGA/kmer-count.cpp
new file mode 100644
index 0000000..b6de14d
--- /dev/null
+++ b/src/SGA/kmer-count.cpp
@@ -0,0 +1,291 @@
+//-----------------------------------------------
+// Copyright 2009 Wellcome Trust Sanger Institute
+// Written by Jared Simpson/James Gurtowski (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// Kmer-count - Counting kmers in BWT or Sequence Files
+//
+
+#include <kmer-count.h>
+#include <iostream>
+#include <stack>
+#include <memory>
+#include <getopt.h>
+#include <BWT.h>
+#include <BWTInterval.h>
+#include <BWTAlgorithms.h>
+
+//
+// Getopt
+//
+
+#define SUBPROGRAM "kmer-count"
+static const char *KMERCOUNT_VERSION_MESSAGE = SUBPROGRAM " Version " PACKAGE_VERSION "\n";
+static const char *KMERCOUNT_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] src.{bwt,fa,fq} [test1.bwt] [test2.bwt]\n"
+"Generate a table of the k-mers in src.{bwt,fa,fq}, and optionaly count the number of time they appears in testX.bwt.\n"
+"Output on stdout the canonical kmers and their counts on forward and reverse strand if input is .bwt\n"
+"If src is a sequence file output forward and reverse counts for each kmer in the file\n"
+"\n"
+"      --help                           display this help and exit\n"
+"      --version                        display program version\n"
+"      -k, --kmer-size=N                The length of the kmer to use. (default: 27)\n"
+"      -d, --sample-rate=N              use occurrence array sample rate of N in the FM-index. Higher values use significantly\n"
+"                                       less memory at the cost of higher runtime. This value must be a power of 2 (default: 128)\n"
+"      -c, --cache-length=N             Cache Length for bwt lookups (default: 10)\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+
+
+namespace opt {
+    static std::string inputSequenceFile;
+    static std::vector<std::string> bwtFiles;
+    static int sampleRate = BWT::DEFAULT_SAMPLE_RATE_SMALL;
+    static unsigned int kmerLength = 27;
+    static int intervalCacheLength = 10;
+}
+
+static const char* shortopts = "d:k:c:x:";
+enum { OPT_HELP = 1, OPT_VERSION };
+static const struct option longopts[] = {
+    { "sample-rate",           required_argument, NULL, 'd' },
+    { "kmer-size",             required_argument, NULL, 'k' },
+    { "cache-length",          required_argument, NULL, 'c' },
+    { "help",                  no_argument,       NULL, OPT_HELP },
+    { "version",               no_argument,       NULL, OPT_VERSION },
+    { NULL, 0, NULL, 0 }
+};
+
+
+void parseKmerCountOptions(int argc, char** argv) {
+
+    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
+    {
+        std::istringstream arg(optarg != NULL ? optarg : "");
+        switch (c)
+        {
+            case 'd': arg >> opt::sampleRate; break;
+            case 'k': arg >> opt::kmerLength; break;
+            case 'c': arg >> opt::intervalCacheLength; break;
+            case OPT_HELP:
+                std::cout << KMERCOUNT_USAGE_MESSAGE;
+                exit(EXIT_SUCCESS);
+            case OPT_VERSION:
+                std::cout << KMERCOUNT_VERSION_MESSAGE;
+                exit(EXIT_SUCCESS);
+        }
+    }
+
+    if(opt::kmerLength <= 0 || opt::kmerLength % 2 == 0)
+    {
+        std::cerr << SUBPROGRAM ": invalid kmer length: " << opt::kmerLength << ", must be greater than zero and odd\n";
+        std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
+
+    if(optind >= argc)
+    {
+      std::cerr << SUBPROGRAM ": missing input bwt/sequence file" << std::endl;
+      std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+      exit(EXIT_FAILURE);
+    }
+
+    std::string inFile(argv[optind]);
+    
+    if(inFile.substr(inFile.find_last_of(".")) != ".bwt")
+    {
+      opt::inputSequenceFile = inFile;
+      ++optind;
+    }
+
+    for(;optind<argc;++optind)
+    {
+        opt::bwtFiles.push_back(argv[optind]);
+    }
+
+    if (opt::inputSequenceFile.empty() && opt::bwtFiles.size() < 1)
+    {
+        std::cerr << SUBPROGRAM ": missing arguments\n";
+        std::cout << "\n" << KMERCOUNT_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
+}
+
+
+
+//
+// BWT Traversal algorithm
+//
+
+// Stack structure used in the depth first search of kmers
+struct stack_elt_t
+{
+    size_t str_sz;
+    char bp;
+    BWTInterval range;
+    stack_elt_t(size_t strsz,char bp):str_sz(strsz),bp(bp){}
+    stack_elt_t(size_t strsz,char bp,const BWTInterval& range):str_sz(strsz),bp(bp),range(range){}
+};
+
+
+
+// extract all canonical kmers of a bwt by performing a backward depth-first-search
+void traverse_kmer(const std::vector<BWTIndexSet> &indicies)
+{
+    std::stack< stack_elt_t > stack;
+    std::string str; // string storing the current path
+
+    // Intitialize the search with root elements
+    for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
+    {
+        stack_elt_t e(str.size(),ALPHABET[i]);
+        BWTAlgorithms::initInterval(e.range,e.bp,indicies[0].pBWT);
+        if (e.range.isValid()) stack.push(e);
+    }
+
+    std::vector<BWTIndexSet>::const_iterator idx_it;
+
+    // Perform the kmer search
+    while(!stack.empty())
+    {
+        // pop an element from the stack, and update the string path accordingly
+        stack_elt_t top = stack.top();
+        stack.pop();
+        str.resize(top.str_sz);
+        str.push_back(top.bp);
+        if (str.length()>= opt::kmerLength) {
+            // we found a kmer, retreive the number of occurence
+            std::string seq(reverse(str));
+            int64_t seq_count = top.range.size();
+
+            // look for the count of its reverse complement
+            std::string seq_rc(complement(str));
+            BWTInterval range_rc = BWTAlgorithms::findInterval(indicies[0].pBWT,seq_rc);
+            int64_t seq_rc_count = range_rc.isValid()?range_rc.size():0;
+
+            // print the current kmer if canonical
+            if (seq<seq_rc) {
+              std::cout << seq << '\t' << seq_count << '\t' << seq_rc_count;
+              
+              for(idx_it = indicies.begin() + 1;
+                  idx_it != indicies.end(); ++idx_it)
+              {
+                std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq, *idx_it);
+                std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq_rc, *idx_it);
+              }
+              std::cout << std::endl;
+
+            } else if (seq_rc_count<=0) {
+                // the current kmer is not canonical, but the reverse complement doesn't exists
+                // so print it now as it will never be traversed by the searching algorithm
+                std::cout << seq_rc << '\t' << seq_rc_count << '\t' << seq_count;
+                for(idx_it = indicies.begin() + 1;
+                    idx_it != indicies.end(); ++idx_it)
+                {
+                  std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq_rc, *idx_it);
+                  std::cout << "\t" << BWTAlgorithms::countSequenceOccurrencesSingleStrand(seq, *idx_it);
+                }
+                std::cout << std::endl;
+            }
+        } else
+        {
+            // not yet a suffix of size k, push next candidates
+            for(size_t i = 0; i < DNA_ALPHABET_SIZE; ++i)
+            {
+                stack_elt_t e(str.size(),ALPHABET[i],top.range);
+                BWTAlgorithms::updateInterval(e.range,e.bp,indicies[0].pBWT);
+                if (e.range.isValid()) stack.push(e);
+            }
+        }
+    }
+}
+
+
+
+
+void kmers_from_file(const std::string &inputFile,
+                     const std::vector<BWTIndexSet> bwtIndicies)
+{
+
+    //Init sequence reader
+    SeqReader reader(inputFile, SRF_NO_VALIDATION);
+    SeqRecord record;
+    std::string kmer, kmer_rc;
+    std::cerr << "Processing " << inputFile << std::endl;
+
+    size_t kmer_idx;
+    std::vector<BWTIndexSet>::const_iterator indexset_it;
+
+    //read the sequences from the file and split into kmers
+    while(reader.get(record))
+    {
+      for(kmer_idx=0; kmer_idx < record.seq.length()-opt::kmerLength+1; ++kmer_idx)
+      {
+        kmer = record.seq.substr(kmer_idx, opt::kmerLength);
+        kmer_rc = reverseComplement(kmer);
+
+        std::cout << record.id << "\t" << kmer_idx << "\t" << kmer;
+          
+        //print out kmer count for all the bwts
+        for(indexset_it = bwtIndicies.begin(); 
+            indexset_it != bwtIndicies.end(); 
+            ++indexset_it)
+        {
+          std::cout << '\t' << BWTAlgorithms::countSequenceOccurrencesSingleStrand(kmer, *indexset_it);
+          std::cout << '\t' << BWTAlgorithms::countSequenceOccurrencesSingleStrand(kmer_rc, *indexset_it);
+        }
+        
+        std::cout << std::endl;
+
+      }
+    }
+}
+
+//
+// Main
+//
+
+int kmerCountMain(int argc, char** argv)
+{
+    // parse command line arguments
+    parseKmerCountOptions(argc,argv);
+
+    // allocate BWT objects
+    std::vector<BWTIndexSet> bwtIndicies;
+    BWTIndexSet tmpIdx;
+
+    for(std::vector<std::string>::iterator it = opt::bwtFiles.begin();
+        it != opt::bwtFiles.end(); ++it)
+    {
+      std::cerr << "Loading " << *it << std::endl;
+
+      tmpIdx.pBWT = new BWT(*it, opt::sampleRate);
+      tmpIdx.pCache = new BWTIntervalCache(opt::intervalCacheLength, tmpIdx.pBWT);
+      
+      bwtIndicies.push_back(tmpIdx);
+    }
+
+    if( opt::inputSequenceFile.empty() )
+    {
+      // run kmer search
+      traverse_kmer(bwtIndicies);
+    }else
+    {
+      kmers_from_file(opt::inputSequenceFile, bwtIndicies);
+    }
+
+    // clean memory
+    std::vector<BWTIndexSet>::const_iterator indexset_it;
+    for(indexset_it = bwtIndicies.begin(); 
+        indexset_it != bwtIndicies.end();
+        ++indexset_it)
+    {
+      delete (*indexset_it).pBWT;
+      delete (*indexset_it).pCache;
+    }
+    return 0;
+}
+
+
+
diff --git a/src/SGA/kmer-count.h b/src/SGA/kmer-count.h
new file mode 100644
index 0000000..e44ba29
--- /dev/null
+++ b/src/SGA/kmer-count.h
@@ -0,0 +1,8 @@
+#ifndef KMERCOUNT_H
+#define KMERCOUNT_H
+#include "config.h"
+
+int kmerCountMain(int argc, char** argv);
+void parseKmerCountOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/overlap.cpp b/src/SGA/overlap-long.cpp
similarity index 65%
copy from src/SGA/overlap.cpp
copy to src/SGA/overlap-long.cpp
index 0e65452..81c9652 100644
--- a/src/SGA/overlap.cpp
+++ b/src/SGA/overlap-long.cpp
@@ -1,17 +1,17 @@
-//-----------------------------------------------
-// Copyright 2009 Wellcome Trust Sanger Institute
-// Written by Jared Simpson (js18 at sanger.ac.uk)
+//-----------------------------------------------------
+// Copyright 2013 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
 // Released under the GPL
-//-----------------------------------------------
+//-----------------------------------------------------
 //
-// overlap - compute pairwise overlaps between reads
+// overlap-long - compute overlaps for long reads
 //
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <iterator>
 #include "Util.h"
-#include "overlap.h"
+#include "overlap-long.h"
 #include "SuffixArray.h"
 #include "BWT.h"
 #include "SGACommon.h"
@@ -23,13 +23,7 @@
 #include "SequenceProcessFramework.h"
 #include "OverlapProcess.h"
 #include "ReadInfoTable.h"
-
-//
-enum OutputType
-{
-    OT_ASQG,
-    OT_RAW
-};
+#include "KmerOverlaps.h"
 
 // Functions
 size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile, 
@@ -47,16 +41,16 @@ void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsF
 //
 // Getopt
 //
-#define SUBPROGRAM "overlap"
-static const char *OVERLAP_VERSION_MESSAGE =
+#define SUBPROGRAM "overlap-long"
+static const char *OVERLAP_LONG_VERSION_MESSAGE =
 SUBPROGRAM " Version " PACKAGE_VERSION "\n"
 "Written by Jared Simpson.\n"
 "\n"
-"Copyright 2009 Wellcome Trust Sanger Institute\n";
+"Copyright 2013 Ontario Institute for Cancer Research\n";
 
-static const char *OVERLAP_USAGE_MESSAGE =
+static const char *OVERLAP_LONG_USAGE_MESSAGE =
 "Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READSFILE\n"
-"Compute pairwise overlap between all the sequences in READS\n"
+"Compute pairwise overlap between all the long reads in READS\n"
 "\n"
 "      --help                           display this help and exit\n"
 "      -v, --verbose                    display verbose output\n"
@@ -64,9 +58,6 @@ static const char *OVERLAP_USAGE_MESSAGE =
 "      -e, --error-rate                 the maximum error rate allowed to consider two sequences aligned (default: exact matches only)\n"
 "      -m, --min-overlap=LEN            minimum overlap required between two reads (default: 45)\n"
 "      -f, --target-file=FILE           perform the overlap queries against the reads in FILE\n"
-"      -x, --exhaustive                 output all overlaps, including transitive edges\n"
-"          --exact                      force the use of the exact-mode irreducible block algorithm. This is faster\n"
-"                                       but requires that no substrings are present in the input set.\n"
 "      -l, --seed-length=LEN            force the seed length to be LEN. By default, the seed length in the overlap step\n"
 "                                       is calculated to guarantee all overlaps with --error-rate differences are found.\n"
 "                                       This option removes the guarantee but will be (much) faster. As SGA can tolerate some\n"
@@ -84,14 +75,13 @@ namespace opt
 {
     static unsigned int verbose;
     static int numThreads = 1;
-    static OutputType outputType = OT_ASQG;
     static std::string readsFile;
     static std::string targetFile;
     static std::string outFile;
     
     static double errorRate = 0.0f;
     static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
-    static int seedLength = 0;
+    static int seedLength = 100;
     static int seedStride = 0;
     static int sampleRate = BWT::DEFAULT_SAMPLE_RATE_SMALL;
     static bool bIrreducibleOnly = true;
@@ -120,15 +110,42 @@ static const struct option longopts[] = {
     { NULL, 0, NULL, 0 }
 };
 
+// Write an ascii pictogram of an overlap
+// O = overhang of length block_size (region of no overlap)
+// P = perfect match of length block_size
+// I = matched block with mismatches
+std::string ascii_overlap(const std::string& s0, const std::string& s1,
+                          SequenceOverlap& ovr, int block_size = 50)
+{
+    std::string out;
+
+    int pre_overhang = ovr.match[0].start / block_size;
+    int post_overhang = (ovr.length[0] - ovr.match[0].end - 1) / block_size;
+
+    std::string p0;
+    std::string p1;
+
+    ovr.makePaddedMatches(s0, s1, &p0, &p1);
+
+    out.append(pre_overhang, '-');
+
+    for(size_t i = 0; i < p0.size(); i += block_size)
+    {
+        if(p0.substr(i, block_size) == p1.substr(i, block_size))
+            out.append(1, '=');
+        else
+            out.append(1, 'x');
+    }
+    out.append(post_overhang, '-');
+    return out;
+}
+
 //
 // Main
 //
-int overlapMain(int argc, char** argv)
+int overlapLongMain(int argc, char** argv)
 {
-    parseOverlapOptions(argc, argv);
-
-    // Prepare the output ASQG file
-    assert(opt::outputType == OT_ASQG);
+    parseOverlapLongOptions(argc, argv);
 
     // Open output file
     std::ostream* pASQGWriter = createWriter(opt::outFile);
@@ -138,13 +155,9 @@ int overlapMain(int argc, char** argv)
     headerRecord.setOverlapTag(opt::minOverlap);
     headerRecord.setErrorRateTag(opt::errorRate);
     headerRecord.setInputFileTag(opt::readsFile);
-    headerRecord.setContainmentTag(true); // containments are always present
-    headerRecord.setTransitiveTag(!opt::bIrreducibleOnly);
+    headerRecord.setTransitiveTag(true);
     headerRecord.write(*pASQGWriter);
 
-    // Compute the overlap hits
-    StringVector hitsFilenames;
-
     // Determine which index files to use. If a target file was provided,
     // use the index of the target reads
     std::string indexPrefix;
@@ -154,46 +167,102 @@ int overlapMain(int argc, char** argv)
         indexPrefix = stripFilename(opt::readsFile);
 
     BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
-    BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
-    OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, 
-                                                         opt::errorRate, opt::seedLength, 
-                                                         opt::seedStride, opt::bIrreducibleOnly);
-
-    pOverlapper->setExactModeOverlap(opt::errorRate <= 0.0001);
-    pOverlapper->setExactModeIrreducible(opt::errorRate <= 0.0001);
-
+    SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI);
+    
     Timer* pTimer = new Timer(PROGRAM_IDENT);
     pBWT->printInfo();
 
-    // Make a prefix for the temporary hits files
-    std::string outPrefix;
-    outPrefix = stripFilename(opt::readsFile);
-    if(!opt::targetFile.empty())
+    // Read the sequence file and write vertex records for each
+    // Also store the read names in a vector of strings
+    ReadTable reads;
+    
+    SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION);
+    SeqRecord record;
+    while(pReader->get(record))
     {
-        outPrefix.append(1, '.');
-        outPrefix.append(stripFilename(opt::targetFile));
-    }
+        reads.addRead(record.toSeqItem());
+        ASQG::VertexRecord vr(record.id, record.seq.toString());
+        vr.write(*pASQGWriter);
 
-    if(opt::numThreads <= 1)
-    {
-        printf("[%s] starting serial-mode overlap computation\n", PROGRAM_IDENT);
-        computeHitsSerial(outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
-    }
-    else
-    {
-        printf("[%s] starting parallel-mode overlap computation with %d threads\n", PROGRAM_IDENT, opt::numThreads);
-        computeHitsParallel(opt::numThreads, outPrefix, opt::readsFile, pOverlapper, opt::minOverlap, hitsFilenames, pASQGWriter);
+        if(reads.getCount() % 100000 == 0)
+            printf("Read %zu sequences\n", reads.getCount());
     }
 
-    // Get the number of strings in the BWT, this is used to pre-allocated the read table
-    delete pOverlapper;
-    delete pBWT; 
-    delete pRBWT;
+    delete pReader;
+    pReader = NULL;
 
-    // Parse the hits files and write the overlaps to the ASQG file
-    convertHitsToASQG(indexPrefix, hitsFilenames, pASQGWriter);
+    BWTIndexSet index;
+    index.pBWT = pBWT;
+    index.pSSA = pSSA;
+    index.pReadTable = &reads;
+
+    // Make a prefix for the temporary hits files
+    size_t n_reads = reads.getCount();
+
+    omp_set_num_threads(opt::numThreads);
+
+#pragma omp parallel for
+    for(size_t read_idx = 0; read_idx < n_reads; ++read_idx)
+    {
+        const SeqItem& curr_read = reads.getRead(read_idx);
+
+        printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length());
+        SequenceOverlapPairVector sopv = 
+            KmerOverlaps::retrieveMatches(curr_read.seq.toString(),
+                                          opt::seedLength,
+                                          opt::minOverlap,
+                                          1 - opt::errorRate,
+                                          100,
+                                          index);
+
+        printf("Found %zu matches\n", sopv.size());
+        for(size_t i = 0; i < sopv.size(); ++i)
+        {
+            std::string match_id = reads.getRead(sopv[i].match_idx).id;
+
+            // We only want to output each edge once so skip this overlap
+            // if the matched read has a lexicographically lower ID
+            if(curr_read.id > match_id)
+                continue;
+
+            std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50);
+            printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(),
+                                                                sopv[i].overlap.match[0].start,
+                                                                sopv[i].overlap.match[0].end,
+                                                                match_id.c_str(),
+                                                                sopv[i].overlap.getOverlapLength(),
+                                                                sopv[i].overlap.getPercentIdentity(),
+                                                                sopv[i].overlap.cigar.c_str());
+
+            // Convert to ASQG
+            SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]);
+            SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]);
+            
+            // KmerOverlaps returns the coordinates of the overlap after flipping the reads
+            // to ensure the strand matches. The ASQG file wants the coordinate of the original
+            // sequencing strand. Flip here if necessary
+            if(sopv[i].is_reversed)
+                sc2.flip();
+
+            // Convert the SequenceOverlap the ASQG's overlap format
+            Overlap ovr(curr_read.id, sc1, match_id,  sc2, sopv[i].is_reversed, -1);
+
+            ASQG::EdgeRecord er(ovr);
+            er.setCigarTag(sopv[i].overlap.cigar);
+            er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity());
+
+#pragma omp critical
+            {
+                er.write(*pASQGWriter);
+            }
+        }
+    }
 
     // Cleanup
+    delete pReader;
+    delete pBWT; 
+    delete pSSA;
+    
     delete pASQGWriter;
     delete pTimer;
     if(opt::numThreads > 1)
@@ -202,6 +271,7 @@ int overlapMain(int argc, char** argv)
     return 0;
 }
 
+/*
 // Compute the hits for each read in the input file without threading
 // Return the number of reads processed
 size_t computeHitsSerial(const std::string& prefix, const std::string& readsFile, 
@@ -256,66 +326,12 @@ size_t computeHitsParallel(int numThreads, const std::string& prefix, const std:
         delete processorVector[i];
     return numProcessed;
 }
-
-//
-void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsFilenames, std::ostream* pASQGWriter)
-{
-    // Load the suffix array index and the reverse suffix array index
-    // Note these are not the full suffix arrays
-    SuffixArray* pFwdSAI = new SuffixArray(indexPrefix + SAI_EXT);
-    SuffixArray* pRevSAI = new SuffixArray(indexPrefix + RSAI_EXT);
-
-    // Load the ReadInfoTable for the queries to look up the ID and lengths of the hits
-    ReadInfoTable* pQueryRIT = new ReadInfoTable(opt::readsFile);
-
-    // If the target file is not the query file, load its ReadInfoTable
-    ReadInfoTable* pTargetRIT;
-    if(!opt::targetFile.empty() && opt::targetFile != opt::readsFile)
-        pTargetRIT = new ReadInfoTable(opt::targetFile);
-    else
-        pTargetRIT = pQueryRIT;
-
-    bool bIsSelfCompare = pTargetRIT == pQueryRIT;
-
-    // Convert the hits to overlaps and write them to the asqg file as initial edges
-    for(StringVector::const_iterator iter = hitsFilenames.begin(); iter != hitsFilenames.end(); ++iter)
-    {
-        printf("[%s] parsing file %s\n", PROGRAM_IDENT, iter->c_str());
-        std::istream* pReader = createReader(*iter);
-    
-        // Read each hit sequentially, converting it to an overlap
-        std::string line;
-        while(getline(*pReader, line))
-        {
-            size_t readIdx;
-            size_t totalEntries;
-            bool isSubstring;
-            OverlapVector ov;
-            OverlapCommon::parseHitsString(line, pQueryRIT, pTargetRIT, pFwdSAI, pRevSAI, bIsSelfCompare, readIdx, totalEntries, ov, isSubstring);
-            for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter)
-            {
-                ASQG::EdgeRecord edgeRecord(*iter);
-                edgeRecord.write(*pASQGWriter);
-            }
-        }
-        delete pReader;
-
-        // delete the hits file
-        unlink(iter->c_str());
-    }
-
-    // Deallocate data
-    if(pTargetRIT != pQueryRIT)
-        delete pTargetRIT;
-    delete pFwdSAI;
-    delete pRevSAI;
-    delete pQueryRIT;
-}
+*/
 
 // 
 // Handle command line arguments
 //
-void parseOverlapOptions(int argc, char** argv)
+void parseOverlapLongOptions(int argc, char** argv)
 {
     bool die = false;
     for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
@@ -336,10 +352,10 @@ void parseOverlapOptions(int argc, char** argv)
             case '?': die = true; break;
             case 'v': opt::verbose++; break;
             case OPT_HELP:
-                std::cout << OVERLAP_USAGE_MESSAGE;
+                std::cout << OVERLAP_LONG_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
             case OPT_VERSION:
-                std::cout << OVERLAP_VERSION_MESSAGE;
+                std::cout << OVERLAP_LONG_VERSION_MESSAGE;
                 exit(EXIT_SUCCESS);
         }
     }
@@ -369,7 +385,7 @@ void parseOverlapOptions(int argc, char** argv)
 
     if (die) 
     {
-        std::cout << "\n" << OVERLAP_USAGE_MESSAGE;
+        std::cout << "\n" << OVERLAP_LONG_USAGE_MESSAGE;
         exit(EXIT_FAILURE);
     }
 
diff --git a/src/SGA/overlap-long.h b/src/SGA/overlap-long.h
new file mode 100644
index 0000000..9bdf838
--- /dev/null
+++ b/src/SGA/overlap-long.h
@@ -0,0 +1,26 @@
+//-----------------------------------------------------
+// Copyright 2013 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// overlap-long - compute overlaps for long reads
+//
+#ifndef OVERLAP_LONG_H
+#define OVERLAP_LONG_H
+#include <getopt.h>
+#include "config.h"
+#include "BWT.h"
+#include "Match.h"
+#include "BWTAlgorithms.h"
+#include "OverlapAlgorithm.h"
+
+// functions
+
+//
+int overlapLongMain(int argc, char** argv);
+
+// options
+void parseOverlapLongOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SGA/overlap.cpp b/src/SGA/overlap.cpp
index 0e65452..f1cb50c 100644
--- a/src/SGA/overlap.cpp
+++ b/src/SGA/overlap.cpp
@@ -63,6 +63,7 @@ static const char *OVERLAP_USAGE_MESSAGE =
 "      -t, --threads=NUM                use NUM worker threads to compute the overlaps (default: no threading)\n"
 "      -e, --error-rate                 the maximum error rate allowed to consider two sequences aligned (default: exact matches only)\n"
 "      -m, --min-overlap=LEN            minimum overlap required between two reads (default: 45)\n"
+"      -p, --prefix=PREFIX              use PREFIX for the names of the index files (default: prefix of the input file)\n"
 "      -f, --target-file=FILE           perform the overlap queries against the reads in FILE\n"
 "      -x, --exhaustive                 output all overlaps, including transitive edges\n"
 "          --exact                      force the use of the exact-mode irreducible block algorithm. This is faster\n"
@@ -88,7 +89,8 @@ namespace opt
     static std::string readsFile;
     static std::string targetFile;
     static std::string outFile;
-    
+    static std::string prefix;
+  
     static double errorRate = 0.0f;
     static unsigned int minOverlap = DEFAULT_MIN_OVERLAP;
     static int seedLength = 0;
@@ -98,7 +100,7 @@ namespace opt
     static bool bExactIrreducible = false;
 }
 
-static const char* shortopts = "m:d:e:t:l:s:o:f:vix";
+static const char* shortopts = "m:d:e:t:l:s:o:f:p:vix";
 
 enum { OPT_HELP = 1, OPT_VERSION, OPT_EXACT };
 
@@ -148,11 +150,15 @@ int overlapMain(int argc, char** argv)
     // Determine which index files to use. If a target file was provided,
     // use the index of the target reads
     std::string indexPrefix;
-    if(!opt::targetFile.empty())
-        indexPrefix = stripFilename(opt::targetFile);
+    if(!opt::prefix.empty())
+      indexPrefix = opt::prefix;
     else
-        indexPrefix = stripFilename(opt::readsFile);
-
+    {
+      if(!opt::targetFile.empty())
+        indexPrefix = stripExtension(opt::targetFile);
+      else
+        indexPrefix = stripExtension(opt::readsFile);
+    }
     BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate);
     BWT* pRBWT = new BWT(indexPrefix + RBWT_EXT, opt::sampleRate);
     OverlapAlgorithm* pOverlapper = new OverlapAlgorithm(pBWT, pRBWT, 
@@ -325,6 +331,7 @@ void parseOverlapOptions(int argc, char** argv)
         {
             case 'm': arg >> opt::minOverlap; break;
             case 'o': arg >> opt::outFile; break;
+            case 'p': arg >> opt::prefix; break;
             case 'e': arg >> opt::errorRate; break;
             case 't': arg >> opt::numThreads; break;
             case 'l': arg >> opt::seedLength; break;
diff --git a/src/SGA/preprocess.cpp b/src/SGA/preprocess.cpp
index 503e171..93c0282 100644
--- a/src/SGA/preprocess.cpp
+++ b/src/SGA/preprocess.cpp
@@ -36,6 +36,7 @@ static const char *PREPROCESS_USAGE_MESSAGE =
 "\n"
 "      --help                           display this help and exit\n"
 "      -v, --verbose                    display verbose output\n"
+"          --seed                       set random seed\n"
 "\nInput/Output options:\n"
 "      -o, --out=FILE                   write the reads to FILE (default: stdout)\n"
 "      -p, --pe-mode=INT                0 - do not treat reads as paired (default)\n"
@@ -45,6 +46,7 @@ static const char *PREPROCESS_USAGE_MESSAGE =
 "          --pe-orphans=FILE            if one half of a read pair fails filtering, write the passed half to FILE\n"
 "\nConversions/Filtering:\n"
 "          --phred64                    convert quality values from phred-64 to phred-33.\n"
+"          --discard-quality            do not output quality scores\n"
 "      -q, --quality-trim=INT           perform Heng Li's BWA quality trim algorithm. \n"
 "                                       Reads are trimmed according to the formula:\n"
 "                                       argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l<INT\n"
@@ -80,6 +82,7 @@ enum QualityScaling
 namespace opt
 {
     static unsigned int verbose;
+    static unsigned int seed = 0;
     static std::string outFile;
     static unsigned int qualityTrim = 0;
     static unsigned int hardClip = 0;
@@ -89,6 +92,7 @@ namespace opt
     static double sampleFreq = 1.0f;
 
     static bool bDiscardAmbiguous = true;
+    static bool bDiscardQuality = false;
     static QualityScaling qualityScale = QS_SANGER;
 
     static bool bFilterGC = false;
@@ -106,8 +110,11 @@ namespace opt
 
 static const char* shortopts = "o:q:m:h:p:r:c:s:f:vi";
 
-enum { OPT_HELP = 1, OPT_VERSION, OPT_PERMUTE, OPT_QSCALE, OPT_MINGC, OPT_MAXGC, 
-       OPT_DUST, OPT_DUST_THRESHOLD, OPT_SUFFIX, OPT_PHRED64, OPT_OUTPUTORPHANS, OPT_DISABLE_PRIMER };
+enum { OPT_HELP = 1, OPT_SEED, OPT_VERSION, OPT_PERMUTE,
+       OPT_QSCALE, OPT_MINGC, OPT_MAXGC,
+       OPT_DUST, OPT_DUST_THRESHOLD, OPT_SUFFIX,
+       OPT_PHRED64, OPT_OUTPUTORPHANS, OPT_DISABLE_PRIMER,
+       OPT_DISCARD_QUALITY };
 
 static const struct option longopts[] = {
     { "verbose",                no_argument,       NULL, 'v' },
@@ -130,7 +137,9 @@ static const struct option longopts[] = {
     { "help",                   no_argument,       NULL, OPT_HELP },
     { "version",                no_argument,       NULL, OPT_VERSION },
     { "permute-ambiguous",      no_argument,       NULL, OPT_PERMUTE },
+    { "discard-quality",        no_argument,       NULL, OPT_DISCARD_QUALITY },
     { "no-primer-check",        no_argument,       NULL, OPT_DISABLE_PRIMER },
+    { "seed",                   required_argument, NULL, OPT_SEED },
     { NULL, 0, NULL, 0 }
 };
 
@@ -150,6 +159,12 @@ int preprocessMain(int argc, char** argv)
     Timer* pTimer = new Timer("sga preprocess");
     parsePreprocessOptions(argc, argv);
 
+    // set random seed
+    if (opt::seed == 0)
+    {
+        opt::seed = time(NULL);
+    }
+
     std::cerr << "Parameters:\n";
     std::cerr << "QualTrim: " << opt::qualityTrim << "\n";
 
@@ -169,6 +184,8 @@ int preprocessMain(int argc, char** argv)
     std::cerr << "Orphan file: " << (opt::orphanFile.empty() ? "none" : opt::orphanFile) << "\n";
     if(opt::bDiscardAmbiguous)
         std::cerr << "Discarding sequences with ambiguous bases\n";
+    if(opt::bDiscardQuality)
+        std::cerr << "Discarding quality scores\n";
     if(opt::bDustFilter)
         std::cerr << "Dust threshold: " << opt::dustThreshold << "\n";
     if(!opt::suffix.empty())
@@ -179,9 +196,10 @@ int preprocessMain(int argc, char** argv)
         std::cerr << "Adapter sequence fwd: " << opt::adapterF << "\n";
         std::cerr << "Adapter sequence rev: " << opt::adapterR << "\n";
     }
+    std::cerr << "Seed: " << opt::seed << "\n";
 
     // Seed the RNG
-    srand(time(NULL));
+    srand(opt::seed);
 
     std::ostream* pWriter;
     if(opt::outFile.empty())
@@ -244,6 +262,13 @@ int preprocessMain(int argc, char** argv)
                 // Read from separate files
                 std::string filename1 = argv[optind++];
                 std::string filename2 = argv[optind++];
+                
+                if(filename1 == "-" || filename2 == "-")
+                {
+                    std::cerr << "Reading from stdin is not supported in --pe-mode 1\n";
+                    std::cerr << "Maybe you meant --pe-mode 2 (interleaved pairs?)\n";
+                    exit(EXIT_FAILURE);
+                }
 
                 pReader1 = new SeqReader(filename1, SRF_NO_VALIDATION);
                 pReader2 = new SeqReader(filename2, SRF_NO_VALIDATION);
@@ -321,7 +346,6 @@ int preprocessMain(int argc, char** argv)
             }
             delete pReader1;
             pReader1 = NULL;
-
         }
 
     }
@@ -500,7 +524,11 @@ bool processRead(SeqRecord& record)
     }
 
     record.seq = seqStr;
-    record.qual = qualStr;
+
+    if(opt::bDiscardQuality)
+        record.qual.clear();
+    else
+        record.qual = qualStr;
 
     if(record.seq.length() == 0 || record.seq.length() < opt::minLength)
         return false;
@@ -611,6 +639,8 @@ void parsePreprocessOptions(int argc, char** argv)
             case OPT_PERMUTE: opt::bDiscardAmbiguous = false; break;
             case OPT_DUST: opt::bDustFilter = true; break;
             case OPT_DISABLE_PRIMER: opt::bDisablePrimerCheck = true; break;
+            case OPT_DISCARD_QUALITY: opt::bDiscardQuality = true; break;
+            case OPT_SEED: arg >> opt::seed; break;
             case OPT_HELP:
                 std::cout << PREPROCESS_USAGE_MESSAGE;
                 exit(EXIT_SUCCESS);
diff --git a/src/SGA/preqc.cpp b/src/SGA/preqc.cpp
index ec8c006..4c39e4a 100644
--- a/src/SGA/preqc.cpp
+++ b/src/SGA/preqc.cpp
@@ -158,6 +158,7 @@ static const char *PREQC_USAGE_MESSAGE =
 "      --help                           display this help and exit\n"
 "      -v, --verbose                    display verbose output\n"
 "      -t, --threads=NUM                use NUM threads (default: 1)\n"
+"          --simple                     only compute the metrics that do not need the FM-index\n"
 "          --max-contig-length=N        stop contig extension at N bp (default: 50000)\n"
 "          --reference=FILE             use the reference FILE to calculate GC plot\n"
 "          --diploid-reference-mode     generate metrics assuming that the input data\n"
@@ -181,11 +182,12 @@ namespace opt
     static std::string referenceFile;
     static int diploidReferenceMode = 0;
     static bool forceEM = false;
+    static bool simple = false;
 }
 
 static const char* shortopts = "p:d:t:o:k:n:b:v";
 
-enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_MAX_CONTIG, OPT_DIPLOID, OPT_FORCE_EM };
+enum { OPT_HELP = 1, OPT_VERSION, OPT_REFERENCE, OPT_MAX_CONTIG, OPT_DIPLOID, OPT_FORCE_EM, OPT_SIMPLE };
 
 static const struct option longopts[] = {
     { "verbose",                no_argument,       NULL, 'v' },
@@ -197,6 +199,7 @@ static const struct option longopts[] = {
     { "branch-cutoff",          required_argument, NULL, 'b' },
     { "max-contig-length",      required_argument, NULL, OPT_MAX_CONTIG },
     { "reference",              required_argument, NULL, OPT_REFERENCE },
+    { "simple",                 no_argument,       NULL, OPT_SIMPLE },
     { "force-EM",               no_argument,       NULL, OPT_FORCE_EM },
     { "diploid-reference-mode", no_argument,       NULL, OPT_DIPLOID },
     { "help",                   no_argument,       NULL, OPT_HELP },
@@ -689,6 +692,9 @@ void generate_position_of_first_error(JSONWriter* pWriter, const BWTIndexSet& in
     for(size_t i = 0; i < n_samples; ++i)
     {
         std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
+        if(s.length() < k)
+            continue;
+
         size_t nk = s.size() - k + 1;
         size_t first_kmer_count = 
             BWTAlgorithms::countSequenceOccurrences(s.substr(0, k), index_set.pBWT);
@@ -760,6 +766,9 @@ void generate_errors_per_base(JSONWriter* pWriter, const BWTIndexSet& index_set)
     for(int i = 0; i < n_samples; ++i)
     {
         std::string s = BWTAlgorithms::sampleRandomString(index_set.pBWT);
+        if(s.length() < k)
+            continue;
+
         KmerOverlaps::retrieveMatches(s, k, min_overlap, max_error_rate, 2, index_set);
         //KmerOverlaps::approximateMatch(s, min_overlap, max_error_rate, 2, 200, index_set);
 
@@ -1963,43 +1972,51 @@ int preQCMain(int argc, char** argv)
     parsePreQCOptions(argc, argv);
     Timer* pTimer = new Timer(PROGRAM_IDENT);
 
-    fprintf(stderr, "Loading FM-index of %s\n", opt::readsFile.c_str());
-    BWTIndexSet index_set;
-    index_set.pBWT = new BWT(opt::prefix + BWT_EXT);
-    index_set.pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
-    index_set.pCache = new BWTIntervalCache(10, index_set.pBWT);
-    
     rapidjson::FileStream f(stdout);
     JSONWriter writer(f);
 
     // Top-level document
     writer.StartObject();
     
-    if(!opt::diploidReferenceMode)
-    {
-        GenomeEstimates estimates = generate_genome_size(&writer, index_set);
-        generate_branch_classification(&writer, estimates, index_set);
-        generate_de_bruijn_simulation(&writer, estimates, index_set);
-        generate_gc_distribution(&writer, index_set);
-        generate_quality_stats(&writer, opt::readsFile);
-        generate_pe_fragment_sizes(&writer, index_set);
-        generate_kmer_coverage(&writer, index_set);
-        generate_position_of_first_error(&writer, index_set);
-        generate_errors_per_base(&writer, index_set);
-        generate_unipath_length_data(&writer, index_set);
-        generate_duplication_rate(&writer, index_set);
-    }
-    else
+    // In simple mode we only compute metrics that do not need the FM-index
+    generate_quality_stats(&writer, opt::readsFile);
+
+    // Load the FM-index and compute the rest of the metrics if requested
+    if(!opt::simple)
     {
-        generate_reference_branch_classification(&writer, index_set);
+        BWTIndexSet index_set;
+        fprintf(stderr, "Loading FM-index of %s\n", opt::readsFile.c_str());
+        index_set.pBWT = new BWT(opt::prefix + BWT_EXT);
+        index_set.pSSA = new SampledSuffixArray(opt::prefix + SAI_EXT, SSA_FT_SAI);
+        index_set.pCache = new BWTIntervalCache(10, index_set.pBWT);
+
+        if(!opt::diploidReferenceMode)
+        {
+            GenomeEstimates estimates = generate_genome_size(&writer, index_set);
+
+            generate_branch_classification(&writer, estimates, index_set);
+            generate_de_bruijn_simulation(&writer, estimates, index_set);
+            generate_gc_distribution(&writer, index_set);
+            generate_pe_fragment_sizes(&writer, index_set);
+            generate_kmer_coverage(&writer, index_set);
+            generate_position_of_first_error(&writer, index_set);
+            generate_errors_per_base(&writer, index_set);
+            generate_unipath_length_data(&writer, index_set);
+            generate_duplication_rate(&writer, index_set);
+        }
+        else
+        {
+            generate_reference_branch_classification(&writer, index_set);
+        }
+
+        delete index_set.pBWT;
+        delete index_set.pSSA;
+        delete index_set.pCache;
     }
 
     // End document
     writer.EndObject();
 
-    delete index_set.pBWT;
-    delete index_set.pSSA;
-    delete index_set.pCache;
     delete pTimer;
     return 0;
 }
@@ -2079,6 +2096,7 @@ void parsePreQCOptions(int argc, char** argv)
             case 't': arg >> opt::numThreads; break;
             case '?': die = true; break;
             case 'v': opt::verbose++; break;
+            case OPT_SIMPLE: opt::simple = true; break;
             case OPT_MAX_CONTIG: arg >> opt::maxContigLength; break;
             case OPT_DIPLOID: opt::diploidReferenceMode = true; break;
             case OPT_REFERENCE: arg >> opt::referenceFile; break;
diff --git a/src/SGA/rmdup.cpp b/src/SGA/rmdup.cpp
index cc7727d..425ddb2 100644
--- a/src/SGA/rmdup.cpp
+++ b/src/SGA/rmdup.cpp
@@ -120,7 +120,7 @@ void rmdup()
     delete pRBWT;
     delete pTimer;
     
-    std::string out_prefix = stripFilename(opt::outFile);
+    std::string out_prefix = stripExtension(opt::outFile);
     std::string dupsFile = parseDupHits(hitsFilenames, out_prefix);
 
     // Rebuild the indices without the duplicated sequences
diff --git a/src/SGA/sga.cpp b/src/SGA/sga.cpp
index d932233..1d5f174 100644
--- a/src/SGA/sga.cpp
+++ b/src/SGA/sga.cpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include "index.h" 
 #include "overlap.h"
+#include "overlap-long.h"
 #include "assemble.h"
 #include "oview.h"
 #include "rmdup.h"
@@ -35,6 +36,9 @@
 #include "rewrite-evidence-bam.h"
 #include "preqc.h"
 #include "haplotype-filter.h"
+#include "graph-concordance.h"
+#include "somatic-variant-filters.h"
+#include "kmer-count.h"
 
 #define PROGRAM_BIN "sga"
 #define AUTHOR "Jared Simpson"
@@ -51,31 +55,34 @@ static const char *SGA_USAGE_MESSAGE =
 "Contact: " AUTHOR " [" PACKAGE_BUGREPORT "]\n"
 "Usage: " PROGRAM_BIN " <command> [options]\n\n"
 "Commands:\n"
-"           preprocess            filter and quality-trim reads\n"
-"           index                 build the BWT and FM-index for a set of reads\n"
-"           merge                 merge multiple BWT/FM-index files into a single index\n"
-"           bwt2fa                transform a bwt back into a set of sequences\n"
-"           correct               correct sequencing errors in a set of reads\n"
-"           fm-merge              merge unambiguously overlapped sequences using the FM-index\n"
-"           overlap               compute overlaps between reads\n"
-"           assemble              generate contigs from an assembly graph\n"
-"           oview                 view overlap alignments\n"
-"           subgraph              extract a subgraph from a graph\n"
-"           filter                remove reads from a data set\n"
-"           rmdup                 duplicate read removal\n"
-"           gen-ssa               generate a sampled suffix array for the given set of reads\n"
-"           scaffold              generate ordered sets of contigs using distance estimates\n"
-"           scaffold2fasta        convert the output of the scaffold subprogram into a fasta file\n"
-"           gapfill               fill intra-scaffold gaps\n"
+"           preprocess               filter and quality-trim reads\n"
+"           index                    build the BWT and FM-index for a set of reads\n"
+"           merge                    merge multiple BWT/FM-index files into a single index\n"
+"           bwt2fa                   transform a bwt back into a set of sequences\n"
+"           correct                  correct sequencing errors in a set of reads\n"
+"           fm-merge                 merge unambiguously overlapped sequences using the FM-index\n"
+"           overlap                  compute overlaps between reads\n"
+"           assemble                 generate contigs from an assembly graph\n"
+"           oview                    view overlap alignments\n"
+"           subgraph                 extract a subgraph from a graph\n"
+"           filter                   remove reads from a data set\n"
+"           rmdup                    duplicate read removal\n"
+"           gen-ssa                  generate a sampled suffix array for the given set of reads\n"
+"           scaffold                 generate ordered sets of contigs using distance estimates\n"
+"           scaffold2fasta           convert the output of the scaffold subprogram into a fasta file\n"
+"           gapfill                  fill intra-scaffold gaps\n"
 "\n\nVariant Calling Commands:\n"
-"           graph-diff            compare reads to find sequence variants\n"
-"           rewrite-evidence-bam  fill in sequence and quality information for a variant evidence BAM\n"
-"           haplotype-filter      filter out low-quality haplotypes\n"
+"           graph-diff               compare reads to find sequence variants\n"
+"           graph-concordance        check called variants for representation in the assembly graph\n"
+"           rewrite-evidence-bam     fill in sequence and quality information for a variant evidence BAM\n"
+"           haplotype-filter         filter out low-quality haplotypes\n"
+"           somatic-variant-filters  filter out low-quality variants\n"
 "\n\nExperimental commands:\n"
 "           preqc                 perform pre-assembly quality checks on a set of reads\n"
 "           stats                 print summary statistics about a read set\n"
 "           filterBAM             filter out contaminating mate-pair data in a BAM file\n"
 "           cluster               find clusters of reads belonging to the same connected component in an assembly graph\n"
+"           kmer-count            extract all kmers from a BWT file\n"
 //"           connect         resolve the complete sequence of a paired-end fragment\n"
 "\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
 
@@ -116,6 +123,8 @@ int main(int argc, char** argv)
             FMMergeMain(argc - 1, argv + 1);
         else if(command == "overlap")
             overlapMain(argc - 1, argv + 1);
+        else if(command == "overlap-long")
+            overlapLongMain(argc - 1, argv + 1);
         else if(command == "correct")
             correctMain(argc - 1, argv + 1);
         else if(command == "assemble")
@@ -154,6 +163,12 @@ int main(int argc, char** argv)
             preQCMain(argc - 1, argv + 1);
         else if(command == "haplotype-filter")
             haplotypeFilterMain(argc - 1, argv + 1);
+        else if(command == "graph-concordance")
+            graphConcordanceMain(argc - 1, argv + 1);
+        else if(command == "somatic-variant-filters")
+            somaticVariantFiltersMain(argc - 1, argv + 1);
+        else if(command == "kmer-count")
+            kmerCountMain(argc - 1, argv + 1);
         else
         {
             std::cerr << "Unrecognized command: " << command << "\n";
diff --git a/src/SGA/somatic-variant-filters.cpp b/src/SGA/somatic-variant-filters.cpp
new file mode 100644
index 0000000..3fca50b
--- /dev/null
+++ b/src/SGA/somatic-variant-filters.cpp
@@ -0,0 +1,864 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// variant-filtering - apply various filters
+// to a somatic VCF file
+//
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iterator>
+#include "Util.h"
+#include "somatic-variant-filters.h"
+#include "BWTAlgorithms.h"
+#include "BWTIndexSet.h"
+#include "SGACommon.h"
+#include "Timer.h"
+#include "SequenceProcessFramework.h"
+#include "HapgenUtil.h"
+#include "SGAStats.h"
+#include "HashMap.h"
+#include "DindelRealignWindow.h"
+#include "VariantIndex.h"
+#include "HapgenUtil.h"
+#include "MultiAlignment.h"
+#include "api/BamReader.h"
+
+// Types
+typedef HashMap<std::string, std::string> StringStringHash;
+
+//
+// Getopt
+//
+#define SUBPROGRAM "somatic-variant-filters"
+static const char *SOMATIC_VARIANT_FILTERS_VERSION_MESSAGE =
+SUBPROGRAM " Version " PACKAGE_VERSION "\n"
+"Written by Jared Simpson.\n"
+"\n"
+"Copyright 2014 Ontario Institute for Cancer Research\n";
+
+static const char *SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE =
+"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... VCF_FILE\n"
+"Count read support for variants in a vcf file\n"
+"\n"
+"      --help                           display this help and exit\n"
+"      -v, --verbose                    display verbose output\n"
+"      -t, --threads=NUM                use NUM threads to compute the overlaps (default: 1)\n"
+"          --reference=STR              load the reference genome from FILE\n"
+"          --tumor-bam=STR              load the aligned tumor reads from FILE\n"
+"          --normal-bam=STR             load the aligned normal reads from FILE\n"
+"          --annotate-only              only annotate with INFO tags, rather than filter\n"
+"\n"
+"Filtering cutoffs:\n"
+"          --min-af=FLOAT               minimum allele frequency (AF tag)\n"
+"          --min-var-dp=INT             minimum variant depth (VarDP)\n"
+"          --max-strand-bias=FLOAT      maximum strand bias (SB)\n"
+"          --max-hp-length=INT          maximum homopolymer context length (HPlen)\n"
+"          --max-dust=FLOAT             maximum dust scores (Dust)\n"
+"          --max-normal-reads=INT       maximum normal reads showing variant\n"
+"          --min-normal-depth=INT       minimum normal depth at the locus\n"
+"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
+
+static const char* PROGRAM_IDENT =
+PACKAGE_NAME "::" SUBPROGRAM;
+
+namespace opt
+{
+    static unsigned int verbose;
+    static std::string vcfFile;
+    static std::string referenceFile;
+    static std::string tumorBamFile;
+    static std::string normalBamFile;
+
+    static int numThreads = 1;
+    static double minAF = 0.0f;
+    static double errorBound = 0.02;
+    static int minVarDP = 0;
+    static double maxStrandBias = 2.0f;
+    static int maxHPLen = 7;
+    static double maxDust = 2.0f;
+    static size_t maxNormalReads = 1;
+    static size_t minNormalDepth = 5;
+    static int minMedianQuality = 15;
+    static size_t maximumAlignments = 500;
+    static double minHaplotypeLength = 125.0f;
+    static bool annotateOnly = false;
+}
+
+static const char* shortopts = "t:v";
+
+enum { OPT_HELP = 1, 
+       OPT_VERSION, 
+       OPT_REFERENCE, 
+       OPT_TUMOR_BAM, 
+       OPT_NORMAL_BAM,
+       OPT_ANNOTATE,
+       OPT_MIN_AF,
+       OPT_MIN_VARDP,
+       OPT_MAX_SB,
+       OPT_MAX_HP,
+       OPT_MAX_DUST,
+       OPT_MAX_NORMAL_READS,
+       OPT_MIN_NORMAL_DEPTH };
+
+static const struct option longopts[] = {
+    { "verbose",               no_argument,       NULL, 'v' },
+    { "annotate-only",         no_argument,       NULL, OPT_ANNOTATE },
+    { "threads",               required_argument, NULL, 't' },
+    { "reference",             required_argument, NULL, OPT_REFERENCE },
+    { "tumor-bam",             required_argument, NULL, OPT_TUMOR_BAM },
+    { "normal-bam",            required_argument, NULL, OPT_NORMAL_BAM },
+    { "min-af",                required_argument, NULL, OPT_MIN_AF },
+    { "min-var-dp",            required_argument, NULL, OPT_MIN_VARDP },
+    { "max-strand-bias",       required_argument, NULL, OPT_MAX_SB },
+    { "max-hp-length",         required_argument, NULL, OPT_MAX_HP },
+    { "max-dust",              required_argument, NULL, OPT_MAX_DUST },
+    { "max-normal-reads",      required_argument, NULL, OPT_MAX_NORMAL_READS },
+    { "min-normal-depth",      required_argument, NULL, OPT_MIN_NORMAL_DEPTH },
+    { "help",                  required_argument, NULL, OPT_HELP },
+    { "version",               required_argument, NULL, OPT_VERSION },
+    { NULL, 0, NULL, 0 }
+};
+
+// Calculate the median value of the vector
+template<typename T>
+double median(const std::vector<T> v)
+{
+    std::vector<T> c = v;
+    std::sort(c.begin(), c.end());
+    double r = 0.0f;
+    int m = c.size() / 2;
+    if(c.size() % 2 == 1)
+        r = c[m];
+    else
+        r = (c[m - 1] + c[m]) / 2.0f;
+    return r;
+}
+
+struct CoverageStats
+{
+    CoverageStats() : n_total_reads(0), n_evidence_reads(0), too_many_alignments(false) {} 
+
+    // functions
+    double calculateVAF() const
+    {
+        return n_total_reads > 0 ? 
+            n_evidence_reads / (double)n_total_reads : 0.0f;
+    }
+
+    //
+    size_t n_total_reads;
+    size_t n_evidence_reads;
+    double median_mapping_quality;
+    std::vector<int> snv_evidence_quals;
+    bool too_many_alignments;
+};
+
+// This struct holds segments
+// of a read that match a variant
+// The read is broken into (up to) 3 pieces
+//  -the segment that aligns to the reference before the variant
+//  -the segment that aligns to the variant region
+//  -the segment that aligns after the variant
+struct VariantReadSegments
+{
+    std::string preSegment;
+    std::string variantSegment;
+    std::string postSegment;
+
+    std::string preQual;
+    std::string variantQual;
+    std::string postQual;
+};
+
+VariantReadSegments splitReadAtVariant(const BamTools::BamAlignment& alignment,
+                                       const VCFRecord& record)
+{
+    // Set up a vector of aligned reference positions
+    // for every base of the read
+    std::vector<int> reference_positions(alignment.QueryBases.size(), -1);
+    size_t read_idx = 0;
+    size_t ref_idx = alignment.Position;
+
+    std::stringstream cigar;
+    // 
+    for(size_t ci = 0; ci < alignment.CigarData.size(); ++ci)
+    {
+        BamTools::CigarOp op = alignment.CigarData[ci];
+        switch(op.Type) {
+            case 'M':
+                for(size_t l = 0; l < op.Length; ++l)
+                    reference_positions[read_idx++] = ref_idx++;
+                break;
+            case 'D':
+            case 'N':
+                ref_idx += op.Length;
+                break;
+            case 'I':
+            case 'S':
+                read_idx += op.Length;
+                break;
+        }
+
+        cigar << op.Length << op.Type;
+    }
+
+    // Find the last aligned position before the variant and the first position after
+    int rightmost_before_variant = -1;
+    int leftmost_after_variant = std::numeric_limits<int>::max();
+
+    int variant_start = record.refPosition - 1; // VCF is 1-based, bamtools is 0-based
+    int variant_end = variant_start + record.refStr.size();
+
+    for(int i = 0; i < (int)reference_positions.size(); ++i) {
+        if(reference_positions[i] != -1 && reference_positions[i] < variant_start && i > rightmost_before_variant)
+            rightmost_before_variant = i;
+        if(reference_positions[i] != -1 && reference_positions[i] >= variant_end && i < leftmost_after_variant)
+            leftmost_after_variant = i;
+    }
+
+    VariantReadSegments ret;
+    ret.preSegment = alignment.QueryBases.substr(0, rightmost_before_variant + 1);
+    ret.preQual = alignment.Qualities.substr(0, rightmost_before_variant + 1);
+
+    ret.variantSegment = alignment.QueryBases.substr(rightmost_before_variant + 1, leftmost_after_variant - rightmost_before_variant - 1);
+    ret.variantQual = alignment.Qualities.substr(rightmost_before_variant + 1, leftmost_after_variant - rightmost_before_variant - 1);
+
+    if(leftmost_after_variant < (int)alignment.QueryBases.size())
+    {
+        ret.postSegment = alignment.QueryBases.substr(leftmost_after_variant);
+        ret.postQual = alignment.Qualities.substr(leftmost_after_variant);
+    }
+
+    return ret;
+}
+
+//
+//
+//
+CoverageStats getVariantCoverage(BamTools::BamReader* pReader, const VCFRecord& record, const ReadTable* refTable)
+{
+    CoverageStats stats;
+    
+    static const int flankingSize = 100;
+    static const double minPercentIdentity = 95.0f;
+
+    bool is_snv = record.refStr.size() == 1 && record.varStr.size() == 1;
+
+    // Grab the reference haplotype
+    int zeroBasedPos = record.refPosition - 1;
+    int start = zeroBasedPos - flankingSize - 1;
+    if(start < 0)
+        start = 0;
+
+    int end = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+    const SeqItem& chr = refTable->getRead(record.refName);
+    if(end > (int)chr.seq.length())
+        end = (int)chr.seq.length();
+
+    std::string reference_haplotype = chr.seq.substr(start, end - start);
+    int translatedPos = zeroBasedPos - start;
+
+    std::string variant_haplotype = reference_haplotype;
+    
+    // Ensure that the reference string at the variant matches the expected
+    if(variant_haplotype.substr(translatedPos, record.refStr.length()) != record.refStr)
+    {
+        std::cerr << "Warning, reference base does not match VCF record.\n";
+        std::cerr << record << "\n";
+    }
+    variant_haplotype.replace(translatedPos, record.refStr.length(), record.varStr);
+
+    // Grab all reads in reference region
+    int refID = pReader->GetReferenceID(record.refName);
+    if(refID < 0)
+        return stats;
+
+    int refStart = record.refPosition;
+    int refEnd = record.refPosition;
+    pReader->SetRegion(refID, refStart, refID, refEnd);
+    BamTools::BamAlignment aln;
+
+    std::vector<double> mapping_quality;
+    std::vector<BamTools::BamAlignment> alignments;
+    while(pReader->GetNextAlignment(aln)) {
+        if(aln.MapQuality > 0)
+            alignments.push_back(aln);
+        mapping_quality.push_back(aln.MapQuality);
+
+        if(alignments.size() > opt::maximumAlignments)
+        {
+            stats.too_many_alignments = true;
+            return stats;
+        }
+    }
+
+    if(!mapping_quality.empty())
+        stats.median_mapping_quality = median(mapping_quality);
+    else
+        stats.median_mapping_quality = 60;
+
+    // Shuffle and take the first N alignments only
+    std::random_shuffle(alignments.begin(), alignments.end());
+
+#if HAVE_OPENMP
+    #pragma omp parallel for
+#endif    
+    for(size_t i = 0; i < alignments.size(); ++i) {
+        BamTools::BamAlignment alignment = alignments[i];
+
+        VariantReadSegments segments = splitReadAtVariant(alignment, record);
+
+        if(opt::verbose > 1)
+        {
+            fprintf(stderr, "var: %zu %s -> %s\n",  record.refPosition, record.refStr.c_str(), record.varStr.c_str());
+            fprintf(stderr, "pos: %d\n",  alignment.Position);
+            fprintf(stderr, "strand: %s\n", alignment.IsReverseStrand() ? "-" : "+");
+            fprintf(stderr, "read: %s\n", alignment.QueryBases.c_str());
+            fprintf(stderr, "qual: %s\n", alignment.Qualities.c_str());
+            fprintf(stderr, "alnb: %s\n", alignment.AlignedBases.c_str());
+            
+            fprintf(stderr, "Pre: %s\n",  segments.preSegment.c_str());
+            fprintf(stderr, "Var: %s\n",  segments.variantSegment.c_str());
+            fprintf(stderr, "Pos: %s\n",  segments.postSegment.c_str());
+            
+            fprintf(stderr, "PreQual: %s\n",  segments.preQual.c_str());
+            fprintf(stderr, "VarQual: %s\n",  segments.variantQual.c_str());
+            fprintf(stderr, "PosQual: %s\n",  segments.postQual.c_str());
+        }
+
+        bool aligned_at_variant = segments.variantSegment.size() > 0 && 
+                                  (segments.preSegment.size() > 0 || segments.postSegment.size() > 0);
+
+        if(!aligned_at_variant)
+            continue;
+        
+        bool is_evidence_read = false;
+        if(segments.variantSegment != record.refStr)
+        {
+            if(segments.variantSegment == record.varStr)
+            {
+                // Evidence read via the current alignment
+                is_evidence_read = true;
+            }
+            else
+            {
+                // Check for evidence via realignment to the variant haplotype
+                SequenceOverlap ref_overlap = 
+                    Overlapper::computeAlignmentAffine(alignment.QueryBases, reference_haplotype);
+                SequenceOverlap var_overlap = 
+                    Overlapper::computeAlignmentAffine(alignment.QueryBases, variant_haplotype);
+                
+                bool quality_alignment = (ref_overlap.getPercentIdentity() >= minPercentIdentity || 
+                                          var_overlap.getPercentIdentity() >= minPercentIdentity);
+
+                is_evidence_read = quality_alignment && var_overlap.score > ref_overlap.score;
+            }
+        }
+
+        #pragma omp critical
+        {
+            stats.n_total_reads += 1;
+            if(is_evidence_read)
+            {
+                stats.n_evidence_reads += 1;
+                if(is_snv && segments.variantQual.size() == 1)
+                {
+                    char qb = segments.variantQual[0];
+                    int q = Quality::char2phred(qb);
+                    stats.snv_evidence_quals.push_back(q);
+                }
+            }
+        }
+    }
+
+    return stats;
+}
+
+void makeTagHash(const VCFRecord& record, StringStringHash& tagHash)
+{
+    for(size_t i = 0; i < record.comments.size(); i++)
+    {
+        StringVector kv = split(record.comments[i], '=');
+        assert(kv.size() == 2 || kv.size() == 1);
+        if(kv.size() == 2)
+            tagHash[kv[0]] = kv[1];
+        else
+            tagHash[kv[0]] = "true";
+    }
+}
+
+
+template<typename T>
+bool getTagValue(StringStringHash& tagHash, const std::string& key, T& out)
+{
+    StringStringHash::iterator iter = tagHash.find(key);
+    if(iter == tagHash.end())
+        return false; // tag not available
+    std::stringstream parser(iter->second);
+    parser >> out;
+    return true;
+}
+
+std::string makeVariantKey(const VCFRecord& record)
+{
+    std::stringstream ss;
+    ss << record.refName << ":" << record.refPosition << ":"
+       << record.refStr << ":" << record.varStr;
+    return ss.str(); 
+}
+
+// Get coordinates of the reference haplotype for this
+// variant, plus some flanking sequence
+void getReferenceInterval(const VCFRecord& record,
+                          int flankingSize,
+                          int& zeroBasedPos,
+                          int& flankStart,
+                          int& flankEnd)
+{
+    zeroBasedPos = record.refPosition - 1;
+    flankStart = zeroBasedPos - flankingSize - 1;
+    flankEnd = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+}
+
+// Modify the reference coordinates to avoid going over the ends
+void clampReferenceInterval(const SeqItem& chr, int& start, int&end)
+{
+    if(start < 0)
+        start = 0;
+ 
+    if(end > (int)chr.seq.length())
+        end = (int)chr.seq.length();
+}
+
+// Calculate Homopolymer lengths at the variant position
+int calculateHomopolymerLength(const VCFRecord& record, const ReadTable* refTable)
+{
+    static const int flankingSize = 100;
+
+    // Get coordinates of reference haplotype
+    int zeroBasedPos, start, end;
+    getReferenceInterval(record, flankingSize, zeroBasedPos, start, end);
+    const SeqItem& chr = refTable->getRead(record.refName);
+
+    clampReferenceInterval(chr, start, end);
+    std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+    // we use the MA class for the homopolymer counting code
+    MAlignDataVector mav;
+    MultiAlignment ma(reference_haplotype, mav);
+
+    // Count run lengths of nucleotides
+    std::vector<size_t> run_lengths;
+
+    // Count runs
+    char curr_char = reference_haplotype[0];
+    size_t curr_run = 1;
+    for(size_t i = 1; i < reference_haplotype.size(); ++i)
+    {
+        if(reference_haplotype[i] != curr_char)
+        {
+            run_lengths.push_back(curr_run);
+            curr_run = 1;
+            curr_char = reference_haplotype[i];
+        }
+        else
+        {
+            curr_run++;
+        }
+    }
+    run_lengths.push_back(curr_run); // last run
+
+    // set up a map from the reference base to the run length its in
+    std::vector<size_t> index_to_run_lengths(reference_haplotype.size());
+    size_t j = 0;
+    for(size_t i = 0; i < run_lengths.size(); ++i)
+    {
+        size_t stop = j + run_lengths[i];
+        while(j < stop)
+        {
+            index_to_run_lengths[j] = i;
+            j++;
+        }
+    }
+    assert(j == reference_haplotype.size());
+
+    // Calculate the maximum homopolymer run as the max over the variant reference region
+    size_t eventStart = zeroBasedPos - start;
+    size_t eventEnd = eventStart + record.refStr.size();
+    size_t maxhp = 0;
+    for(size_t i = eventStart; i <= eventEnd; ++i)
+    {
+        size_t idx = index_to_run_lengths[i];
+        if(run_lengths[idx] > maxhp)
+            maxhp = run_lengths[idx];
+    }
+    return maxhp;
+}
+
+struct RepeatCounts
+{
+    std::string unit;
+    size_t numRefUnits;
+};
+
+RepeatCounts getRepeatCounts(const VCFRecord& record, const ReadTable* refTable)
+{
+    // Get coordinates of reference haplotype
+    int zeroBasedPos, start, end;
+    getReferenceInterval(record, 100, zeroBasedPos, start, end);
+    const SeqItem& chr = refTable->getRead(record.refName);
+
+    clampReferenceInterval(chr, start, end);
+    std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+    // VCF encodes indels as TAC -> T
+    // Calculate the inserted deleted (or substituted) sequence
+    // without including the matching reference base 
+    std::string repeat_unit;
+    if(record.varStr[0] == record.refStr[0])
+    {
+        // indel
+        if(record.varStr.size() > record.refStr.size())
+            repeat_unit = record.varStr.substr(1);
+        else
+            repeat_unit = record.refStr.substr(1);
+    }
+    else
+    {
+        repeat_unit = record.varStr;
+    }
+    
+    size_t repeat_len = repeat_unit.size();
+    size_t event_start = zeroBasedPos - start;
+    size_t event_end = event_start + record.refStr.size();
+
+    // Get the first position of the repeat unit near the variant
+    size_t first_pos = reference_haplotype.find(repeat_unit, event_start);
+
+    size_t num_units = 0;
+
+    if(first_pos <= event_end)
+    {
+        // Count forwards
+        int p = (int)first_pos;
+        while(p < (int)reference_haplotype.size() && reference_haplotype.compare(p, repeat_len, repeat_unit) == 0)
+            p += repeat_len;
+        num_units = (p - first_pos) / repeat_len;
+
+        // Count backwards
+        p = first_pos - repeat_len;
+        while(p >= 0 && reference_haplotype.compare(p, repeat_len, repeat_unit) == 0)
+            p -= repeat_len;
+        num_units += ((first_pos - p) / repeat_len) - 1;
+    }
+
+    RepeatCounts out;
+    out.unit = repeat_unit;
+    out.numRefUnits = num_units;
+    return out;
+}
+
+// Get the 3-mer preceding and following the variant
+void getVariantContext(const VCFRecord& record, const ReadTable* refTable,
+                       std::string& prefix, std::string& suffix)
+{
+    static const int flankingSize = 100;
+
+    // Grab the reference haplotype
+    int zeroBasedPos = record.refPosition - 1;
+    int start = zeroBasedPos - flankingSize - 1;
+    if(start < 0)
+        start = 0;
+
+    int end = zeroBasedPos + record.refStr.length() + 2 * flankingSize;
+    const SeqItem& chr = refTable->getRead(record.refName);
+    if(end > (int)chr.seq.length())
+        end = (int)chr.seq.length();
+
+    std::string reference_haplotype = chr.seq.substr(start, end - start);
+
+    size_t eventStart = zeroBasedPos - start;
+    size_t eventEnd = eventStart + record.refStr.size();
+    static const size_t k = 3;
+    prefix = reference_haplotype.substr(eventStart - k, k);
+    suffix = reference_haplotype.substr(eventEnd, k);
+}
+
+//
+// Main
+//
+int somaticVariantFiltersMain(int argc, char** argv)
+{
+    parseSomaticVariantFiltersOptions(argc, argv);
+
+    Timer* pTimer = new Timer(PROGRAM_IDENT);
+
+#if HAVE_OPENMP
+    omp_set_num_threads(opt::numThreads);
+#endif 
+    // Load Reference
+    ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION);
+    refTable.indexReadsByID();
+
+    // Load BAMs
+    BamTools::BamReader* pTumorBamReader = new BamTools::BamReader;
+    pTumorBamReader->Open(opt::tumorBamFile);
+    pTumorBamReader->LocateIndex();
+    assert(pTumorBamReader->HasIndex());
+
+    BamTools::BamReader* pNormalBamReader = new BamTools::BamReader;
+    pNormalBamReader->Open(opt::normalBamFile);
+    pNormalBamReader->LocateIndex();
+    assert(pNormalBamReader->HasIndex());
+
+    // Track duplicated variants
+    HashSet<std::string> duplicateHash;
+
+    std::istream* pInput = createReader(opt::vcfFile.c_str());
+    std::string line;
+
+    while(getline(*pInput, line))
+    {
+        if(line.empty())
+            continue;
+
+        if(line[0] == '#')
+        {
+            std::cout << line << "\n";
+            continue;
+        }
+        
+        // parse record
+        VCFRecord record(line);
+        if(record.isMultiAllelic())
+        {
+            std::cerr << "Error: multi-allelic VCF found, please run vcfbreakmulti\n";
+            exit(EXIT_FAILURE);
+        }
+
+        // Check if we've seen this variant already
+        std::string key = makeVariantKey(record);
+        if(duplicateHash.find(key) != duplicateHash.end())
+            continue;
+        else
+            duplicateHash.insert(key);
+
+        if(opt::verbose > 0)
+        {
+            std::stringstream ss;
+            ss << "Variant: " << record << "\n";
+            fprintf(stderr, "===============================================\n%s", ss.str().c_str());
+        }
+
+        StringStringHash tagHash;
+        makeTagHash(record, tagHash);
+
+        StringVector fail_reasons;
+
+        int hplen = 0;
+        if(!getTagValue(tagHash, "HPLen", hplen))
+            hplen = calculateHomopolymerLength(record, &refTable);
+
+        if(hplen > opt::maxHPLen)
+            fail_reasons.push_back("Homopolymer");
+
+        double dust = 0.0f;
+        if(!getTagValue(tagHash, "Dust", dust))
+            dust = HapgenUtil::calculateDustScoreAtPosition(record.refName, 
+                                                            record.refPosition, 
+                                                            &refTable);
+
+        if(dust > opt::maxDust)
+            fail_reasons.push_back("LowComplexity");
+        
+        double af;
+        if(getTagValue(tagHash, "AF", af) && af < opt::minAF)
+            fail_reasons.push_back("LowAlleleFrequency");
+
+        int varDP;
+        if(getTagValue(tagHash, "VarDP", varDP) && varDP < opt::minVarDP)
+            fail_reasons.push_back("LowVarDP");
+        
+        double avgHapLen;
+        if(getTagValue(tagHash, "AvgHapLen", avgHapLen) && avgHapLen < opt::minHaplotypeLength)
+            fail_reasons.push_back("ShortHaplotype");
+
+        double strandBias;
+        if(getTagValue(tagHash, "SB", strandBias) && strandBias >= opt::maxStrandBias)
+            fail_reasons.push_back("StrandBias");
+        
+        // Count the number of copies of the inserted/deleted sequence in the reference
+        RepeatCounts repeatCounts = getRepeatCounts(record, &refTable);
+
+        // Realignment-based stats
+        CoverageStats tumor_stats = getVariantCoverage(pTumorBamReader, record, &refTable);
+        CoverageStats normal_stats = getVariantCoverage(pNormalBamReader, record, &refTable);
+
+        if(opt::verbose > 0)
+        {
+            fprintf(stderr, "Tumor: [%zu %zu]\n",  tumor_stats.n_total_reads, tumor_stats.n_evidence_reads);
+            fprintf(stderr, "Normal: [%zu %zu]\n", normal_stats.n_total_reads, normal_stats.n_evidence_reads);
+        }
+
+        if(!tumor_stats.too_many_alignments && !normal_stats.too_many_alignments)
+        {
+            // Check that there is not evidence for the variant in the normal sample
+            if(normal_stats.n_evidence_reads > opt::maxNormalReads)
+                fail_reasons.push_back("NormalEvidence");
+            
+            // If the normal is poorly covered in this region, we may call a germline variant as a variant
+            if(normal_stats.n_total_reads < opt::minNormalDepth)
+                fail_reasons.push_back("LowNormalDepth");
+
+            if(!tumor_stats.snv_evidence_quals.empty())
+            {
+                // Check that the base scores of SNVs are reasonably high
+                double median_quality = median(tumor_stats.snv_evidence_quals);
+                if(median_quality < opt::minMedianQuality)
+                    fail_reasons.push_back("LowQuality");
+
+                // For very deep regions, errors can be mistaken for SNVs
+                // Check if the variant frequency is very low. This check is not performed
+                // for indels and MNPs as they might not be aligned to this region.
+                double vf_estimate = tumor_stats.n_evidence_reads / (double)tumor_stats.n_total_reads;
+                if(vf_estimate < opt::errorBound)
+                    fail_reasons.push_back("PossibleError");
+            }
+            
+            // Check that the mapping quality of reads in the region is reasonable
+            if(tumor_stats.median_mapping_quality < opt::minMedianQuality)
+                fail_reasons.push_back("LowMappingQuality");
+        }
+        else 
+        {
+            // If the depth in the region is excessively high, the statistical models may break down
+            fail_reasons.push_back("DepthLimitReached");
+        }
+
+        if(!fail_reasons.empty() && !opt::annotateOnly)
+        {
+            if(record.passStr != "PASS" && record.passStr != ".")
+                fail_reasons.insert(fail_reasons.begin(), record.passStr);
+
+            std::stringstream strss;
+            std::copy(fail_reasons.begin(), fail_reasons.end(), std::ostream_iterator<std::string>(strss, ";"));
+            record.passStr = strss.str();
+            record.passStr.erase(record.passStr.size() - 1); // erase trailing ;
+        }
+        
+        // Add INFO tags indicating allele coverage
+        if(opt::annotateOnly)
+        {
+            double tumor_vf = tumor_stats.calculateVAF();
+            double normal_vf = normal_stats.calculateVAF();
+
+            std::string prefix;
+            std::string suffix;
+            getVariantContext(record, &refTable, prefix, suffix);
+
+            record.addComment("TumorVAF", tumor_vf);
+            record.addComment("NormalVAF", normal_vf);
+            record.addComment("TumorVarDepth", (int)tumor_stats.n_evidence_reads);
+            record.addComment("TumorTotalDepth", (int)tumor_stats.n_total_reads);
+            record.addComment("NormalVarDepth", (int)normal_stats.n_evidence_reads);
+            record.addComment("NormalTotalDepth", (int)normal_stats.n_total_reads);
+            record.addComment("5pContext", prefix);
+            record.addComment("3pContext", suffix);
+            record.addComment("RepeatUnit", repeatCounts.unit);
+            record.addComment("RepeatRefCount", (int)repeatCounts.numRefUnits);
+        }
+
+        std::cout << record << "\n";
+    }
+    
+    // Cleanup
+    delete pInput;
+    delete pTumorBamReader;
+    delete pNormalBamReader;
+    delete pTimer;
+
+    return 0;
+}
+
+// 
+// Handle command line arguments
+//
+void parseSomaticVariantFiltersOptions(int argc, char** argv)
+{
+    std::string algo_str;
+    bool die = false;
+    for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) 
+    {
+        std::istringstream arg(optarg != NULL ? optarg : "");
+        switch (c) 
+        {
+            case OPT_REFERENCE: arg >> opt::referenceFile; break;
+            case OPT_TUMOR_BAM: arg >> opt::tumorBamFile; break;
+            case OPT_NORMAL_BAM: arg >> opt::normalBamFile; break;
+            case OPT_MIN_AF: arg >> opt::minAF; break;
+            case OPT_MIN_VARDP: arg >> opt::minVarDP; break;
+            case OPT_MAX_SB: arg >> opt::maxStrandBias; break;
+            case OPT_MAX_HP: arg >> opt::maxHPLen; break;
+            case OPT_MAX_DUST: arg >> opt::maxDust; break;
+            case OPT_MAX_NORMAL_READS: arg >> opt::maxNormalReads; break;
+            case OPT_MIN_NORMAL_DEPTH: arg >> opt::minNormalDepth; break;
+            case OPT_ANNOTATE: opt::annotateOnly = true; break;
+            case 't': arg >> opt::numThreads; break;
+            case '?': die = true; break;
+            case 'v': opt::verbose++; break;
+            case OPT_HELP:
+                std::cout << SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE;
+                exit(EXIT_SUCCESS);
+            case OPT_VERSION:
+                std::cout << SOMATIC_VARIANT_FILTERS_VERSION_MESSAGE;
+                exit(EXIT_SUCCESS);
+        }
+    }
+
+    if (argc - optind < 1) 
+    {
+        std::cerr << SUBPROGRAM ": missing arguments\n";
+        die = true;
+    } 
+    else if (argc - optind > 1) 
+    {
+        std::cerr << SUBPROGRAM ": too many arguments\n";
+        die = true;
+    }
+
+    if(opt::numThreads <= 0)
+    {
+        std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n";
+        die = true;
+    }
+
+    if(opt::tumorBamFile.empty())
+    {
+        std::cerr << SUBPROGRAM ": a --tumor-bam must be provided\n";
+        die = true;
+    }
+    
+    if(opt::normalBamFile.empty())
+    {
+        std::cerr << SUBPROGRAM ": a --normal-bam must be provided\n";
+        die = true;
+    }
+
+    if(opt::referenceFile.empty())
+    {
+        std::cerr << SUBPROGRAM ": a --reference must be provided\n";
+        die = true;
+    }
+
+    opt::vcfFile = argv[optind++];
+
+    if (die) 
+    {
+        std::cout << "\n" << SOMATIC_VARIANT_FILTERS_USAGE_MESSAGE;
+        exit(EXIT_FAILURE);
+    }
+}
diff --git a/src/SGA/somatic-variant-filters.h b/src/SGA/somatic-variant-filters.h
new file mode 100644
index 0000000..95f225e
--- /dev/null
+++ b/src/SGA/somatic-variant-filters.h
@@ -0,0 +1,23 @@
+//-----------------------------------------------
+// Copyright 2013 Wellcome Trust Sanger Institute
+// Written by Jared Simpson (js18 at sanger.ac.uk)
+// Released under the GPL
+//-----------------------------------------------
+//
+// variant-filtering - apply various filters
+// to a somatic VCF file
+//
+#ifndef SOMATIC_VARIANT_FILTERS_H
+#define SOMATIC_VARIANT_FILTERS_H
+#include <getopt.h>
+#include "config.h"
+
+// functions
+
+//
+int somaticVariantFiltersMain(int argc, char** argv);
+
+// options
+void parseSomaticVariantFiltersOptions(int argc, char** argv);
+
+#endif
diff --git a/src/SQG/ASQG.cpp b/src/SQG/ASQG.cpp
index c4a3911..d73558f 100644
--- a/src/SQG/ASQG.cpp
+++ b/src/SQG/ASQG.cpp
@@ -33,6 +33,10 @@ static char TRANSITIVE_TAG[] = "TE"; // 1 if the graph has transitive edges
 // Vertex tags
 static char SUBSTRING_TAG[] = "SS";
 
+// Edge tags
+static char CIGAR_TAG[] = "CG";
+static char PERCENT_IDENTITY_TAG[] = "PI";
+
 //
 // Header Record
 //
@@ -251,6 +255,13 @@ void EdgeRecord::write(std::ostream& out)
     std::stringstream ss;
     ss << m_overlap;
     fields.push_back(ss.str());
+
+    if(m_cigarTag.isInitialized())
+        fields.push_back(m_cigarTag.toTagString(CIGAR_TAG));
+    
+    if(m_percentIdentityTag.isInitialized())
+        fields.push_back(m_percentIdentityTag.toTagString(PERCENT_IDENTITY_TAG));
+
     writeFields(out, fields);
 }
 
diff --git a/src/SQG/ASQG.h b/src/SQG/ASQG.h
index c385114..3f9ddd5 100644
--- a/src/SQG/ASQG.h
+++ b/src/SQG/ASQG.h
@@ -92,12 +92,19 @@ namespace ASQG
 
             const Overlap& getOverlap() const { return m_overlap; }
 
+            void setCigarTag(const std::string& cigar) { m_cigarTag.set(cigar); }
+            void setPercentIdentityTag(const float f) { m_percentIdentityTag.set(f); }
+
             void write(std::ostream& out);
             void parse(const std::string& record);
 
         private:
         
             Overlap m_overlap;
+
+            // Optional tags
+            SQG::FloatTag m_percentIdentityTag;
+            SQG::StringTag m_cigarTag;
     };
 
     // Parsing functions
diff --git a/src/SuffixTools/BWTIndexSet.h b/src/SuffixTools/BWTIndexSet.h
index 5b55527..1aa05b2 100644
--- a/src/SuffixTools/BWTIndexSet.h
+++ b/src/SuffixTools/BWTIndexSet.h
@@ -24,7 +24,7 @@
 struct BWTIndexSet
 {
     // Constructor
-    BWTIndexSet() : pBWT(NULL), pRBWT(NULL), pCache(NULL), pSSA(NULL), pPopIdx(NULL), pQualityTable(NULL) {}
+    BWTIndexSet() : pBWT(NULL), pRBWT(NULL), pCache(NULL), pSSA(NULL), pPopIdx(NULL), pQualityTable(NULL), pReadTable(NULL) {}
 
     // Data
     const BWT* pBWT;
@@ -33,6 +33,7 @@ struct BWTIndexSet
     const SampledSuffixArray* pSSA;
     const PopulationIndex* pPopIdx;
     const QualityTable* pQualityTable;
+    const ReadTable* pReadTable;
 };
 
 #endif
diff --git a/src/SuffixTools/SparseGapArray.h b/src/SuffixTools/SparseGapArray.h
index deebfcc..1262fc3 100644
--- a/src/SuffixTools/SparseGapArray.h
+++ b/src/SuffixTools/SparseGapArray.h
@@ -320,7 +320,7 @@ class SparseGapArray : public GapArray
         //
         void initOverflow(size_t i, OverflowStorage c)
         {
-            m_overflow.insert(std::make_pair<size_t, OverflowStorage>(i, c));
+            m_overflow.insert(std::make_pair(i, c));
         }
 
         //
diff --git a/src/Thirdparty/multiple_alignment.cpp b/src/Thirdparty/multiple_alignment.cpp
index a33848f..6421cb6 100644
--- a/src/Thirdparty/multiple_alignment.cpp
+++ b/src/Thirdparty/multiple_alignment.cpp
@@ -512,6 +512,88 @@ std::string MultipleAlignment::calculateCigarBetweenRows(size_t row0, size_t row
     return Overlapper::compactCigar(cigar);
 }
 
+// new function based on the standard SGA function calculateBaseConsensus()
+// we modified the function for more accurate overlap correction results
+// 1. We only attempt to correct a base in the read, if it is seen less than base_threshold times (default 2 --> we only correct the base in the read if no other read has the same base).
+// 2. We do not correct a position in the read if there is >1 possibility. Possibility is defined as the number of different bases with at least a count of min_count_max_base.
+// 3. We only correct the base in the read if the base with the maximum count occurs at least min_count_max_base times.
+std::string MultipleAlignment::calculateBaseConsensusMinCoverage(int base_threshold, int min_trim_coverage, int min_count_max_base)
+{
+    assert(!m_sequences.empty());
+    std::string consensus_sequence;
+    MultipleAlignmentElement& base_element = m_sequences.front();
+    size_t start_column = base_element.getStartColumn();
+    size_t end_column = base_element.getEndColumn();
+    
+    // This index records the last base in the consensus that had coverage greater than
+    // min_trim_coverage. After the consensus calculation the read is trimmed back to this position
+    int last_good_base = -1;
+    
+    for(size_t c = start_column; c <= end_column; ++c) {
+        std::vector<int> counts = getColumnBaseCounts(c);
+        
+        char max_symbol = '\0';
+        int max_count = -1;
+        int total_depth = 0;
+        size_t noPossibleCorrections = 0;
+
+#ifdef MA_DEBUG_CONSENSUS
+        printf("%zu\t", c);
+#endif
+        for(size_t a = 0; a < m_alphabet_size; ++a) {
+            char symbol = m_alphabet[a];
+            total_depth += counts[a];
+            
+            if(symbol != 'N' && counts[a] >= min_count_max_base) {
+                noPossibleCorrections ++;
+            }
+            if(symbol != 'N' && counts[a] > max_count) {
+                max_symbol = symbol;
+                max_count = counts[a];
+            }
+#ifdef MA_DEBUG_CONSENSUS
+            printf("%c:%d ", symbol, counts[a]);
+#endif
+        }
+        
+        char base_symbol = base_element.getColumnSymbol(c);
+        int base_count = counts[symbol2index(base_symbol)];
+        
+        // Choose a consensus base for this column. Only change a base
+        // if has been seen less than base_threshold times and the max
+        // base in the column has been seen more times than the base symbol
+        char consensus_symbol;
+        // Here we require the 3 conditions listed above
+        if(max_count > base_count && base_count < base_threshold && max_count >= min_count_max_base && noPossibleCorrections == 1)
+            consensus_symbol = max_symbol;
+        else
+            consensus_symbol = base_symbol;
+        
+        // Output a symbol to the consensus. Skip padding symbols and leading
+        // bases that are less than the minimum required depth to avoid trimming
+        if(consensus_symbol != '-' &&
+           (!consensus_sequence.empty() || total_depth >= min_trim_coverage))
+            consensus_sequence.push_back(consensus_symbol);
+        
+        // Record the position of the last good base
+        if(total_depth >= min_trim_coverage) {
+            int consensus_index = consensus_sequence.size() - 1;
+            if(consensus_index > last_good_base)
+                last_good_base = consensus_index;
+        }
+#ifdef MA_DEBUG_CONSENSUS
+        printf("CALL: %c\n", consensus_symbol);
+#endif 
+    }
+    
+    if(last_good_base != -1)
+        consensus_sequence.erase(last_good_base + 1);
+    else
+        consensus_sequence.clear();
+    
+    return consensus_sequence;
+}
+
 std::string MultipleAlignment::calculateBaseConsensus(int min_call_coverage, int min_trim_coverage)
 {
     assert(!m_sequences.empty());
diff --git a/src/Thirdparty/multiple_alignment.h b/src/Thirdparty/multiple_alignment.h
index 175ac52..a5b5c87 100644
--- a/src/Thirdparty/multiple_alignment.h
+++ b/src/Thirdparty/multiple_alignment.h
@@ -204,6 +204,13 @@ class MultipleAlignment
         // min_trim_coverage depth at the ends of the base sequence.
         std::string calculateBaseConsensus(int min_call_coverage, int min_trim_coverage);
 
+        // Calculate a new consensus sequence for the base sequence of the multiple alignment
+        // A base call is changed only if it has been seen in less than min_call_coverage sequences
+        // Leading/trailing bases are trimmed from the consensus sequence if there is less than
+        // min_trim_coverage depth at the ends of the base sequence.
+        // Also require a minimum count for the consensus base (min_count_base_overlap+1) and only one frequent base
+        std::string calculateBaseConsensusMinCoverage(int min_call_coverage, int min_trim_coverage, int min_count_base_overlap);
+    
         // Calculate consensus sequence of the base element that maximizes the likelihood of the multiple alignment
         void calculateBaseConsensusLikelihood(std::string* consensus_sequence, std::string* consensus_quality);
         
diff --git a/src/Thirdparty/overlapper.cpp b/src/Thirdparty/overlapper.cpp
index 1ab6076..497f24b 100644
--- a/src/Thirdparty/overlapper.cpp
+++ b/src/Thirdparty/overlapper.cpp
@@ -31,19 +31,18 @@
 #include <sstream>
 #include <limits>
 #include <stdio.h>
+#include <inttypes.h>
 
-OverlapperParams default_params = { 2, -6, -3 };
-OverlapperParams ungapped_params = { 2, -10000, -3 };
-
-
-
+// 
+OverlapperParams default_params = { 2, -6, -3, -2, ALT_OVERLAP };
+OverlapperParams ungapped_params = { 2, -10000, -3, -2, ALT_OVERLAP };
+OverlapperParams affine_default_params = { 2, -5, -3, -2, ALT_OVERLAP, true, true, true, true, true };
 
 //
 #define max3(x,y,z) std::max(std::max(x,y), z)
 //#define DEBUG_OVERLAPPER 1
 //#define DEBUG_EXTEND 1
 
-
 // 
 SequenceInterval::SequenceInterval() : start(0), end(-1)
 {
@@ -170,7 +169,10 @@ int SequenceOverlap::calculateTotalColumns() const
 //
 void SequenceOverlap::printAlignment(const std::string& s1, const std::string& s2) const
 {
-    assert(isValid());
+    if(!isValid()) {
+        std::cerr << "Alignment is not valid\n";
+        return;
+    }
 
     std::string out_1;
     std::string out_2;
@@ -199,7 +201,7 @@ void SequenceOverlap::printAlignment(const std::string& s1, const std::string& s
     char code;
     while(cigar_parser >> length >> code) {
         assert(length > 0);
-        if(code == 'M') {
+        if(code == 'M' || code == 'X' || code == '=') {
             out_1.append(s1.substr(current_1, length));
             out_2.append(s2.substr(current_2, length));
             current_1 += length;
@@ -627,7 +629,12 @@ SequenceOverlap Overlapper::extendMatch(const std::string& s1, const std::string
     return output;
 }
 
-// The score for this cell coming from a match, deletion and insertion
+static const uint8_t FROM_DIAG = 0;
+static const uint8_t FROM_LEFT = 1;
+static const uint8_t FROM_UP = 2;
+static const uint8_t FROM_INVALID = 3;
+
+// The score for this cell coming from a match, deletion and insertion and its direction
 struct AffineCell
 {
     AffineCell() : G(0), I(-std::numeric_limits<int>::max()), D(-std::numeric_limits<int>::max()) {}
@@ -636,164 +643,244 @@ struct AffineCell
     int G;
     int I;
     int D;
+    
+    uint8_t Gt;
+    uint8_t It;
+    uint8_t Dt;
 };
 
 typedef std::vector<AffineCell> AffineCells;
 typedef std::vector<AffineCells> AffineMatrix;
 
-SequenceOverlap Overlapper::computeOverlapAffine(const std::string& s1, const std::string& s2, const OverlapperParams params)
+SequenceOverlap Overlapper::computeAlignmentAffine(const std::string& s1, const std::string& s2, const OverlapperParams params)
 {
     // Exit with invalid intervals if either string is zero length
     SequenceOverlap output;
     if(s1.empty() || s2.empty()) {
-        std::cerr << "Overlapper::computeOverlap error: empty input sequence\n";
+        std::cerr << "Overlapper::computeAlignmentAffine error: empty input sequence\n";
         exit(EXIT_FAILURE);
     }
 
+    bool gap_s1_start;
+    bool gap_s1_end;
+    bool gap_s2_start;
+    bool gap_s2_end;
+    
+    // Set the bools for the explicit alignment types
+    if (params.type == ALT_GLOBAL)
+    {
+        gap_s1_start = false;
+        gap_s1_end = false;
+        gap_s2_start = false;
+        gap_s2_end = false;
+    }
+    else if (params.type == ALT_OVERLAP)
+    {
+        gap_s1_start = true;
+        gap_s1_end = true;
+        gap_s2_start = true;
+        gap_s2_end = true;
+    }
+    else if (params.type == ALT_CONTAINMENT)
+    {
+        gap_s1_start = true;
+        gap_s1_end = true;
+        gap_s2_start = false;
+        gap_s2_end = false;
+    }
+    else if (params.type == ALT_CUSTOM)
+    {
+        gap_s1_start = params.gap_s1_start;
+        gap_s1_end = params.gap_s1_end;
+        gap_s2_start = params.gap_s2_start;
+        gap_s2_end = params.gap_s2_end;
+    }
+    else
+    {
+        // Unknown alignment type
+        abort();
+    }
+
     // Initialize the scoring matrix
     size_t num_columns = s1.size() + 1;
     size_t num_rows = s2.size() + 1;
-
-    int gap_open = 5;
-    int gap_ext = 2;
-
+    int gap_open = -params.gap_penalty;
+    int gap_ext = -params.gap_ext_penalty;
+    
     AffineMatrix score_matrix;
     score_matrix.resize(num_columns);
+    
     for(size_t i = 0; i < score_matrix.size(); ++i)
         score_matrix[i].resize(num_rows);
+    
+    // Initialze first row and column
+    // Penalties in first row iff gap_s1_start==false
+    int c = (gap_s1_start == false ? 1 : 0);
+    for(size_t i = 1; i < num_columns; ++i) {
+        int v = -(gap_open + i * gap_ext) * c;
+        score_matrix[i][0].D = v;
+        score_matrix[i][0].Dt = (i == 1? FROM_DIAG : FROM_LEFT);
+        score_matrix[i][0].G = v;
+        score_matrix[i][0].Gt = FROM_LEFT;
+    }
+
+    // Penalties in first column iff gap_s2_start==false
+    c = (gap_s2_start == false ? 1 : 0);
+    for(size_t j = 1; j < num_rows; ++j) {
+        int v = -(gap_open + j * gap_ext) * c;
+        score_matrix[0][j].I = v;
+        score_matrix[0][j].It = (j == 1? FROM_DIAG : FROM_UP);
+        score_matrix[0][j].G = v;
+        score_matrix[0][j].Gt = FROM_UP;
+    }
 
     // Calculate scores
     for(size_t i = 1; i < num_columns; ++i) {
         for(size_t j = 1; j < num_rows; ++j) {
-
+            
             // Calculate the score for entry (i,j)
             int idx_1 = i - 1;
             int idx_2 = j - 1;
-
             int diagonal = score_matrix[i-1][j-1].G + (s1[idx_1] == s2[idx_2] ? params.match_score : params.mismatch_penalty);
-
-            // When computing the score starting from the left/right cells, we have to determine
-            // whether to extend an existing gap or start a new one.
+            
             AffineCell& curr = score_matrix[i][j];
-
             AffineCell& up = score_matrix[i][j-1];
-            if(up.I > up.G - gap_open)
-                curr.I = up.I - gap_ext;
-            else
-                curr.I = up.G - (gap_open + gap_ext);
-
             AffineCell& left = score_matrix[i-1][j];
-            if(left.D > left.G - gap_open)
-                curr.D = left.D - gap_ext;
-            else
-                curr.D = left.G - (gap_open + gap_ext);
+            
+            // When computing the score starting from the left/right cells, we have to determine
+            // whether to extend an existing gap or start a new one.
+            // In the last column, insertion costs are controlled by gap_s2_end
+            int ins_open = (i < num_columns - 1 or not gap_s2_end? gap_open : 0);
+            int ins_ext = (i < num_columns - 1 or not gap_s2_end? gap_ext : 0);
+            
+            // In the last row, deletion costs are controlled by gap_s1_end
+            int del_open = (j < num_rows - 1 or not gap_s1_end? gap_open : 0);
+            int del_ext = (j < num_rows - 1 or not gap_s1_end? gap_ext : 0);
+            
+            if(up.I > up.G - ins_open) {
+                curr.I = up.I - ins_ext;
+                curr.It = FROM_UP;
+            } else {
+                curr.I = up.G - (ins_open + ins_ext);
+                curr.It = FROM_DIAG;
+            }
+            if(left.D > left.G - del_open) {
+                curr.D = left.D - del_ext;
+                curr.Dt = FROM_LEFT;
+            } else {
+                curr.D = left.G - (del_open + del_ext);
+                curr.Dt = FROM_DIAG;
+            }
             
             curr.G = max3(curr.D, curr.I, diagonal);
+            if(curr.G == curr.I)
+                curr.Gt = FROM_UP;
+            else if(curr.G == curr.D)
+                curr.Gt = FROM_LEFT;
+            else
+                curr.Gt = FROM_DIAG;
         }
     }
- 
-    // The location of the highest scoring match in the
-    // last row or last column is the maximum scoring overlap
-    // for the pair of strings. We start the backtracking from
-    // that cell
-    int max_row_value = std::numeric_limits<int>::min();
-    int max_column_value = std::numeric_limits<int>::min();
-    size_t max_row_index = 0;
-    size_t max_column_index = 0;
-
-    // Check every column of the last row
-    // The first column is skipped to avoid empty alignments
-    for(size_t i = 1; i < num_columns; ++i) {
-        int v = score_matrix[i][num_rows - 1].G;
-        if(v > max_row_value) {
-            max_row_value = v;
-            max_row_index = i;
+    
+    // With the new scores, the max score is always in the bottom right cell
+    size_t i = num_columns - 1;
+    size_t j = num_rows - 1;
+    
+    output.score = score_matrix[i][j].G;
+    uint8_t direction = score_matrix[i][j].Gt;
+    
+    // However, the alignment might contain free end gaps which we now remove
+    if (gap_s2_end)
+    {
+        while (j >= 1 and direction == FROM_UP)
+        {
+            direction = score_matrix[i][j].It;
+            --j;
+            if (direction == FROM_DIAG)
+                direction = score_matrix[i][j].Gt;
         }
     }
 
-    // Check every row of the last column
-    for(size_t j = 1; j < num_rows; ++j) {
-        int v = score_matrix[num_columns - 1][j].G;
-        if(v > max_column_value) {
-            max_column_value = v;
-            max_column_index = j;
+    if (gap_s1_end and j == num_rows - 1)
+    {
+        while (i >= 1 and direction == FROM_LEFT)
+        {
+            direction = score_matrix[i][j].Dt;
+            --i;
+            if (direction == FROM_DIAG)
+                direction = score_matrix[i][j].Gt;
         }
     }
 
-    // Compute the location at which to start the backtrack
-    size_t i;
-    size_t j;
-
-    if(max_column_value > max_row_value) {
-        i = num_columns - 1;
-        j = max_column_index;
-        output.score = max_column_value;
-    } else {
-        i = max_row_index;
-        j = num_rows - 1;
-        output.score = max_row_value;
-    }
-
     // Set the alignment endpoints to be the index of the last aligned base
     output.match[0].end = i - 1;
     output.match[1].end = j - 1;
     output.length[0] = s1.length();
     output.length[1] = s2.length();
+
 #ifdef DEBUG_OVERLAPPER
     printf("Endpoints selected: (%d %d) with score %d\n", output.match[0].end, output.match[1].end, output.score);
 #endif
-
+    
     output.edit_distance = 0;
     output.total_columns = 0;
-
     std::string cigar;
-    while(i > 0 && j > 0) {
-        // Compute the possible previous locations of the path
-        int idx_1 = i - 1;
-        int idx_2 = j - 1;
-
-        bool is_match = s1[idx_1] == s2[idx_2];
-        int diagonal = score_matrix[i - 1][j - 1].G + (is_match ? params.match_score : params.mismatch_penalty);
-        int up1 = score_matrix[i][j-1].G - (gap_open + gap_ext);
-        int up2 = score_matrix[i][j-1].I - gap_ext;
-
-        int left1 = score_matrix[i-1][j].G - (gap_open + gap_ext);
-        int left2 = score_matrix[i-1][j].D - gap_ext;
-
-        int curr = score_matrix[i][j].G;
-
-        // If there are multiple possible paths to this cell
-        // we break ties in order of insertion,deletion,match
-        // this helps left-justify matches for homopolymer runs
-        // of unequal lengths
-        if(curr == up1 || curr == up2) {
+    
+    // We stop when we hit an edge along which gaps are free
+    while (not (i == 0 and j == 0) // absolute stop, regardless of free gaps
+            and not (i == 0 and gap_s2_start) // stop at left edge if s2 start gaps are free
+            and not (j == 0 and gap_s1_start)) // stop at top edge if s1 start gaps are free
+    {
+        if (direction == FROM_UP)
+        {
             cigar.push_back('I');
-            j -= 1;
-            output.edit_distance += 1;
-        } else if(curr == left1 || curr == left2) {
+            ++output.edit_distance;
+            direction = score_matrix[i][j].It;
+            --j;
+
+            if (direction == FROM_DIAG)
+                direction = score_matrix[i][j].Gt;
+        }
+        else if (direction == FROM_LEFT)
+        {
             cigar.push_back('D');
-            i -= 1;
-            output.edit_distance += 1;
-        } else {
-            assert(curr == diagonal);
-            if(!is_match)
-                output.edit_distance += 1;
-            cigar.push_back('M');
-            i -= 1;
-            j -= 1;
+            ++output.edit_distance;
+            direction = score_matrix[i][j].Dt;
+            --i;
+         
+            if (direction == FROM_DIAG)
+                direction = score_matrix[i][j].Gt;
         }
-
-        output.total_columns += 1;
+        else
+        {
+            assert(direction == FROM_DIAG);
+            assert(i > 0);
+            assert(j > 0);
+            
+            bool is_match = s1[i - 1] == s2[j - 1];
+            
+            if (is_match)
+            {
+                cigar.push_back(params.use_m_ops? 'M' : '=');
+            }
+            else
+            {
+                cigar.push_back(params.use_m_ops? 'M' : 'X');
+                ++output.edit_distance;
+            }
+            --i;
+            --j;
+            direction = score_matrix[i][j].Gt;
+        }
+        ++output.total_columns;
     }
-
+    
     // Set the alignment startpoints
     output.match[0].start = i;
     output.match[1].start = j;
-
     // Compact the expanded cigar string into the canonical run length encoding
     // The backtracking produces a cigar string in reversed order, flip it
     std::reverse(cigar.begin(), cigar.end());
-    assert(!cigar.empty());
     output.cigar = compactCigar(cigar);
     return output;
 }
diff --git a/src/Thirdparty/overlapper.h b/src/Thirdparty/overlapper.h
index 1834ef9..1cd6a59 100644
--- a/src/Thirdparty/overlapper.h
+++ b/src/Thirdparty/overlapper.h
@@ -127,16 +127,40 @@ struct SequenceOverlap
 
 };
 
+enum AffineAlignmentType
+{
+    ALT_OVERLAP,
+    ALT_GLOBAL,
+    ALT_CONTAINMENT,
+    ALT_CUSTOM
+};
+
 struct OverlapperParams
 {
     int match_score;
     int gap_penalty;
     int mismatch_penalty;
+    int gap_ext_penalty;
+    AffineAlignmentType type;
+    // The following 4 bools indicate whether it is free to align the respective positions to gaps.
+    // The existing alignment types are a subset of the 16 possible settings, as follows:
+    // ALT_GLOBAL: all false (no free gaps)
+    // ALT_OVERLAP: all true
+    // ALT_CONTAINMENT: s1 gaps are free, but not s2
+    bool gap_s1_start;
+    bool gap_s1_end;
+    bool gap_s2_start;
+    bool gap_s2_end;
+    // The following controls the cigar op used for matches and mismatches
+    // if true: use 'M' for both matches and mismatches (default)
+    // if false: use '=' for matches, 'X' for mismatches
+    bool use_m_ops;
 };
 
 // Global variables
-extern OverlapperParams default_params; // { 2, -5, -3 };
-extern OverlapperParams ungapped_params; // { 2, -10000, -3 };
+extern OverlapperParams default_params; // { 2, -6, -3, -2 };
+extern OverlapperParams ungapped_params; // { 2, -10000, -3, -2 };
+extern OverlapperParams affine_default_params; // { 2, -5, -3, -2 }; 
 
 //
 namespace Overlapper
@@ -153,7 +177,10 @@ SequenceOverlap computeOverlap(const std::string& s1, const std::string& s2, con
 SequenceOverlap extendMatch(const std::string& s1, const std::string& s2, int start_1, int start_2, int bandwidth);
 
 // Perform an alignment using affine gap penalties
-SequenceOverlap computeOverlapAffine(const std::string& s1, const std::string& s2, const OverlapperParams params = default_params);
+// This can perform either "overlap" type alignments (suffix to prefix)
+// or global type alignments (end-to-end).
+SequenceOverlap computeAlignmentAffine(const std::string& s1, const std::string& s2, const OverlapperParams params = affine_default_params);
+SequenceOverlap computeAlignmentAffine2(const std::string& s1, const std::string& s2, const OverlapperParams params);
 
 // Compact an expanded CIGAR string into a regular cigar string
 std::string compactCigar(const std::string& ecigar);
diff --git a/src/Util/Makefile.am b/src/Util/Makefile.am
index f382b25..5126c89 100644
--- a/src/Util/Makefile.am
+++ b/src/Util/Makefile.am
@@ -31,6 +31,7 @@ libutil_a_SOURCES = \
         VCFUtil.h VCFUtil.cpp \
         QualityTable.h QualityTable.cpp \
         BloomFilter.h BloomFilter.cpp \
+        VariantIndex.h VariantIndex.cpp \
         Verbosity.h \
         Timer.h \
         EncodedString.h \
diff --git a/src/Util/SGAStats.h b/src/Util/SGAStats.h
index 55d3744..1937513 100644
--- a/src/Util/SGAStats.h
+++ b/src/Util/SGAStats.h
@@ -9,7 +9,7 @@
 //
 
 #ifndef SGASTATS_H
-#define STATS_H
+#define SGASTATS_H
 
 namespace SGAStats
 {
diff --git a/src/Util/SeqReader.cpp b/src/Util/SeqReader.cpp
index 0dcaeac..804244d 100644
--- a/src/Util/SeqReader.cpp
+++ b/src/Util/SeqReader.cpp
@@ -13,12 +13,16 @@
 
 SeqReader::SeqReader(std::string filename, uint32_t flags) : m_flags(flags)
 {
-    m_pHandle = createReader(filename);
+    if(filename == "-")
+        m_pHandle = &std::cin;
+    else
+        m_pHandle = createReader(filename);
 }
     
 SeqReader::~SeqReader()
 {
-    delete m_pHandle;
+    if(m_pHandle != &std::cin)
+        delete m_pHandle;
 }
 
 // Extract an element from the file
diff --git a/src/Util/SeqReader.h b/src/Util/SeqReader.h
index ed8c38d..e2547e9 100644
--- a/src/Util/SeqReader.h
+++ b/src/Util/SeqReader.h
@@ -1,7 +1,7 @@
 //-----------------------------------------------
 // Copyright 2009 Wellcome Trust Sanger Institute
 // Written by Jared Simpson (js18 at sanger.ac.uk)
-// Released under the GPL license
+// Released under the GPL
 //-----------------------------------------------
 //
 // SeqReader - Reads fasta or fastq sequence files
diff --git a/src/Util/VCFUtil.cpp b/src/Util/VCFUtil.cpp
index 163b480..ad02450 100644
--- a/src/Util/VCFUtil.cpp
+++ b/src/Util/VCFUtil.cpp
@@ -10,6 +10,7 @@
 #include <iostream>
 #include <stdlib.h>
 #include <iterator>
+#include <sstream>
 #include "VCFUtil.h"
 #include "StdAlnTools.h"
 #include "MultiAlignment.h"
@@ -23,6 +24,47 @@ VCFRecord::VCFRecord()
     quality = 20.0f;
 }
 
+// parse from a line
+VCFRecord::VCFRecord(const std::string& line)
+{
+    std::stringstream parser(line);
+
+    parser >> refName;
+    parser >> refPosition;
+    parser >> id;
+    parser >> refStr;
+    parser >> varStr;
+
+    // Non-conformant VCF might represent unknown qualities with '.'
+    std::string quality_str;
+    parser >> quality_str;
+    if(quality_str == ".")
+    {
+        quality = 255;
+    }
+    else
+    {
+        std::stringstream qparser(quality_str);
+        qparser >> quality;
+    }
+
+    parser >> passStr;
+    
+    std::string tmp;
+    parser >> tmp;
+
+    // Split the comments on the semicolon delimiter
+    comments = split(tmp, ';');
+    
+    parser >> formatStr;
+    if(!formatStr.empty())
+    {
+        // parse genotypes
+        while(parser >> tmp)
+            sampleStr.push_back(tmp);
+    }
+}
+
 void VCFRecord::addComment(const std::string& key, const std::string& value)
 {
     std::string out = key;
@@ -67,7 +109,7 @@ void VCFRecord::printVerbose() const
 }
 
 // Write the VCF record to the stream
-std::ostream& operator<<(std::ostream& o, VCFRecord& record)
+std::ostream& operator<<(std::ostream& o, const VCFRecord& record)
 {
     o.precision(5);
     o.setf(std::ios::fixed,std::ios::floatfield);
diff --git a/src/Util/VCFUtil.h b/src/Util/VCFUtil.h
index dcdba7b..85eed7b 100644
--- a/src/Util/VCFUtil.h
+++ b/src/Util/VCFUtil.h
@@ -39,7 +39,13 @@ enum VCFReturnCode
 struct VCFRecord
 {
     // functions
+
+    // initialize an empty record
     VCFRecord();
+
+    // parse a record from a tab-delimited string
+    VCFRecord(const std::string& line);
+
     void addComment(const std::string& key, const std::string& value);
     void addComment(const std::string& key, const int& value);
     void addComment(const std::string& key, const double& value);
@@ -49,7 +55,7 @@ struct VCFRecord
 
     void printVerbose() const;
     VCFClassification classify() const;
-    friend std::ostream& operator<<(std::ostream& o, VCFRecord& record);
+    friend std::ostream& operator<<(std::ostream& o, const VCFRecord& record);
 
     static bool sort(const VCFRecord& a, const VCFRecord& b) 
     { 
@@ -59,6 +65,8 @@ struct VCFRecord
             return a.refPosition < b.refPosition;
     }
 
+    bool isMultiAllelic() const { return varStr.find(",") != std::string::npos; }
+
     // data
     std::string refName;
     size_t refPosition;
diff --git a/src/Util/VariantIndex.cpp b/src/Util/VariantIndex.cpp
new file mode 100644
index 0000000..09b9c71
--- /dev/null
+++ b/src/Util/VariantIndex.cpp
@@ -0,0 +1,96 @@
+//-----------------------------------------------------
+// Copyright 2014 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// Data structure for performing proximity queries
+// against a set of variants
+//
+#include <iostream>
+#include <algorithm>
+#include <assert.h>
+#include "VariantIndex.h"
+
+VariantIndex::VariantIndex(const std::string& filename, const ReadTable& refTable)
+{
+    std::ifstream input(filename.c_str());
+    std::string line;
+
+    // Parse the VCF file
+    while(getline(input, line))
+    {
+        if(line.empty())
+            continue;
+
+        if(line[0] == '#')
+            continue;
+        
+        VCFRecord record(line);
+
+        // Do not allow multi-allelic records
+        if(record.isMultiAllelic())
+        {
+            std::cerr << "Error: multi-allelic Variant found, please run vcfbreakmulti\n";
+            exit(EXIT_FAILURE);
+        }
+
+        // Convert to a minimal representation of the change
+        VariantRecord mvf = { record.refName, record.refStr, record.varStr, record.refPosition };
+        m_records.push_back(mvf);
+    }
+
+    buildIndex(refTable);
+}
+
+void VariantIndex::buildIndex(const ReadTable& refTable)
+{
+    for(size_t i = 0; i < refTable.getCount(); ++i)
+    {
+        const SeqItem& seq_record = refTable.getRead(i);
+        std::string chromosome = seq_record.id;
+        size_t length = seq_record.seq.length();
+        int buckets = (length / BUCKET_SIZE) + 1;
+
+        m_map[chromosome].resize(buckets);
+    }
+
+    for(size_t i = 0; i < m_records.size(); ++i)
+    {
+        VariantIndexMap::iterator iter = m_map.find(m_records[i].reference);
+        assert(iter != m_map.end());
+
+        int bucket_id = m_records[i].position / BUCKET_SIZE;
+        assert(bucket_id < (int)iter->second.size());
+        IntVector& bucket = iter->second[bucket_id];
+        bucket.push_back(i);
+    }
+}
+
+VariantRecordVector VariantIndex::getNearVariants(const std::string& reference,
+                                                  int position,
+                                                  int distance) const
+{
+    VariantRecordVector out;
+
+    VariantIndexMap::const_iterator iter = m_map.find(reference);
+    assert(iter != m_map.end());
+    const IntVectorVector& chr_buckets = iter->second;
+
+    int lower_index = std::max(position - distance, 0) / BUCKET_SIZE;
+    int upper_index = std::min((position + distance) / BUCKET_SIZE, (int)chr_buckets.size() - 1);
+
+    for(int bi = lower_index; bi <= upper_index; ++bi)
+    {
+        const IntVector& bucket = chr_buckets[bi];
+        for(size_t vi = 0; vi < bucket.size(); ++vi)
+        {
+            const VariantRecord& record = m_records[bucket[vi]];
+            if(abs(record.position - position) < distance)
+                out.push_back(record);
+        }
+    }
+
+    return out;
+}
+
diff --git a/src/Util/VariantIndex.h b/src/Util/VariantIndex.h
new file mode 100644
index 0000000..0de6b01
--- /dev/null
+++ b/src/Util/VariantIndex.h
@@ -0,0 +1,59 @@
+//-----------------------------------------------------
+// Copyright 2014 Ontario Institute for Cancer Research
+// Written by Jared Simpson (jared.simpson at oicr.on.ca)
+// Released under the GPL
+//-----------------------------------------------------
+//
+// Data structure for performing proximity queries
+// against a set of variants
+//
+#ifndef VARIANTINDEX_H
+#define VARIANTINDEX_H
+#include "Util.h"
+#include "SeqReader.h"
+#include "VCFUtil.h"
+#include "ReadTable.h"
+#include <map>
+
+// To save space we store a vcf-like record
+// with only the required fields
+struct VariantRecord
+{
+    std::string reference;
+    std::string ref_sequence;
+    std::string alt_sequence;
+    size_t position;
+};
+
+typedef std::vector<VariantRecord> VariantRecordVector;
+typedef std::vector<int> IntVector;
+typedef std::vector<IntVector> IntVectorVector;
+
+// map from chromosome to a vector of buckets, one every 10kbp
+typedef std::map<std::string, IntVectorVector> VariantIndexMap;
+
+class VariantIndex
+{
+    public:
+        //
+        VariantIndex(const std::string& filename, const ReadTable& refTable);
+
+        // Return a vector of the variants that are close to the input variant
+        VariantRecordVector getNearVariants(const std::string& reference, 
+                                            int position, 
+                                            int distance = 30) const;
+
+    private:
+        
+        //
+        void buildIndex(const ReadTable& refTable);
+
+        //
+        VariantRecordVector m_records;
+        VariantIndexMap m_map;
+
+        // 
+        static const int BUCKET_SIZE = 10000;
+};
+
+#endif
diff --git a/src/bin/sga-align b/src/bin/sga-align
index 323a2ba..d8fcd3d 100755
--- a/src/bin/sga-align
+++ b/src/bin/sga-align
@@ -28,7 +28,10 @@ def runBWAIndex(inFile):
 
 def runBWAAln(inFiles, contigsFile, outFile):
     assert len(inFiles) == 2
-    
+    if inFiles[0] == inFiles[1]:
+        print 'Error: the two reads files are the same'
+        sys.exit(1)
+
     contigsBasename = getBasename(contigsFile)
     tempSAI = map(lambda x : getBasename(x) + "." + contigsBasename + ".bwsai", inFiles)
 
diff --git a/src/bin/sga-astat.py b/src/bin/sga-astat.py
index aa2b807..819b6a2 100755
--- a/src/bin/sga-astat.py
+++ b/src/bin/sga-astat.py
@@ -75,9 +75,9 @@ contigData = list()
 # Read the contig names and lengths from the bam
 bamFile = pysam.Samfile(bamFilename, "rb")
 
-for (name, len) in zip(bamFile.references, bamFile.lengths):
-    t = name, len, 0
-    contigData.append(ContigData(name, len))
+for (name, length) in zip(bamFile.references, bamFile.lengths):
+    t = name, length, 0
+    contigData.append(ContigData(name, length))
     #print 'Name: ' + name
     #print 'Length: ' + str(len)
 
@@ -126,7 +126,7 @@ bootstrapLen = 0;
 bootstrapReads = 0;
 
 if genomeSize == 0:
-    for i in range(0, numContigsForInitialEstimate):
+    for i in range(0, min(numContigsForInitialEstimate, len(contigData))):
         cd = contigData[i]
         bootstrapLen += cd.nlen
         bootstrapReads += cd.n
diff --git a/src/bin/sga-preqc-report.py b/src/bin/sga-preqc-report.py
index 87758a6..4d45472 100755
--- a/src/bin/sga-preqc-report.py
+++ b/src/bin/sga-preqc-report.py
@@ -2,6 +2,8 @@
 """Generate a readable report from preqc output.
 """
 
+from __future__ import print_function, division
+
 import sys, os.path
 import matplotlib as MPL
 import argparse
@@ -73,17 +75,17 @@ def main(argv=None):
     # misc: listing available plots
     if( args.list_plots ):
         plot_funcs, plotsample_funcs = _get_available_plot_functions()
-        print "Available plots:"
-        print '\n'.join(['\t'+k for k in sorted(plot_funcs)])
-        print "Available per-sample plots:"
-        print '\n'.join(['\t'+k for k in sorted(plotsample_funcs)])
+        print("Available plots:")
+        print('\n'.join(['\t'+k for k in sorted(plot_funcs)]))
+        print("Available per-sample plots:")
+        print('\n'.join(['\t'+k for k in sorted(plotsample_funcs)]))
         sys.exit(0)
     # main mode
     output_pfx = args.output
     preqc_files = args.preqc_file
-    print 'output_pfx: ', output_pfx
-    print 'preqc_files:'
-    print '\t' + '\n\t'.join(preqc_files)
+    print('output_pfx: ', output_pfx)
+    print('preqc_files:')
+    print('\t' + '\n\t'.join(preqc_files))
     # load data
     data = load_preqc_datafiles(preqc_files)
     # make the report
@@ -670,7 +672,7 @@ def plotsample_gc_distribution(ax, d):
         ax.set_title(d['name'] + ' GC Bias')
         ax.set_xlabel("GC %")
         ax.set_ylabel("k-mer coverage")
-        return True 
+        return True
     else:
         # no data, draw an empty plot
         return _finish_plot(ax, [], 0, 'No GC data for %s' % d['name'])
@@ -683,15 +685,21 @@ def load_preqc_datafiles(preqc_files):
         also assign colors and markers"""
     # load the data
     data = []
+    deserial_fail = []
     for f in preqc_files:
         f = os.path.abspath(f)
         if( os.path.getsize(f) <= 0 ):
-            print "Warning: empty file '%s' ... skipping"%f
+            print("Warning: empty file '%s' ... skipping"%f)
             continue
         if( f in [d['file'] for d in data] ):
-            print "Warning: duplicate file '%s' ... skipping"%f
+            print("Warning: duplicate file '%s' ... skipping"%f)
+            continue
+        try:
+            deserial = json.load(open(f, 'r'))
+        except ValueError:
+            deserial_fail.append(f)
             continue
-        data.append(json.load(open(f, 'r')))
+        data.append(deserial)
         data[-1]['file'] = f
     # create unique names for each entry
     names = unique_names_from_filenames([d['file'] for d in data], splitext=True)
@@ -701,6 +709,8 @@ def load_preqc_datafiles(preqc_files):
         data[i]['name'] = names[i]
         data[i]['plot_color'] = plot_colors[i%len(plot_colors)]
         data[i]['plot_marker'] = PLOT_MARKERS[i%len(PLOT_MARKERS)]
+    for failed in deserial_fail:
+        print("Warning: failed to de-serialize file:", failed)
     return data
 
 
@@ -754,7 +764,7 @@ def make_report_with_subplots(output_pfx, data, save_png=False, pylab_show=False
     plot_legend(subplots[2][0], data)
     # Coverage plots
     plot_kmer_distribution(subplots[2][2], data, use_markers=False, legend_loc=None)
-    #plot_random_walk(subplots[2][4], data, use_markers=False, legend_loc=None) #@jts deprecated 
+    #plot_random_walk(subplots[2][4], data, use_markers=False, legend_loc=None) #@jts deprecated
 
     # use the rest of the subplots for gc
     gc_subplots = []
@@ -821,7 +831,8 @@ def make_report_plot_per_page(plots, output_pfx, data, save_png=False, pylab_sho
             func = plotsample_funcs[plotname]
             data_list = data
         else: # unrecogized, skip
-            print >>sys.stderr, "Warning: plot name '"+plotname+"' not recognized ... skipping"
+            print("Warning: plot name '"+plotname+"' not recognized ... skipping",
+                  file=sys.stderr)
             continue
 
         # loop over each sample if needed, else d == data (data_list=[data])
@@ -831,7 +842,7 @@ def make_report_plot_per_page(plots, output_pfx, data, save_png=False, pylab_sho
             if( len(data_list) > 1 ):
                 figname += '_'+d['name'].replace('/','_').replace('\\','_')
             # make the fig
-            print 'plotting:',figname
+            print('plotting:',figname)
             fig, subplots = _create_fig_and_subplots(figname, 1, 1)
             # plot
             rv = func(subplots[0], d)
diff --git a/src/bin/sga-variant-filters.pl b/src/bin/sga-variant-filters.pl
deleted file mode 100755
index 6f8f964..0000000
--- a/src/bin/sga-variant-filters.pl
+++ /dev/null
@@ -1,115 +0,0 @@
-#! /usr/bin/perl
-# Perform filtering of SGA variant calls
-
-use strict;
-use Getopt::Long;
-use File::Basename;
-
-my $dbsnp_path = ""; # Filter variants against dbSNP at the given directory
-my $sga_file = "";
-my $dust_cutoff = 2.0;
-my $strand_cutoff = 2.0;
-my $hplen_cutoff = 7;
-
-GetOptions("dbsnp=s" => \$dbsnp_path,
-           "sga=s" => \$sga_file);
-
-die("The --sga option is mandatory") if($sga_file eq "");
-my %non_dbsnp_sites;
-my $using_dbsnp = 0;
-if($dbsnp_path ne "") {
-    $using_dbsnp = 1;
-    load_nondbsnp_sites($sga_file, $dbsnp_path);
-}
-
-filter_annotations($sga_file);
-
-sub filter_annotations
-{
-    my($in) = @_;
-    my $base = basename($in, ".vcf");
-    my $outname = "$base.filters.vcf";
-    open(OUT, ">$outname") || die("Cannot open $outname");
-    open(IN, $in) || die("Cannot open $in");
-    my $total_out = 0;
-    my %seen_hash;
-
-    while(<IN>) {
-        # Print header lines
-        if(/^#/) {
-            print OUT;
-            next;
-        }
-
-        chomp;
-    
-        # If this call is exactly the same as a previous call, ignore it
-        my @f = split;
-        my $key = sprintf("%s.%d.%s.%s", $f[0], $f[1], $f[3], $f[4]);
-        if(defined($seen_hash{$key})) {
-            next;
-        } else {
-            $seen_hash{$key} = 1;
-        }
-
-        my @filter_reason;
-        my ($dust) = (/Dust=(\d+\.\d+)/);
-        if($dust > $dust_cutoff) {
-            push @filter_reason, "LowComplexity";
-        }
-
-        my ($strand) = (/SB=(\d+\.\d+)/);
-        if($strand > $strand_cutoff) {
-            push @filter_reason, "StrandBias";
-        }
-
-        my ($hplen) = (/HPLen=(\d+)/);
-        if($hplen > $hplen_cutoff) {
-            push @filter_reason, "Homopolymer";
-        }
-
-        if($using_dbsnp && !defined($non_dbsnp_sites{$key})) { 
-            push @filter_reason, "dbSNP";
-        }
-
-        if(scalar(@filter_reason) > 0) {
-            $f[6] = join(";", @filter_reason);
-        } else {
-            $total_out++;
-        }
-        print OUT join("\t", @f) . "\n";
-    }
-    
-    close(IN);
-    close(OUT);
-
-    print "$total_out calls passed all filters\n";
-    return $outname;
-}
-
-# Build a hash of calls that are NOT present in dbsnp
-sub load_nondbsnp_sites
-{
-    my($in, $path) = @_;
-    my $out = "$in.dbsnp_filtered.vcf";
-    
-    # sort, bgzip and tabix the file
-    system("cat $in | vcf-sort > $in.tmp.sorted.vcf");
-    system("bgzip -f $in.tmp.sorted.vcf");
-    system("vcf tabix -f -p vcf $in.tmp.sorted.vcf.gz");
-    
-    # find the non-dbsnp sites
-    open(SITES, "vcf isec -C $in.tmp.sorted.vcf.gz $path |");
-    while(<SITES>) {
-        chomp;
-        my @fields = split;
-        my $site_key = "$fields[0].$fields[1].$fields[2].$fields[3]";
-        $non_dbsnp_sites{$site_key} = 1;
-    }
-    close(SITES);
-
-    # Cleanup tmp
-    unlink("$in.tmp.sorted.vcf");
-    unlink("$in.tmp.sorted.vcf.gz");
-    unlink("$in.tmp.sorted.vcf.gz.tbi");
-}
diff --git a/src/configure.ac b/src/configure.ac
index 73a1559..f774efc 100644
--- a/src/configure.ac
+++ b/src/configure.ac
@@ -1,7 +1,7 @@
 #                                               -*- Autoconf -*-
 # Process this file with autoconf to produce a configure script.
 AC_PREREQ(2.59)
-AC_INIT(sga, 0.10.13, js18 at sanger.ac.uk)
+AC_INIT(sga, 0.10.14, js18 at sanger.ac.uk)
 AM_INIT_AUTOMAKE(foreign)
 AC_CONFIG_SRCDIR([SGA/sga.cpp])
 AC_CONFIG_HEADER([config.h])

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/sga.git