[med-svn] [pbgenomicconsensus] 01/05: Imported Upstream version 2.0.0+20160420
Afif Elghraoui
afif at moszumanska.debian.org
Fri Apr 22 06:40:14 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository pbgenomicconsensus.
commit 0b4bc9ed31bc19fd00a3e990bcd6fb4ac90418cf
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Thu Apr 21 23:05:02 2016 -0700
Imported Upstream version 2.0.0+20160420
---
CHANGELOG | 3 +
GenomicConsensus/ResultCollector.py | 6 +-
GenomicConsensus/Worker.py | 8 +-
GenomicConsensus/__init__.py | 2 +-
.../{__init__.py => algorithmSelection.py} | 43 ++-
GenomicConsensus/{ => arrow}/__init__.py | 6 +-
.../{quiver/quiver.py => arrow/arrow.py} | 138 +++----
GenomicConsensus/arrow/diploid.py | 223 +++++++++++
GenomicConsensus/arrow/evidence.py | 173 +++++++++
GenomicConsensus/arrow/model.py | 109 ++++++
GenomicConsensus/{quiver => arrow}/utils.py | 232 ++++++------
GenomicConsensus/consensus.py | 12 +
GenomicConsensus/main.py | 125 ++++---
GenomicConsensus/options.py | 81 ++--
GenomicConsensus/plurality/plurality.py | 4 +-
GenomicConsensus/{ => poa}/__init__.py | 1 -
GenomicConsensus/poa/poa.py | 413 +++++++++++++++++++++
GenomicConsensus/quiver/model.py | 10 +-
GenomicConsensus/quiver/quiver.py | 21 +-
GenomicConsensus/quiver/utils.py | 44 +--
GenomicConsensus/reference.py | 52 +--
GenomicConsensus/utils.py | 40 +-
GenomicConsensus/windows.py | 3 +
README.md | 59 ++-
bin/arrow | 2 +
bin/gffToBed | 15 +-
bin/gffToVcf | 17 +-
bin/poa | 2 +
bin/summarizeConsensus | 130 +++----
circle.yml | 31 ++
doc/{QuiverFAQ.rst => FAQ.rst} | 174 +++++----
doc/HowTo.rst | 128 +++++++
doc/HowToQuiver.rst | 161 --------
doc/VariantCallerFunctionalSpecification.rst | 9 +-
doc/params-template.xml | 51 +++
setup.py | 11 +-
tests/cram/arrow-all4mer.t | 38 ++
tests/cram/bad_input.t | 6 +
tests/cram/best-all4mer.t | 23 ++
tests/cram/internal/alignment_summary_scaling.t | 25 ++
tests/cram/internal/arrow-staph.t | 32 ++
tests/cram/internal/quiver-compatibility.t | 3 +-
tests/cram/internal/quiver-ecoli.t | 2 +-
tests/cram/internal/quiver-eichler-bac.t | 2 +-
tests/cram/internal/quiver-fluidigm-amplicons.t | 2 +-
tests/cram/internal/quiver-mruber.t | 12 +-
.../internal/quiver-tinyLambda-coverage-islands.t | 2 -
tests/cram/plurality-all4mer.t | 37 ++
tests/cram/plurality-hcv.t | 55 ---
tests/cram/plurality-pbcore-lambda.t | 17 -
tests/cram/poa-all4mer.t | 37 ++
tests/cram/quiver-all4mer.t | 37 ++
tests/cram/quiver-hcv.t | 66 ----
tests/cram/version.t | 2 +-
tests/data/all4mer/All4mer.V2.01_Insert.fa | 5 +
tests/data/all4mer/All4mer.V2.01_Insert.fa.fai | 1 +
tests/data/all4mer/README | 5 +
tests/data/all4mer/hole-numbers.txt | 14 +
tests/data/all4mer/out.aligned_subreads.bam | Bin 0 -> 559393 bytes
tests/data/all4mer/out.aligned_subreads.bam.pbi | Bin 0 -> 9926 bytes
tests/data/sanity/empty.subreads.bam | Bin 0 -> 728 bytes
tests/data/sanity/empty.subreads.bam.pbi | Bin 0 -> 67 bytes
tests/data/sanity/mapped.subreads.bam | Bin 0 -> 3334 bytes
tests/data/sanity/mapped.subreads.bam.pbi | Bin 0 -> 127 bytes
tests/data/sanity/mixed.alignmentset.xml | 19 +
tests/unit/AlignmentHitStubs.py | 32 +-
tests/unit/test_algorithm_selection.py | 14 +
tests/unit/test_quiver.py | 48 +++
tests/unit/test_summarize_consensus.py | 78 ++++
tests/unit/test_tool_contract.py | 17 +-
70 files changed, 2268 insertions(+), 902 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index b6eed56..1e0cfea 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,6 @@
+Version 2.0.0
+ * Working support for Arrow and POA-only consensus models
+
Version 1.1.0
* Working support for DataSet read and reference files
diff --git a/GenomicConsensus/ResultCollector.py b/GenomicConsensus/ResultCollector.py
index 45b3692..2e1b3ad 100644
--- a/GenomicConsensus/ResultCollector.py
+++ b/GenomicConsensus/ResultCollector.py
@@ -43,8 +43,9 @@ class ResultCollector(object):
"""
Gathers results and writes to a file.
"""
- def __init__(self, resultsQueue, algorithmConfig):
+ def __init__(self, resultsQueue, algorithmName, algorithmConfig):
self._resultsQueue = resultsQueue
+ self._algorithmName = algorithmName
self._algorithmConfig = algorithmConfig
def _run(self):
@@ -140,8 +141,7 @@ class ResultCollector(object):
spanName = refName
else:
spanName = refName + "_%d_%d" % (s, e)
- cssName = consensus.consensusContigName(spanName,
- options.algorithm)
+ cssName = consensus.consensusContigName(spanName, self._algorithmName)
# Gather just the chunks pertaining to this span
chunksThisSpan = [ chunk for chunk in self.consensusChunksByRefId[refId]
if windows.windowsIntersect(chunk.refWindow, span) ]
diff --git a/GenomicConsensus/Worker.py b/GenomicConsensus/Worker.py
index 37e6c03..a647716 100644
--- a/GenomicConsensus/Worker.py
+++ b/GenomicConsensus/Worker.py
@@ -54,10 +54,10 @@ class Worker(object):
def _run(self):
if options.usingBam:
- self._inCmpH5 = loadBam(options.inputFilename, options.referenceFilename)
+ self._inAlnFile = loadBam(options.inputFilename, options.referenceFilename)
else:
- self._inCmpH5 = loadCmpH5(options.inputFilename, options.referenceFilename,
- disableChunkCache=options.disableHdf5ChunkCache)
+ self._inAlnFile = loadCmpH5(options.inputFilename, options.referenceFilename,
+ disableChunkCache=options.disableHdf5ChunkCache)
self.onStart()
while True:
@@ -81,7 +81,7 @@ class Worker(object):
def run(self):
- if options.debug:
+ if options.pdb:
import ipdb
with ipdb.launch_ipdb_on_exception():
self._run()
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py
index abed5b6..0e66fb7 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/__init__.py
@@ -30,4 +30,4 @@
# Author: David Alexander
-__VERSION__ = "1.1.0"
+__VERSION__ = "2.0.0"
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/algorithmSelection.py
similarity index 54%
copy from GenomicConsensus/__init__.py
copy to GenomicConsensus/algorithmSelection.py
index abed5b6..e9dfe49 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/algorithmSelection.py
@@ -1,5 +1,6 @@
+#!/usr/bin/env python
#################################################################################
-# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+# Copyright (c) 2011-2016, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
@@ -30,4 +31,42 @@
# Author: David Alexander
-__VERSION__ = "1.1.0"
+from .utils import die
+
+def bestAlgorithm_(sequencingChemistries):
+ """
+ Identify the (de novo) consensus algorithm we expect to deliver
+ the best results, given the sequencing chemistries represented in
+ an alignment file.
+
+ We key off the sequencing chemistries as follows:
+
+ - Just RS chemistry data? Then use quiver (at least for now, until
+ we get arrow > quiver on P6-C4)
+ - Else (either all Sequel data, or a mix of Sequel and RS data),
+ use arrow.
+ - Unknown chemistry found? Return None; we should abort if this is found
+
+ Note that the handling/rejection of chemistry mixtures (including
+ mixtures of Sequel and RS data) is left to the algorithm itself.
+ """
+ if len(sequencingChemistries) == 0:
+ raise ValueError("sequencingChemistries must be nonempty list or set")
+ chems = set(sequencingChemistries)
+ anyUnknown = "unknown" in chems
+ allRS = all(not(chem.startswith("S/")) for chem in chems) and (not anyUnknown)
+
+ if anyUnknown:
+ return None
+ elif allRS:
+ return "quiver"
+ else:
+ return "arrow"
+
+def bestAlgorithm(sequencingChemistries):
+ ba = bestAlgorithm_(sequencingChemistries)
+ if ba is None:
+ die("Unidentifiable sequencing chemistry present in dataset. " +
+ "Check if your SMRTanalysis installation is out-of-date.")
+ else:
+ return ba
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/arrow/__init__.py
old mode 100644
new mode 100755
similarity index 95%
copy from GenomicConsensus/__init__.py
copy to GenomicConsensus/arrow/__init__.py
index abed5b6..9ab16cd
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/arrow/__init__.py
@@ -28,6 +28,8 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-# Author: David Alexander
+# Authors: David Alexander, Lance Hepler
-__VERSION__ = "1.1.0"
+import utils
+import model
+# import evidence
diff --git a/GenomicConsensus/quiver/quiver.py b/GenomicConsensus/arrow/arrow.py
old mode 100644
new mode 100755
similarity index 59%
copy from GenomicConsensus/quiver/quiver.py
copy to GenomicConsensus/arrow/arrow.py
index 1e2fea5..ced2e25
--- a/GenomicConsensus/quiver/quiver.py
+++ b/GenomicConsensus/arrow/arrow.py
@@ -28,27 +28,27 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-# Author: David Alexander
+# Authors: David Alexander, Lance Hepler
import logging
-import ConsensusCore as cc, numpy as np
+import ConsensusCore2 as cc, numpy as np
from .. import reference
from ..options import options
from ..Worker import WorkerProcess, WorkerThread
from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread
-from GenomicConsensus.consensus import Consensus, QuiverConsensus, join
+from GenomicConsensus.consensus import Consensus, ArrowConsensus, join
from GenomicConsensus.windows import kSpannedIntervals, holes, subWindow
from GenomicConsensus.variants import filterVariants, annotateVariants
-from GenomicConsensus.quiver.evidence import dumpEvidence
-from GenomicConsensus.quiver import diploid
+from GenomicConsensus.arrow.evidence import dumpEvidence
+from GenomicConsensus.arrow import diploid
-import GenomicConsensus.quiver.model as M
-import GenomicConsensus.quiver.utils as U
+import GenomicConsensus.arrow.model as M
+import GenomicConsensus.arrow.utils as U
-def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
- depthLimit, quiverConfig):
+def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
+ depthLimit, arrowConfig):
"""
High-level routine for calling the consensus for a
window of the genome given a cmp.h5.
@@ -59,21 +59,21 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
inadequate coverage.
"""
winId, winStart, winEnd = refWindow
- logging.info("Quiver operating on %s" %
+ logging.info("Arrow operating on %s" %
reference.windowToString(refWindow))
if options.fancyChunking:
- # 1) identify the intervals with adequate coverage for quiver
+ # 1) identify the intervals with adequate coverage for arrow
# consensus; restrict to intervals of length > 10
- alnHits = U.readsInWindow(cmpH5, refWindow,
+ alnHits = U.readsInWindow(alnFile, refWindow,
depthLimit=20000,
- minMapQV=quiverConfig.minMapQV,
- strategy="long-and-strand-balanced",
+ minMapQV=arrowConfig.minMapQV,
+ strategy="longest",
stratum=options.readStratum,
barcode=options.barcode)
starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
ends = np.fromiter((hit.tEnd for hit in alnHits), np.int)
- intervals = kSpannedIntervals(refWindow, quiverConfig.minPoaCoverage,
+ intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage,
starts, ends, minLength=10)
coverageGaps = holes(refWindow, intervals)
allIntervals = sorted(intervals + coverageGaps)
@@ -95,17 +95,17 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
subWin = subWindow(refWindow, interval)
windowRefSeq = referenceContig[intStart:intEnd]
- alns = U.readsInWindow(cmpH5, subWin,
+ alns = U.readsInWindow(alnFile, subWin,
depthLimit=depthLimit,
- minMapQV=quiverConfig.minMapQV,
- strategy="long-and-strand-balanced",
+ minMapQV=arrowConfig.minMapQV,
+ strategy="longest",
stratum=options.readStratum,
barcode=options.barcode)
clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
- clippedAlns = U.filterAlns(subWin, clippedAlns_, quiverConfig)
+ clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig)
if len([ a for a in clippedAlns
- if a.spansReferenceRange(*interval) ]) >= quiverConfig.minPoaCoverage:
+ if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage:
logging.debug("%s: Reads being used: %s" %
(reference.windowToString(subWin),
@@ -114,20 +114,14 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
css = U.consensusForAlignments(subWin,
intRefSeq,
clippedAlns,
- quiverConfig)
+ arrowConfig)
siteCoverage = U.coverageInWindow(subWin, alns)
- if options.diploid:
- variants_ = diploid.variantsFromConsensus(subWin, windowRefSeq,
- css.sequence, css.confidence, siteCoverage,
- options.aligner,
- css.mms)
- else:
- variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
- css.sequence, css.confidence, siteCoverage,
- options.aligner,
- mms=None)
+ variants_ = U.variantsFromConsensus(subWin, windowRefSeq,
+ css.sequence, css.confidence, siteCoverage,
+ options.aligner,
+ ai=None)
filteredVars = filterVariants(options.minCoverage,
options.minConfidence,
@@ -143,12 +137,13 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
((options.dumpEvidence == "all") or
(options.dumpEvidence == "variants") and (len(variants) > 0))
if shouldDumpEvidence:
- dumpEvidence(options.evidenceDirectory,
- subWin, windowRefSeq,
- clippedAlns, css)
+ logging.info("Arrow does not yet support --dumpEvidence")
+# dumpEvidence(options.evidenceDirectory,
+# subWin, windowRefSeq,
+# clippedAlns, css)
else:
- css = QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
- subWin, intRefSeq)
+ css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+ subWin, intRefSeq)
subConsensi.append(css)
# 4) glue the subwindow consensus objects together to form the
@@ -159,10 +154,10 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
return css, variants
-class QuiverWorker(object):
+class ArrowWorker(object):
@property
- def quiverConfig(self):
+ def arrowConfig(self):
return self._algorithmConfig
def onChunk(self, workChunk):
@@ -173,8 +168,8 @@ class QuiverWorker(object):
# Quick cutout for no-coverage case
if not workChunk.hasCoverage:
- noCallCss = QuiverConsensus.noCallConsensus(self.quiverConfig.noEvidenceConsensus,
- referenceWindow, refSeqInWindow)
+ noCallCss = ArrowConsensus.noCallConsensus(self.arrowConfig.noEvidenceConsensus,
+ referenceWindow, refSeqInWindow)
return (referenceWindow, (noCallCss, []))
# General case
@@ -193,8 +188,8 @@ class QuiverWorker(object):
# Get the consensus for the enlarged window.
#
css_, variants_ = \
- consensusAndVariantsForWindow(self._inCmpH5, eWindow,
- refContig, options.coverage, self.quiverConfig)
+ consensusAndVariantsForWindow(self._inAlnFile, eWindow,
+ refContig, options.coverage, self.arrowConfig)
#
# Restrict the consensus and variants to the reference window.
@@ -220,8 +215,8 @@ class QuiverWorker(object):
#
# Slave process/thread classes
#
-class QuiverWorkerProcess(QuiverWorker, WorkerProcess): pass
-class QuiverWorkerThread(QuiverWorker, WorkerThread): pass
+class ArrowWorkerProcess(ArrowWorker, WorkerProcess): pass
+class ArrowWorkerThread(ArrowWorker, WorkerThread): pass
#
@@ -232,52 +227,27 @@ __all__ = [ "name",
"configure",
"slaveFactories" ]
-name = "Quiver"
+name = "arrow"
availability = (True, "OK")
-def configure(options, cmpH5):
- if options.verbosity > 1:
- cc.Logging.EnableDiagnosticLogging()
-
- if cmpH5.readType != "standard":
+def configure(options, alnFile):
+ if alnFile.readType != "standard":
raise U.IncompatibleDataException(
- "The Quiver algorithm requires a cmp.h5 file containing standard (non-CCS) reads." )
+ "The Arrow algorithm requires a BAM file containing standard (non-CCS) reads." )
- if options.parametersSpec == "auto":
- if options.diploid:
- logging.info("Diploid analysis--resorting to unknown.NoQVsModel until other " +
- "parameter sets can be recalibrated.")
- params = M.loadParameterSets(options.parametersFile, spec="unknown.NoQVsModel")
- else:
- params = M.loadParameterSets(options.parametersFile, cmpH5=cmpH5)
- qvMsg = "This .cmp.h5 file lacks some of the QV data tracks that are required " + \
- "for optimal performance of the Quiver algorithm. For optimal results" + \
- " use the ResequencingQVs workflow in SMRTPortal with bas.h5 files " + \
- "from an instrument using software version 1.3.1 or later, or the " + \
- "--forQuiver option to pbalign."
- if not M.enoughQVsLoaded(cmpH5):
- raise U.IncompatibleDataException(qvMsg)
- elif not M.allQVsLoaded(cmpH5):
- logging.warn(qvMsg)
- else:
- params = M.loadParameterSets(options.parametersFile,
- spec=options.parametersSpec,
- cmpH5=cmpH5)
- if not all(ps.model.isCompatibleWithCmpH5(cmpH5) for ps in params.values()):
- raise U.IncompatibleDataException(
- "Selected Quiver parameter set is incompatible with this cmp.h5 file " +
- "due to missing data tracks.")
-
- logging.info("Using Quiver parameter set(s): %s" % (", ".join(ps.name for ps in params.values())))
- return M.QuiverConfig(minMapQV=options.minMapQV,
- noEvidenceConsensus=options.noEvidenceConsensusCall,
- refineDinucleotideRepeats=(not options.fastMode) and options.refineDinucleotideRepeats,
- computeConfidence=(not options.fastMode),
- parameterSets=params)
+ if options.diploid:
+ logging.warn("Diploid analysis not yet supported under Arrow model.")
+
+ return M.ArrowConfig(minMapQV=options.minMapQV,
+ noEvidenceConsensus=options.noEvidenceConsensusCall,
+ computeConfidence=(not options.fastMode),
+ minReadScore=options.minReadScore,
+ minHqRegionSnr=options.minHqRegionSnr,
+ minZScore=options.minZScore)
def slaveFactories(threaded):
# By default we use slave processes. The tuple ordering is important.
if threaded:
- return (QuiverWorkerThread, ResultCollectorThread)
+ return (ArrowWorkerThread, ResultCollectorThread)
else:
- return (QuiverWorkerProcess, ResultCollectorProcess)
+ return (ArrowWorkerProcess, ResultCollectorProcess)
diff --git a/GenomicConsensus/arrow/diploid.py b/GenomicConsensus/arrow/diploid.py
new file mode 100755
index 0000000..7c88f33
--- /dev/null
+++ b/GenomicConsensus/arrow/diploid.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Authors: David Alexander, Lance Hepler
+
+from GenomicConsensus.arrow.utils import allSingleBaseMutations
+from GenomicConsensus.variants import Variant
+
+import numpy as np
+import ConsensusCore2 as cc
+
+# IUPAC reference:
+# http://www.bioinformatics.org/sms/iupac.html
+
+_packIupac = { ("A", "G") : "R" ,
+ ("G", "A") : "R" ,
+ ("C", "T") : "Y" ,
+ ("T", "C") : "Y" ,
+ ("G", "C") : "S" ,
+ ("C", "G") : "S" ,
+ ("A", "T") : "W" ,
+ ("T", "A") : "W" ,
+ ("G", "T") : "K" ,
+ ("T", "G") : "K" ,
+ ("A", "C") : "M" ,
+ ("C", "A") : "M" }
+
+_unpackIupac = { "R" : ("A", "G") ,
+ "Y" : ("C", "T") ,
+ "S" : ("G", "C") ,
+ "W" : ("A", "T") ,
+ "K" : ("G", "T") ,
+ "M" : ("A", "C") }
+
+def packIUPAC(bases):
+ return _packIupac[bases]
+
+def unpackIUPAC(iupacCode):
+ return _unpackIupac[iupacCode]
+
+def isHeterozygote(base):
+ return (base in _unpackIupac)
+
+def packMuts(cssBase, mut1, mut2):
+ # Turn two muts (with same Start, End, LengthDiff) into a single mutation to
+ # IUPAC. The no-op mutation is coded as None.
+ #
+ # Example1: (_, Subs A, Subs T) -> Subs W
+ # Example2: (_, Ins A, Ins T) -> Ins W
+ # Example3: (A, None, Subs T) -> Subs W
+ #
+ nonNullMut = mut1 or mut2
+ start = nonNullMut.Start()
+ mutType = nonNullMut.Type
+ newBase1 = mut1.Base if mut1 else cssBase
+ newBase2 = mut2.Base if mut2 else cssBase
+ newBasePacked = packIUPAC((newBase1, newBase2))
+ return cc.Mutation(mutType, start, newBasePacked)
+
+
+def scoresForPosition(ai, pos):
+ muts = allSingleBaseMutations(str(ai), positions=[pos])
+ noMutScore = [0] * ai.NumReads()
+ mutScores_ = [ ai.ReadLLs(mut)
+ for mut in muts ]
+ mutScores = np.column_stack([noMutScore] + mutScores_).astype(np.float32)
+ return mutScores
+
+
+def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
+ cssQvInWindow=None, siteCoverage=None, aligner="affine",
+ ai=None):
+ """
+ Compare the consensus and the reference in this window, returning
+ a list of variants.
+
+ Uses the integrator to identify heterozygous variants.
+ """
+ assert (cssQvInWindow is None) == (siteCoverage is None) # Both or none
+
+ refId, refStart, refEnd = refWindow
+
+ if ai is not None:
+ #
+ # Hunting diploid variants:
+ # 1. find confident heterozygous sites;
+ # 2. build a "diploid consensus" using IUPAC encoding
+ # for het sites; mark cssQv accordingly
+ # 3. align diploid consensus to reference
+ # 4. extract and decorate variants
+ #
+ assert str(ai) == cssSequenceInWindow
+ iupacMutations = [] # List of (Mutation, confidence)
+ for pos in xrange(0, ai.Length()):
+ ds = cc.IsSiteHeterozygous(scoresForPosition(ai, pos), 40)
+ if ds:
+ muts = [None] + list(allSingleBaseMutations(cssSequenceInWindow, positions=[pos]))
+ mut0 = muts[ds.Allele0]
+ mut1 = muts[ds.Allele1]
+ cssBase = cssSequenceInWindow[pos]
+ packedMut = packMuts(cssBase, mut0, mut1)
+ iupacMutations.append((packedMut, 40))
+
+ # Create diploidCss by applying mutations, meanwhile updating the
+ # confidence vector accordingly.
+ diploidCss = cc.ApplyMutations([pair[0] for pair in iupacMutations],
+ cssSequenceInWindow)
+
+ diploidQv = list(cssQvInWindow) if cssQvInWindow is not None else None
+
+ runningLengthDiff = 0
+ for (mut, conf) in iupacMutations:
+ start = mut.Start() + runningLengthDiff
+ end = mut.End() + runningLengthDiff
+ diploidQv[start:end] = [conf]
+ assert len(diploidCss) == len(diploidQv)
+
+ cssSequenceInWindow = diploidCss
+ cssQvInWindow = diploidQv
+
+ vars = variantsFromAlignment(refWindow,
+ refSequenceInWindow, cssSequenceInWindow,
+ cssQvInWindow, siteCoverage)
+ return vars
+
+
+def variantsFromAlignment(refWindow, refSeq, cssSeq,
+ cssQV=None, refCoverage=None):
+ """
+ Extract the variants implied by a pairwise alignment of cssSeq to
+ refSeq reference. If cssQV, refCoverage are provided, they will
+ be used to decorate the variants with those attributes.
+
+ Arguments:
+ - cssQV: QV array, same length as css
+ - refCoverage: coverage array, sample length as reference window
+
+ This is trickier than in the haploid case. We have to break out
+ diploid variants as single bases, in order to avoid implying
+ phase.
+ """
+ variants = []
+ refId, refStart, refEnd = refWindow
+
+ aln = cc.AlignAffineIupac(refSeq, cssSeq);
+ alnTarget = aln.Target()
+ alnQuery = aln.Query()
+
+ assert (cssQV is None) == (refCoverage is None) # Both or none
+ assert len(refSeq) == refEnd - refStart
+ assert cssQV is None or len(cssSeq) == len(cssQV)
+ assert refCoverage is None or len(refSeq) == len(refCoverage)
+
+ transcript = [ X if (Q != "N" and T != "N") else "N"
+ for (X, T, Q) in zip(aln.Transcript(),
+ alnTarget,
+ alnQuery) ]
+ variants = []
+ runStart = -1
+ runStartRefPos = None
+ runX = None
+ refPos = refStart
+ for pos, (X, T, Q) in enumerate(zip(transcript,
+ alnTarget,
+ alnQuery)):
+ if X != runX or isHeterozygote(Q):
+ if runStart >= 0 and runX not in "MN":
+ # Package up the run and dump a variant
+ ref = alnTarget[runStart:pos].replace("-", "")
+ read = alnQuery [runStart:pos].replace("-", "")
+ if isHeterozygote(read):
+ allele1, allele2 = unpackIUPAC(read)
+ var = Variant(refId, runStartRefPos, refPos, ref, allele1, allele2)
+ else:
+ var = Variant(refId, runStartRefPos, refPos, ref, read)
+ variants.append(var)
+ runStart = pos
+ runStartRefPos = refPos
+ runX = X
+ if T != "-": refPos += 1
+
+
+ # This might be better handled within the loop above, just keeping
+ # track of Qpos, Tpos
+ if cssQV is not None:
+ cssPosition = cc.TargetToQueryPositions(aln)
+ for v in variants:
+ # HACK ALERT: we are not really handling the confidence or
+ # coverage for variants at last position of the window
+ # correctly here.
+ refPos_ = min(v.refStart-refStart, len(refCoverage)-1)
+ cssPos_ = min(cssPosition[v.refStart-refStart], len(cssQV)-1)
+
+ if refCoverage is not None: v.coverage = refCoverage[refPos_]
+ if cssQV is not None: v.confidence = cssQV[cssPos_]
+
+ return variants
diff --git a/GenomicConsensus/arrow/evidence.py b/GenomicConsensus/arrow/evidence.py
new file mode 100755
index 0000000..450f9c5
--- /dev/null
+++ b/GenomicConsensus/arrow/evidence.py
@@ -0,0 +1,173 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Authors: David Alexander, Lance Hepler
+
+__all__ = [ "dumpEvidence",
+ "ArrowEvidence" ]
+
+import h5py, logging, os.path, numpy as np
+from collections import namedtuple
+from itertools import groupby
+from bisect import bisect_left, bisect_right
+from pbcore.io import FastaReader, FastaWriter
+from .utils import scoreMatrix
+from .. import reference
+
+def dumpEvidence(evidenceDumpBaseDirectory,
+ refWindow, refSequence, alns,
+ arrowConsensus):
+ # Format of evidence dump:
+ # evidence_dump/
+ # ref000001/
+ # 0-1005/
+ # reference.fa
+ # reads.fa
+ # consensus.fa
+ # arrow-scores.h5
+ # 995-2005/
+ # ...
+ join = os.path.join
+ refId, refStart, refEnd = refWindow
+ refName = reference.idToName(refId)
+ windowDirectory = join(evidenceDumpBaseDirectory,
+ refName,
+ "%d-%d" % (refStart, refEnd))
+ logging.info("Dumping evidence to %s" % (windowDirectory,))
+
+ if os.path.exists(windowDirectory):
+ raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory
+ os.makedirs(windowDirectory)
+ refFasta = FastaWriter(join(windowDirectory, "reference.fa"))
+ readsFasta = FastaWriter(join(windowDirectory, "reads.fa"))
+ consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))
+
+ windowName = refName + (":%d-%d" % (refStart, refEnd))
+ refFasta.writeRecord(windowName, refSequence)
+ refFasta.close()
+
+ consensusFasta.writeRecord(windowName + "|arrow", arrowConsensus.sequence)
+ consensusFasta.close()
+
+ rowNames, columnNames, baselineScores, scores = scoreMatrix(arrowConsensus.ai)
+ arrowScoreFile = h5py.File(join(windowDirectory, "arrow-scores.h5"))
+ arrowScoreFile.create_dataset("Scores", data=scores)
+ vlen_str = h5py.special_dtype(vlen=str)
+ arrowScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
+ arrowScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str)
+ arrowScoreFile.create_dataset("BaselineScores", data=baselineScores)
+ arrowScoreFile.close()
+ for aln in alns:
+ readsFasta.writeRecord(str(aln.rowNumber),
+ aln.read(orientation="genomic", aligned=False))
+ readsFasta.close()
+
+
+class ArrowEvidence(object):
+ """
+ An experimental reader class for arrow evidence dumps produced by
+ arrow --dumpEvidence
+ """
+
+ Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase"))
+
+ @staticmethod
+ def _parseMutName(mutName):
+ fields = mutName.split(" ")
+ pos = int(fields[0])
+ type, fromBase, _, toBase = fields[1:]
+ return ArrowEvidence.Mutation(pos, type, fromBase, toBase)
+
+ def __init__(self, path, refStart, consensus, rowNames, colNames, baselineScores, scores):
+ self.path = path
+ self.refStart = refStart
+ self.consensus = consensus
+ self.rowNames = rowNames
+ self.colNames = colNames
+ self.baselineScores = baselineScores
+ self.scores = scores
+ self.muts = map(ArrowEvidence._parseMutName, self.colNames)
+
+ @property
+ def positions(self):
+ return [ mut.Position for mut in self.muts ]
+
+ @property
+ def uniquePositions(self):
+ return sorted(list(set(self.positions)))
+
+ @property
+ def totalScores(self):
+ return self.baselineScores[:, np.newaxis] + self.scores
+
+ @staticmethod
+ def load(path):
+ if path.endswith("/"): path = path[:-1]
+
+ refWin_ = path.split("/")[-1].split("-")
+ refStart = int(refWin_[0])
+
+ with FastaReader(path + "/consensus.fa") as fr:
+ consensus = next(iter(fr)).sequence
+
+ with h5py.File(path + "/arrow-scores.h5", "r") as f:
+ scores = f["Scores"].value
+ baselineScores = f["BaselineScores"].value
+ colNames = f["ColumnNames"].value
+ rowNames = f["RowNames"].value
+ return ArrowEvidence(path, refStart, consensus,
+ rowNames, colNames,
+ baselineScores, scores)
+
+ def forPosition(self, pos):
+ posStart = bisect_left(self.positions, pos)
+ posEnd = bisect_right(self.positions, pos)
+ return ArrowEvidence(self.path,
+ self.refStart,
+ self.consensus,
+ self.rowNames,
+ self.colNames[posStart:posEnd],
+ self.baselineScores,
+ self.scores[:, posStart:posEnd])
+
+
+ def justSubstitutions(self):
+ colMask = np.array(map(lambda s: ("Sub" in s), self.colNames))
+ return ArrowEvidence(self.path,
+ self.refStart,
+ self.consensus,
+ self.rowNames,
+ self.colNames[colMask],
+ self.baselineScores,
+ self.scores[:, colMask])
+
+ def rowNumbers(self):
+ with FastaReader(self.path + "/reads.fa") as fr:
+ return [ int(ctg.name) for ctg in fr ]
diff --git a/GenomicConsensus/arrow/model.py b/GenomicConsensus/arrow/model.py
new file mode 100755
index 0000000..2138c65
--- /dev/null
+++ b/GenomicConsensus/arrow/model.py
@@ -0,0 +1,109 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Authors: David Alexander, Lance Hepler
+
+import numpy as np, ConfigParser, collections, logging
+from glob import glob
+from os.path import join
+from pkg_resources import resource_filename, Requirement
+
+from GenomicConsensus.utils import die
+from GenomicConsensus.arrow.utils import fst, snd
+from pbcore.chemistry import ChemistryLookupError
+from pbcore.io import CmpH5Alignment
+import ConsensusCore2 as cc
+
+__all__ = [ "ArrowConfig" ]
+
+
+#
+# ArrowConfig: the kitchen sink class of arrow options
+#
+
+class ArrowConfig(object):
+ """
+ Arrow configuration options
+ """
+ def __init__(self,
+ minMapQV=10,
+ minPoaCoverage=3,
+ maxPoaCoverage=11,
+ mutationSeparation=10,
+ mutationNeighborhood=20,
+ maxIterations=40,
+ noEvidenceConsensus="nocall",
+ computeConfidence=True,
+ readStumpinessThreshold=0.1,
+ minReadScore=0.75,
+ minHqRegionSnr=3.75,
+ minZScore=-3.5):
+
+ self.minMapQV = minMapQV
+ self.minPoaCoverage = minPoaCoverage
+ self.maxPoaCoverage = maxPoaCoverage
+ self.mutationSeparation = mutationSeparation
+ self.mutationNeighborhood = mutationNeighborhood
+ self.maxIterations = maxIterations
+ self.noEvidenceConsensus = noEvidenceConsensus
+ self.computeConfidence = computeConfidence
+ self.readStumpinessThreshold = readStumpinessThreshold
+ self.minReadScore = minReadScore
+ self.minHqRegionSnr = minHqRegionSnr
+ self.minZScore = minZScore
+
+ @staticmethod
+ def extractFeatures(aln):
+ """
+ Extract the data in a cmp.h5 alignment record into a
+ native-orientation gapless string.
+ """
+ if isinstance(aln, CmpH5Alignment):
+ die("Arrow does not support CmpH5 files!")
+ else:
+ return aln.read(aligned=False, orientation="native")
+
+ @staticmethod
+ def extractMappedRead(aln, windowStart):
+ """
+ Given a clipped alignment, convert its coordinates into template
+ space (starts with 0), bundle it up with its features as a
+ MappedRead.
+ """
+ assert aln.referenceSpan > 0
+ name = aln.readName
+ chemistry = aln.sequencingChemistry
+ strand = cc.StrandEnum_REVERSE if aln.isReverseStrand else cc.StrandEnum_FORWARD
+ read = cc.Read(name, ArrowConfig.extractFeatures(aln), chemistry)
+ return (cc.MappedRead(read,
+ strand,
+ int(aln.referenceStart - windowStart),
+ int(aln.referenceEnd - windowStart)),
+ cc.SNR(aln.hqRegionSnr))
diff --git a/GenomicConsensus/quiver/utils.py b/GenomicConsensus/arrow/utils.py
old mode 100644
new mode 100755
similarity index 63%
copy from GenomicConsensus/quiver/utils.py
copy to GenomicConsensus/arrow/utils.py
index 9d8470c..c8bbe8e
--- a/GenomicConsensus/quiver/utils.py
+++ b/GenomicConsensus/arrow/utils.py
@@ -28,16 +28,17 @@
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
-# Author: David Alexander
+# Authors: David Alexander, Lance Hepler
-import numpy as np, itertools, logging, re
+import numpy as np, itertools, logging, re, sys
from collections import Counter
+from traceback import format_exception
from GenomicConsensus.variants import *
from GenomicConsensus.utils import *
-from GenomicConsensus.consensus import QuiverConsensus
+from GenomicConsensus.consensus import ArrowConsensus
from pbcore.io.rangeQueries import projectIntoRange
-import ConsensusCore as cc
+import ConsensusCore2 as cc
def uniqueSingleBaseMutations(templateSequence, positions=None):
"""
@@ -52,15 +53,19 @@ def uniqueSingleBaseMutations(templateSequence, positions=None):
# snvs
for subsBase in allBases:
if subsBase != tplBase:
- yield cc.Mutation(cc.SUBSTITUTION, tplStart, subsBase)
+ yield cc.Mutation(cc.MutationType_SUBSTITUTION,
+ tplStart,
+ subsBase)
# Insertions---only allowing insertions that are not cognate
# with the previous base.
for insBase in allBases:
if insBase != prevTplBase:
- yield cc.Mutation(cc.INSERTION, tplStart, insBase)
+ yield cc.Mutation(cc.MutationType_INSERTION,
+ tplStart,
+ insBase)
# Deletion--only allowed if refBase does not match previous tpl base
if tplBase != prevTplBase:
- yield cc.Mutation(cc.DELETION, tplStart, "-")
+ yield cc.Mutation(cc.MutationType_DELETION, tplStart)
def allSingleBaseMutations(templateSequence, positions=None):
"""
@@ -74,12 +79,16 @@ def allSingleBaseMutations(templateSequence, positions=None):
# snvs
for subsBase in allBases:
if subsBase != tplBase:
- yield cc.Mutation(cc.SUBSTITUTION, tplStart, subsBase)
+ yield cc.Mutation(cc.MutationType_SUBSTITUTION,
+ tplStart,
+ subsBase)
# Insertions
for insBase in allBases:
- yield cc.Mutation(cc.INSERTION, tplStart, insBase)
+ yield cc.Mutation(cc.MutationType_INSERTION,
+ tplStart,
+ insBase)
# Deletion
- yield cc.Mutation(cc.DELETION, tplStart, "-")
+ yield cc.Mutation(cc.MutationType_DELETION, tplStart)
def nearbyMutations(mutations, tpl, neighborhoodSize):
"""
@@ -92,9 +101,6 @@ def nearbyMutations(mutations, tpl, neighborhoodSize):
min(len(tpl), mp + neighborhoodSize)))
return uniqueSingleBaseMutations(tpl, sorted(nearbyPositions))
-def asFloatFeature(arr):
- return cc.FloatFeature(np.array(arr, dtype=np.float32))
-
def bestSubset(mutationsAndScores, separation):
"""
Given a list of (mutation, score) tuples, this utility method
@@ -118,61 +124,27 @@ def bestSubset(mutationsAndScores, separation):
return output
-def refineConsensus(mms, quiverConfig):
+def refineConsensus(ai, arrowConfig):
"""
Given a MultiReadMutationScorer, identify and apply favorable
template mutations. Return (consensus, didConverge) :: (str, bool)
"""
- isConverged = cc.RefineConsensus(mms)
- return mms.Template(), isConverged
-
-def _buildDinucleotideRepeatPattern(minRepeatCount):
- allDinucs = [ a + b for a in "ACGT" for b in "ACGT" if a != b ]
- pattern = "(" + "|".join(["(?:%s){%d,}" % (dinuc, minRepeatCount)
- for dinuc in allDinucs]) + ")"
- return pattern
-
-dinucleotideRepeatPattern = _buildDinucleotideRepeatPattern(3)
-
-def findDinucleotideRepeats(s):
- """
- string -> list( (start_position, end_position), length-2 string )
+ cfg = cc.PolishConfig(arrowConfig.maxIterations,
+ arrowConfig.mutationSeparation,
+ arrowConfig.mutationNeighborhood)
+ isConverged, nTested, nApplied = cc.Polish(ai, cfg)
+ return str(ai), isConverged
- List is sorted, and [start_position, end_position) intervals are
- disjoint
- """
- repeatsFound = [ (m.span(), s[m.start():m.start()+2])
- for m in re.finditer(dinucleotideRepeatPattern, s) ]
- return sorted(repeatsFound)
-
-
-def refineDinucleotideRepeats(mms):
- """
- We have observed a couple instances where we call the consensus to
- be off the truth by +/- 1 dinucleotide repeat---we are getting
- trapped in an inferor local optimum, like so:
-
- likelihood
- truth ATATATAT 100
- quiver AT--ATAT 90
- quiver+A ATA-ATAT 85
- quiver+T AT-TATAT 85
-
- To resolve this issue, we need to explore the likelihood change
- for wobbling on every dinucleotide repeat in the window.
- """
- return cc.RefineDinucleotideRepeats(mms)
-
-def consensusConfidence(mms, positions=None):
+def consensusConfidence(ai, positions=None):
"""
Returns an array of QV values reflecting the consensus confidence
at each position specified. If the `positions` argument is
omitted, confidence values are returned for all positions in the
- consensus (mms.Template()).
+ consensus (str(ai)).
"""
- return np.array(cc.ConsensusQVs(mms), dtype=np.uint8)
+ return np.array(np.clip(cc.ConsensusQVs(ai), 0, 93), dtype=np.uint8)
-def variantsFromAlignment(a, refWindow):
+def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
"""
Extract the variants implied by a pairwise alignment to the
reference.
@@ -180,6 +152,7 @@ def variantsFromAlignment(a, refWindow):
variants = []
refId, refStart, _ = refWindow
refPos = refStart
+ cssPos = 0
tbl = zip(a.Transcript(),
a.Target(),
a.Query())
@@ -193,19 +166,33 @@ def variantsFromAlignment(a, refWindow):
run = list(run)
ref = "".join(map(snd, run))
refLen = len(ref) - Counter(ref)["-"]
- read = "".join(map(third, run))
+ css = "".join(map(third, run))
+ cssLen = len(css) - Counter(css)["-"]
+ variant = None
if code == "M" or code == "N":
pass
elif code == "R":
- assert len(read)==len(ref)
- variants.append(Variant(refId, refPos, refPos+len(read), ref, read))
+ assert len(css)==len(ref)
+ variant = Variant(refId, refPos, refPos+len(css), ref, css)
elif code == "I":
- variants.append(Variant(refId, refPos, refPos, "", read))
+ variant = Variant(refId, refPos, refPos, "", css)
elif code == "D":
- variants.append(Variant(refId, refPos, refPos + len(ref), ref, ""))
+ variant = Variant(refId, refPos, refPos + len(ref), ref, "")
+
+ if variant is not None:
+ # HACK ALERT: variants at the first and last position
+ # are not handled correctly
+ if siteCoverage is not None and np.size(siteCoverage) > 0:
+ refPos_ = min(refPos-refStart, len(siteCoverage)-1)
+ variant.coverage = siteCoverage[refPos_]
+ if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
+ cssPos_ = min(cssPos, len(cssQvInWindow)-1)
+ variant.confidence = cssQvInWindow[cssPos_]
+ variants.append(variant)
refPos += refLen
+ cssPos += cssLen
return variants
@@ -231,9 +218,9 @@ def lifted(queryPositions, mappedRead):
return copy
-_typeMap = { cc.INSERTION : "Ins",
- cc.DELETION : "Del",
- cc.SUBSTITUTION : "Sub" }
+_typeMap = { cc.MutationType_INSERTION : "Ins",
+ cc.MutationType_DELETION : "Del",
+ cc.MutationType_SUBSTITUTION : "Sub" }
def _shortMutationDescription(mut, tpl):
"""
@@ -244,15 +231,15 @@ def _shortMutationDescription(mut, tpl):
201 Sub C > T
201 Del C > .
"""
- _type = _typeMap[mut.Type()]
+ _type = _typeMap[mut.Type]
_pos = mut.Start()
- _oldBase = "." if mut.Type() == cc.INSERTION \
+ _oldBase = "." if mut.Type() == cc.MutationType_INSERTION \
else tpl[_pos]
- _newBase = "." if mut.Type() == cc.DELETION \
- else mut.NewBases()
+ _newBase = "." if mut.Type() == cc.MutationType_DELETION \
+ else mut.Base
return "%d %s %s > %s" % (_pos, _type, _oldBase, _newBase)
-def scoreMatrix(mms):
+def scoreMatrix(ai):
"""
Returns (rowNames, columnNames, S)
@@ -264,16 +251,16 @@ def scoreMatrix(mms):
- columnNames[j] is an identifier for mutation j, encoding the
position, type, and base change
"""
- css = mms.Template()
+ css = str(ai)
allMutations = sorted(allSingleBaseMutations(css))
- shape = (mms.NumReads(), len(allMutations))
+ shape = (ai.NumReads(), len(allMutations))
scoreMatrix = np.zeros(shape)
for j, mut in enumerate(allMutations):
- mutScores = mms.Scores(mut)
+ mutScores = ai.ReadLLs(mut)
scoreMatrix[:, j] = mutScores
- baselineScores = np.array(mms.BaselineScores())
- rowNames = [ mms.Read(i).Name
- for i in xrange(mms.NumReads()) ]
+ baselineScores = np.array(ai.ReadLLs())
+ rowNames = [ ai.Read(i).Name
+ for i in xrange(ai.NumReads()) ]
columnNames = [ _shortMutationDescription(mut, css)
for mut in allMutations ]
return (rowNames, columnNames, baselineScores, scoreMatrix)
@@ -281,7 +268,7 @@ def scoreMatrix(mms):
def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
cssQvInWindow=None, siteCoverage=None, aligner="affine",
- mms=None):
+ ai=None):
"""
Compare the consensus and the reference in this window, returning
a list of variants.
@@ -294,29 +281,14 @@ def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
align = cc.Align
ga = align(refSequenceInWindow, cssSequenceInWindow)
- cssPosition = cc.TargetToQueryPositions(ga)
-
- vars = variantsFromAlignment(ga, refWindow)
- for v in vars:
- # HACK ALERT: we are not really handling the confidence or
- # coverage for variants at last position of the window
- # correctly here.
- refPos_ = min(v.refStart-refStart, len(siteCoverage)-1)
- cssPos_ = min(cssPosition[v.refStart-refStart], len(cssQvInWindow)-1)
-
- # make sure the arrays are non-empty before indexing into them
- if siteCoverage is not None and np.size(siteCoverage) > 0:
- v.coverage = siteCoverage[refPos_]
- if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
- v.confidence = cssQvInWindow[cssPos_]
+ return variantsFromAlignment(ga, refWindow, cssQvInWindow, siteCoverage)
- return vars
-def filterAlns(refWindow, alns, quiverConfig):
+def filterAlns(refWindow, alns, arrowConfig):
"""
Given alns (already clipped to the window bounds), filter out any
- that are incompatible with Quiver.
+ that are incompatible with Arrow.
By and large we avoid doing any filtering to avoid potential
reference bias in variant calling.
@@ -332,10 +304,12 @@ def filterAlns(refWindow, alns, quiverConfig):
Win. [ )
"""
return [ a for a in alns
- if a.readLength >= (quiverConfig.readStumpinessThreshold * a.referenceSpan) ]
+ if a.readLength >= (arrowConfig.readStumpinessThreshold * a.referenceSpan) and
+ min(a.hqRegionSnr) >= arrowConfig.minHqRegionSnr and
+ a.readScore >= arrowConfig.minReadScore ]
-def consensusForAlignments(refWindow, refSequence, alns, quiverConfig):
+def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
"""
Call consensus on this interval---without subdividing the interval
further.
@@ -351,49 +325,59 @@ def consensusForAlignments(refWindow, refSequence, alns, quiverConfig):
fwdSequences = [ a.read(orientation="genomic", aligned=False)
for a in alns
if a.spansReferenceRange(refStart, refEnd) ]
- assert len(fwdSequences) >= quiverConfig.minPoaCoverage
+ assert len(fwdSequences) >= arrowConfig.minPoaCoverage
try:
- p = cc.PoaConsensus.FindConsensus(fwdSequences[:quiverConfig.maxPoaCoverage])
+ p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage])
except:
logging.info("%s: POA could not be generated" % (refWindow,))
- return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
- refWindow, refSequence)
+ return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+ refWindow, refSequence)
ga = cc.Align(refSequence, p.Sequence)
numPoaVariants = ga.Errors()
poaCss = p.Sequence
- # Extract reads into ConsensusCore-compatible objects, and map them into the
+ # Extract reads into ConsensusCore2-compatible objects, and map them into the
# coordinates relative to the POA consensus
- mappedReads = [ quiverConfig.extractMappedRead(aln, refStart) for aln in alns ]
+ mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
queryPositions = cc.TargetToQueryPositions(ga)
- mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]
+ mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ]
# Load the mapped reads into the mutation scorer, and iterate
# until convergence.
- configTbl = quiverConfig.ccQuiverConfigTbl
- mms = cc.SparseSseQvMultiReadMutationScorer(configTbl, poaCss)
- for mr in mappedReads:
- mms.AddRead(mr)
+ ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
+ coverage = 0
+ for (mr, snr) in mappedReads:
+ if (mr.TemplateEnd <= mr.TemplateStart or
+ mr.TemplateEnd - mr.TemplateStart < 2 or
+ mr.Length() < 2):
+ continue
+ coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0
+
+ # TODO(lhepler, dalexander): propagate coverage around somehow
# Iterate until covergence
- _, quiverConverged = refineConsensus(mms, quiverConfig)
- if quiverConverged:
- if quiverConfig.refineDinucleotideRepeats:
- refineDinucleotideRepeats(mms)
- quiverCss = mms.Template()
- if quiverConfig.computeConfidence:
- confidence = consensusConfidence(mms)
+ try:
+ if coverage < arrowConfig.minPoaCoverage:
+ logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
+ return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+ refWindow, refSequence)
+ _, converged = refineConsensus(ai, arrowConfig)
+ assert converged, "Arrow did not converge to MLE"
+ arrowCss = str(ai)
+ if arrowConfig.computeConfidence:
+ confidence = consensusConfidence(ai)
else:
- confidence = np.zeros(shape=len(quiverCss), dtype=int)
- return QuiverConsensus(refWindow,
- quiverCss,
- confidence,
- mms)
- else:
- logging.info("%s: Quiver did not converge to MLE" % (refWindow,))
- return QuiverConsensus.noCallConsensus(quiverConfig.noEvidenceConsensus,
- refWindow, refSequence)
+ confidence = np.zeros(shape=len(arrowCss), dtype=int)
+ return ArrowConsensus(refWindow,
+ arrowCss,
+ confidence,
+ ai)
+ except:
+ traceback = ''.join(format_exception(*sys.exc_info()))
+ logging.info("%s: %s" % (refWindow, traceback))
+ return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+ refWindow, refSequence)
def coverageInWindow(refWin, hits):
diff --git a/GenomicConsensus/consensus.py b/GenomicConsensus/consensus.py
index 9a2a6a4..da2a426 100644
--- a/GenomicConsensus/consensus.py
+++ b/GenomicConsensus/consensus.py
@@ -34,6 +34,7 @@ import numpy as np
__all__ = [ "Consensus",
"QuiverConsensus",
+ "ArrowConsensus",
"totalLength",
"areContiguous",
"join" ]
@@ -96,6 +97,17 @@ class QuiverConsensus(Consensus):
self.mms = mms
+class ArrowConsensus(Consensus):
+ """
+ An ArrowConsensus object carries an additional field, `ai`, which
+ is the ConsensusCore2 abstract integrator object, which can be used
+ to perform some post-hoc analyses (diploid, sample mixture, etc)
+ """
+ def __init__(self, refWindow, sequence, confidence, ai=None):
+ super(ArrowConsensus, self).__init__(refWindow, sequence, confidence)
+ self.ai = ai
+
+
def totalLength(consensi):
"""
Total length of reference/scaffold coordinate windows
diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py
index 988220e..4c5ca14 100644
--- a/GenomicConsensus/main.py
+++ b/GenomicConsensus/main.py
@@ -35,12 +35,13 @@ from __future__ import absolute_import
import argparse, atexit, cProfile, gc, glob, h5py, logging, multiprocessing
import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback
+import functools
import re
import sys
import pysam
-from pbcommand.utils import setup_log
+from pbcommand.utils import setup_log, Constants as LogFormats
from pbcommand.cli import pbparser_runner
from pbcore.io import AlignmentSet, ContigSet
@@ -49,14 +50,12 @@ from GenomicConsensus.options import (options, Constants,
get_parser,
processOptions,
resolveOptions,
- consensusCoreVersion)
+ consensusCoreVersion,
+ consensusCore2Version)
from GenomicConsensus.utils import (IncompatibleDataException,
datasetCountExceedsThreshold,
die)
-from GenomicConsensus.quiver import quiver
-from GenomicConsensus.plurality import plurality
-
class ToolRunner(object):
"""
The main driver class for the GenomicConsensus tool. It is assumed that
@@ -64,7 +63,7 @@ class ToolRunner(object):
'options' namespace before instantiating this class.
"""
def __init__(self):
- self._inCmpH5 = None
+ self._inAlnFile = None
self._resultsQueue = None
self._workQueue = None
self._slaves = None
@@ -72,18 +71,6 @@ class ToolRunner(object):
self._algorithmConfiguration = None
self._aborting = False
- def _setupLogging(self):
- if options.quiet:
- logLevel = logging.ERROR
- elif options.verbosity >= 2:
- logLevel = logging.DEBUG
- elif options.verbosity == 1:
- logLevel = logging.INFO
- else:
- logLevel = logging.WARNING
- log = logging.getLogger()
- log.setLevel(logLevel)
-
def _makeTemporaryDirectory(self):
"""
Make a temp dir where we can stash things if necessary.
@@ -91,16 +78,30 @@ class ToolRunner(object):
options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )
- def _algorithmByName(self, name):
- if name=="plurality":
+ def _algorithmByName(self, name, cmpH5):
+ if name == "plurality":
+ from GenomicConsensus.plurality import plurality
algo = plurality
- elif name=="quiver":
+ elif name == "quiver":
+ from GenomicConsensus.quiver import quiver
algo = quiver
+ elif name == "arrow":
+ from GenomicConsensus.arrow import arrow
+ algo = arrow
+ elif name == "poa":
+ from GenomicConsensus.poa import poa
+ algo = poa
+ elif name == "best":
+ logging.info("Identifying best algorithm based on input data")
+ from GenomicConsensus import algorithmSelection
+ algoName = algorithmSelection.bestAlgorithm(cmpH5.sequencingChemistry)
+ return self._algorithmByName(algoName, cmpH5)
else:
die("Failure: unrecognized algorithm %s" % name)
isOK, msg = algo.availability
if not isOK:
die("Failure: %s" % msg)
+ logging.info("Will use {a} algorithm".format(a=name))
return algo
def _launchSlaves(self):
@@ -130,7 +131,7 @@ class ToolRunner(object):
p.start()
logging.info("Launched compute slaves.")
- rcp = ResultCollectorType(self._resultsQueue, self._algorithmConfiguration)
+ rcp = ResultCollectorType(self._resultsQueue, self._algorithm.name, self._algorithmConfiguration)
rcp.start()
self._slaves.append(rcp)
logging.info("Launched collector slave.")
@@ -143,17 +144,17 @@ class ToolRunner(object):
self._workQueue = multiprocessing.Queue(options.queueSize)
self._resultsQueue = multiprocessing.Queue(options.queueSize)
- def _readCmpH5Input(self):
+ def _readAlignmentInput(self):
"""
- Read the CmpH5 input file into a CmpH5 object and
- store it as self._inCmpH5.
+ Read the AlignmentSet input file and
+ store it as self._inAlnFile.
"""
fname = options.inputFilename
- self._inCmpH5 = AlignmentSet(fname)
+ self._inAlnFile = AlignmentSet(fname)
- def _loadReference(self, cmpH5):
+ def _loadReference(self, alnFile):
logging.info("Loading reference")
- err = reference.loadFromFile(options.referenceFilename, cmpH5)
+ err = reference.loadFromFile(options.referenceFilename, alnFile)
if err:
die("Error loading reference")
# Grok the referenceWindow spec, if any.
@@ -172,27 +173,27 @@ class ToolRunner(object):
options.referenceWindows = map(reference.stringToWindow,
options.referenceWindowsAsString.split(","))
if options.referenceWindowsFromAlignment:
- options.referenceWindows = cmpH5.refWindows
+ options.referenceWindows = alnFile.refWindows
- def _checkFileCompatibility(self, cmpH5):
- if not cmpH5.isSorted:
- die("Input CmpH5 file must be sorted.")
- if cmpH5.isEmpty:
- die("Input CmpH5 file must be nonempty.")
+ def _checkFileCompatibility(self, alnFile):
+ if not alnFile.isSorted:
+ die("Input Alignment file must be sorted.")
+ if alnFile.isEmpty:
+ die("Input Alignment file must be nonempty.")
- def _shouldDisableChunkCache(self, cmpH5):
- #if isinstance(cmpH5, CmpH5Reader):
- #if cmpH5.isCmpH5:
+ def _shouldDisableChunkCache(self, alnFile):
+ #if isinstance(alnFile, CmpH5Reader):
+ #if alnFile.isCmpH5:
# threshold = options.autoDisableHdf5ChunkCache
- # return datasetCountExceedsThreshold(cmpH5, threshold)
+ # return datasetCountExceedsThreshold(alnFile, threshold)
#else:
# return False
return True
- def _configureAlgorithm(self, options, cmpH5):
+ def _configureAlgorithm(self, options, alnFile):
assert self._algorithm != None
try:
- self._algorithmConfiguration = self._algorithm.configure(options, cmpH5)
+ self._algorithmConfiguration = self._algorithm.configure(options, alnFile)
except IncompatibleDataException as e:
die("Failure: %s" % e.message)
@@ -203,7 +204,7 @@ class ToolRunner(object):
ids = reference.enumerateIds(options.referenceWindows)
for _id in ids:
if options.fancyChunking:
- chunks = reference.fancyEnumerateChunks(self._inCmpH5,
+ chunks = reference.fancyEnumerateChunks(self._inAlnFile,
_id,
options.referenceChunkSize,
options.minCoverage,
@@ -266,14 +267,27 @@ class ToolRunner(object):
# essentially harmless.
gc.disable()
- self._algorithm = self._algorithmByName(options.algorithm)
- self._setupLogging()
random.seed(42)
+ if options.pdb or options.pdbAtStartup:
+ print >>sys.stderr, "Process ID: %d" % os.getpid()
+ try:
+ import ipdb
+ except ImportError:
+ die("Debugging options require 'ipdb' package installed.")
+
+ if not options.threaded:
+ die("Debugging only works with -T (threaded) mode")
+
+ if options.pdbAtStartup:
+ ipdb.set_trace()
+
logging.info("h5py version: %s" % h5py.version.version)
logging.info("hdf5 version: %s" % h5py.version.hdf5_version)
logging.info("ConsensusCore version: %s" %
(consensusCoreVersion() or "ConsensusCore unavailable"))
+ logging.info("ConsensusCore2 version: %s" %
+ (consensusCore2Version() or "ConsensusCore2 unavailable"))
logging.info("Starting.")
atexit.register(self._cleanup)
@@ -281,6 +295,8 @@ class ToolRunner(object):
self._makeTemporaryDirectory()
with AlignmentSet(options.inputFilename) as peekFile:
+ if options.algorithm == "arrow" and peekFile.isCmpH5:
+ die("Arrow does not support CmpH5 files")
if not peekFile.isCmpH5 and not peekFile.hasPbi:
die("Genomic Consensus only works with cmp.h5 files and BAM "
"files with accompanying .pbi files")
@@ -289,6 +305,7 @@ class ToolRunner(object):
resolveOptions(peekFile)
self._loadReference(peekFile)
self._checkFileCompatibility(peekFile)
+ self._algorithm = self._algorithmByName(options.algorithm, peekFile)
self._configureAlgorithm(options, peekFile)
options.disableHdf5ChunkCache = True
#options.disableHdf5ChunkCache = self._shouldDisableChunkCache(peekFile)
@@ -300,7 +317,7 @@ class ToolRunner(object):
self._setupEvidenceDumpDirectory(options.evidenceDirectory)
self._launchSlaves()
- self._readCmpH5Input()
+ self._readAlignmentInput()
monitoringThread = threading.Thread(target=monitorSlaves, args=(self,))
monitoringThread.start()
@@ -313,11 +330,7 @@ class ToolRunner(object):
filename=os.path.join(options.temporaryDirectory,
"profile-main.out"))
- elif options.debug:
- if not options.threaded:
- die("Debugging only works with -T (threaded) mode")
- logging.info("PID: %d", os.getpid())
- import ipdb
+ elif options.pdb:
with ipdb.launch_ipdb_on_exception():
self._mainLoop()
@@ -339,7 +352,7 @@ class ToolRunner(object):
self._printProfiles()
# close h5 file.
- self._inCmpH5.close()
+ self._inAlnFile.close()
return 0
def monitorSlaves(driver):
@@ -377,6 +390,7 @@ def resolved_tool_contract_runner(resolved_contract):
fastq_path = rc.task.output_files[2]
args = [
alignment_path,
+ "--verbose",
"--reference", reference_path,
"--outputFilename", gff_path,
"--outputFilename", fasta_path,
@@ -398,18 +412,15 @@ def resolved_tool_contract_runner(resolved_contract):
return rc
def main(argv=sys.argv):
- logFormat = '[%(levelname)s] %(message)s'
- logging.basicConfig(level=logging.WARN, format=logFormat)
- log = logging.getLogger()
- def dummy_setup(*args, **kwargs):
- pass
+ setup_log_ = functools.partial(setup_log,
+ str_formatter=LogFormats.LOG_FMT_LVL)
return pbparser_runner(
argv=argv[1:],
parser=get_parser(),
args_runner_func=args_runner,
contract_runner_func=resolved_tool_contract_runner,
- alog=log,
- setup_log_func=dummy_setup)
+ alog=logging.getLogger(),
+ setup_log_func=setup_log_)
if __name__ == "__main__":
sys.exit(main(sys.argv))
diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py
index ed6672e..7303e89 100644
--- a/GenomicConsensus/options.py
+++ b/GenomicConsensus/options.py
@@ -51,9 +51,9 @@ from __future__ import absolute_import
import argparse, h5py, os, os.path, sys, json
from pbcommand.models import FileTypes, SymbolTypes, get_pbparser
-from pbcommand.common_options import (add_resolved_tool_contract_option,)
-# FIXME add_subcomponent_versions_option)
-from pbcommand.cli import get_default_argparser
+from pbcommand.common_options import (add_resolved_tool_contract_option,
+ add_debug_option)
+# FIXME add_subcomponent_versions_option)
from .utils import fileFormat
from . import __VERSION__
@@ -67,6 +67,13 @@ def consensusCoreVersion():
except:
return None
+def consensusCore2Version():
+ try:
+ import ConsensusCore2
+ return ConsensusCore2.__version__
+ except:
+ return None
+
class Constants(object):
TOOL_ID = "genomic_consensus.tasks.variantcaller"
DRIVER_EXE = "variantCaller --resolved-tool-contract "
@@ -75,11 +82,14 @@ class Constants(object):
MIN_COVERAGE_ID = "genomic_consensus.task_options.min_coverage"
DIPLOID_MODE_ID = "genomic_consensus.task_options.diploid"
- DEFAULT_ALGORITHM = "quiver"
+ DEFAULT_ALGORITHM = "best"
DEFAULT_MIN_CONFIDENCE = 40
DEFAULT_MIN_COVERAGE = 5
DEFAULT_MAX_COVERAGE = 100
DEFAULT_MIN_MAPQV = 10
+ DEFAULT_MIN_READSCORE = 0.65
+ DEFAULT_MIN_HQREGIONSNR = 3.75
+ DEFAULT_MIN_ZSCORE = -3.5
def get_parser():
"""
@@ -93,7 +103,8 @@ def get_parser():
description="Compute genomic consensus and call variants relative to the reference.",
driver_exe=Constants.DRIVER_EXE,
nproc=SymbolTypes.MAX_NPROC,
- resource_types=())
+ resource_types=(),
+ default_level="WARN")
tcp = p.tool_contract_parser
tcp.add_input_file_type(FileTypes.DS_ALIGN, "infile",
"Alignment DataSet", "BAM or Alignment DataSet")
@@ -102,21 +113,21 @@ def get_parser():
tcp.add_output_file_type(FileTypes.GFF, "variants",
name="Consensus GFF",
description="Consensus GFF",
- default_name="variants.gff")
+ default_name="variants")
tcp.add_output_file_type(FileTypes.DS_CONTIG, "consensus",
name="Consensus ContigSet",
description="Consensus sequence in Fasta format",
- default_name="consensus.contigset.xml")
+ default_name="consensus")
tcp.add_output_file_type(FileTypes.FASTQ, "consensus_fastq",
name="Consensus fastq",
description="Consensus fastq",
- default_name="consensus.fastq")
+ default_name="consensus")
tcp.add_str(
option_id=Constants.ALGORITHM_ID,
option_str="algorithm",
default=Constants.DEFAULT_ALGORITHM,
name="Algorithm",
- description="Algorithm name") # FIXME
+ description="Variant calling algorithm")
tcp.add_int(
option_id=Constants.MIN_CONFIDENCE_ID,
option_str="minConfidence",
@@ -135,7 +146,7 @@ def get_parser():
tcp.add_boolean(
option_id=Constants.DIPLOID_MODE_ID,
option_str="diploid",
- default=True, # XXX i.e. --diploid=True, not the actual default!
+ default=False,
name="Diploid mode (experimental)",
description="Enable detection of heterozygous variants (experimental)")
add_options_to_argument_parser(p.arg_parser.parser)
@@ -268,7 +279,27 @@ def add_options_to_argument_parser(parser):
dest="readStratum",
default=None,
type=parseReadStratum)
-
+ readSelection.add_argument(
+ "--minReadScore",
+ action="store",
+ dest="minReadScore",
+ type=float,
+ default=Constants.DEFAULT_MIN_READSCORE,
+ help="The minimum ReadScore for reads that will be used for analysis (arrow-only).")
+ readSelection.add_argument(
+ "--minSnr",
+ action="store",
+ dest="minHqRegionSnr",
+ type=float,
+ default=Constants.DEFAULT_MIN_HQREGIONSNR,
+ help="The minimum acceptable signal-to-noise over all channels for reads that will be used for analysis (arrow-only).")
+ readSelection.add_argument(
+ "--minZScore",
+ action="store",
+ dest="minZScore",
+ type=float,
+ default=Constants.DEFAULT_MIN_ZSCORE,
+ help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).")
algorithm = parser.add_argument_group("Algorithm and parameter settings")
algorithm.add_argument(
@@ -276,7 +307,8 @@ def add_options_to_argument_parser(parser):
action="store",
dest="algorithm",
type=str,
- default="quiver")
+ choices=["quiver", "arrow", "plurality", "poa", "best"],
+ default="best")
algorithm.add_argument(
"--parametersFile", "-P",
dest="parametersFile",
@@ -298,16 +330,13 @@ def add_options_to_argument_parser(parser):
"which selects the best parameter set from the cmp.h5")
debugging = parser.add_argument_group("Verbosity and debugging/profiling")
+ add_debug_option(debugging)
debugging.add_argument(
- "--verbose",
- dest="verbosity",
- action="count",
- help="Set the verbosity level.")
- debugging.add_argument(
- "--quiet",
- dest="quiet",
+ "--pdbAtStartup",
action="store_true",
- help="Turn off all logging, including warnings")
+ dest="pdbAtStartup",
+ default=False,
+ help="Drop into Python debugger at startup (requires ipdb)")
debugging.add_argument(
"--profile",
action="store_true",
@@ -455,7 +484,7 @@ def processOptions():
options.shellCommand = " ".join(sys.argv)
-def resolveOptions(cmpH5):
+def resolveOptions(alnFile):
"""
Some of the options are provided as strings by the user, but need
to be translated into internal identifiers. These options are
@@ -465,10 +494,10 @@ def resolveOptions(cmpH5):
This is essentially just an order-of-initialization issue.
"""
if options._barcode != None:
- if not cmpH5.isBarcoded:
- raise Exception("cmp.h5 file is not barcoded!")
- if options._barcode not in cmpH5.barcode:
- raise Exception("Barcode with given name not present in cmp.h5 file!")
- options.barcode = cmpH5.barcode[options._barcode]
+ if not alnFile.isBarcoded:
+ raise Exception("input file is not barcoded!")
+ if options._barcode not in alnFile.barcode:
+ raise Exception("Barcode with given name not present in input file!")
+ options.barcode = alnFile.barcode[options._barcode]
else:
options.barcode = None
diff --git a/GenomicConsensus/plurality/plurality.py b/GenomicConsensus/plurality/plurality.py
index 5051683..c7013ba 100644
--- a/GenomicConsensus/plurality/plurality.py
+++ b/GenomicConsensus/plurality/plurality.py
@@ -385,7 +385,7 @@ class PluralityWorker(object):
referenceWindow, refSeqInWindow)
return (referenceWindow, (noCallCss, []))
- alnHits = readsInWindow(self._inCmpH5, referenceWindow,
+ alnHits = readsInWindow(self._inAlnFile, referenceWindow,
depthLimit=options.coverage,
minMapQV=options.minMapQV,
strategy="long-and-strand-balanced",
@@ -420,7 +420,7 @@ __all__ = [ "name",
"configure",
"slaveFactories" ]
-name = "Plurality"
+name = "plurality"
availability = (True, "OK")
def slaveFactories(threaded):
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/poa/__init__.py
similarity index 98%
copy from GenomicConsensus/__init__.py
copy to GenomicConsensus/poa/__init__.py
index abed5b6..effd567 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/poa/__init__.py
@@ -30,4 +30,3 @@
# Author: David Alexander
-__VERSION__ = "1.1.0"
diff --git a/GenomicConsensus/poa/poa.py b/GenomicConsensus/poa/poa.py
new file mode 100644
index 0000000..833f97e
--- /dev/null
+++ b/GenomicConsensus/poa/poa.py
@@ -0,0 +1,413 @@
+#################################################################################
+# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+# Author: David Alexander
+
+from __future__ import absolute_import
+
+import itertools, logging, math, random
+from collections import Counter
+
+import ConsensusCore as cc, numpy as np
+
+from ..utils import readsInWindow, snd, third
+from .. import reference
+from ..options import options
+from ..consensus import Consensus, join
+from ..windows import kSpannedIntervals, holes, subWindow
+from ..variants import Variant, filterVariants, annotateVariants
+from ..Worker import WorkerProcess, WorkerThread
+from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread
+
+
+#
+# --------------- Configuration ----------------------
+#
+
+class PoaConfig(object):
+ """
+ Poa configuration options
+ """
+ def __init__(self,
+ aligner="affine",
+ minMapQV=10,
+ minPoaCoverage=3,
+ maxPoaCoverage=100,
+ noEvidenceConsensus="nocall",
+ readStumpinessThreshold=0.1,
+ minReadScore=0.75,
+ minHqRegionSnr=3.75):
+ self.aligner = aligner
+ self.minMapQV = minMapQV
+ self.minPoaCoverage = minPoaCoverage
+ self.maxPoaCoverage = maxPoaCoverage
+ self.noEvidenceConsensus = noEvidenceConsensus
+ self.readStumpinessThreshold = readStumpinessThreshold
+ self.minReadScore = minReadScore
+ self.minHqRegionSnr = minHqRegionSnr
+
+
+#
+# ----------- The actual algorithm code -------------
+#
+
+def filterAlns(alns, poaConfig):
+ """
+ Given alns (already clipped to the window bounds), filter out any
+ that are deemed insufficiently high-quality for POA.
+
+ By and large we avoid doing any filtering to avoid potential
+ reference bias in variant calling.
+
+ However at the moment the POA (and potentially other components)
+ breaks when there is a read of zero length. So we throw away
+ reads that are "stumpy", where the aligner has inserted a large
+ gap, such that while the alignment technically spans the window,
+ it may not have any read content therein:
+
+ Ref ATGATCCAGTTACTCCGATAAA
+ Read ATG---------------TA-A
+ Win. [ )
+ """
+ return [ a for a in alns
+ if a.readLength >= (poaConfig.readStumpinessThreshold * a.referenceSpan) and
+ min(a.hqRegionSnr) >= poaConfig.minHqRegionSnr and
+ a.readScore >= poaConfig.minReadScore ]
+
+
+def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
+ """
+ Extract the variants implied by a pairwise alignment to the
+ reference.
+ """
+ variants = []
+ refId, refStart, _ = refWindow
+ refPos = refStart
+ cssPos = 0
+ tbl = zip(a.Transcript(),
+ a.Target(),
+ a.Query())
+
+ # We don't call variants where either the reference or css is 'N'
+ grouper = lambda row: "N" if (row[1]=="N" or row[2]=="N") else row[0]
+ runs = itertools.groupby(tbl, grouper)
+
+ for code, run in runs:
+ assert code in "RIDMN"
+ run = list(run)
+ ref = "".join(map(snd, run))
+ refLen = len(ref) - Counter(ref)["-"]
+ css = "".join(map(third, run))
+ cssLen = len(css) - Counter(css)["-"]
+ variant = None
+
+ if code == "M" or code == "N":
+ pass
+ elif code == "R":
+ assert len(css)==len(ref)
+ variant = Variant(refId, refPos, refPos+len(css), ref, css)
+ elif code == "I":
+ variant = Variant(refId, refPos, refPos, "", css)
+ elif code == "D":
+ variant = Variant(refId, refPos, refPos + len(ref), ref, "")
+
+ if variant is not None:
+ # HACK ALERT: variants at the first and last position
+ # are not handled correctly
+ if siteCoverage is not None and np.size(siteCoverage) > 0:
+ refPos_ = min(refPos-refStart, len(siteCoverage)-1)
+ variant.coverage = siteCoverage[refPos_]
+ if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
+ cssPos_ = min(cssPos, len(cssQvInWindow)-1)
+ variant.confidence = cssQvInWindow[cssPos_]
+ variants.append(variant)
+
+ refPos += refLen
+ cssPos += cssLen
+
+ return variants
+
+
+def variantsAndConfidence(refWindow, refSequence, cssSequence, aligner="affine"):
+ """
+ Compute the confidence for each position, and compare
+ the consensus and reference in this window, returning a list of variants
+ """
+ refId, refStart, refEnd = refWindow
+
+ if aligner == "affine":
+ align = cc.AlignAffine
+ else:
+ align = cc.Align
+
+ ga = align(refSequence, cssSequence)
+ confidence = np.ones((len(cssSequence),), dtype=np.uint8) * 20
+ variants = variantsFromAlignment(ga, refWindow, confidence)
+ return (confidence, variants)
+
+
+def consensusAndVariantsForAlignments(refWindow, refSequence, alns, poaConfig):
+ """
+ Call consensus on this interval---without subdividing the interval
+ further.
+
+ Testable!
+
+ Clipping has already been done!
+ """
+ _, refStart, refEnd = refWindow
+
+ # Compute the POA consensus, which is our initial guess, and
+ # should typically be > 99.5% accurate
+ fwdSequences = [ a.read(orientation="genomic", aligned=False)
+ for a in alns
+ if a.spansReferenceRange(refStart, refEnd) ]
+
+ try:
+ assert len(fwdSequences) >= poaConfig.minPoaCoverage
+ p = cc.PoaConsensus.FindConsensus(fwdSequences[:poaConfig.maxPoaCoverage])
+ except:
+ logging.info("%s: POA could not be generated" % (refWindow,))
+ css = Consensus.noCallConsensus(poaConfig.noEvidenceConsensus,
+ refWindow, refSequence)
+ return (css, [])
+
+ poaCss = p.Sequence
+ confidence, variants = \
+ variantsAndConfidence(refWindow, refSequence, poaCss, poaConfig.aligner)
+ css = Consensus(refWindow, poaCss, confidence)
+
+ return (css, variants)
+
+
+def poaConsensusAndVariants(alnFile, refWindow, referenceContig,
+ depthLimit, poaConfig):
+ """
+ High-level routine for calling the consensus for a
+ window of the genome given an alignment.
+
+ Identifies the coverage contours of the window in order to
+ identify subintervals where a good consensus can be called.
+ Creates the desired "no evidence consensus" where there is
+ inadequate coverage.
+ """
+ winId, winStart, winEnd = refWindow
+ logging.info("POA operating on %s" %
+ reference.windowToString(refWindow))
+
+ if options.fancyChunking:
+ # 1) identify the intervals with adequate coverage for poa
+ # consensus; restrict to intervals of length > 10
+ alnHits = readsInWindow(alnFile, refWindow,
+ depthLimit=20000,
+ minMapQV=poaConfig.minMapQV,
+ strategy="longest",
+ stratum=options.readStratum,
+ barcode=options.barcode)
+ starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
+ ends = np.fromiter((hit.tEnd for hit in alnHits), np.int)
+ intervals = kSpannedIntervals(refWindow, poaConfig.minPoaCoverage,
+ starts, ends, minLength=10)
+ coverageGaps = holes(refWindow, intervals)
+ allIntervals = sorted(intervals + coverageGaps)
+ if len(allIntervals) > 1:
+ logging.info("Usable coverage in %s: %r" %
+ (reference.windowToString(refWindow), intervals))
+
+ else:
+ allIntervals = [ (winStart, winEnd) ]
+
+ # 2) pull out the reads we will use for each interval
+ # 3) call consensusForAlignments on the interval
+ subConsensi = []
+ variants = []
+
+ for interval in allIntervals:
+ intStart, intEnd = interval
+ intRefSeq = referenceContig[intStart:intEnd]
+ subWin = subWindow(refWindow, interval)
+
+ windowRefSeq = referenceContig[intStart:intEnd]
+ alns = readsInWindow(alnFile, subWin,
+ depthLimit=depthLimit,
+ minMapQV=poaConfig.minMapQV,
+ strategy="longest",
+ stratum=options.readStratum,
+ barcode=options.barcode)
+ clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
+ clippedAlns = filterAlns(clippedAlns_, poaConfig)
+
+ if len([ a for a in clippedAlns
+ if a.spansReferenceRange(*interval) ]) >= poaConfig.minPoaCoverage:
+
+ logging.debug("%s: Reads being used: %s" %
+ (reference.windowToString(subWin),
+ " ".join([str(hit.readName) for hit in alns])))
+
+ css, variants_ = \
+ consensusAndVariantsForAlignments(subWin, intRefSeq,
+ clippedAlns, poaConfig)
+
+ filteredVars = filterVariants(options.minCoverage,
+ options.minConfidence,
+ variants_)
+ # Annotate?
+ if options.annotateGFF:
+ annotateVariants(filteredVars, clippedAlns)
+
+ variants += filteredVars
+
+ # Dump?
+ shouldDumpEvidence = \
+ ((options.dumpEvidence == "all") or
+ (options.dumpEvidence == "variants") and (len(variants) > 0))
+ if shouldDumpEvidence:
+ logging.info("POA does not yet support --dumpEvidence")
+# dumpEvidence(options.evidenceDirectory,
+# subWin, windowRefSeq,
+# clippedAlns, css)
+ else:
+ css = Consensus.noCallConsensus(poaConfig.noEvidenceConsensus,
+ subWin, intRefSeq)
+ subConsensi.append(css)
+
+ # 4) glue the subwindow consensus objects together to form the
+ # full window consensus
+ css = join(subConsensi)
+
+ # 5) Return
+ return css, variants
+
+
+#
+# -------------- Poa Worker class --------------------
+#
+
+class PoaWorker(object):
+
+ @property
+ def poaConfig(self):
+ return self._algorithmConfig
+
+ def onStart(self):
+ random.seed(42)
+
+ def onChunk(self, workChunk):
+ referenceWindow = workChunk.window
+ refId, refStart, refEnd = referenceWindow
+
+ refSeqInWindow = reference.sequenceInWindow(referenceWindow)
+
+ # Quick cutout for no-coverage case
+ if not workChunk.hasCoverage:
+ noCallCss = Consensus.noCallConsensus(self.poaConfig.noEvidenceConsensus,
+ referenceWindow, refSeqInWindow)
+ return (referenceWindow, (noCallCss, []))
+
+ # General case
+ eWindow = reference.enlargedReferenceWindow(referenceWindow,
+ options.referenceChunkOverlap)
+ _, eStart, eEnd = eWindow
+
+ # We call consensus on the enlarged window and then map back
+ # to the reference and clip the consensus at the implied
+ # bounds. This seems to be more reliable thank cutting the
+ # consensus bluntly
+ refContig = reference.byName[refId].sequence
+ refSequenceInEnlargedWindow = refContig[eStart:eEnd]
+
+ #
+ # Get the consensus for the enlarged window.
+ #
+ css_, variants_ = \
+ poaConsensusAndVariants(self._inAlnFile, eWindow, refContig,
+ options.coverage, self.poaConfig)
+
+ #
+ # Restrict the consensus and variants to the reference window.
+ #
+ ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence)
+ targetPositions = cc.TargetToQueryPositions(ga)
+ cssStart = targetPositions[refStart-eStart]
+ cssEnd = targetPositions[refEnd-eStart]
+
+ cssSequence = css_.sequence[cssStart:cssEnd]
+ cssQv = css_.confidence[cssStart:cssEnd]
+ variants = [ v for v in variants_
+ if refStart <= v.refStart < refEnd ]
+
+ consensusObj = Consensus(referenceWindow,
+ cssSequence,
+ cssQv)
+
+ return (referenceWindow, (consensusObj, variants))
+
+
+# define both process and thread-based plurality callers
+class PoaWorkerProcess(PoaWorker, WorkerProcess): pass
+class PoaWorkerThread(PoaWorker, WorkerThread): pass
+
+#
+# --------------------- Plugin API --------------------------------
+#
+
+# Pluggable module API for algorithms:
+# - Algorithm lives in a package
+# - Package must never fail to import, even if some of
+# its dependencies are not installed.
+# - Package must provide a main module exposing these top level
+# variables/methods:
+# - name = str
+# - availability = (bool, str)
+# - configure -> options -> cmph5 -> algorithm specific config object;
+# (can raise IncompatibleDataException)
+# - slaveFactories -> bool -> (class, class)
+
+__all__ = [ "name",
+ "availability",
+ "configure",
+ "slaveFactories" ]
+
+name = "poa"
+availability = (True, "OK")
+
+def slaveFactories(threaded):
+ if threaded:
+ return (PoaWorkerThread, ResultCollectorThread)
+ else:
+ return (PoaWorkerProcess, ResultCollectorProcess)
+
+def configure(options, _):
+ poaConfig = PoaConfig(aligner=options.aligner,
+ minMapQV=options.minMapQV,
+ noEvidenceConsensus=options.noEvidenceConsensusCall,
+ minReadScore=options.minReadScore,
+ minHqRegionSnr=options.minHqRegionSnr)
+ return poaConfig
diff --git a/GenomicConsensus/quiver/model.py b/GenomicConsensus/quiver/model.py
index f12453c..d720d82 100644
--- a/GenomicConsensus/quiver/model.py
+++ b/GenomicConsensus/quiver/model.py
@@ -92,7 +92,7 @@ class Model(object):
@classmethod
def isCompatibleWithCmpH5(cls, cmpH5):
- return all(cmpH5.hasPulseFeature(feature) for feature in cls.requiredFeatures)
+ return all(cmpH5.hasBaseFeature(feature) for feature in cls.requiredFeatures)
@classmethod
def extractFeatures(cls, aln):
@@ -112,7 +112,7 @@ class Model(object):
_args = [ alnRead[~gapMask].tostring() ]
for feature in ALL_FEATURES:
if feature in cls.requiredFeatures:
- _args.append(asFloatFeature(aln.pulseFeature(feature)[~gapMask]))
+ _args.append(asFloatFeature(aln.baseFeature(feature)[~gapMask]))
else:
_args.append(cc.FloatFeature(int(aln.readLength)))
return cc.QvSequenceFeatures(*_args)
@@ -121,7 +121,7 @@ class Model(object):
_args = [ aln.read(aligned=False, orientation="native") ]
for feature in ALL_FEATURES:
if feature in cls.requiredFeatures:
- _args.append(asFloatFeature(aln.pulseFeature(feature, aligned=False)))
+ _args.append(asFloatFeature(aln.baseFeature(feature, aligned=False)))
else:
_args.append(cc.FloatFeature(int(aln.readLength)))
return cc.QvSequenceFeatures(*_args)
@@ -402,7 +402,7 @@ def loadParameterSets(parametersFile=None, spec=None, cmpH5=None):
die("Quiver: no available parameter set named %s" % \
spec)
elif chemistryName:
- qvsAvailable = cmpH5.pulseFeaturesAvailable()
+ qvsAvailable = cmpH5.baseFeaturesAvailable()
p = _bestParameterSet(sets, chemistryName, qvsAvailable)
if p.chemistry != chemistryName:
die("Quiver: no parameter set available compatible with this " + \
@@ -416,7 +416,7 @@ def loadParameterSets(parametersFile=None, spec=None, cmpH5=None):
"this version of SMRTanalysis is too old to recognize a new chemistry.")
if not _isChemistryMixSupported(chemistryNames):
die("Unsupported chemistry mix, cannot proceed.")
- qvsAvailable = cmpH5.pulseFeaturesAvailable()
+ qvsAvailable = cmpH5.baseFeaturesAvailable()
bestParams = [ _bestParameterSet(sets, chemistryName, qvsAvailable)
for chemistryName in chemistryNames ]
params = dict(zip(chemistryNames, bestParams))
diff --git a/GenomicConsensus/quiver/quiver.py b/GenomicConsensus/quiver/quiver.py
index 1e2fea5..cbea95c 100644
--- a/GenomicConsensus/quiver/quiver.py
+++ b/GenomicConsensus/quiver/quiver.py
@@ -51,7 +51,7 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
depthLimit, quiverConfig):
"""
High-level routine for calling the consensus for a
- window of the genome given a cmp.h5.
+ window of the genome given an alignment file.
Identifies the coverage contours of the window in order to
identify subintervals where a good consensus can be called.
@@ -193,7 +193,7 @@ class QuiverWorker(object):
# Get the consensus for the enlarged window.
#
css_, variants_ = \
- consensusAndVariantsForWindow(self._inCmpH5, eWindow,
+ consensusAndVariantsForWindow(self._inAlnFile, eWindow,
refContig, options.coverage, self.quiverConfig)
#
@@ -232,7 +232,7 @@ __all__ = [ "name",
"configure",
"slaveFactories" ]
-name = "Quiver"
+name = "quiver"
availability = (True, "OK")
def configure(options, cmpH5):
@@ -241,16 +241,25 @@ def configure(options, cmpH5):
if cmpH5.readType != "standard":
raise U.IncompatibleDataException(
- "The Quiver algorithm requires a cmp.h5 file containing standard (non-CCS) reads." )
+ "The Quiver algorithm requires an alignment file containing standard (non-CCS) reads." )
if options.parametersSpec == "auto":
+ # Reject Sequel chemistries explicitly---there are no Quiver
+ # trainings for Sequel. Arrow should be used.
+ # (Not that power-users can bypass this requirement using an explicit parameter set)
+ for chem in cmpH5.sequencingChemistry:
+ if chem.startswith("S/"):
+ raise U.IncompatibleDataException(
+ "The Quiver algorithm is not trained for Sequel data. " +
+ "Please use the Arrow algorithm instead.")
+
if options.diploid:
logging.info("Diploid analysis--resorting to unknown.NoQVsModel until other " +
"parameter sets can be recalibrated.")
params = M.loadParameterSets(options.parametersFile, spec="unknown.NoQVsModel")
else:
params = M.loadParameterSets(options.parametersFile, cmpH5=cmpH5)
- qvMsg = "This .cmp.h5 file lacks some of the QV data tracks that are required " + \
+ qvMsg = "This alignment file file lacks some of the QV data tracks that are required " + \
"for optimal performance of the Quiver algorithm. For optimal results" + \
" use the ResequencingQVs workflow in SMRTPortal with bas.h5 files " + \
"from an instrument using software version 1.3.1 or later, or the " + \
@@ -265,7 +274,7 @@ def configure(options, cmpH5):
cmpH5=cmpH5)
if not all(ps.model.isCompatibleWithCmpH5(cmpH5) for ps in params.values()):
raise U.IncompatibleDataException(
- "Selected Quiver parameter set is incompatible with this cmp.h5 file " +
+ "Selected Quiver parameter set is incompatible with this alignment file " +
"due to missing data tracks.")
logging.info("Using Quiver parameter set(s): %s" % (", ".join(ps.name for ps in params.values())))
diff --git a/GenomicConsensus/quiver/utils.py b/GenomicConsensus/quiver/utils.py
index 9d8470c..c231c44 100644
--- a/GenomicConsensus/quiver/utils.py
+++ b/GenomicConsensus/quiver/utils.py
@@ -172,7 +172,7 @@ def consensusConfidence(mms, positions=None):
"""
return np.array(cc.ConsensusQVs(mms), dtype=np.uint8)
-def variantsFromAlignment(a, refWindow):
+def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
"""
Extract the variants implied by a pairwise alignment to the
reference.
@@ -180,6 +180,7 @@ def variantsFromAlignment(a, refWindow):
variants = []
refId, refStart, _ = refWindow
refPos = refStart
+ cssPos = 0
tbl = zip(a.Transcript(),
a.Target(),
a.Query())
@@ -193,19 +194,33 @@ def variantsFromAlignment(a, refWindow):
run = list(run)
ref = "".join(map(snd, run))
refLen = len(ref) - Counter(ref)["-"]
- read = "".join(map(third, run))
+ css = "".join(map(third, run))
+ cssLen = len(css) - Counter(css)["-"]
+ variant = None
if code == "M" or code == "N":
pass
elif code == "R":
- assert len(read)==len(ref)
- variants.append(Variant(refId, refPos, refPos+len(read), ref, read))
+ assert len(css)==len(ref)
+ variant = Variant(refId, refPos, refPos+len(css), ref, css)
elif code == "I":
- variants.append(Variant(refId, refPos, refPos, "", read))
+ variant = Variant(refId, refPos, refPos, "", css)
elif code == "D":
- variants.append(Variant(refId, refPos, refPos + len(ref), ref, ""))
+ variant = Variant(refId, refPos, refPos + len(ref), ref, "")
+
+ if variant is not None:
+ # HACK ALERT: variants at the first and last position
+ # are not handled correctly
+ if siteCoverage is not None and np.size(siteCoverage) > 0:
+ refPos_ = min(refPos-refStart, len(siteCoverage)-1)
+ variant.coverage = siteCoverage[refPos_]
+ if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
+ cssPos_ = min(cssPos, len(cssQvInWindow)-1)
+ variant.confidence = cssQvInWindow[cssPos_]
+ variants.append(variant)
refPos += refLen
+ cssPos += cssLen
return variants
@@ -294,24 +309,9 @@ def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow,
align = cc.Align
ga = align(refSequenceInWindow, cssSequenceInWindow)
- cssPosition = cc.TargetToQueryPositions(ga)
- vars = variantsFromAlignment(ga, refWindow)
- for v in vars:
- # HACK ALERT: we are not really handling the confidence or
- # coverage for variants at last position of the window
- # correctly here.
- refPos_ = min(v.refStart-refStart, len(siteCoverage)-1)
- cssPos_ = min(cssPosition[v.refStart-refStart], len(cssQvInWindow)-1)
+ return variantsFromAlignment(ga, refWindow, cssQvInWindow, siteCoverage)
- # make sure the arrays are non-empty before indexing into them
- if siteCoverage is not None and np.size(siteCoverage) > 0:
- v.coverage = siteCoverage[refPos_]
-
- if cssQvInWindow is not None and np.size(cssQvInWindow) > 0:
- v.confidence = cssQvInWindow[cssPos_]
-
- return vars
def filterAlns(refWindow, alns, quiverConfig):
"""
diff --git a/GenomicConsensus/reference.py b/GenomicConsensus/reference.py
index 5c265d6..0a9698d 100644
--- a/GenomicConsensus/reference.py
+++ b/GenomicConsensus/reference.py
@@ -103,7 +103,7 @@ filename = None
def isLoaded():
return filename != None
-def loadFromFile(filename_, cmpH5):
+def loadFromFile(filename_, alnFile):
"""
Reads reference from FASTA file, loading
lookup tables that can be used any time later.
@@ -130,15 +130,15 @@ def loadFromFile(filename_, cmpH5):
except IOError as e:
die(e)
- cmpContigNames = set(cmpH5.refNames)
+ cmpContigNames = set(alnFile.refNames)
for fastaRecord in f.contigs:
refName = fastaRecord.id
if refName in cmpContigNames:
- cmpH5RefEntry = cmpH5.referenceInfo(refName)
- refId = cmpH5RefEntry.ID
- pacBioName = cmpH5RefEntry.Name
- refFullName = cmpH5RefEntry.FullName
+ refEntry = alnFile.referenceInfo(refName)
+ refId = refEntry.ID
+ pacBioName = refEntry.Name
+ refFullName = refEntry.FullName
sequence = UppercasingMmappedFastaSequence(fastaRecord.sequence)
length = len(fastaRecord.sequence)
contig = ReferenceContig(refId, refName, refFullName, sequence, length)
@@ -207,38 +207,24 @@ def enumerateChunks(refId, referenceStride, referenceWindows=()):
for (s, e) in enumerateIntervals(span[1:], referenceStride):
yield WorkChunk((refId, s, e), True)
-def fancyEnumerateChunks(cmpH5, refId, referenceStride,
+def fancyEnumerateChunks(alnFile, refId, referenceStride,
minCoverage, minMapQV, referenceWindows=()):
"""
Enumerate chunks, creating chunks with hasCoverage=False for
coverage cutouts.
"""
-
- # The abstraction frays here, unless we want to push all of this into
- # pbdataset. With the fairly specific nature of this query, a composition
- # of standard accessors is probably ok:
- tStart = []
- tEnd = []
- for reader in cmpH5.resourceReaders(refId):
- if cmpH5.isCmpH5:
- startRow = reader.referenceInfo(refId).StartRow
- endRow = reader.referenceInfo(refId).EndRow
- rows = slice(startRow, endRow)
- else:
- refLen = reader.referenceInfo(refId).Length
- rows = reader.readsInRange(refId, 0, refLen, justIndices=True)
- goodMapQVs = (reader.mapQV[rows] >= minMapQV)
- tStart.extend(reader.tStart[rows][goodMapQVs].view(np.int32))
- tEnd.extend(reader.tEnd[rows][goodMapQVs].view(np.int32))
-
- # Sort the intervals (not sure if it matters, but might as well be
- # consistent with the inputs:
- if tStart and tEnd:
- tStart = np.array(tStart)
- tEnd = np.array(tEnd)
- sort_order = tStart.argsort()
- tStart = tStart[sort_order]
- tEnd = tEnd[sort_order]
+ # Pull out rows with this refId and good enough MapQV
+ rows = alnFile.index[
+ ((alnFile.tId == alnFile.referenceInfo(refId).ID) &
+ (alnFile.mapQV >= minMapQV))]
+
+ unsorted_tStart = rows.tStart
+ unsorted_tEnd = rows.tEnd
+
+ # Sort (expected by CoveredIntervals)
+ sort_order = np.lexsort((unsorted_tEnd, unsorted_tStart))
+ tStart = unsorted_tStart[sort_order].tolist()
+ tEnd = unsorted_tEnd[sort_order].tolist()
for span in enumerateSpans(refId, referenceWindows):
_, spanStart, spanEnd = span
diff --git a/GenomicConsensus/utils.py b/GenomicConsensus/utils.py
index 5661c31..38614fa 100644
--- a/GenomicConsensus/utils.py
+++ b/GenomicConsensus/utils.py
@@ -35,7 +35,7 @@ import ast
import math, numpy as np, os.path, sys, itertools
def die(msg):
- print msg
+ print >>sys.stderr, msg
sys.exit(-1)
class CommonEqualityMixin(object):
@@ -96,7 +96,7 @@ def rowNumberIsInReadStratum(readStratum, rowNumber):
n, N = readStratum
return (rowNumber % N) == n
-def readsInWindow(cmpH5, window, depthLimit=None,
+def readsInWindow(alnFile, window, depthLimit=None,
minMapQV=0, strategy="fileorder",
stratum=None, barcode=None):
"""
@@ -117,32 +117,28 @@ def readsInWindow(cmpH5, window, depthLimit=None,
def depthCap(iter):
if depthLimit is not None:
- return cmpH5[list(itertools.islice(iter, 0, depthLimit))]
+ return alnFile[list(itertools.islice(iter, 0, depthLimit))]
else:
- return cmpH5[list(iter)]
+ return alnFile[list(iter)]
def lengthInWindow(hit):
- return (min(cmpH5.index.tEnd[hit], winEnd) -
- max(cmpH5.index.tStart[hit], winStart))
+ return (min(alnFile.index.tEnd[hit], winEnd) -
+ max(alnFile.index.tStart[hit], winStart))
winId, winStart, winEnd = window
- alnHits = np.array(list(cmpH5.readsInRange(winId, winStart, winEnd,
- justIndices=True)))
+ alnHits = np.array(list(alnFile.readsInRange(winId, winStart, winEnd,
+ justIndices=True)))
if len(alnHits) == 0:
return []
- # Different naming between cmpH5 and pbi
- mapQV = lambda x: x.mapQV
- if cmpH5.isCmpH5:
- mapQV = lambda x: x.MapQV
if barcode == None:
- alnHits = alnHits[mapQV(cmpH5.index)[alnHits] >= minMapQV]
+ alnHits = alnHits[alnFile.mapQV[alnHits] >= minMapQV]
else:
- # this wont work with cmpH5 (no bc in index):
+ # this wont work with CmpH5 (no bc in index):
barcode = ast.literal_eval(barcode)
- alnHits = alnHits[(mapQV(cmpH5.index)[alnHits] >= minMapQV) &
- (cmpH5.index.bcLeft[alnHits] == barcode[0]) &
- (cmpH5.index.bcRight[alnHits] == barcode[1])]
+ alnHits = alnHits[(alnFile.mapQV[alnHits] >= minMapQV) &
+ (alnFile.index.bcLeft[alnHits] == barcode[0]) &
+ (alnFile.index.bcRight[alnHits] == barcode[1])]
if strategy == "fileorder":
return depthCap(alnHits)
@@ -159,8 +155,8 @@ def readsInWindow(cmpH5, window, depthLimit=None,
# either case.
# lexical sort:
- ends = cmpH5.index.tEnd[alnHits]
- starts = cmpH5.index.tStart[alnHits]
+ ends = alnFile.index.tEnd[alnHits]
+ starts = alnFile.index.tStart[alnHits]
lex_sort = np.lexsort((ends, starts))
# reorder based on sort:
@@ -180,14 +176,14 @@ def readsInWindow(cmpH5, window, depthLimit=None,
return depthCap(sorted_alnHits[win_sort])
-def datasetCountExceedsThreshold(cmpH5, threshold):
+def datasetCountExceedsThreshold(alnFile, threshold):
"""
Does the file contain more than `threshold` datasets? This
impacts whether or not we should disable the chunk cache.
"""
total = 0
- for i in np.unique(cmpH5.AlnGroupID):
- total += len(cmpH5._alignmentGroup(i))
+ for i in np.unique(alnFile.AlnGroupID):
+ total += len(alnFile._alignmentGroup(i))
if total > threshold:
return True
return False
diff --git a/GenomicConsensus/windows.py b/GenomicConsensus/windows.py
index e56d50f..aa9e375 100644
--- a/GenomicConsensus/windows.py
+++ b/GenomicConsensus/windows.py
@@ -42,6 +42,9 @@ import numpy as np, math
from pbcore.io.rangeQueries import projectIntoRange
from ConsensusCore import CoveredIntervals
+# TODO(lhepler): replace the above with the following:
+# from ConsensusCore2 import CoveredIntervals
+
def intervalToPair(v):
return (v.Begin, v.End)
diff --git a/README.md b/README.md
index e756ad3..2701540 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,34 @@
-GenomicConsensus (quiver)
+GenomicConsensus (quiver, arrow) [![Circle CI](https://circleci.com/gh/PacificBiosciences/GenomicConsensus.svg?style=svg)](https://circleci.com/gh/PacificBiosciences/GenomicConsensus)
-------------------------
-The ``GenomicConsensus`` package provides the ``quiver`` tool,
-PacBio's flagship consensus and variant caller. The backend logic is
-provided by the ``ConsensusCore`` library, which you must install
-first.
+The ``GenomicConsensus`` package provides the ``variantCaller`` tool,
+which allows you to apply the Quiver or Arrow algorithm to mapped
+PacBio reads to get consensus and variant calls.
+Background on Quiver and Arrow
+------------------------------
-Installing
-----------
-Make sure you have set up and activated your virtualenv, and installed
-``pbcore`` and ``ConsensusCore`` (which cannot be installed
-automatically by pip or setuptools at this time). Then:
+*Quiver* is the legacy consensus model based on a conditional random
+field approach. Quiver enables consensus accuracies on genome
+assemblies at accuracies approaching or even exceeding Q60 (one error
+per million bases). If you use the HGAP assembly protocol in
+SMRTportal 2.0 or later, Quiver runs automatically as the final
+"assembly polishing" step.
+
+Over the years Quiver has proven difficult to train and develop, so we are
+phasing it out in favor of the new model, Arrow. *Arrow* is an
+improved consensus model based on a more straightforward hidden Markov
+model approach.
+
+Quiver is supported for PacBio RS data. Arrow is supported for PacBio
+Sequel data and RS data with the P6-C4 chemistry.
+
+
+Getting GenomicConsensus
+------------------------
+Casual users should get ``GenomicConsensus`` from the
+[SMRTanalysis software bundle](http://www.pacb.com/support/software-downloads/).
-```sh
-% python setup.py install
-````
Running
-------
@@ -27,6 +40,10 @@ Basic usage is as follows:
> -o consensus.fasta -o consensus.fastq
```
+``quiver`` is a shortcut for ``variantCaller --algorithm=quiver``.
+Naturally, to use arrow you could use the ``arrow`` shortcut or
+``variantCaller --algorithm=arrow``.
+
in this example we perform haploid consensus and variant calling on
the mapped reads in the ``aligned_reads.bam`` which was aligned to
``reference.fasta``. The ``reference.fasta`` is only used for
@@ -34,11 +51,15 @@ designating variant calls, not for computing the consensus. The
consensus quality score for every position can be found in the output
FASTQ file.
+*Note that 2.3 SMRTanalysis does not support "dataset" input (FOFN
+ or XML files); those who need this feature should wait for the forthcoming
+ release of SMRTanalysis 3.0 or build from GitHub sources.*
+
-Documentation
--------------
+More documentation
+------------------
-- [More detailed installation instructions](https://github.com/PacificBiosciences/GenomicConsensus/blob/master/doc/HowToQuiver.rst)
-- [Quiver FAQ](https://github.com/PacificBiosciences/GenomicConsensus/blob/master/doc/QuiverFAQ.rst)
-- [variants.gff spec](https://github.com/PacificBiosciences/GenomicConsensus/blob/master/doc/VariantsGffSpecification.rst)
-- [CHANGELOG](https://github.com/PacificBiosciences/GenomicConsensus/blob/master/CHANGELOG)
+- [More detailed installation and running instructions](./doc/HowTo.rst)
+- [FAQ](./doc/FAQ.rst)
+- [variants.gff spec](./doc/VariantsGffSpecification.rst)
+- [CHANGELOG](./CHANGELOG)
diff --git a/bin/arrow b/bin/arrow
new file mode 100755
index 0000000..dba88f6
--- /dev/null
+++ b/bin/arrow
@@ -0,0 +1,2 @@
+#!/bin/sh
+variantCaller --algorithm=arrow $*
diff --git a/bin/gffToBed b/bin/gffToBed
old mode 100755
new mode 100644
index b4911ac..1694c73
--- a/bin/gffToBed
+++ b/bin/gffToBed
@@ -19,6 +19,8 @@ from pbcore.io import GffReader, WriterBase
__version__ = "3.0"
+log = logging.getLogger(__name__)
+
class Constants(object):
TASK_ID = "genomic_consensus.tasks.gff2bed"
PURPOSE_ID = "genomic_consensus.task_options.gff2bed_purpose"
@@ -164,7 +166,8 @@ def get_contract_parser():
version=__version__,
name="gffToBed",
description=__doc__,
- driver_exe=Constants.DRIVER_EXE)
+ driver_exe=Constants.DRIVER_EXE,
+ default_level="ERROR")
ap = p.arg_parser.parser
tcp = p.tool_contract_parser
ap.add_argument("purpose", choices=["variants","coverage"],
@@ -172,9 +175,9 @@ def get_contract_parser():
p.add_input_file_type(FileTypes.GFF, "gff",
"GFF file", "GFF file")
tcp.add_output_file_type(FileTypes.BED, "bed",
- "BED file", "BED file", "output.bed")
+ "BED file", "BED file", "output")
tcp.add_str(Constants.PURPOSE_ID, "purpose",
- default="",
+ default="variants",
name="Purpose",
description="Run mode ('variants' or 'coverage')")
p.add_str(Constants.TRACK_NAME_ID, "name",
@@ -192,18 +195,14 @@ def get_contract_parser():
return p
def main(argv=sys.argv):
- logging.basicConfig(level=logging.WARN)
- log = logging.getLogger()
mp = get_contract_parser()
- def dummy_log(*args, **kwargs):
- pass
return pbparser_runner(
argv=argv[1:],
parser=mp,
args_runner_func=args_runner,
contract_runner_func=resolved_tool_contract_runner,
alog=log,
- setup_log_func=dummy_log)
+ setup_log_func=setup_log)
if __name__ == '__main__':
sys.exit(main())
diff --git a/bin/gffToVcf b/bin/gffToVcf
index ed1d947..69a9346 100755
--- a/bin/gffToVcf
+++ b/bin/gffToVcf
@@ -21,6 +21,9 @@ from pbcore.io import GffReader, WriterBase
__version__ = "3.0"
+log = logging.getLogger(__name__)
+
+
class Constants(object):
TASK_ID = "genomic_consensus.tasks.gff2vcf"
DRIVER_EXE = "gffToVcf --resolved-tool-contract "
@@ -155,30 +158,26 @@ def get_contract_parser():
version=__version__,
name="gffToVcf",
description=__doc__,
- driver_exe=Constants.DRIVER_EXE)
+ driver_exe=Constants.DRIVER_EXE,
+ default_level="ERROR")
ap = p.arg_parser.parser
tcp = p.tool_contract_parser
p.add_input_file_type(FileTypes.GFF, "gffFile",
"GFF file", "GFF file")
tcp.add_output_file_type(FileTypes.VCF, "vcf", "VCF file", "VCF file",
- default_name="output.vcf")
+ default_name="output")
ap.add_argument("--globalReference", action="store", default=None,
help="Name of global reference to put in Meta field")
return p
def main(argv=sys.argv):
- logging.basicConfig(level=logging.WARN)
- log = logging.getLogger()
- p = get_contract_parser()
- def dummy_log(*args, **kwargs):
- pass
return pbparser_runner(
argv=argv[1:],
- parser=p,
+ parser=get_contract_parser(),
args_runner_func=args_runner,
contract_runner_func=resolved_tool_contract_runner,
alog=log,
- setup_log_func=dummy_log)
+ setup_log_func=setup_log)
if __name__ == '__main__':
sys.exit(main())
diff --git a/bin/poa b/bin/poa
new file mode 100755
index 0000000..39c6573
--- /dev/null
+++ b/bin/poa
@@ -0,0 +1,2 @@
+#!/bin/sh
+variantCaller --algorithm=poa $*
diff --git a/bin/summarizeConsensus b/bin/summarizeConsensus
index 78fc434..6686a2d 100755
--- a/bin/summarizeConsensus
+++ b/bin/summarizeConsensus
@@ -4,9 +4,10 @@
Augment the alignment_summary.gff file with consensus and variants information.
"""
-from collections import namedtuple
+from collections import namedtuple, defaultdict
import argparse
import logging
+import bisect
import json
import gzip
import sys
@@ -14,7 +15,7 @@ import sys
import numpy as np
from pbcommand.utils import setup_log
-from pbcommand.cli import pacbio_args_or_contract_runner, get_default_argparser
+from pbcommand.cli import pbparser_runner, get_default_argparser
from pbcommand.models import FileTypes, get_pbparser
from pbcommand.common_options import add_resolved_tool_contract_option
from pbcore.io import GffReader, GffWriter, Gff3Record
@@ -27,61 +28,41 @@ from GenomicConsensus import __VERSION__
#
Region = namedtuple("Region", ("seqid", "start", "end"))
+log = logging.getLogger(__name__)
+
class Constants(object):
TOOL_ID = "genomic_consensus.tasks.summarize_consensus"
DRIVER_EXE = "summarizeConsensus --resolved-tool-contract "
-def get_argument_parser():
- parser = get_default_argparser(__VERSION__, __doc__)
- parser.add_argument("inputAlignmentSummaryGff",
- type=str,
- help="Input alignment_summary.gff filename")
- # FIXME not optional, should be positional
- parser.add_argument("--variantsGff",
- type=str,
- help="Input variants.gff or variants.gff.gz filename",
- required=True)
- parser.add_argument("--output",
- "-o",
- type=str,
- help="Output alignment_summary.gff filename")
- add_resolved_tool_contract_option(parser)
- # FIXME temporary workaround for parser chaos
- class EmitToolContractAction(argparse.Action):
- def __call__(self, parser, namespace, values, option_string=None):
- parser2 = get_contract_parser()
- sys.stdout.write(json.dumps(parser2.to_contract().to_dict(), indent=4)+'\n')
- sys.exit(0)
- parser.add_argument("--emit-tool-contract",
- nargs=0,
- action=EmitToolContractAction)
- return parser
def get_contract_parser():
- """
- Used to generate emitted tool contract, but not (yet) to actually process
- command-line options.
- """
- driver_exe = "variantCaller --resolved-tool-contract "
p = get_pbparser(
- "genomic_consensus.tasks.summarize_consensus",
+ Constants.TOOL_ID,
__VERSION__,
"Summarize Consensus",
__doc__,
- Constants.DRIVER_EXE)
+ Constants.DRIVER_EXE,
+ default_level="ERROR")
p.add_input_file_type(FileTypes.GFF, "alignment_summary",
"Alignment summary GFF", "Alignment summary GFF file")
- p.add_input_file_type(FileTypes.GFF, "variants",
+ p.tool_contract_parser.add_input_file_type(FileTypes.GFF, "variants",
"Variants GFF", "Variants GFF file")
- p.add_output_file_type(FileTypes.GFF, "output",
+ p.arg_parser.parser.add_argument("--variantsGff",
+ type=str,
+ help="Input variants.gff or variants.gff.gz filename",
+ required=True)
+ p.tool_contract_parser.add_output_file_type(FileTypes.GFF, "output",
name="Output GFF file",
description="New alignment summary GFF file",
- default_name="alignment_summary_variants.gff")
+ default_name="alignment_summary_variants")
+ p.arg_parser.parser.add_argument("-o", "--output",
+ type=str,
+ help="Output alignment_summary.gff filename")
return p
def get_args_from_resolved_tool_contract(resolved_tool_contract):
rtc = resolved_tool_contract
- p = get_argument_parser()
+ p = get_contract_parser().arg_parser.parser
args = [
rtc.task.input_files[0],
"--variantsGff", rtc.task.input_files[1],
@@ -97,7 +78,7 @@ def run(options):
]
inputVariantsGff = GffReader(options.variantsGff)
- inputAlignmentSummaryGff = GffReader(options.inputAlignmentSummaryGff)
+ inputAlignmentSummaryGff = GffReader(options.alignment_summary)
summaries = {}
for gffRecord in inputAlignmentSummaryGff:
@@ -105,26 +86,51 @@ def run(options):
summaries[region] = { "ins" : 0,
"del" : 0,
"sub" : 0,
- "cQv" : (0, 0, 0)
+ # TODO: base consensusQV on effective coverage
+ "cQv" : (20, 20, 20)
}
inputAlignmentSummaryGff.close()
counterNames = { "insertion" : "ins",
"deletion" : "del",
"substitution" : "sub" }
+ regions_by_contig = defaultdict(list)
+ for region in summaries:
+ regions_by_contig[region.seqid].append(region)
+ for seqid in regions_by_contig.keys():
+ r = regions_by_contig[seqid]
+ regions_by_contig[seqid] = sorted(r, lambda a,b: cmp(a.start, b.start))
+ logging.info("Processing variant records")
+ i = 0
+ have_contigs = set(regions_by_contig.keys())
for variantGffRecord in inputVariantsGff:
- for region in summaries:
- summary = summaries[region]
- if (region.seqid == variantGffRecord.seqid and
- region.start <= variantGffRecord.start <= region.end):
- counterName = counterNames[variantGffRecord.type]
- variantLength = max(len(variantGffRecord.reference),
- len(variantGffRecord.variantSeq))
- summary[counterName] += variantLength
- # TODO: base consensusQV on effective coverage
- summary["cQv"] = (20, 20, 20)
-
- inputAlignmentSummaryGff = open(options.inputAlignmentSummaryGff)
+ if not variantGffRecord.seqid in have_contigs:
+ raise KeyError(
+ "Can't find alignment summary for contig '{s}".format(
+ s=variantGffRecord.seqid))
+ positions = [r.start for r in regions_by_contig[variantGffRecord.seqid]]
+ idx = bisect.bisect_right(positions, variantGffRecord.start) - 1
+ # XXX we have to be a little careful here - an insertion at the start
+ # of a contig will have start=0 versus start=1 for the first region
+ if idx < 0:
+ idx = 0
+ region = regions_by_contig[variantGffRecord.seqid][idx]
+ assert ((region.start <= variantGffRecord.start <= region.end) or
+ (region.start == 1 and variantGffRecord.start == 0 and
+ variantGffRecord.type == "insertion")), \
+ (variantGffRecord.seqid, region.start, variantGffRecord.start,
+ region.end, variantGffRecord.type, idx)
+ summary = summaries[region]
+ counterName = counterNames[variantGffRecord.type]
+ variantLength = max(len(variantGffRecord.reference),
+ len(variantGffRecord.variantSeq))
+ summary[counterName] += variantLength
+ i += 1
+ if i % 1000 == 0:
+ logging.info("{i} records...".format(i=i))
+
+
+ inputAlignmentSummaryGff = open(options.alignment_summary)
outputAlignmentSummaryGff = open(options.output, "w")
inHeader = True
@@ -157,25 +163,23 @@ def run(options):
print >>outputAlignmentSummaryGff, line
return 0
+
def args_runner(args):
return run(options=args)
+
def resolved_tool_contract_runner(resolved_tool_contract):
args = get_args_from_resolved_tool_contract(resolved_tool_contract)
return run(options=args)
+
def main(argv=sys.argv):
- logging.basicConfig(level=logging.WARN)
- mp = get_argument_parser()
- log = logging.getLogger()
- def dummy_log(*args, **kwargs):
- pass
- return pacbio_args_or_contract_runner(argv[1:],
- mp,
- args_runner,
- resolved_tool_contract_runner,
- log,
- dummy_log)
+ return pbparser_runner(argv[1:],
+ get_contract_parser(),
+ args_runner,
+ resolved_tool_contract_runner,
+ log,
+ setup_log)
if __name__ == "__main__":
sys.exit(main())
diff --git a/circle.yml b/circle.yml
new file mode 100644
index 0000000..b1db9e5
--- /dev/null
+++ b/circle.yml
@@ -0,0 +1,31 @@
+dependencies:
+ cache_directories:
+ - "_deps/cmake-3.3.0-Linux-x86_64"
+ - "_deps/boost_1_58_0"
+ - "_deps/swig-3.0.8"
+ pre:
+ - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+ - sudo apt-get update
+ - sudo apt-get install g++-4.8
+ - if [ ! -d _deps ] ; then mkdir _deps ; fi # Create a directory for dependencies, These are static, cache them.
+ - pushd _deps ; if [ ! -d cmake-3.3.0-Linux-x86_64 ] ; then wget --no-check-certificate http://www.cmake.org/files/v3.3/cmake-3.3.0-Linux-x86_64.tar.gz ; tar xzf cmake-3.3.0-Linux-x86_64.tar.gz ; fi
+ - pushd _deps ; if [ ! -d boost_1_58_0 ] ; then wget http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_1_58_0.tar.bz2 ; tar xjf boost_1_58_0.tar.bz2 ; fi
+ - pushd _deps ; if [ ! -f swig-3.0.8/bin/swig ] ; then rm -fr swig-3.0.8* ; mkdir dl ; pushd dl ; wget http://downloads.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz ; tar xzf swig-3.0.8.tar.gz ; pushd swig-3.0.8 ; ./configure --prefix $(readlink -f ../../swig-3.0.8) ; make ; make install ; fi
+ - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore.git
+ - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore2.git
+ - pip install --upgrade pip
+ - pip install numpy
+ - pip install cython
+ - pip install h5py
+ - pip install pysam
+ - pip install --upgrade --no-deps git+https://github.com/PacificBiosciences/pbcommand.git
+ - pip install --upgrade --no-deps git+https://github.com/PacificBiosciences/pbcore.git
+ - pip install cram nose
+ override:
+ - pushd _deps/ConsensusCore ; python setup.py install --boost=$(readlink -f ../boost_1_58_0) --swig=$(readlink -f ../swig-3.0.8/bin/swig)
+ - pushd _deps/ConsensusCore2 ; CC=gcc-4.8 CXX=g++-4.8 CMAKE_COMMAND=$(readlink -f ../cmake-3.3.0-Linux-x86_64/bin/cmake) Boost_INCLUDE_DIRS=$(readlink -f ../boost_1_58_0) SWIG_COMMAND=$(readlink -f ../swig-3.0.8/bin/swig) pip install --verbose --upgrade --no-deps .
+test:
+ pre:
+ - pip install --verbose .
+ override:
+ - make check
diff --git a/doc/QuiverFAQ.rst b/doc/FAQ.rst
similarity index 70%
rename from doc/QuiverFAQ.rst
rename to doc/FAQ.rst
index 706551e..0312c0e 100644
--- a/doc/QuiverFAQ.rst
+++ b/doc/FAQ.rst
@@ -3,12 +3,20 @@ Quiver FAQ
What are EviCons? GenomicConsensus? Quiver? Plurality?
------------------------------------------------------------
-**GenomicConsensus** is the current PacBio consensus and variant calling suite. It contains a main program, ``variantCaller.py``,
-which provides two consensus / variant calling algorithms: **Plurality** and **Quiver**. These algorithms can be run by calling ``variantCaller.py --algorithm=[quiver|plurality]`` or by going through the convenience wrapper scripes ``quiver`` and ``plurality``.
+**GenomicConsensus** is the current PacBio consensus and variant
+calling suite. It contains a main driver program, ``variantCaller``,
+which provides two consensus/variant calling algorithms: **Arrow** and
+**Quiver**. These algorithms can be run by calling
+``variantCaller.py --algorithm=[arrow|quiver|plurality]`` or by going
+through the convenience wrapper scripts ``quiver`` and ``arrow``.
-**EviCons** was the previous generation PacBio variant caller (removed in software release v1.3.1).
+**EviCons** was the previous generation PacBio variant caller (removed
+ in software release v1.3.1).
-A separate package called **ConsensusCore** is a C++ library where all the computation behind Quiver is done (and is transparent to the user after installation).
+Separate packages called **ConsensusCore** and **ConsensusCore2** are
+C++ libraries where all the computation behind Quiver and Arrow are
+done, respectively. This is transparent to the user after
+installation.
What is Plurality?
@@ -50,9 +58,9 @@ challenges.
What is Quiver?
---------------
**Quiver** is a more sophisticated algorithm that finds the maximum
-likelihood template sequence given PacBio reads of the template.
+quasi-likelihood template sequence given PacBio reads of the template.
PacBio reads are modeled using a conditional random field approach that
-prescribes a probability to a read given a template sequence. In
+scores the quasi-likelihood of a read given a template sequence. In
addition to the base sequence of each read, Quiver uses several
additional *QV* covariates that the basecaller provides. Using these
covariates provides additional information about each read, allowing
@@ -67,21 +75,40 @@ it resolves the example above with ease.
The name **Quiver** reflects a consensus-calling algorithm that is
`QV-aware`.
-How do I run Quiver?
---------------------
+We use the lowercase "quiver" to denote the quiver *tool* in GenomicConsensus,
+which applies the Quiver algorithm to mapped reads to derive sequence
+consensus and variants.
+Quiver is described in detail in the supplementary material to the
+`HGAP paper`_.
+
+
+What is Arrow?
+--------------
+Arrow is a newer model intended to supercede Quiver in the near future.
+The key differences from Quiver are that it uses an HMM model instead
+of a CRF, it computes true likelihoods, and it uses a smaller set of
+covariates. We expect a whitepaper on Arrow to be available soon.
+
+We use the lowercase "arrow" to denote the arrow *tool*, which applies
+the Arrow algorithm to mapped reads to derive sequence
+consensus and variants.
+
+
+How do I run `quiver`/`arrow`?
+------------------------------
For general instructions on installing and running, see the
-HowToQuiver_ document.
+HowTo_ document.
-What does Quiver put in its output files?
+What is the output from `quiver`/`arrow`?
-----------------------------------------
-There are three output files from Quiver:
+There are three output files from the GenomicConsensus tools:
-- A consensus *FASTA* file containing the consensus sequence
-- A consensus *FASTQ* file containing the consensus sequence with quality annotations
-- A variants *GFF* file containing a filtered, annotated list of variants identified
+1. A consensus *FASTA* file containing the consensus sequence
+2. A consensus *FASTQ* file containing the consensus sequence with quality annotations
+3. A variants *GFF* file containing a filtered, annotated list of variants identified
It is important to note that the variants included in the output
variants GFF file are *filtered* by coverage and quality, so not all
@@ -89,18 +116,17 @@ variants that are apparent in comparing the reference to the consensus
FASTA output will correspond to variants in the output variants GFF
file.
-To enable all output files, the following can be run (for example):
-
- % quiver -j16 aligned_reads.cmp.h5 -r ref.fa \
- -o consensus.fa \
- -o consensus.fq \
- -o variants.gff
+To enable all output files, the following can be run (for example)::
+ % quiver -j16 aligned_reads.cmp.h5 -r ref.fa \
+ -o consensus.fa \
+ -o consensus.fq \
+ -o variants.gff
The extension is used to determine the output file format.
-What does it mean that Quiver's consensus is *de novo*?
+What does it mean that `quiver` consensus is *de novo*?
-------------------------------------------------------
Quiver's consensus is *de novo* in the sense that the reference and the reference
alignment are not used to inform the consensus output. Only the reads
@@ -114,8 +140,8 @@ consensus. One can set ``--noEvidenceConsensusCall=nocall`` to
avoid using the reference even in zero coverage regions.
-What is Quiver's accuracy?
---------------------------
+What is the expected `quiver` accuracy?
+---------------------------------------
Quiver's expected accuracy is a function of coverage and chemistry.
The C2 chemistry (no longer available), P6-C4 and P4-C2 chemistries
provide the most accuracy. Nominal consensus accuracy levels are as
@@ -146,23 +172,29 @@ of 99.999%. These accuracy expectations are based on routine
validations performed on multiple bacterial genomes before each
chemistry release.
-What are the residual errors after applying Quiver?
----------------------------------------------------
+
+What is the expected accuracy from `arrow`
+------------------------------------------
+`arrow` achieves similar accuracy to `quiver`. Numbers will be published soon.
+
+
+What are the residual errors after applying `quiver`?
+-----------------------------------------------------
If there are errors remaining applying Quiver, they will almost
invariably be homopolymer run-length errors (insertions or deletions).
-Does Quiver need to know what sequencing chemistry was used?
-----------------------------------------------------------
+Does `quiver`/`arrow` need to know what sequencing chemistry was used?
+----------------------------------------------------------------------
At present, the Quiver model is trained per-chemistry, so it is very
important that Quiver knows the sequencing chemistries used.
-If SMRT Analysis software was used to build the `cmp.h5` file, the
+If SMRT Analysis software was used to build the `cmp.h5` or BAM input file, the
`cmp.h5` will be loaded with information about the sequencing
-chemistry used for each SMRT Cell, and Quiver will automatically
+chemistry used for each SMRT Cell, and GenomicConsensus will automatically
identify the right parameters to use.
If custom software was used to build the `cmp.h5`, or an
@@ -174,21 +206,28 @@ chemistry or model must be explicity entered. For example::
-Can a mix of chemistries be used in a cmp.h5 file for Quiver?
------------------------------------------------------------
+Can a mix of chemistries be used in a cmp.h5 file for quiver/arrow?
+-------------------------------------------------------------------
-Yes! Quiver automatically sees the chemistry *per-SMRT Cell*, so it
+Yes! GenomicConsensus tools automatically see the chemistry *per-SMRT Cell*, so it
can figure out the right parameters for each read and model them
appropriately.
-Chemistry mixtures of P6-C4, P4-C2, P5-C3, and C2 are supported. If
-other chemistries are mixed in a `cmp.h5`, Quiver will give undefined
-results. However, Quiver can still be used on any `cmp.h5` file
-containing sequencing reads from a single chemistry.
+
+What chemistries and chemistry mixes are supported?
+---------------------------------------------------
+
+
+For Quiver: all PacBio RS chemistries are supported. Chemistry
+mixtures of P6-C4, P4-C2, P5-C3, and C2 are supported.
+
+For Arrow: the RS chemistry P6-C4, and all PacBio Sequel chemistries
+are supported. Mixes of these chemistries are supported.
+
-What are the QVs that Quiver uses?
-------------------------------------
+What are the QVs that the Quiver model uses?
+--------------------------------------------
Quiver uses additional QV tracks provided by the basecaller.
These QVs may be looked at as little breadcrumbs that are left behind by
the basecaller to help identify positions where it was likely that
@@ -212,20 +251,20 @@ lacking some of these tracks, Quiver will still run, though it will
issue a warning that its performance will be suboptimal.
-Why is Quiver making errors in some region?
--------------------------------------------
-The most likely cause for *true* errors made by Quiver is that the
+Why is `quiver`/`arrow` making errors in some region?
+-----------------------------------------------------
+The most likely cause for *true* errors made by these tools is that the
coverage in the region was low. If there is 5x coverage over a
1000-base region, then 10 errors in that region can be expected.
It is important to understand that the effective coverage available to
-Quiver is not the full coverage apparent in plots---Quiver and
-Plurality both filter out ambiguously mapped reads by default. The
+`quiver`/`arrow` is not the full coverage apparent in plots---the tools
+filter out ambiguously mapped reads by default. The
remaining coverage after filtering is called the /effective coverage/.
See the next section for discussion of `MapQV`.
If you have verified that there is high effective coverage in the region
-in question, it is highly possible---given the high accuracy Quiver
+in question, it is highly possible---given the high accuracy quiver and arrow
can achieve---that the apparent errors actually
reflect true sequence variants. Inspect the FASTQ output file to
ensure that the region was called at high confidence; if an erroneous
@@ -271,13 +310,13 @@ calls. Quiver and Plurality both filter out aligned reads with a
MapQV below 20 (by default), so as not to call a variant using data of
uncertain genomic origin.
-This can be problematic if using Quiver to get a consensus
+This can be problematic if using quiver/arrow to get a consensus
sequence. If the genome of interest contains long (relative to the library
insert size) highly-similar repeats, the effective coverage (after
`MapQV` filtering) may be reduced in the repeat regions---this is termed
these `MapQV` dropouts. If the coverage is sufficiently reduced in
-these regions, Quiver will not call consensus in these regions---see
-`What does Quiver do for genomic regions with no effective coverage?`_.
+these regions, quiver/arrow will not call consensus in these regions---see
+`What do quiver/arrow do for genomic regions with no effective coverage?`_.
If you want to use ambiguously mapped reads in computing a consensus
for a denovo assembly, the `MapQV` filter can be turned off entirely.
@@ -292,14 +331,14 @@ How can the `MapQV` filter be turned off and when should it be?
--------------------------------------------------------------
The `MapQV` filter can be disabled using the flag
``--mapQvThreshold=0`` (shorthand: ``-m=0``). If running a
-Quiver job via SMRT Portal, this can be done by unchecking the "Use
+quiver/arrow job via SMRT Portal, this can be done by unchecking the "Use
only unambiguously mapped reads" option. Consider this in
de novo assembly projects, but it is not recommended for variant
calling applications.
-How can variant calls made by Quiver be inspected or validated?
---------------------------------------------------------------
+How can variant calls made by quiver/arrow be inspected or validated?
+---------------------------------------------------------------------
When in doubt, it is easiest to inspect the region in a tool like
SMRT View, which enables you to view the reads aligned to the region.
Deletions and substitutions should be fairly easy to spot; to view
@@ -307,10 +346,10 @@ insertions, right-click on the reference base and select "View
Insertions Before...".
-What are the filtering parameters that Quiver uses?
----------------------------------------------------
+What are the filtering parameters that quiver/arrow use?
+--------------------------------------------------------
-Quiver limits read coverage, filters reads by `MapQV`, and filters
+The available options limit read coverage, filters reads by `MapQV`, and filters
variants by quality and coverage.
- The overall read coverage used to call consensus in every window is
@@ -325,24 +364,24 @@ variants by quality and coverage.
What happens when the sample is a mixture, or diploid?
-----------------------------------------------------
-At present, Quiver assumes a haploid sample, and the behavior of
-*Quiver* on sample mixtures or diploid/polyploid samples is
+At present, quiver/arrow assume a haploid sample, and the behavior of
+on sample mixtures or diploid/polyploid samples is
*undefined*. The program will not crash, but the output results are
not guaranteed to accord with any one of the haplotypes in the sample,
as opposed to a potential patchwork.
-Why would I want to *iterate* the mapping+Quiver process?
----------------------------------------------------------
-Some customers using Quiver for polishing highly repetitive genomes
-have found that if they take the consensus FASTA output of Quiver, use
+Why would I want to *iterate* the mapping+(quiver/arrow) process?
+-----------------------------------------------------------------
+Some customers using quiver for polishing highly repetitive genomes
+have found that if they take the consensus FASTA output of quiver, use
it as a new reference, and then perform mapping and Quiver again to
get a new consensus, they get improved results from the second round
-of Quiver.
+of quiver.
This can be explained by noting that the output of the first round of
-Quiver is more accurate than the initial draft consensus output by the
-assembler, so the second round's mapping to the Quiver consensus can
+quiver is more accurate than the initial draft consensus output by the
+assembler, so the second round's mapping to the quiver consensus can
be more sensitive in mapping reads from repetitive regions. This can
then result in improved consensus in those repetitive regions, because
the reads have been assigned more correctly to their true genomic
@@ -351,7 +390,7 @@ of reads around from one rounds' mapping to the next might alter
borderline (low confidence) consensus calls even away from repetitive
regions.
-We recommend the (mapping+Quiver) iteration for customers polishing
+We recommend the (mapping+quiver) iteration for customers polishing
repetitive genomes, and it could also prove useful for resequencing
applications. However we caution that this is very much an
*exploratory* procedure and we make no guarantees about its
@@ -360,9 +399,9 @@ when the procedure is iterated, and the procedure is *not* guaranteed
to be convergent.
-Is iterating the (mapping+Quiver) process a convergent procedure?
------------------------------------------------------------------
-We have seen many examples where (mapping+Quiver), repeated many
+Is iterating the (mapping+quiver/arrow) process a convergent procedure?
+-----------------------------------------------------------------------
+We have seen many examples where (mapping+quiver), repeated many
times, is evidently *not* a convergent procedure. For example, a
variant call may be present in iteration n, absent in n+1, and then
present again in n+2. It is possible for subtle changes in mapping to
@@ -373,4 +412,5 @@ calls.
-.. _HowToQuiver: https://github.com/PacificBiosciences/GenomicConsensus/blob/master/doc/HowToQuiver.rst
+.. _HowTo: ./HowTo.rst
+.. _`HGAP paper`: http://www.nature.com/nmeth/journal/v10/n6/full/nmeth.2474.html
diff --git a/doc/HowTo.rst b/doc/HowTo.rst
new file mode 100644
index 0000000..538f707
--- /dev/null
+++ b/doc/HowTo.rst
@@ -0,0 +1,128 @@
+
+How to install and use GenomicConsensus
+=======================================
+
+**We recommend that you obtain GenomicConsensus by installing the most
+recent version of SMRTanalysis. Other means of installation are not
+officially supported.**
+
+
+Basic running instructions
+--------------------------
+
+Basic usage---using 8 CPUs to compute consensus of mapped reads and
+variants relative to a reference---is as follows::
+
+ % quiver -j8 aligned_reads{.cmp.h5, .bam, .fofn, or .xml} \
+ > -r reference{.fasta or .xml} -o variants.gff \
+ > -o consensus.fasta -o consensus.fastq
+
+``quiver`` is a shortcut for ``variantCaller --algorithm=quiver``.
+Naturally, to use arrow you could use the ``arrow`` shortcut or
+``variantCaller --algorithm=arrow``.
+
+in this example we perform haploid consensus and variant calling on
+the mapped reads in the ``aligned_reads.bam`` which was aligned to
+``reference.fasta``. The ``reference.fasta`` is only used for
+designating variant calls, not for computing the consensus. The
+consensus quality score for every position can be found in the output
+FASTQ file.
+
+*Note that 2.3 SMRTanalysis does not support "dataset" input (FOFN
+or XML files); those who need this feature should wait for the forthcoming
+release of SMRTanalysis 3.0 or build from GitHub sources.*
+
+
+Running a large-scale resequencing/polishing job in SMRTanalysis 2.3
+--------------------------------------------------------------------
+
+To run a large-scale resequencing job (>50 megabase genome @ 50x
+coverage,nominally), you want to spread the computation load across
+multiple nodes in your computing cluster.
+
+The `smrtpipe` workflow engine in SMRTanalysis 2.3 provides a
+convenient workflow automating this---it will automatically spread the
+load for both mapping and quiver jobs among your available cluster
+nodes. This is accessible via the SMRTportal UI; the simplest way to
+set up and run thse workflows is via tha UI. Nonetheless, we include
+command-line instructions for completeness.
+
+If you have to run the `smrtpipe` workflow manually from the command
+line, a recipe is as folows::
+
+1. Make sure the reference you will align and compare against is
+ present in a SMRTportal "reference repository". Even if you
+ don't want to use SMRTportal, you need to build/import the
+ reference appropriately, and the simplest way to do that is
+ via SMRTportal. If you don't have a SMRTportal instance,
+ you can use the ``referenceUploader`` command to prepare your
+ reference repository.
+
+2. Prepare an "input.fofn" file listing, one-per-line, each "bax.h5"
+ file in your input data set.
+
+3. Convert the "input.fofn" to an "input.xml" file that SMRTpipe can
+ understand::
+
+ $ fofnToSmrtpipeInput.py input.fofn > input.xml
+
+4. Prepare your "params.xml" file. Here is a `params.xml template`_
+ you can use; you should just need to edit the reference path.
+
+5. Activate your SMRTanalysis environment, and invoke smrtpipe::
+
+ $ source <SMRT Analysis>/etc/setup.sh
+ $ smrtpipe.py --distribute --params=params.xml xml:input.xml
+
+6. After successful execution is complete, the results should be
+ available as `data/consensus.fast[aq].gz` and
+ `data/variants.gff.gz`, etc.
+
+Please consult the `SMRTpipe reference manual`_ for further information.
+
+*Note that resequencing (mapping reads against a reference genome and
+then calling consensus and identifying variants) and polishing
+(mapping reads against a draft assembly and then taking the consensus
+output as the final, polished, assembly) are the same algorithmic
+operation, the only effective difference is that the "variants.gff"
+output is not biologically meaningful in the polishing case---it just
+records the edits that were made to the draft to produce the polished
+assembly.*
+
+Running a large-scale quiver/arrow job in SMRTanalysis 3.0+
+-----------------------------------------------------------
+
+(Forthcoming)
+
+
+Building bleeding-edge code (unsupported)
+----------------------------------------
+
+If you need to access the the latest code for some reason, a
+convenient way to build it is to use PacBio's pitchfork_ build
+system, which will take care of all third party dependencies for you.
+Here's a recipe::
+
+ git clone git at github.com:PacificBiosciences/pitchfork.git
+ cd pitchfork
+ make GenomicConsensus # may take some time, as it builds dependencies...
+
+Now, with GenomicConsensus built, you can use it via::
+
+ bash --init-file deployment/setup-env.sh # Puts you in a subshell where your build is available
+ quiver --help # now you have quiver, arrow, etc. available
+
+If you encounter build issues using `pitchfork`, please report the
+issues there. Note that you can deploy PacBio software to a location
+of your choice using pitchfork.
+
+
+Further questions?
+------------------
+
+Please consult the `FAQ document`_.
+
+.. _`FAQ document`: ./FAQ.rst
+.. _pitchfork : https://github.com/PacificBiosciences/pitchfork
+.. _`params.xml template`: ./params-template.xml
+.. _`SMRTpipe reference manual`: http://www.pacb.com/wp-content/uploads/2015/09/SMRT-Pipe-Reference-Guide.pdf
diff --git a/doc/HowToQuiver.rst b/doc/HowToQuiver.rst
deleted file mode 100644
index 136c3b9..0000000
--- a/doc/HowToQuiver.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-
-How to install and use Quiver
-=============================
-
-Quiver is bundled in SMRTanalysis version 1.4 and later. The easiest
-way to get Quiver is to install the most recent version of SMRTanalysis.
-
-If you want to install Quiver as a standalone package from the latest
-bleeding-edge code, follow the instructions below.
-
-*Note: please install this software on an isolated machine that does
-not have SMRTanalysis installed. Older versions of SMRTanalysis
-pollute the ``PYTHONPATH``, which has the undesirable effect of
-overriding ``virtualenv``-installed modules.*
-
-Background
-----------
-**Quiver** is an algorithm for calling highly accurate consensus from
-multiple PacBio reads, using a pair-HMM exploiting both the basecalls
-and QV metrics to infer the true underlying DNA sequence.
-
-Quiver is available through the ``quiver`` script from the
-``GenomicConsensus`` package. To use Quiver, the following PacBio
-software is required.
-
-- ``GenomicConsensus``, containing ``quiver``
-- ``ConsensusCore``, a C++ library containing the core computational
- routines for Quiver
-- ``pbcore``, a package providing access to PacBio data files
-
-
-Required libraries and tools
-----------------------------
-To install the PacBio software, the following are required:
-
-- Boost >= 1.4.7 (standard C++ libraries)
-- SWIG >= 2.0.7 (library wrapper generator)
-- Python 2.7.3
-- virtualenv (builds isolated Python environments)
-
-If you are within PacBio, these requirements are already installed
-within the cluster environment.
-
-Otherwise, you will need to install them yourself. The automatic
-installation script requires that the ``swig`` executable is in your
-UNIX ``$PATH`` and that your boost installation can be found under
-``/usr/include`` or ``/usr/local``.
-
-
-Data file requirements
-----------------------
-
-To make the most accurate consensus calls possible, Quiver makes use
-of a battery of quality value metrics calculated by the basecaller.
-If you are using a SMRTportal installation verision 1.4 or later, then
-SMRTportal will load all the information Quiver needs, so you
-can skip the rest of this section.
-
-In SMRTportal versions 1.3.3 and prior, by default only a subset of
-these quality values are included in the ``.cmp.h5`` files produced by
-SMRTanalysis. To get a ``.cmp.h5`` with all the QVs loaded, you will
-need to use the ``RS_Mapping_QVs`` protocol to create a ``cmp.h5``
-file for Quiver.
-
-If you are using an older version than SMRTportal/SMRTanalysis 1.3.3,
-please upgrade.
-
-
-Installation instructions
--------------------------
-
-Step 1: Set up your Python environment
-``````````````````````````````````````
-I recommend using a Python *virtualenv* to isolate your sandbox.
-
-To set up a new virtualenv, do ::
-
- $ cd; virtualenv -p python2.7 --no-site-packages VE-QUIVER
-
-and activate the virtualenv using ::
-
- $ source ~/VE-QUIVER/bin/activate
-
-
-Step 2: Install PacBio libraries
-````````````````````````````````
-To install the PacBio software, execute ::
-
- $ pip install pbcore
-
- $ git clone https://github.com/PacificBiosciences/ConsensusCore
- $ cd ConsensusCore; python setup.py install --swig=$SWIG --boost=$BOOST
- $ pip install git+https://github.com/PacificBiosciences/GenomicConsensus
-
-where you replace ``$SWIG`` with the path to your ``swig`` executable
-and ``$BOOST`` with the path to your boost install (the top level
-directory). (Note that if SWIG is in your ``$PATH`` and boost is in
-``/usr/local`` or ``/usr/include/``, you do not need to specify these
-flags on the command line---``setup.py`` will find them).
-
-
-Step 3: Run Quiver
-``````````````````
-Those who wish to call consensus on a resequencing job can simply use
-the ``quiver`` script that has been installed in your
-virtualenv (from `GenomicConsensus`).
-
-For example, ::
-
- $ quiver -j8 aligned_reads.bam \
- > -r path/to/lambda.fasta \
- > -o variants.gff -o consensus.fasta
-
-will use 8 CPUs to run Quiver on ``aligned_reads.bam``, outputting
-the consensus sequence and variants.
-
-Note that if you are using a cmp.h5 file and have not used the `RS_Mapping_QVs`
-protocol to generate that file---or if the source bas.h5 file was generated by
-pre-1.3.1 instrument software---the cmp.h5 will not contain the full battery of
-QV metrics required for optimal Quiver accuracy. The command will still work,
-but it will give a warning that its accuracy will be suboptimal.
-
-Step 3.5: Run Quiver on Multiple Input Files
-````````````````````````````````````````````
-Multiple alignment files in a FOFN (File of File Names) can be quivered against
-a single reference (`GenomicConsensus` >= 1.1.0).
-
-An example input FOFN::
-
- $ cat aligned_reads.fofn
- /path/to/reads1.bam
- /path/to/reads2.bam
-
-can be used instead of a reads file::
-
- $ quiver -j8 aligned_reads.fofn \
- > -r path/to/lambda.fasta \
- > -o variants.gff -o consensus.fasta
-
-Quiver can also be used with DataSet XML files. See pbcore for details on
-generating new DataSet XML files for your alignment files.
-
-Step 4: Highly-accurate assembly consensus
-``````````````````````````````````````````
-Quiver enables consensus accuracies on genome assemblies at accuracies
-approaching or even exceeding Q60 (one error per million bases). If
-you use the HGAP assembly protocol in SMRTportal 2.0 or later, Quiver
-runs automatically as the final "assembly polishing" step.
-
-
-Resources
----------
-Here is an `FAQ document`_ to address common issues.
-
-For a technical summary of some of the details of how Quiver works, I
-recommend reading the supplementary material of our 2013 *Nature
-Methods* `HGAP paper`_
-
-
-.. _`FAQ document`: https://github.com/PacificBiosciences/GenomicConsensus/blob/master/doc/QuiverFAQ.rst
-.. _`HGAP paper`: http://www.nature.com/nmeth/journal/v10/n6/full/nmeth.2474.html
diff --git a/doc/VariantCallerFunctionalSpecification.rst b/doc/VariantCallerFunctionalSpecification.rst
index d983b2e..07f6148 100644
--- a/doc/VariantCallerFunctionalSpecification.rst
+++ b/doc/VariantCallerFunctionalSpecification.rst
@@ -138,9 +138,10 @@ reports deviations from the reference as potential variants.
developed for CCS. Quiver leverages the quality values (QVs) provided by
upstream processing tools, which provide insight into whether
insertions/deletions/substitutions were deemed likely at a given read
-position. Use of **quiver** requires the ``ConsensusCore`` library as well as
-trained parameter set, which will be loaded from a standard location (TBD).
-Quiver can be thought of as a QV-aware local-realignment procedure.
+position. Use of **quiver** requires the ``ConsensusCore`` and ``ConsensusCore2``
+libraries as well as trained parameter set, which will be loaded from a
+standard location (TBD). Arrow and Quiver can be thought of as
+local-realignment procedures (QV-aware in the case of Quiver).
Both algorithms are expected to converge to *zero* errors (miscalled variants)
as coverage increases; however **quiver** should converge much faster (i.e.,
@@ -155,6 +156,8 @@ The ``GenomicConsensus`` module has two essential dependencies:
1. **pbcore**, the PacBio Python bioinformatics library
2. **ConsensusCore**, a C++ library with SWIG bindings that provides access to
the same algorithms used in circular consensus sequencing.
+3. **ConsensusCore2**, a C++ library with SWIG bindings that provides access to
+ the same algorithms used in circular consensus sequencing.
Both of these modules are easily installed using their ``setup.py`` scripts,
which is the canonical means of installing Python packages.
diff --git a/doc/params-template.xml b/doc/params-template.xml
new file mode 100644
index 0000000..9bdef03
--- /dev/null
+++ b/doc/params-template.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<smrtpipeSettings>
+ <protocol version="1.3.0">
+ <param name="reference">
+ <!-- Specifiy the path of the reference genome. Replace the path below.-->
+ <value>/mnt/secondary-siv/references/ecoli_split_1000</value>
+ </param>
+ </protocol>
+
+ <module id="P_Fetch" />
+
+ <module id="P_Filter">
+ <!-- Filter reads and subreads based on read length and quality value. -->
+ <param name="minLength"> <value>50</value> </param>
+ <param name="minSubReadLength"> <value>50</value> </param>
+ <param name="readScore"> <value>0.75</value> </param>
+ </module>
+
+ <module id="P_FilterReports"/>
+
+ <module id="P_Mapping">
+ <param name="maxHits"> <value>10 </value> </param>
+ <param name="maxDivergence"> <value>30 </value> </param>
+ <param name="minAnchorSize"> <value>12 </value> </param>
+ <param name="samBam"> <value>True</value> </param>
+ <param name="gff2Bed"> <value>True</value> </param>
+ <param name="placeRepeatsRandomly">
+ <value>True</value>
+ </param>
+ <param name="pbalign_opts">
+ <value>--seed=1 --minAccuracy=0.75 --minLength=50 --algorithmOptions='-useQuality' </value>
+ </param>
+ <param name="pulseMetrics">
+ <value>DeletionQV,IPD,InsertionQV,PulseWidth,QualityValue,MergeQV,SubstitutionQV,DeletionTag</value>
+ </param>
+ <param name="loadPulsesOpts"> <value>byread</value> </param>
+ </module>
+
+ <module id="P_MappingReports"/>
+
+ <module id="P_GenomicConsensus">
+ <param name="algorithm"> <value>quiver</value> </param>
+ <param name="outputConsensus"> <value>True </value> </param>
+ <param name="makeVcf"> <value>True </value> </param>
+ <param name="makeBed"> <value>True </value> </param>
+ <param name="enableMapQVFilter"> <value>True </value> </param>
+ </module>
+
+ <module id="P_ConsensusReports"/>
+ <!-- Generate reports for module P_GenomicConsensus. -->
+</smrtpipeSettings>
diff --git a/setup.py b/setup.py
index 28a734b..3e15261 100755
--- a/setup.py
+++ b/setup.py
@@ -20,16 +20,19 @@ setup(
'bin/gffToVcf',
'bin/gffToBed',
'bin/plurality',
- 'bin/quiver'],
+ 'bin/poa',
+ 'bin/quiver',
+ 'bin/arrow'],
packages = find_packages(),
package_data={'GenomicConsensus.quiver': ['resources/*/GenomicConsensus/*.ini']},
include_package_data=True,
zip_safe = False,
install_requires=[
- 'pbcore >= 1.2.2',
- 'pbcommand >= 0.2.0',
+ 'pbcore >= 1.2.9',
+ 'pbcommand >= 0.3.20',
'numpy >= 1.6.0',
'h5py >= 2.0.1',
- 'ConsensusCore >= 1.0.1',
+ 'ConsensusCore >= 1.0.1'
+ # , 'ConsensusCore2 >= 0.9',
]
)
diff --git a/tests/cram/arrow-all4mer.t b/tests/cram/arrow-all4mer.t
new file mode 100644
index 0000000..6680734
--- /dev/null
+++ b/tests/cram/arrow-all4mer.t
@@ -0,0 +1,38 @@
+Bite-sized quiver test using an All4Mers template!
+
+ $ export DATA=$TESTDIR/../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run arrow.
+
+ $ arrow $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+No variants!
+
+ $ egrep -v '^#' v.gff | cat
+
+Perfect consensus, no no-calls
+
+ $ cat css.fa
+ >All4mer.V2.01_Insert|arrow
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+
+ $ fold -60 css.fq
+ @All4mer.V2.01_Insert|arrow
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+ +
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~~~~~~~~~~~~~~~~~~~~
+
diff --git a/tests/cram/bad_input.t b/tests/cram/bad_input.t
index 58fdbe5..30d59e9 100644
--- a/tests/cram/bad_input.t
+++ b/tests/cram/bad_input.t
@@ -3,3 +3,9 @@ Test for sane behavior in the presence of bogus arguments.
$ variantCaller fake.alignmentset.xml -r fake.referenceset.xml -o test.fasta 2>&1 | tail -1
variantCaller: error: Input file */fake.alignmentset.xml not found. (glob)
+
+Test that it doesn't crash when one BAM file in an otherwise valid AlignmentSet is empty.
+
+ $ DATA=$TESTDIR/../data/sanity
+ $ REF="`python -c 'import pbcore.data ; print pbcore.data.getLambdaFasta()'`"
+ $ arrow --reference $REF -o contig.fasta $DATA/mixed.alignmentset.xml
diff --git a/tests/cram/best-all4mer.t b/tests/cram/best-all4mer.t
new file mode 100644
index 0000000..ef116c3
--- /dev/null
+++ b/tests/cram/best-all4mer.t
@@ -0,0 +1,23 @@
+Bite-sized quiver test using an All4Mers template!
+
+ $ export DATA=$TESTDIR/../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run arrow.
+
+ $ variantCaller --algorithm=best $INPUT -r $REFERENCE -o v.gff -o css.fa
+
+No variants!
+
+ $ egrep -v '^#' v.gff | cat
+
+Perfect consensus, no no-calls
+
+ $ cat css.fa
+ >All4mer.V2.01_Insert|quiver
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
diff --git a/tests/cram/internal/alignment_summary_scaling.t b/tests/cram/internal/alignment_summary_scaling.t
new file mode 100644
index 0000000..e20e454
--- /dev/null
+++ b/tests/cram/internal/alignment_summary_scaling.t
@@ -0,0 +1,25 @@
+
+Test performance of summarizeConsensus with large numbers of variants.
+
+ $ TESTDATA="/pbi/dept/secondary/siv/testdata/pbreports-unittest/data/summarizeConsensus"
+
+First test has 48000 regions total but hundreds of small contigs, with more
+than 5 million variants. This should not take more than 5 minutes or so
+unless an O(N^2) loop is used.
+
+ $ VARIANTS=$TESTDATA/variants.gff
+ $ SUMMARY=$TESTDATA/alignment_summary.gff
+ $ GFFREF=$TESTDATA/alignment_summary_variants.gff
+ $ OUTPUT=alignment_summary_variants_test.gff
+ $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
+ $ diff -I "##source-commandline" $OUTPUT $GFFREF
+
+Second test has 20000 regions in a single contig, and 10000 variants. This
+will also take several minutes.
+
+ $ VARIANTS=$TESTDATA/variants_big_chr.gff
+ $ SUMMARY=$TESTDATA/alignment_summary_big_chr.gff
+ $ GFFREF=$TESTDATA/alignment_summary_variants_big_chr.gff
+ $ OUTPUT=alignment_summary_variants_big_chr_test.gff
+ $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
+ $ diff -I "##source-commandline" $OUTPUT $GFFREF
diff --git a/tests/cram/internal/arrow-staph.t b/tests/cram/internal/arrow-staph.t
new file mode 100644
index 0000000..99eb11c
--- /dev/null
+++ b/tests/cram/internal/arrow-staph.t
@@ -0,0 +1,32 @@
+Compare quiver vs. arrow on a high SNR Staph job.
+
+ $ export INPUT=/mnt/secondary/Share/Quiver/TestData/staphHighSnr/aligned_subreads.fofn
+ $ export REFERENCE=/mnt/secondary/Share/Quiver/TestData/staph/S_aureus_USA300_TCH1516.fasta
+ $ export MASK=/mnt/secondary/Share/Quiver/GenomeMasks/S_aureus_USA300_TCH1516-mask.gff
+
+ $ quiver -j${JOBS-8} $INPUT -r $REFERENCE -o quiver-variants.gff -o quiver-css.fasta
+ $ arrow -j${JOBS-8} $INPUT -r $REFERENCE -o arrow-variants.gff -o arrow-css.fasta
+
+Quiver does a good job here---no errors.
+
+ $ gffsubtract.pl quiver-variants.gff $MASK | grep -v "#" | sed 's/\t/ /g'
+
+ $ fastacomposition quiver-css.fasta
+ quiver-css.fasta A 960233 C 470725 G 470271 T 971458
+
+Arrow, on the other hand, makes a number of errors.
+
+ $ gffsubtract.pl arrow-variants.gff $MASK | grep -v "#" | sed 's/\t/ /g'
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 42450 42450 . . . reference=A;variantSeq=.;coverage=100;confidence=68
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 442861 442861 . . . reference=T;variantSeq=.;coverage=100;confidence=64
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 562107 562107 . . . reference=T;variantSeq=.;coverage=100;confidence=52
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 690814 690814 . . . reference=T;variantSeq=.;coverage=100;confidence=46
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1041877 1041877 . . . reference=A;variantSeq=.;coverage=100;confidence=45
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1801427 1801427 . . . reference=T;variantSeq=.;coverage=100;confidence=51
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1840635 1840635 . . . reference=A;variantSeq=.;coverage=100;confidence=88
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1850379 1850379 . . . reference=T;variantSeq=.;coverage=100;confidence=40
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2147449 2147449 . . . reference=T;variantSeq=.;coverage=100;confidence=93
+ Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2231043 2231043 . . . reference=A;variantSeq=.;coverage=100;confidence=80
+
+ $ fastacomposition arrow-css.fasta
+ arrow-css.fasta A 960033 C 470641 G 470190 T 971286 a 178 c 84 g 80 t 158
diff --git a/tests/cram/internal/quiver-compatibility.t b/tests/cram/internal/quiver-compatibility.t
index 8e9ac39..7cb847c 100644
--- a/tests/cram/internal/quiver-compatibility.t
+++ b/tests/cram/internal/quiver-compatibility.t
@@ -7,7 +7,6 @@ First make sure we abort once we recognize the tiny fluidigm file is CCS data.
$ export INPUT=$DATA/fluidigm_amplicons/040500.cmp.h5
$ export REFERENCE=$DATA/fluidigm_amplicons/Fluidigm_human_amplicons.fasta
$ quiver -r $REFERENCE -o variants.gff $INPUT 2>1
- Failure: The Quiver algorithm requires a cmp.h5 file containing standard (non-CCS) reads.
[255]
@@ -16,7 +15,7 @@ Tiny lambda file. Make sure it recognizes this cmp.h5 has an imcomplete set of
$ export INPUT=/mnt/secondary/Share/Quiver/TestData/tinyLambda/aligned_reads_1.cmp.h5
$ export REFERENCE=/mnt/secondary/Share/Quiver/TestData/tinyLambda/lambdaNEB.fa
$ quiver -p C2.AllQVsModel -r $REFERENCE -o variants.gff $INPUT
- Failure: Selected Quiver parameter set is incompatible with this cmp.h5 file due to missing data tracks.
+ Failure: Selected Quiver parameter set is incompatible with this alignment file due to missing data tracks.
[255]
diff --git a/tests/cram/internal/quiver-ecoli.t b/tests/cram/internal/quiver-ecoli.t
index 91fcd71..3bebe1d 100644
--- a/tests/cram/internal/quiver-ecoli.t
+++ b/tests/cram/internal/quiver-ecoli.t
@@ -31,7 +31,7 @@ since I built the new reference.
##sequence-region ecoliK12_pbi_March2013 1 4642522
ecoliK12_pbi_March2013 . deletion 85 85 . . . reference=G;variantSeq=.;coverage=53;confidence=48
ecoliK12_pbi_March2013 . deletion 219 219 . . . reference=A;variantSeq=.;coverage=58;confidence=47
- ecoliK12_pbi_March2013 . insertion 1536 1536 . . . reference=.;variantSeq=C;coverage=91;confidence=50
+ ecoliK12_pbi_March2013 . insertion 1536 1536 . . . reference=.;variantSeq=C;coverage=91;confidence=47
No no-call windows.
diff --git a/tests/cram/internal/quiver-eichler-bac.t b/tests/cram/internal/quiver-eichler-bac.t
index 17a0a4d..eee21b5 100644
--- a/tests/cram/internal/quiver-eichler-bac.t
+++ b/tests/cram/internal/quiver-eichler-bac.t
@@ -20,7 +20,7 @@ MuMMer at the end are compared to the Sanger reference.
CH17-157L1 . deletion 805 805 . . . reference=T;variantSeq=.;coverage=100;confidence=47
CH17-157L1 . deletion 26174 26175 . . . reference=AC;variantSeq=.;coverage=100;confidence=48
CH17-157L1 . deletion 93356 93357 . . . reference=CG;variantSeq=.;coverage=100;confidence=49
- CH17-157L1 . insertion 230679 230679 . . . reference=.;variantSeq=A;coverage=100;confidence=49
+ CH17-157L1 . insertion 230679 230679 . . . reference=.;variantSeq=A;coverage=100;confidence=48
CH17-157L1 . insertion 230681 230681 . . . reference=.;variantSeq=CA;coverage=100;confidence=48
CH17-157L1 . insertion 230684 230684 . . . reference=.;variantSeq=C;coverage=100;confidence=48
diff --git a/tests/cram/internal/quiver-fluidigm-amplicons.t b/tests/cram/internal/quiver-fluidigm-amplicons.t
index 53c84d1..0ec1ea1 100644
--- a/tests/cram/internal/quiver-fluidigm-amplicons.t
+++ b/tests/cram/internal/quiver-fluidigm-amplicons.t
@@ -3,4 +3,4 @@
$ export REFERENCE=/mnt/secondary/Share/Quiver/TestData/fluidigmAmplicons/MET_EGFR_Full_Genes.fasta
$ quiver --noEvidenceConsensusCall=nocall -j${JOBS-8} $INPUT -r $REFERENCE \
> -o variants.gff -o css.fasta
- [WARNING] This .cmp.h5 file lacks some of the QV data tracks that are required for optimal performance of the Quiver algorithm. For optimal results use the ResequencingQVs workflow in SMRTPortal with bas.h5 files from an instrument using software version 1.3.1 or later, or the --forQuiver option to pbalign.
+ [WARNING] This alignment file file lacks some of the QV data tracks that are required for optimal performance of the Quiver algorithm. For optimal results use the ResequencingQVs workflow in SMRTPortal with bas.h5 files from an instrument using software version 1.3.1 or later, or the --forQuiver option to pbalign.
diff --git a/tests/cram/internal/quiver-mruber.t b/tests/cram/internal/quiver-mruber.t
index 8a14e6c..640ff7a 100644
--- a/tests/cram/internal/quiver-mruber.t
+++ b/tests/cram/internal/quiver-mruber.t
@@ -7,13 +7,13 @@ Inspect the variant calls.
$ grep -v "#" variants.gff | sed 's/\t/ /g'
M.ruber . substitution 357364 357364 . . . reference=C;variantSeq=T;coverage=100;confidence=47
- M.ruber . insertion 640716 640716 . . . reference=.;variantSeq=C;coverage=100;confidence=49
- M.ruber . insertion 1320669 1320669 . . . reference=.;variantSeq=C;coverage=100;confidence=49
+ M.ruber . insertion 640716 640716 . . . reference=.;variantSeq=C;coverage=100;confidence=48
+ M.ruber . insertion 1320669 1320669 . . . reference=.;variantSeq=C;coverage=100;confidence=48
M.ruber . deletion 1878514 1878953 . . . reference=AGGGCGTACTTCTTTTCGGGTGCAGATGCGTAGGCATCGTAGTTGAACAGGGTTTTGACCGCCATTGAGCACTCCTTTTACGGTTCCACAATGAGTTTGCTGATCATGTTGGCGTGGCCGATGCCGCAGTATTCGTTGCAGATGATGGGATACTCACCGGGTTTGCTGAAGGTGTAGCTGACCTTGGCAATTTCCCCCGGTATCACCTGTACGTTGATGTTGGTGTTGTGTACGTGGAAGCTGTGCTGCACATCGGGTGAGGTGATATAGAAGGTTACCTTCCTGCCCACCTTGAACCGCATCTCCGCTGGCAGGTAGCCAAAGGCAAAGGCCTGCACATAGGCCACGTACTCGTTGCCGACCTGCTCAACCCGTGGGTTGGCAAAGTCTCCCTCGGTGCGCACCTTGGTGGCGTCGATGCGGCCTGCCCCCACCGGGT [...]
- M.ruber . insertion 1987969 1987969 . . . reference=.;variantSeq=G;coverage=100;confidence=49
- M.ruber . insertion 2010700 2010700 . . . reference=.;variantSeq=T;coverage=100;confidence=48
- M.ruber . insertion 2070035 2070035 . . . reference=.;variantSeq=A;coverage=100;confidence=49
- M.ruber . insertion 2827713 2827713 . . . reference=.;variantSeq=T;coverage=100;confidence=49
+ M.ruber . insertion 1987969 1987969 . . . reference=.;variantSeq=G;coverage=100;confidence=48
+ M.ruber . insertion 2010700 2010700 . . . reference=.;variantSeq=T;coverage=100;confidence=47
+ M.ruber . insertion 2070035 2070035 . . . reference=.;variantSeq=A;coverage=100;confidence=47
+ M.ruber . insertion 2827713 2827713 . . . reference=.;variantSeq=T;coverage=100;confidence=48
M.ruber . deletion 2841287 2841301 . . . reference=AAGCACGCCGAGGGA;variantSeq=.;coverage=100;confidence=49
diff --git a/tests/cram/internal/quiver-tinyLambda-coverage-islands.t b/tests/cram/internal/quiver-tinyLambda-coverage-islands.t
index 97c34a0..c7a8cd7 100644
--- a/tests/cram/internal/quiver-tinyLambda-coverage-islands.t
+++ b/tests/cram/internal/quiver-tinyLambda-coverage-islands.t
@@ -15,9 +15,7 @@ These variant calls actually look reasonable given the reads, but the
confidences are too high. Fix this.
$ grep -v '#' variants.gff
- lambda_NEB3011\t.\tinsertion\t24781\t24781\t.\t.\t.\treference=.;variantSeq=T;coverage=6;confidence=57 (esc)
lambda_NEB3011\t.\tdeletion\t24878\t24878\t.\t.\t.\treference=A;variantSeq=.;coverage=16;confidence=43 (esc)
- lambda_NEB3011\t.\tinsertion\t30882\t30882\t.\t.\t.\treference=.;variantSeq=C;coverage=5;confidence=44 (esc)
$ fastacomposition css.fa
css.fa A 282 C 266 G 305 N 47361 T 281
diff --git a/tests/cram/plurality-all4mer.t b/tests/cram/plurality-all4mer.t
new file mode 100644
index 0000000..b5953de
--- /dev/null
+++ b/tests/cram/plurality-all4mer.t
@@ -0,0 +1,37 @@
+Bite-sized quiver test using an All4Mers template!
+
+ $ export DATA=$TESTDIR/../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run quiver.
+
+ $ plurality $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+No variants!
+
+ $ egrep -v '^#' v.gff | cat
+
+Perfect consensus, no no-calls.
+
+ $ cat css.fa
+ >All4mer.V2.01_Insert|plurality
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+
+ $ fold -60 css.fq
+ @All4mer.V2.01_Insert|plurality
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+ +
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+ IIIIIIIIIIIIIIIIIIII
diff --git a/tests/cram/plurality-hcv.t b/tests/cram/plurality-hcv.t
deleted file mode 100644
index afc0f3e..0000000
--- a/tests/cram/plurality-hcv.t
+++ /dev/null
@@ -1,55 +0,0 @@
-
-Run plurality on the small example file, and make sure the GFF and
-CSV output is correct.
-
-
- $ export DATA=$TESTDIR/../data
- $ export INPUT=$DATA/hcv/aligned_reads.cmp.h5
- $ export REFERENCE=$DATA/hcv/HCV_Ref_For_187140.fasta
- $ variantCaller --algorithm=plurality -q 10 -r $REFERENCE -o variants.gff -o consensus.fq $INPUT
-
-I like to show the head of the output files inline here so that glaringly obvious changes will
-pop right out, but I verify that the files are exactly correct by looking at the md5 sums.
-
-First, the variants.gff:
-
- $ cat variants.gff
- ##gff-version 3
- ##pacbio-variant-version 2.1
- ##date * (glob)
- ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12
- ##source GenomicConsensus * (glob)
- ##source-commandline * (glob)
- ##source-alignment-file * (glob)
- ##source-reference-file * (glob)
- ##sequence-region 5primeEnd 1 156
- ##sequence-region 3primeEnd 1 386
-
-
-Examine consensus output. This is identical to the reference
-
- $ fold -60 consensus.fq
- @5primeEnd|plurality
- GGAACCGGTGAGTACACCGGAATTGCCAGGACGACCGGGTCCTTTCGTGGATAAACCCGC
- TCAATGCCTGGAGATTTGGGCGTGCCCCCGCAAGACTGCTAGCCGAGTAGTGTTGGGTCG
- CGAAAGGCCTTGTGGTACTGCCTGATAGGGTGCTTG
- +
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- @3primeEnd|plurality
- TACCTGGTCATAGCCTCCGTGAAGGCTCTCAGGCTCGCTGCATCCTCCGGGACTCCCTGA
- CTTTCACAGATAACGACTAAGTCGTCGCCACACACGAGCATGGTGCAGTCCTGGAGCCCA
- GCGGCTCGACAGGCTGCTTTGGCCTTGATGTAGCAGGTGAGGGTGTTACCACAGCTGGTC
- GTCAGTACGCCGCTCGCGCGGCACCTGCGATAGCCGCAGTTTTCCCCCCTTGAATTAGTA
- AGAGGGCCCCCGACATAGAGCCTCTCGGTGAGGGACTTGATGGCCACGCGGGCTTGGGGG
- TCCAGGTCACAACATTGGTAAATTGCCTCCTCTGTACGGATATCGCTCTCAGTGACTGTG
- GAGTCAAAGCAGCGGGTATCATACGA
- +
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
- IIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/tests/cram/plurality-pbcore-lambda.t b/tests/cram/plurality-pbcore-lambda.t
deleted file mode 100644
index b0e4593..0000000
--- a/tests/cram/plurality-pbcore-lambda.t
+++ /dev/null
@@ -1,17 +0,0 @@
-
-Run plurality on the lambda file in pbcore.
-
- $ export CMPH5=`python -c "from pbcore import data as D; print D.getBamAndCmpH5()[1]"`
-# $ export BAM=`python -c "from pbcore import data as D; print D.getBamAndCmpH5()[0]"`
- $ export REF=`python -c "from pbcore import data as D; print D.getLambdaFasta()"`
-
- $ plurality $CMPH5 -r $REF -o css-cmph5.fa -o v-cmph5.gff
- $ cat v-cmph5.gff | grep -v "#" | sed 's/ / /g'
- lambda_NEB3011 . deletion 4945 4945 . . . reference=C;variantSeq=.;frequency=4;coverage=7;confidence=40
-
-# $ plurality $BAM -r $REF -o css-bam.fa -o v-bam.gff
-# [WARNING] 'fancyChunking' not yet available for BAM, disabling
-# $ cat v-bam.gff | grep -v "#" | sed 's/ / /g'
-# lambda_NEB3011 . deletion 4945 4945 . . . reference=C;variantSeq=.;frequency=4;coverage=7;confidence=40
-
-# $ diff css-cmph5.fa css-bam.fa
diff --git a/tests/cram/poa-all4mer.t b/tests/cram/poa-all4mer.t
new file mode 100644
index 0000000..8eb55ff
--- /dev/null
+++ b/tests/cram/poa-all4mer.t
@@ -0,0 +1,37 @@
+Bite-sized quiver test using an All4Mers template!
+
+ $ export DATA=$TESTDIR/../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run quiver.
+
+ $ poa $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+No variants!
+
+ $ egrep -v '^#' v.gff | cat
+
+Perfect consensus, no no-calls.
+
+ $ cat css.fa
+ >All4mer.V2.01_Insert|poa
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+
+ $ fold -60 css.fq
+ @All4mer.V2.01_Insert|poa
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+ +
+ 555555555555555555555555555555555555555555555555555555555555
+ 555555555555555555555555555555555555555555555555555555555555
+ 555555555555555555555555555555555555555555555555555555555555
+ 555555555555555555555555555555555555555555555555555555555555
+ 55555555555555555555
diff --git a/tests/cram/quiver-all4mer.t b/tests/cram/quiver-all4mer.t
new file mode 100644
index 0000000..d33ae79
--- /dev/null
+++ b/tests/cram/quiver-all4mer.t
@@ -0,0 +1,37 @@
+Bite-sized quiver test using an All4Mers template!
+
+ $ export DATA=$TESTDIR/../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run quiver.
+
+ $ quiver $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+No variants!
+
+ $ egrep -v '^#' v.gff | cat
+
+Perfect consensus, no no-calls.
+
+ $ cat css.fa
+ >All4mer.V2.01_Insert|quiver
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+
+ $ fold -60 css.fq
+ @All4mer.V2.01_Insert|quiver
+ CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTT
+ TCCTGTCCACCTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACA
+ AAATGGAAGCCGGGGCCTTAAACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAAC
+ TCTCCCAGTCAATATACCAACACATCGTGGGACGGATTGCAGAGCGAATCTATCCGCGCT
+ CGCATAATTTAGTGTTGATC
+ +
+ "PPQQPQQPRPQRSRQRRRRPQQQRSPRRRQQRPRQQPRRSRPSQQRQQQTQRRPQPQQS
+ SRQQRRQTRQRRSRRQQRRTSRRRRRQQSQQQPRTRRQQRQQRSRRRSPSQSSRQQQPRR
+ SRRRRSRSSRSRRTRRSSRPQTQQRQQQRRRSQUQQQRQQQURTSQRQQPRTQSRQQQQR
+ RQSRTRPPRQQRPRQRRTRRQSRQQRQQPSQQQPRRRTRQQPRQQRSRRRSQQQQRQRQQ
+ RPRQQRRSQRQRQQSSSQQQ
diff --git a/tests/cram/quiver-hcv.t b/tests/cram/quiver-hcv.t
deleted file mode 100644
index 7232429..0000000
--- a/tests/cram/quiver-hcv.t
+++ /dev/null
@@ -1,66 +0,0 @@
-
-Bite-sized Quiver test using the HCV dataset
-
- $ export DATA=$TESTDIR/../data
- $ export INPUT=$DATA/hcv/aligned_reads.cmp.h5
- $ export REFERENCE=$DATA/hcv/HCV_Ref_For_187140.fasta
-
-Quiver actually makes one error here, which is kind of disappointing,
-but this data is from a really ancient instrument-software version, so
-I'm not all that surprised.
-
- $ quiver -p unknown -x0 -q0 $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
-
- $ cat v.gff
- ##gff-version 3
- ##pacbio-variant-version 2.1
- ##date * (glob)
- ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.12
- ##source GenomicConsensus * (glob)
- ##source-commandline * (glob)
- ##source-alignment-file * (glob)
- ##source-reference-file * (glob)
- ##sequence-region 5primeEnd 1 156
- ##sequence-region 3primeEnd 1 386
- 3primeEnd\t.\tdeletion\t296\t296\t.\t.\t.\treference=G;variantSeq=.;coverage=92;confidence=4 (esc)
-
-
- $ cat css.fa
- >5primeEnd|quiver
- GGAACCGGTGAGTACACCGGAATTGCCAGGACGACCGGGTCCTTTCGTGGATAAACCCGC
- TCAATGCCTGGAGATTTGGGCGTGCCCCCGCAAGACTGCTAGCCGAGTAGTGTTGGGTCG
- CGAAAGGCCTTGTGGTACTGCCTGATAGGGTGCTTG
- >3primeEnd|quiver
- TACCTGGTCATAGCCTCCGTGAAGGCTCTCAGGCTCGCTGCATCCTCCGGGACTCCCTGA
- CTTTCACAGATAACGACTAAGTCGTCGCCACACACGAGCATGGTGCAGTCCTGGAGCCCA
- GCGGCTCGACAGGCTGCTTTGGCCTTGATGTAGCAGGTGAGGGTGTTACCACAGCTGGTC
- GTCAGTACGCCGCTCGCGCGGCACCTGCGATAGCCGCAGTTTTCCCCCCTTGAATTAGTA
- AGAGGGCCCCCGACATAGAGCCTCTCGGTGAGGGACTTGATGGCCACGCGGGCTTGGGGT
- CCAGGTCACAACATTGGTAAATTGCCTCCTCTGTACGGATATCGCTCTCAGTGACTGTGG
- AGTCAAAGCAGCGGGTATCATACGA
-
- $ fold -60 css.fq
- @5primeEnd|quiver
- GGAACCGGTGAGTACACCGGAATTGCCAGGACGACCGGGTCCTTTCGTGGATAAACCCGC
- TCAATGCCTGGAGATTTGGGCGTGCCCCCGCAAGACTGCTAGCCGAGTAGTGTTGGGTCG
- CGAAAGGCCTTGTGGTACTGCCTGATAGGGTGCTTG
- +
- "RPRQSQRPQQPQPQPQQOQQQPQQQQQQRPRQQPPQRQPQRRRRQPPQRQPPQQPPQQQ
- QQPRPQPRQPQPQQQRRPRQQQPQPRQQRRQPRPPPQQQPQRQQQPQPPQQQQQPQQQRQ
- PQQRQPRPRPRQQQRQQQQQPRQQQPQQRRQQQRRQ
- @3primeEnd|quiver
- TACCTGGTCATAGCCTCCGTGAAGGCTCTCAGGCTCGCTGCATCCTCCGGGACTCCCTGA
- CTTTCACAGATAACGACTAAGTCGTCGCCACACACGAGCATGGTGCAGTCCTGGAGCCCA
- GCGGCTCGACAGGCTGCTTTGGCCTTGATGTAGCAGGTGAGGGTGTTACCACAGCTGGTC
- GTCAGTACGCCGCTCGCGCGGCACCTGCGATAGCCGCAGTTTTCCCCCCTTGAATTAGTA
- AGAGGGCCCCCGACATAGAGCCTCTCGGTGAGGGACTTGATGGCCACGCGGGCTTGGGGT
- CCAGGTCACAACATTGGTAAATTGCCTCCTCTGTACGGATATCGCTCTCAGTGACTGTGG
- AGTCAAAGCAGCGGGTATCATACGA
- +
- "QQRQQQPQPQPQPRQQRPQPPRPQQQQRQPQRQQQQQPQQQQQRQRQRQQPQQPRRPQP
- QPQQQQQPQQQPSQQQPQQQPRQQPQQQRQPQQQQQQPQPQQQQQQRQPPRPRRQRRRQQ
- QQPRPQQRRQQRRQQQQPQRQRPQPRQQPQQPPQRQRQQRPRQPPQQQQQQPPQRQQQQQ
- QPQQQQRPQRRQPQQQPQQPPPQPQPQQQPQQQQQPQQQQQQR5QPQQQRQPPQPRPQQQ
- QRQRSRJQQSPQQPPPPQPQPQQQPQQRQQPRQRQQQQQQRPQQQQPQQQRQPRQ%RQQQ
- PQPQRQQQRPQQQQRPQRQRQPQQQQQPRPQQQQRPPQQQPQQQQOQQQPQQQPQPQRRR
- PPQPPPRPQPRQQQPPQQPQPRQQQ
diff --git a/tests/cram/version.t b/tests/cram/version.t
index 7233c86..7df81c8 100644
--- a/tests/cram/version.t
+++ b/tests/cram/version.t
@@ -2,7 +2,7 @@ This actually failed once because of a missing import, so we might as
well test it.
$ variantCaller --version
- 1.1.0
+ 2.0.0
This will break if the parser setup is messed up.
diff --git a/tests/data/all4mer/All4mer.V2.01_Insert.fa b/tests/data/all4mer/All4mer.V2.01_Insert.fa
new file mode 100644
index 0000000..2f3139d
--- /dev/null
+++ b/tests/data/all4mer/All4mer.V2.01_Insert.fa
@@ -0,0 +1,5 @@
+>All4mer.V2.01_Insert
+CATCAGGTAAGAAAGTACGATGCTACAGCTTGTGACTGGTGCGGCACTTTTGGCTGAGTTTCCTGTCCAC
+CTCATGTATTCTGCCCTAACGTCGGTCTTCACGCCATTACTAGACCGACAAAATGGAAGCCGGGGCCTTA
+AACCCCGTTCGAGGCGTAGCAAGGAGATAGGGTTATGAACTCTCCCAGTCAATATACCAACACATCGTGG
+GACGGATTGCAGAGCGAATCTATCCGCGCTCGCATAATTTAGTGTTGATC
diff --git a/tests/data/all4mer/All4mer.V2.01_Insert.fa.fai b/tests/data/all4mer/All4mer.V2.01_Insert.fa.fai
new file mode 100644
index 0000000..454afd7
--- /dev/null
+++ b/tests/data/all4mer/All4mer.V2.01_Insert.fa.fai
@@ -0,0 +1 @@
+All4mer.V2.01_Insert 260 22 70 71
diff --git a/tests/data/all4mer/README b/tests/data/all4mer/README
new file mode 100644
index 0000000..7f43825
--- /dev/null
+++ b/tests/data/all4mer/README
@@ -0,0 +1,5 @@
+How I made this lil' file (next time write a Makefile!)
+
+ $ baxSieve --whitelist hole-numbers.txt /mnt/secondary/Share/VariantCalling/CCS/P6-C4.EnzymologyTemplates/2770691/0001/Analysis_Results/m141008_060349_42194_c100704972550000001823137703241586_s1_p0.1.bax.h5 out.bas.h5
+ $ bax2bam out.bas.h5 -o out
+ $ pbalign out.subreads.bam All4mer.V2.01_Insert.fa out.aligned_subreads.bam
diff --git a/tests/data/all4mer/hole-numbers.txt b/tests/data/all4mer/hole-numbers.txt
new file mode 100644
index 0000000..cb0429a
--- /dev/null
+++ b/tests/data/all4mer/hole-numbers.txt
@@ -0,0 +1,14 @@
+14
+28
+29
+32
+47
+50
+56
+59
+60
+63
+72
+88
+90
+92
diff --git a/tests/data/all4mer/out.aligned_subreads.bam b/tests/data/all4mer/out.aligned_subreads.bam
new file mode 100644
index 0000000..e52680a
Binary files /dev/null and b/tests/data/all4mer/out.aligned_subreads.bam differ
diff --git a/tests/data/all4mer/out.aligned_subreads.bam.pbi b/tests/data/all4mer/out.aligned_subreads.bam.pbi
new file mode 100644
index 0000000..d961493
Binary files /dev/null and b/tests/data/all4mer/out.aligned_subreads.bam.pbi differ
diff --git a/tests/data/sanity/empty.subreads.bam b/tests/data/sanity/empty.subreads.bam
new file mode 100644
index 0000000..18f0531
Binary files /dev/null and b/tests/data/sanity/empty.subreads.bam differ
diff --git a/tests/data/sanity/empty.subreads.bam.pbi b/tests/data/sanity/empty.subreads.bam.pbi
new file mode 100644
index 0000000..e398d79
Binary files /dev/null and b/tests/data/sanity/empty.subreads.bam.pbi differ
diff --git a/tests/data/sanity/mapped.subreads.bam b/tests/data/sanity/mapped.subreads.bam
new file mode 100644
index 0000000..952f90c
Binary files /dev/null and b/tests/data/sanity/mapped.subreads.bam differ
diff --git a/tests/data/sanity/mapped.subreads.bam.pbi b/tests/data/sanity/mapped.subreads.bam.pbi
new file mode 100644
index 0000000..9487243
Binary files /dev/null and b/tests/data/sanity/mapped.subreads.bam.pbi differ
diff --git a/tests/data/sanity/mixed.alignmentset.xml b/tests/data/sanity/mixed.alignmentset.xml
new file mode 100644
index 0000000..d14f1bd
--- /dev/null
+++ b/tests/data/sanity/mixed.alignmentset.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<AlignmentSet CreatedAt="2016-01-22T09:46:49" MetaType="PacBio.DataSet.AlignmentSet" Name="pacbio_dataset_alignmentset-160122_174649911" Tags="" TimeStampedName="pacbio_dataset_alignmentset-160122_174649911" UniqueId="fa6fa3e1-288c-67ca-aea6-80807f4a18f2" Version="3.0.1" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosci [...]
+ <pbbase:ExternalResources>
+ <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="empty.subreads.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-160122_174649910" UniqueId="419c15b3-f8d6-4f8c-8fd4-9fee9062f178">
+ <pbbase:FileIndices>
+ <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="empty.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-160122_174649910" UniqueId="6baa3899-8214-49c0-bb5a-cd583a4a7038"/>
+ </pbbase:FileIndices>
+ </pbbase:ExternalResource>
+ <pbbase:ExternalResource MetaType="PacBio.AlignmentFile.AlignmentBamFile" ResourceId="mapped.subreads.bam" TimeStampedName="pacbio_alignmentfile_alignmentbamfile-160122_174649910" UniqueId="6407de5f-ace6-4fb5-92f6-edfa1beaf29d">
+ <pbbase:FileIndices>
+ <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="mapped.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-160122_174649910" UniqueId="393c8765-532d-4be2-8d33-3a255b4b66a7"/>
+ </pbbase:FileIndices>
+ </pbbase:ExternalResource>
+ </pbbase:ExternalResources>
+ <DataSetMetadata>
+ <TotalLength>1506</TotalLength>
+ <NumRecords>1</NumRecords>
+ </DataSetMetadata>
+</AlignmentSet>
diff --git a/tests/unit/AlignmentHitStubs.py b/tests/unit/AlignmentHitStubs.py
index 4983c9b..9d551b0 100644
--- a/tests/unit/AlignmentHitStubs.py
+++ b/tests/unit/AlignmentHitStubs.py
@@ -20,9 +20,9 @@ def ungappedPulseArray(a):
else:
raise Exception, "Invalid pulse array type"
-def _makePulseFeatureAccessor(featureName):
+def _makeBaseFeatureAccessor(featureName):
def f(self, aligned=True, orientation="native"):
- return self.pulseFeature(featureName, aligned=aligned, orientation=orientation)
+ return self.baseFeature(featureName, aligned=aligned, orientation=orientation)
return f
class AlignmentHitStub(object):
@@ -40,9 +40,9 @@ class AlignmentHitStub(object):
self._reference = np.fromstring(nativeReference, dtype="S1")
self._read = np.fromstring(read, dtype="S1")
- self._pulseFeatures = {}
+ self._baseFeatures = {}
for featureName, feature in kwargs.iteritems():
- self._pulseFeatures[featureName] = feature
+ self._baseFeatures[featureName] = feature
def read(self, aligned=True, orientation="native"):
val = self._read
@@ -73,12 +73,12 @@ class AlignmentHitStub(object):
def spansReferencePosition(self, refPos):
return self.referenceStart <= refPos < self.referenceEnd
- def pulseFeature(self, featureName, aligned=True, orientation="native"):
- data = self._pulseFeatures[featureName]
+ def baseFeature(self, featureName, aligned=True, orientation="native"):
+ data = self._baseFeatures[featureName]
if orientation == "genomic" and self.reverseStrand:
data = data[::-1]
if not aligned:
- data = ungappedPulseArray(data)
+ data = ungappedBaseArray(data)
return data
def clippedTo(self, refStart, refEnd):
@@ -86,14 +86,14 @@ class AlignmentHitStub(object):
assert refStart <= self.referenceStart <= self.referenceEnd <= refEnd
return self
- IPD = _makePulseFeatureAccessor("IPD")
- PulseWidth = _makePulseFeatureAccessor("PulseWidth")
- QualityValue = _makePulseFeatureAccessor("QualityValue")
- InsertionQV = _makePulseFeatureAccessor("InsertionQV")
- DeletionQV = _makePulseFeatureAccessor("DeletionQV")
- DeletionTag = _makePulseFeatureAccessor("DeletionTag")
- MergeQV = _makePulseFeatureAccessor("MergeQV")
- SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+ IPD = _makeBaseFeatureAccessor("IPD")
+ PulseWidth = _makeBaseFeatureAccessor("PulseWidth")
+ QualityValue = _makeBaseFeatureAccessor("QualityValue")
+ InsertionQV = _makeBaseFeatureAccessor("InsertionQV")
+ DeletionQV = _makeBaseFeatureAccessor("DeletionQV")
+ DeletionTag = _makeBaseFeatureAccessor("DeletionTag")
+ MergeQV = _makeBaseFeatureAccessor("MergeQV")
+ SubstitutionQV = _makeBaseFeatureAccessor("SubstitutionQV")
def __repr__(self):
return "Stub 0x%x" % id(self)
@@ -107,7 +107,7 @@ FORWARD, REVERSE = False, True
def _(s):
"""
Decode an ASCII-art representation of
- a pulse feature.
+ a base feature.
Spec: string -> np.array(dtype=float32)
diff --git a/tests/unit/test_algorithm_selection.py b/tests/unit/test_algorithm_selection.py
new file mode 100644
index 0000000..65e8e89
--- /dev/null
+++ b/tests/unit/test_algorithm_selection.py
@@ -0,0 +1,14 @@
+
+from nose.tools import assert_equal as EQ
+from GenomicConsensus.algorithmSelection import bestAlgorithm_
+
+
+def test_algorithm_selection():
+ EQ("quiver", bestAlgorithm_(["P6-C4"]))
+ EQ("quiver", bestAlgorithm_(["P6-C4", "P5-C3"]))
+ EQ("arrow", bestAlgorithm_(["S/P1-C1/beta"]))
+ EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1/beta"]))
+ EQ(None, bestAlgorithm_(["P6-C4", "unknown"]))
+ EQ("arrow", bestAlgorithm_(["S/P1-C1"]))
+ EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1"]))
+ EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1"])) # (Arrow pres. no training for P5. But it will tell us that)
diff --git a/tests/unit/test_quiver.py b/tests/unit/test_quiver.py
index 9cb3bbf..3925780 100644
--- a/tests/unit/test_quiver.py
+++ b/tests/unit/test_quiver.py
@@ -22,6 +22,25 @@ class TestVariantsFromAlignment(object):
vs = utils.variantsFromAlignment(a, (1, 1000, 2000))
assert_equal([ Variant(1, 1002, 1004, "TT", "GG") ], vs)
+ def testVariantsFromAlignment4(self):
+ a = PairwiseAlignment("GA-TACA", "GATTACA")
+ qvs = [0, 0, 1, 0, 0, 0, 0]
+ vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs)
+ assert_equal([ Variant(1, 1002, 1002, "", "T", confidence=1) ], vs)
+
+ def testVariantsFromAlignment5(self):
+ a = PairwiseAlignment("-ATTACA", "GATTACA")
+ qvs = [1, 0, 0, 0, 0, 0, 0]
+ vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs)
+ assert_equal([ Variant(1, 1000, 1000, "", "G", confidence=1) ], vs)
+
+ def testVariantsFromAlignment6(self):
+ a = PairwiseAlignment("GATTAC-", "GATTACA")
+ qvs = [0, 0, 0, 0, 0, 0, 1]
+ vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs)
+ assert_equal([ Variant(1, 1006, 1006, "", "A", confidence=1) ], vs)
+
+
def testNoCallBasesInReference1(self):
a = PairwiseAlignment("GATTNGATT", "GAGGATATT")
vs = utils.variantsFromAlignment(a, (1, 1000, 2000))
@@ -43,3 +62,32 @@ class TestVariantsFromAlignment(object):
vs = utils.variantsFromAlignment(a, (1, 1000, 2000))
assert_equal([ Variant(1, 1002, 1003, "T", "G"),
Variant(1, 1005, 1006, "C", "G") ], vs)
+
+
+ # Deletions expose that the way we calculate confidences for
+ # deletion variants is *broken*. In the current setup, we have no
+ # choice but to pull a confidence out of thin air, or pull it from
+ # a nearby base---both incorrect! The *right* way to do it is to
+ # properly compare the two hypotheses, which is possible since we
+ # retain the mss/ai.
+
+ # ... here, for now, we are primarily just testing that we don't
+ # crash.
+
+ def testDeletionConfidence1(self):
+ a = PairwiseAlignment("GATTACA", "GATTAC-")
+ qvs = [0, 0, 0, 0, 0, 1]
+ vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs)
+ assert_equal([ Variant(1, 1006, 1007, "A", "", confidence=1) ], vs)
+
+ def testDeletionConfidence2(self):
+ a = PairwiseAlignment("GATTACA", "-ATTACA")
+ qvs = [1, 0, 0, 0, 0, 0]
+ vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs)
+ assert_equal([ Variant(1, 1000, 1001, "G", "", confidence=1) ], vs)
+
+ def testDeletionConfidence3(self):
+ a = PairwiseAlignment("AT", "--")
+ qvs = []
+ vs = utils.variantsFromAlignment(a, (1, 1000, 1002), qvs)
+ assert_equal([ Variant(1, 1000, 1002, "AT", "") ], vs)
diff --git a/tests/unit/test_summarize_consensus.py b/tests/unit/test_summarize_consensus.py
new file mode 100644
index 0000000..b1d2718
--- /dev/null
+++ b/tests/unit/test_summarize_consensus.py
@@ -0,0 +1,78 @@
+
+"""
+Test summarizeConsensus with synthetic inputs, which should include all
+boundary conditions.
+"""
+
+import subprocess
+import tempfile
+import unittest
+import os
+
+
+VARIANTS = """\
+chr1\t.\tsubstitution\t100\t100\t.\t.\t.\treference=C;variantSeq=A;coverage=38;confidence=48
+chr1\t.\tsubstitution\t5001\t5001\t.\t.\t.\treference=C;variantSeq=A;coverage=38;confidence=48
+chr1\t.\tsubstitution\t10000\t10000\t.\t.\t.\treference=A;variantSeq=G;coverage=38;confidence=48
+chr1\t.\tinsertion\t12000\t12000\t.\t.\t.\treference=.;variantSeq=G;coverage=38;confidence=48
+chr1\t.\tdeletion\t15001\t15001\t.\t.\t.\treference=T;variantSeq=.;coverage=38;confidence=49
+chr2\t.\tdeletion\t20000\t20000\t.\t.\t.\treference=T;variantSeq=.;coverage=38;confidence=49
+chr2\t.\tsubstitution\t23469\t23469\t.\t.\t.\treference=T;variantSeq=A;coverage=38;confidence=47
+chr3\t.\tinsertion\t0\t0\t.\t.\t.\treference=.;variantSeq=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA;coverage=23;confidence=93
+chr3\t.\tsubstitution\t3469\t3469\t.\t.\t.\treference=C;variantSeq=A;coverage=32;confidence=40"""
+
+SUMMARY = """\
+chr1\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=4,23,28;cov2=20.162,5.851;gaps=0,0
+chr1\t.\tregion\t5001\t10000\t0.00\t+\t.\tcov=18,24,29;cov2=24.009,2.316;gaps=0,0
+chr1\t.\tregion\t10001\t15000\t0.00\t+\t.\tcov=17,25,29;cov2=24.182,2.596;gaps=0,0
+chr1\t.\tregion\t15001\t20000\t0.00\t+\t.\tcov=20,34,49;cov2=33.714,7.143;gaps=0,0
+chr2\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=18,49,80;cov2=49.018,10.125;gaps=0,0
+chr2\t.\tregion\t10000\t23469\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24
+chr3\t.\tregion\t1\t7000\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24"""
+
+EXPECTED = """\
+##source GenomicConsensus 2.0.0
+##pacbio-alignment-summary-version 0.6
+##source-commandline this line will be skipped in the comparison
+chr1\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=4,23,28;cov2=20.162,5.851;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=1
+chr1\t.\tregion\t5001\t10000\t0.00\t+\t.\tcov=18,24,29;cov2=24.009,2.316;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=2
+chr1\t.\tregion\t10001\t15000\t0.00\t+\t.\tcov=17,25,29;cov2=24.182,2.596;gaps=0,0;cQv=20,20,20;del=0;ins=1;sub=0
+chr1\t.\tregion\t15001\t20000\t0.00\t+\t.\tcov=20,34,49;cov2=33.714,7.143;gaps=0,0;cQv=20,20,20;del=1;ins=0;sub=0
+chr2\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=18,49,80;cov2=49.018,10.125;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=0
+chr2\t.\tregion\t10000\t23469\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24;cQv=20,20,20;del=1;ins=0;sub=1
+chr3\t.\tregion\t1\t7000\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24;cQv=20,20,20;del=0;ins=33;sub=1"""
+
+class TestSummarizeConsensus(unittest.TestCase):
+
+ def setUp(self):
+ self.variants_gff = tempfile.NamedTemporaryFile(suffix=".gff").name
+ self.summary_gff = tempfile.NamedTemporaryFile(suffix=".gff").name
+ with open(self.variants_gff, "w") as gff:
+ gff.write(VARIANTS)
+ with open(self.summary_gff, "w") as gff:
+ gff.write(SUMMARY)
+
+ def tearDown(self):
+ os.remove(self.variants_gff)
+ os.remove(self.summary_gff)
+
+ def test_integration(self):
+ gff_out = tempfile.NamedTemporaryFile(suffix=".gff").name
+ args = [
+ "summarizeConsensus",
+ "--variants", self.variants_gff,
+ "--output", gff_out,
+ self.summary_gff
+ ]
+ self.assertEqual(subprocess.call(args), 0)
+ with open(gff_out) as gff:
+ lines = gff.read().splitlines()
+ expected_lines = EXPECTED.splitlines()
+ self.assertEqual(len(lines), len(expected_lines))
+ for a, b in zip(lines, expected_lines):
+ if not a.startswith("##source-commandline"):
+ self.assertEqual(a, b)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/unit/test_tool_contract.py b/tests/unit/test_tool_contract.py
index 6764dd1..357ada3 100644
--- a/tests/unit/test_tool_contract.py
+++ b/tests/unit/test_tool_contract.py
@@ -3,6 +3,7 @@ import unittest
import os.path
from pbcore.io import openDataSet, ContigSet
+import pbcore.data
import pbcommand.testkit
# XXX local data directory, absolutely required
@@ -12,16 +13,15 @@ assert os.path.isdir(DATA_DIR)
# optional (but required for TestSummarizeConsensus)
DATA_DIR_2 = "/mnt/secondary/Share/Quiver/TestData/tinyLambda/"
- at unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
- "Missing /pbi/dept/secondary/siv/testdata")
-class TestQuiver(pbcommand.testkit.PbTestApp):
+class TestVariantCaller(pbcommand.testkit.PbTestApp):
DRIVER_BASE = "variantCaller "
DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
REQUIRES_PBCORE = True
INPUT_FILES = [
- "/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/0007_tiny/Alignment_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.alignmentset.xml",
- "/pbi/dept/secondary/siv/references/lambdaNEB/sequence/lambdaNEB.fasta"
+ pbcore.data.getBamAndCmpH5()[0],
+# "/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/0007_tiny/Alignment_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.alignmentset.xml",
+ pbcore.data.getLambdaFasta()
]
TASK_OPTIONS = {
"genomic_consensus.task_options.min_coverage": 0,
@@ -36,6 +36,12 @@ class TestQuiver(pbcommand.testkit.PbTestApp):
self.assertTrue(isinstance(ds, ContigSet))
+class TestVariantCallerArrow(TestVariantCaller):
+ TASK_OPTIONS = {
+ "genomic_consensus.task_options.algorithm": "arrow",
+ }
+
+
class TestGffToBed(pbcommand.testkit.PbTestApp):
DRIVER_BASE = "gffToBed "
DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
@@ -45,7 +51,6 @@ class TestGffToBed(pbcommand.testkit.PbTestApp):
os.path.join(DATA_DIR, "converters", "variants.gff.gz"),
]
TASK_OPTIONS = {
- "genomic_consensus.task_options.gff2bed_purpose": "variants",
"genomic_consensus.task_options.track_name": "None",
"genomic_consensus.task_options.track_description": "None",
"genomic_consensus.task_options.use_score": 0,
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git
More information about the debian-med-commit
mailing list