[med-svn] [pbgenomicconsensus] 01/05: Imported Upstream version 2.1.0
Afif Elghraoui
afif at moszumanska.debian.org
Sat Jan 21 21:59:11 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository pbgenomicconsensus.
commit db3cde2805ef6dbe0e35728aece4924f583061b0
Author: Afif Elghraoui <afif at debian.org>
Date: Sat Jul 23 15:54:41 2016 -0700
Imported Upstream version 2.1.0
---
CHANGELOG | 6 +
GenomicConsensus/__init__.py | 2 +-
GenomicConsensus/arrow/arrow.py | 52 ++++++--
GenomicConsensus/arrow/evidence.py | 161 +++++++++++++-----------
GenomicConsensus/arrow/model.py | 49 ++++----
GenomicConsensus/arrow/utils.py | 77 ++++++++----
GenomicConsensus/main.py | 11 +-
GenomicConsensus/options.py | 10 +-
circle.yml | 4 +
doc/HowTo.rst | 8 +-
tests/cram/bad_input.t | 2 +-
tests/cram/extra/arrow-evidence.t | 28 +++++
tests/cram/internal/alignment_summary_scaling.t | 4 +-
tests/cram/internal/arrow-staph.t | 14 +--
tests/cram/version.t | 2 +-
tests/unit/test_algorithm_selection.py | 4 +-
tests/unit/test_summarize_consensus.py | 2 +-
tests/unit/test_tool_contract.py | 72 +++++------
18 files changed, 315 insertions(+), 193 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index 1e0cfea..eea0fd8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+Version 2.1.0
+ * Major fixes for arrow
+ * Documentation for arrow
+ * Update for substantially-refactored ConsensusCore2
+ * --dumpEvidence support for Arrow
+
Version 2.0.0
* Working support for Arrow and POA-only consensus models
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py
index 0e66fb7..68ac61b 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/__init__.py
@@ -30,4 +30,4 @@
# Author: David Alexander
-__VERSION__ = "2.0.0"
+__VERSION__ = "2.1.0"
diff --git a/GenomicConsensus/arrow/arrow.py b/GenomicConsensus/arrow/arrow.py
index ced2e25..5016ddf 100755
--- a/GenomicConsensus/arrow/arrow.py
+++ b/GenomicConsensus/arrow/arrow.py
@@ -30,7 +30,7 @@
# Authors: David Alexander, Lance Hepler
-import logging
+import logging, os.path
import ConsensusCore2 as cc, numpy as np
from .. import reference
@@ -41,8 +41,9 @@ from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread
from GenomicConsensus.consensus import Consensus, ArrowConsensus, join
from GenomicConsensus.windows import kSpannedIntervals, holes, subWindow
from GenomicConsensus.variants import filterVariants, annotateVariants
-from GenomicConsensus.arrow.evidence import dumpEvidence
+from GenomicConsensus.arrow.evidence import ArrowEvidence
from GenomicConsensus.arrow import diploid
+from GenomicConsensus.utils import die
import GenomicConsensus.arrow.model as M
import GenomicConsensus.arrow.utils as U
@@ -68,7 +69,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
alnHits = U.readsInWindow(alnFile, refWindow,
depthLimit=20000,
minMapQV=arrowConfig.minMapQV,
- strategy="longest",
+ strategy="long-and-strand-balanced",
stratum=options.readStratum,
barcode=options.barcode)
starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
@@ -98,7 +99,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
alns = U.readsInWindow(alnFile, subWin,
depthLimit=depthLimit,
minMapQV=arrowConfig.minMapQV,
- strategy="longest",
+ strategy="long-and-strand-balanced",
stratum=options.readStratum,
barcode=options.barcode)
clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
@@ -133,14 +134,26 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
variants += filteredVars
# Dump?
- shouldDumpEvidence = \
+ maybeDumpEvidence = \
((options.dumpEvidence == "all") or
+ (options.dumpEvidence == "outliers") or
(options.dumpEvidence == "variants") and (len(variants) > 0))
- if shouldDumpEvidence:
- logging.info("Arrow does not yet support --dumpEvidence")
-# dumpEvidence(options.evidenceDirectory,
-# subWin, windowRefSeq,
-# clippedAlns, css)
+ if maybeDumpEvidence:
+ refId, refStart, refEnd = subWin
+ refName = reference.idToName(refId)
+ windowDirectory = os.path.join(
+ options.evidenceDirectory,
+ refName,
+ "%d-%d" % (refStart, refEnd))
+ ev = ArrowEvidence.fromConsensus(css)
+ if options.dumpEvidence != "outliers":
+ ev.save(windowDirectory)
+ elif (np.max(np.abs(ev.delta)) > 20):
+ # Mathematically I don't think we should be seeing
+ # deltas > 6 in magnitude, but let's just restrict
+ # attention to truly bonkers outliers.
+ ev.save(windowDirectory)
+
else:
css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
subWin, intRefSeq)
@@ -238,12 +251,29 @@ def configure(options, alnFile):
if options.diploid:
logging.warn("Diploid analysis not yet supported under Arrow model.")
+ # test available chemistries
+ supp = set(cc.SupportedChemistries())
+ logging.info("Found consensus models for: ({0})".format(", ".join(sorted(supp))))
+
+ used = set(alnFile.sequencingChemistry)
+ if options.parametersSpec != "auto":
+ used = set([options.parametersSpec])
+
+ unsupp = used - supp
+ if unsupp:
+ die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp))))
+
+ logging.info("Using consensus models for: ({0})".format(", ".join(sorted(used))))
+
return M.ArrowConfig(minMapQV=options.minMapQV,
noEvidenceConsensus=options.noEvidenceConsensusCall,
computeConfidence=(not options.fastMode),
minReadScore=options.minReadScore,
minHqRegionSnr=options.minHqRegionSnr,
- minZScore=options.minZScore)
+ minZScore=options.minZScore,
+ minAccuracy=options.minAccuracy,
+ chemistryOverride=(None if options.parametersSpec == "auto"
+ else options.parametersSpec))
def slaveFactories(threaded):
# By default we use slave processes. The tuple ordering is important.
diff --git a/GenomicConsensus/arrow/evidence.py b/GenomicConsensus/arrow/evidence.py
index 450f9c5..03d008f 100755
--- a/GenomicConsensus/arrow/evidence.py
+++ b/GenomicConsensus/arrow/evidence.py
@@ -30,8 +30,7 @@
# Authors: David Alexander, Lance Hepler
-__all__ = [ "dumpEvidence",
- "ArrowEvidence" ]
+__all__ = [ "ArrowEvidence" ]
import h5py, logging, os.path, numpy as np
from collections import namedtuple
@@ -41,60 +40,7 @@ from pbcore.io import FastaReader, FastaWriter
from .utils import scoreMatrix
from .. import reference
-def dumpEvidence(evidenceDumpBaseDirectory,
- refWindow, refSequence, alns,
- arrowConsensus):
- # Format of evidence dump:
- # evidence_dump/
- # ref000001/
- # 0-1005/
- # reference.fa
- # reads.fa
- # consensus.fa
- # arrow-scores.h5
- # 995-2005/
- # ...
- join = os.path.join
- refId, refStart, refEnd = refWindow
- refName = reference.idToName(refId)
- windowDirectory = join(evidenceDumpBaseDirectory,
- refName,
- "%d-%d" % (refStart, refEnd))
- logging.info("Dumping evidence to %s" % (windowDirectory,))
-
- if os.path.exists(windowDirectory):
- raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory
- os.makedirs(windowDirectory)
- refFasta = FastaWriter(join(windowDirectory, "reference.fa"))
- readsFasta = FastaWriter(join(windowDirectory, "reads.fa"))
- consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))
-
- windowName = refName + (":%d-%d" % (refStart, refEnd))
- refFasta.writeRecord(windowName, refSequence)
- refFasta.close()
-
- consensusFasta.writeRecord(windowName + "|arrow", arrowConsensus.sequence)
- consensusFasta.close()
-
- rowNames, columnNames, baselineScores, scores = scoreMatrix(arrowConsensus.ai)
- arrowScoreFile = h5py.File(join(windowDirectory, "arrow-scores.h5"))
- arrowScoreFile.create_dataset("Scores", data=scores)
- vlen_str = h5py.special_dtype(vlen=str)
- arrowScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
- arrowScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str)
- arrowScoreFile.create_dataset("BaselineScores", data=baselineScores)
- arrowScoreFile.close()
- for aln in alns:
- readsFasta.writeRecord(str(aln.rowNumber),
- aln.read(orientation="genomic", aligned=False))
- readsFasta.close()
-
-
class ArrowEvidence(object):
- """
- An experimental reader class for arrow evidence dumps produced by
- arrow --dumpEvidence
- """
Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase"))
@@ -105,9 +51,9 @@ class ArrowEvidence(object):
type, fromBase, _, toBase = fields[1:]
return ArrowEvidence.Mutation(pos, type, fromBase, toBase)
- def __init__(self, path, refStart, consensus, rowNames, colNames, baselineScores, scores):
- self.path = path
- self.refStart = refStart
+ def __init__(self, refWindow, consensus, rowNames, colNames, baselineScores, scores):
+ assert isinstance(consensus, str)
+ self.refWindow = refWindow # tuple(str, int, int)
self.consensus = consensus
self.rowNames = rowNames
self.colNames = colNames
@@ -115,6 +61,27 @@ class ArrowEvidence(object):
self.scores = scores
self.muts = map(ArrowEvidence._parseMutName, self.colNames)
+ @staticmethod
+ def fromConsensus(css):
+ rowNames, colNames, baselineScores, scores = scoreMatrix(css.ai)
+ return ArrowEvidence(css.refWindow,
+ css.sequence,
+ rowNames,
+ colNames,
+ baselineScores,
+ scores)
+ @property
+ def refName(self):
+ return self.refWindow[0]
+
+ @property
+ def refStart(self):
+ return self.refWindow[1]
+
+ @property
+ def refEnd(self):
+ return self.refWindow[2]
+
@property
def positions(self):
return [ mut.Position for mut in self.muts ]
@@ -124,33 +91,75 @@ class ArrowEvidence(object):
return sorted(list(set(self.positions)))
@property
- def totalScores(self):
- return self.baselineScores[:, np.newaxis] + self.scores
+ def delta(self):
+ return self.scores - self.baselineScores[:, np.newaxis]
@staticmethod
- def load(path):
- if path.endswith("/"): path = path[:-1]
-
- refWin_ = path.split("/")[-1].split("-")
- refStart = int(refWin_[0])
-
- with FastaReader(path + "/consensus.fa") as fr:
+ def load(dir):
+ """
+ Load an ArrowEvidence from a directory
+ """
+ if dir.endswith("/"): dir = dir[:-1]
+ refStart, refEnd = map(int, dir.split("/")[-1].split("-"))
+ refName = dir.split("/")[-2]
+ refWindow = (refName, refStart, refEnd)
+ with FastaReader(dir + "/consensus.fa") as fr:
consensus = next(iter(fr)).sequence
-
- with h5py.File(path + "/arrow-scores.h5", "r") as f:
+ with h5py.File(dir + "/arrow-scores.h5", "r") as f:
scores = f["Scores"].value
baselineScores = f["BaselineScores"].value
colNames = f["ColumnNames"].value
rowNames = f["RowNames"].value
- return ArrowEvidence(path, refStart, consensus,
+ return ArrowEvidence(refWindow, consensus,
rowNames, colNames,
baselineScores, scores)
+ def save(self, dir):
+ """
+ Save this ArrowEvidence to a directory. The directory will be
+ *created* by this method.
+
+ Format of evidence dump:
+ evidence_dump/
+ ref000001/
+ 0-1005/
+ consensus.fa
+ arrow-scores.h5
+ 995-2005/
+ ...
+ """
+ logging.info("Dumping evidence to %s" % (dir,))
+ join = os.path.join
+ if os.path.exists(dir):
+ raise Exception, "Evidence dump does not expect directory %s to exist." % dir
+ os.makedirs(dir)
+ #refFasta = FastaWriter(join(dir, "reference.fa"))
+ #readsFasta = FastaWriter(join(dir, "reads.fa"))
+ consensusFasta = FastaWriter(join(dir, "consensus.fa"))
+ windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
+ #refFasta.writeRecord(windowName, self.refSequence)
+ #refFasta.close()
+
+ consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
+ consensusFasta.close()
+
+ arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
+ arrowScoreFile.create_dataset("Scores", data=self.scores)
+ vlen_str = h5py.special_dtype(vlen=str)
+ arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str)
+ arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str)
+ arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores)
+ arrowScoreFile.close()
+ # for aln in alns:
+ # readsFasta.writeRecord(str(aln.rowNumber),
+ # aln.read(orientation="genomic", aligned=False))
+ # readsFasta.close()
+
+
def forPosition(self, pos):
posStart = bisect_left(self.positions, pos)
posEnd = bisect_right(self.positions, pos)
- return ArrowEvidence(self.path,
- self.refStart,
+ return ArrowEvidence(self.refStart,
self.consensus,
self.rowNames,
self.colNames[posStart:posEnd],
@@ -160,8 +169,7 @@ class ArrowEvidence(object):
def justSubstitutions(self):
colMask = np.array(map(lambda s: ("Sub" in s), self.colNames))
- return ArrowEvidence(self.path,
- self.refStart,
+ return ArrowEvidence(self.refStart,
self.consensus,
self.rowNames,
self.colNames[colMask],
@@ -169,5 +177,6 @@ class ArrowEvidence(object):
self.scores[:, colMask])
def rowNumbers(self):
- with FastaReader(self.path + "/reads.fa") as fr:
- return [ int(ctg.name) for ctg in fr ]
+ # with FastaReader(self.dir + "/reads.fa") as fr:
+ # return [ int(ctg.name) for ctg in fr ]
+ raise NotImplementedError
diff --git a/GenomicConsensus/arrow/model.py b/GenomicConsensus/arrow/model.py
index 2138c65..a1c8f1d 100755
--- a/GenomicConsensus/arrow/model.py
+++ b/GenomicConsensus/arrow/model.py
@@ -64,7 +64,9 @@ class ArrowConfig(object):
readStumpinessThreshold=0.1,
minReadScore=0.75,
minHqRegionSnr=3.75,
- minZScore=-3.5):
+ minZScore=-3.5,
+ minAccuracy=0.82,
+ chemistryOverride=None):
self.minMapQV = minMapQV
self.minPoaCoverage = minPoaCoverage
@@ -78,32 +80,37 @@ class ArrowConfig(object):
self.minReadScore = minReadScore
self.minHqRegionSnr = minHqRegionSnr
self.minZScore = minZScore
+ self.minAccuracy = minAccuracy
+ self.chemistryOverride = chemistryOverride
- @staticmethod
- def extractFeatures(aln):
- """
- Extract the data in a cmp.h5 alignment record into a
- native-orientation gapless string.
- """
- if isinstance(aln, CmpH5Alignment):
- die("Arrow does not support CmpH5 files!")
- else:
- return aln.read(aligned=False, orientation="native")
-
- @staticmethod
- def extractMappedRead(aln, windowStart):
+ def extractMappedRead(self, aln, windowStart):
"""
Given a clipped alignment, convert its coordinates into template
space (starts with 0), bundle it up with its features as a
MappedRead.
"""
+ if isinstance(aln, CmpH5Alignment):
+ die("Arrow does not support CmpH5 files!")
+
assert aln.referenceSpan > 0
+
+ def baseFeature(featureName):
+ if aln.reader.hasBaseFeature(featureName):
+ rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native")
+ return rawFeature.clip(0,255).astype(np.uint8)
+ else:
+ return np.zeros((aln.qLen,), dtype=np.uint8)
+
name = aln.readName
chemistry = aln.sequencingChemistry
- strand = cc.StrandEnum_REVERSE if aln.isReverseStrand else cc.StrandEnum_FORWARD
- read = cc.Read(name, ArrowConfig.extractFeatures(aln), chemistry)
- return (cc.MappedRead(read,
- strand,
- int(aln.referenceStart - windowStart),
- int(aln.referenceEnd - windowStart)),
- cc.SNR(aln.hqRegionSnr))
+ strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD
+ read = cc.Read(name,
+ aln.read(aligned=False, orientation="native"),
+ cc.Uint8Vector(baseFeature("Ipd").tolist()),
+ cc.Uint8Vector(baseFeature("PulseWidth").tolist()),
+ cc.SNR(aln.hqRegionSnr),
+ chemistry if self.chemistryOverride is None else self.chemistryOverride)
+ return cc.MappedRead(read,
+ strand,
+ int(aln.referenceStart - windowStart),
+ int(aln.referenceEnd - windowStart))
diff --git a/GenomicConsensus/arrow/utils.py b/GenomicConsensus/arrow/utils.py
index c8bbe8e..8277fd9 100755
--- a/GenomicConsensus/arrow/utils.py
+++ b/GenomicConsensus/arrow/utils.py
@@ -132,8 +132,8 @@ def refineConsensus(ai, arrowConfig):
cfg = cc.PolishConfig(arrowConfig.maxIterations,
arrowConfig.mutationSeparation,
arrowConfig.mutationNeighborhood)
- isConverged, nTested, nApplied = cc.Polish(ai, cfg)
- return str(ai), isConverged
+ polishResult = cc.Polish(ai, cfg)
+ return str(ai), polishResult.hasConverged
def consensusConfidence(ai, positions=None):
"""
@@ -142,7 +142,7 @@ def consensusConfidence(ai, positions=None):
omitted, confidence values are returned for all positions in the
consensus (str(ai)).
"""
- return np.array(np.clip(cc.ConsensusQVs(ai), 0, 93), dtype=np.uint8)
+ return np.array(np.clip(cc.ConsensusQualities(ai), 0, 93), dtype=np.uint8)
def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
"""
@@ -233,9 +233,9 @@ def _shortMutationDescription(mut, tpl):
"""
_type = _typeMap[mut.Type]
_pos = mut.Start()
- _oldBase = "." if mut.Type() == cc.MutationType_INSERTION \
+ _oldBase = "." if mut.Type == cc.MutationType_INSERTION \
else tpl[_pos]
- _newBase = "." if mut.Type() == cc.MutationType_DELETION \
+ _newBase = "." if mut.Type == cc.MutationType_DELETION \
else mut.Base
return "%d %s %s > %s" % (_pos, _type, _oldBase, _newBase)
@@ -253,16 +253,17 @@ def scoreMatrix(ai):
"""
css = str(ai)
allMutations = sorted(allSingleBaseMutations(css))
- shape = (ai.NumReads(), len(allMutations))
+ readNames = list(ai.ReadNames())
+ numReads = len(readNames)
+ shape = (numReads, len(allMutations))
scoreMatrix = np.zeros(shape)
for j, mut in enumerate(allMutations):
- mutScores = ai.ReadLLs(mut)
+ mutScores = ai.LLs(mut)
scoreMatrix[:, j] = mutScores
- baselineScores = np.array(ai.ReadLLs())
- rowNames = [ ai.Read(i).Name
- for i in xrange(ai.NumReads()) ]
+ baselineScores = np.array(ai.LLs())
columnNames = [ _shortMutationDescription(mut, css)
for mut in allMutations ]
+ rowNames = readNames
return (rowNames, columnNames, baselineScores, scoreMatrix)
@@ -309,6 +310,24 @@ def filterAlns(refWindow, alns, arrowConfig):
a.readScore >= arrowConfig.minReadScore ]
+def sufficientlyAccurate(mappedRead, poaCss, minAccuracy):
+ if minAccuracy <= 0.0:
+ return True
+ s, e = mappedRead.TemplateStart, mappedRead.TemplateEnd
+ tpl = poaCss[s:e]
+ if mappedRead.Strand == cc.StrandType_FORWARD:
+ pass
+ elif mappedRead.Strand == cc.StrandType_REVERSE:
+ tpl = reverseComplement(tpl)
+ else:
+ return False
+ aln = cc.AlignLinear(tpl, mappedRead.Seq)
+ nErrors = sum(1 for t in aln.Transcript() if t != 'M')
+ tlen = len(tpl)
+ acc = 1.0 - 1.0 * min(nErrors, tlen) / tlen
+ return acc >= minAccuracy
+
+
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
"""
Call consensus on this interval---without subdividing the interval
@@ -341,29 +360,38 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
# coordinates relative to the POA consensus
mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
queryPositions = cc.TargetToQueryPositions(ga)
- mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ]
+ mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]
# Load the mapped reads into the mutation scorer, and iterate
# until convergence.
ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
coverage = 0
- for (mr, snr) in mappedReads:
+ for mr in mappedReads:
if (mr.TemplateEnd <= mr.TemplateStart or
mr.TemplateEnd - mr.TemplateStart < 2 or
mr.Length() < 2):
continue
- coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0
-
- # TODO(lhepler, dalexander): propagate coverage around somehow
+ if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy):
+ tpl = poaCss[mr.TemplateStart:mr.TemplateEnd]
+ if mr.Strand == cc.StrandType_FORWARD:
+ pass
+ elif mr.Strand == cc.StrandType_REVERSE:
+ tpl = reverseComplement(tpl)
+ else:
+ tpl = "INACTIVE/UNMAPPED"
+ logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq))
+ continue
+ if ai.AddRead(mr) == cc.State_VALID:
+ coverage += 1
# Iterate until covergence
- try:
- if coverage < arrowConfig.minPoaCoverage:
- logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
- return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
- refWindow, refSequence)
- _, converged = refineConsensus(ai, arrowConfig)
- assert converged, "Arrow did not converge to MLE"
+ if coverage < arrowConfig.minPoaCoverage:
+ logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
+ return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+ refWindow, refSequence)
+ _, converged = refineConsensus(ai, arrowConfig)
+
+ if converged:
arrowCss = str(ai)
if arrowConfig.computeConfidence:
confidence = consensusConfidence(ai)
@@ -373,9 +401,8 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
arrowCss,
confidence,
ai)
- except:
- traceback = ''.join(format_exception(*sys.exc_info()))
- logging.info("%s: %s" % (refWindow, traceback))
+ else:
+ logging.info("%s: Arrow did not converge to MLE" % (refWindow,))
return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
refWindow, refSequence)
diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py
index 4c5ca14..0bbf436 100644
--- a/GenomicConsensus/main.py
+++ b/GenomicConsensus/main.py
@@ -78,7 +78,7 @@ class ToolRunner(object):
options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )
- def _algorithmByName(self, name, cmpH5):
+ def _algorithmByName(self, name, peekFile):
if name == "plurality":
from GenomicConsensus.plurality import plurality
algo = plurality
@@ -88,14 +88,19 @@ class ToolRunner(object):
elif name == "arrow":
from GenomicConsensus.arrow import arrow
algo = arrow
+ # All arrow models require PW except P6 and the first S/P1-C1
+ if set(peekFile.sequencingChemistry) - set(["P6-C4", "S/P1-C1/beta"]):
+ if (not peekFile.hasBaseFeature("Ipd") or
+ not peekFile.hasBaseFeature("PulseWidth")):
+ die("Model requires missing base feature: IPD or PulseWidth")
elif name == "poa":
from GenomicConsensus.poa import poa
algo = poa
elif name == "best":
logging.info("Identifying best algorithm based on input data")
from GenomicConsensus import algorithmSelection
- algoName = algorithmSelection.bestAlgorithm(cmpH5.sequencingChemistry)
- return self._algorithmByName(algoName, cmpH5)
+ algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry)
+ return self._algorithmByName(algoName, peekFile)
else:
die("Failure: unrecognized algorithm %s" % name)
isOK, msg = algo.availability
diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py
index 7303e89..127d1c9 100644
--- a/GenomicConsensus/options.py
+++ b/GenomicConsensus/options.py
@@ -90,6 +90,7 @@ class Constants(object):
DEFAULT_MIN_READSCORE = 0.65
DEFAULT_MIN_HQREGIONSNR = 3.75
DEFAULT_MIN_ZSCORE = -3.5
+ DEFAULT_MIN_ACCURACY = 0.82
def get_parser():
"""
@@ -300,6 +301,13 @@ def add_options_to_argument_parser(parser):
type=float,
default=Constants.DEFAULT_MIN_ZSCORE,
help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).")
+ readSelection.add_argument(
+ "--minAccuracy",
+ action="store",
+ dest="minAccuracy",
+ type=float,
+ default=Constants.DEFAULT_MIN_ACCURACY,
+ help="The minimum acceptable window-global alignment accuracy for reads that will be used for the analysis (arrow-only).")
algorithm = parser.add_argument_group("Algorithm and parameter settings")
algorithm.add_argument(
@@ -349,7 +357,7 @@ def add_options_to_argument_parser(parser):
nargs="?",
default=None,
const="variants",
- choices=["variants", "all"])
+ choices=["variants", "all", "outliers"])
debugging.add_argument(
"--evidenceDirectory",
default="evidence_dump")
diff --git a/circle.yml b/circle.yml
index b1db9e5..f1e3793 100644
--- a/circle.yml
+++ b/circle.yml
@@ -7,12 +7,15 @@ dependencies:
- sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
- sudo apt-get update
- sudo apt-get install g++-4.8
+ - curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+ - sudo apt-get install git-lfs=1.1.0
- if [ ! -d _deps ] ; then mkdir _deps ; fi # Create a directory for dependencies, These are static, cache them.
- pushd _deps ; if [ ! -d cmake-3.3.0-Linux-x86_64 ] ; then wget --no-check-certificate http://www.cmake.org/files/v3.3/cmake-3.3.0-Linux-x86_64.tar.gz ; tar xzf cmake-3.3.0-Linux-x86_64.tar.gz ; fi
- pushd _deps ; if [ ! -d boost_1_58_0 ] ; then wget http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_1_58_0.tar.bz2 ; tar xjf boost_1_58_0.tar.bz2 ; fi
- pushd _deps ; if [ ! -f swig-3.0.8/bin/swig ] ; then rm -fr swig-3.0.8* ; mkdir dl ; pushd dl ; wget http://downloads.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz ; tar xzf swig-3.0.8.tar.gz ; pushd swig-3.0.8 ; ./configure --prefix $(readlink -f ../../swig-3.0.8) ; make ; make install ; fi
- pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore.git
- pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore2.git
+ - pushd _deps ; git clone https://github.com/PacificBiosciences/PacBioTestData.git
- pip install --upgrade pip
- pip install numpy
- pip install cython
@@ -24,6 +27,7 @@ dependencies:
override:
- pushd _deps/ConsensusCore ; python setup.py install --boost=$(readlink -f ../boost_1_58_0) --swig=$(readlink -f ../swig-3.0.8/bin/swig)
- pushd _deps/ConsensusCore2 ; CC=gcc-4.8 CXX=g++-4.8 CMAKE_COMMAND=$(readlink -f ../cmake-3.3.0-Linux-x86_64/bin/cmake) Boost_INCLUDE_DIRS=$(readlink -f ../boost_1_58_0) SWIG_COMMAND=$(readlink -f ../swig-3.0.8/bin/swig) pip install --verbose --upgrade --no-deps .
+ - pushd _deps/PacBioTestData ; git lfs pull && make python
test:
pre:
- pip install --verbose .
diff --git a/doc/HowTo.rst b/doc/HowTo.rst
index 538f707..0466d52 100644
--- a/doc/HowTo.rst
+++ b/doc/HowTo.rst
@@ -36,9 +36,14 @@ release of SMRTanalysis 3.0 or build from GitHub sources.*
Running a large-scale resequencing/polishing job in SMRTanalysis 2.3
--------------------------------------------------------------------
+We do not recommend attempting to construct a single giant cmp.h5 file and
+then processing it on a single node. This is inefficient and users attempting to do this
+have run into many problems with the instability of the HDF5 library (which PacBio is
+moving away from, in favor of BAM_.)
+
To run a large-scale resequencing job (>50 megabase genome @ 50x
coverage,nominally), you want to spread the computation load across
-multiple nodes in your computing cluster.
+multiple nodes in your computing cluster.
The `smrtpipe` workflow engine in SMRTanalysis 2.3 provides a
convenient workflow automating this---it will automatically spread the
@@ -126,3 +131,4 @@ Please consult the `FAQ document`_.
.. _pitchfork : https://github.com/PacificBiosciences/pitchfork
.. _`params.xml template`: ./params-template.xml
.. _`SMRTpipe reference manual`: http://www.pacb.com/wp-content/uploads/2015/09/SMRT-Pipe-Reference-Guide.pdf
+.. _`BAM`: http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html
diff --git a/tests/cram/bad_input.t b/tests/cram/bad_input.t
index 30d59e9..74e49bb 100644
--- a/tests/cram/bad_input.t
+++ b/tests/cram/bad_input.t
@@ -7,5 +7,5 @@ Test for sane behavior in the presence of bogus arguments.
Test that it doesn't crash when one BAM file in an otherwise valid AlignmentSet is empty.
$ DATA=$TESTDIR/../data/sanity
- $ REF="`python -c 'import pbcore.data ; print pbcore.data.getLambdaFasta()'`"
+ $ REF="`pbdata get lambda-fasta`"
$ arrow --reference $REF -o contig.fasta $DATA/mixed.alignmentset.xml
diff --git a/tests/cram/extra/arrow-evidence.t b/tests/cram/extra/arrow-evidence.t
new file mode 100644
index 0000000..3c42cd1
--- /dev/null
+++ b/tests/cram/extra/arrow-evidence.t
@@ -0,0 +1,28 @@
+Get the arrow evidence dump and make sure it can be grokked.
+
+ $ export DATA=$TESTDIR/../../data
+ $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+ $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run arrow w/ evidence dump
+
+ $ arrow --dumpEvidence=all $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+Inspect the output...
+
+ $ find evidence_dump | sort
+ evidence_dump
+ evidence_dump/All4mer.V2.01_Insert
+ evidence_dump/All4mer.V2.01_Insert/0-260
+ evidence_dump/All4mer.V2.01_Insert/0-260/arrow-scores.h5
+ evidence_dump/All4mer.V2.01_Insert/0-260/consensus.fa
+
+
+Try to load it up using the API...
+
+ $ python << EOF
+ > from GenomicConsensus.arrow.evidence import ArrowEvidence
+ > ev = ArrowEvidence.load("evidence_dump/All4mer.V2.01_Insert/0-260")
+ > assert 8*len(ev.consensus)==len(ev.colNames)==2080
+ > assert ev.delta.shape == (len(ev.rowNames), len(ev.colNames))==(95,2080)
+ > EOF
diff --git a/tests/cram/internal/alignment_summary_scaling.t b/tests/cram/internal/alignment_summary_scaling.t
index e20e454..cbc3a1a 100644
--- a/tests/cram/internal/alignment_summary_scaling.t
+++ b/tests/cram/internal/alignment_summary_scaling.t
@@ -12,7 +12,7 @@ unless an O(N^2) loop is used.
$ GFFREF=$TESTDATA/alignment_summary_variants.gff
$ OUTPUT=alignment_summary_variants_test.gff
$ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
- $ diff -I "##source-commandline" $OUTPUT $GFFREF
+ $ diff -I "##source" $OUTPUT $GFFREF
Second test has 20000 regions in a single contig, and 10000 variants. This
will also take several minutes.
@@ -22,4 +22,4 @@ will also take several minutes.
$ GFFREF=$TESTDATA/alignment_summary_variants_big_chr.gff
$ OUTPUT=alignment_summary_variants_big_chr_test.gff
$ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
- $ diff -I "##source-commandline" $OUTPUT $GFFREF
+ $ diff -I "##source" $OUTPUT $GFFREF
diff --git a/tests/cram/internal/arrow-staph.t b/tests/cram/internal/arrow-staph.t
index 99eb11c..433675d 100644
--- a/tests/cram/internal/arrow-staph.t
+++ b/tests/cram/internal/arrow-staph.t
@@ -14,19 +14,9 @@ Quiver does a good job here---no errors.
$ fastacomposition quiver-css.fasta
quiver-css.fasta A 960233 C 470725 G 470271 T 971458
-Arrow, on the other hand, makes a number of errors.
+Arrow, since the SNR capping fix, also gets no errors.
$ gffsubtract.pl arrow-variants.gff $MASK | grep -v "#" | sed 's/\t/ /g'
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 42450 42450 . . . reference=A;variantSeq=.;coverage=100;confidence=68
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 442861 442861 . . . reference=T;variantSeq=.;coverage=100;confidence=64
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 562107 562107 . . . reference=T;variantSeq=.;coverage=100;confidence=52
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 690814 690814 . . . reference=T;variantSeq=.;coverage=100;confidence=46
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1041877 1041877 . . . reference=A;variantSeq=.;coverage=100;confidence=45
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1801427 1801427 . . . reference=T;variantSeq=.;coverage=100;confidence=51
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1840635 1840635 . . . reference=A;variantSeq=.;coverage=100;confidence=88
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1850379 1850379 . . . reference=T;variantSeq=.;coverage=100;confidence=40
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2147449 2147449 . . . reference=T;variantSeq=.;coverage=100;confidence=93
- Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2231043 2231043 . . . reference=A;variantSeq=.;coverage=100;confidence=80
$ fastacomposition arrow-css.fasta
- arrow-css.fasta A 960033 C 470641 G 470190 T 971286 a 178 c 84 g 80 t 158
+ arrow-css.fasta A 960232 C 470725 G 470271 T 971457
diff --git a/tests/cram/version.t b/tests/cram/version.t
index 7df81c8..2ec1942 100644
--- a/tests/cram/version.t
+++ b/tests/cram/version.t
@@ -2,7 +2,7 @@ This actually failed once because of a missing import, so we might as
well test it.
$ variantCaller --version
- 2.0.0
+ 2.1.0
This will break if the parser setup is messed up.
diff --git a/tests/unit/test_algorithm_selection.py b/tests/unit/test_algorithm_selection.py
index 65e8e89..8ffaf6b 100644
--- a/tests/unit/test_algorithm_selection.py
+++ b/tests/unit/test_algorithm_selection.py
@@ -10,5 +10,5 @@ def test_algorithm_selection():
EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1/beta"]))
EQ(None, bestAlgorithm_(["P6-C4", "unknown"]))
EQ("arrow", bestAlgorithm_(["S/P1-C1"]))
- EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1"]))
- EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1"])) # (Arrow pres. no training for P5. But it will tell us that)
+ EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1.1"]))
+ EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1.1"])) # (Arrow pres. no training for P5. But it will tell us that)
diff --git a/tests/unit/test_summarize_consensus.py b/tests/unit/test_summarize_consensus.py
index b1d2718..e335120 100644
--- a/tests/unit/test_summarize_consensus.py
+++ b/tests/unit/test_summarize_consensus.py
@@ -31,7 +31,7 @@ chr2\t.\tregion\t10000\t23469\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1
chr3\t.\tregion\t1\t7000\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24"""
EXPECTED = """\
-##source GenomicConsensus 2.0.0
+##source GenomicConsensus 2.1.0
##pacbio-alignment-summary-version 0.6
##source-commandline this line will be skipped in the comparison
chr1\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=4,23,28;cov2=20.162,5.851;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=1
diff --git a/tests/unit/test_tool_contract.py b/tests/unit/test_tool_contract.py
index 357ada3..33de349 100644
--- a/tests/unit/test_tool_contract.py
+++ b/tests/unit/test_tool_contract.py
@@ -3,43 +3,46 @@ import unittest
import os.path
from pbcore.io import openDataSet, ContigSet
-import pbcore.data
import pbcommand.testkit
-# XXX local data directory, absolutely required
+import pbtestdata
+
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
assert os.path.isdir(DATA_DIR)
-# optional (but required for TestSummarizeConsensus)
-DATA_DIR_2 = "/mnt/secondary/Share/Quiver/TestData/tinyLambda/"
-
-class TestVariantCaller(pbcommand.testkit.PbTestApp):
- DRIVER_BASE = "variantCaller "
- DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
- DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
- REQUIRES_PBCORE = True
- INPUT_FILES = [
- pbcore.data.getBamAndCmpH5()[0],
-# "/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/0007_tiny/Alignment_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.alignmentset.xml",
- pbcore.data.getLambdaFasta()
- ]
- TASK_OPTIONS = {
- "genomic_consensus.task_options.min_coverage": 0,
- "genomic_consensus.task_options.min_confidence": 0,
- "genomic_consensus.task_options.algorithm": "quiver",
- "genomic_consensus.task_options.diploid": False,
- }
-
- def run_after(self, rtc, output_dir):
- contigs_file = rtc.task.output_files[1]
- with openDataSet(contigs_file, strict=True) as ds:
- self.assertTrue(isinstance(ds, ContigSet))
-
-
-class TestVariantCallerArrow(TestVariantCaller):
- TASK_OPTIONS = {
- "genomic_consensus.task_options.algorithm": "arrow",
- }
+# These tests seem to cause some logging failure at shutdown;
+# disabling them pending upstream fix. See:
+# https://bugzilla.nanofluidics.com/show_bug.cgi?id=33699
+
+# class TestVariantCaller(pbcommand.testkit.PbTestApp):
+# DRIVER_BASE = "variantCaller "
+# DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
+# DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
+# REQUIRES_PBCORE = True
+# INPUT_FILES = [
+# pbtestdata.get_file("aligned-xml"), pbtestdata.get_file("lambdaNEB")
+# ]
+# TASK_OPTIONS = {
+# "genomic_consensus.task_options.min_coverage": 0,
+# "genomic_consensus.task_options.min_confidence": 0,
+# "genomic_consensus.task_options.algorithm": "quiver",
+# "genomic_consensus.task_options.diploid": False,
+# }
+
+# def test_run_e2e(self):
+# import ipdb; ipdb.set_trace()
+# super(TestVariantCaller, self).test_run_e2e()
+
+# def run_after(self, rtc, output_dir):
+# contigs_file = rtc.task.output_files[1]
+# with openDataSet(contigs_file, strict=True) as ds:
+# self.assertTrue(isinstance(ds, ContigSet))
+
+
+# class TestVariantCallerArrow(TestVariantCaller):
+# TASK_OPTIONS = {
+# "genomic_consensus.task_options.algorithm": "arrow",
+# }
class TestGffToBed(pbcommand.testkit.PbTestApp):
@@ -70,15 +73,14 @@ class TestGffToVcf(pbcommand.testkit.PbTestApp):
}
- at unittest.skipUnless(os.path.isdir(DATA_DIR_2), "Missing %s" % DATA_DIR_2)
class TestSummarizeConsensus(pbcommand.testkit.PbTestApp):
DRIVER_BASE = "summarizeConsensus"
DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
REQUIRES_PBCORE = True
INPUT_FILES = [
- os.path.join(DATA_DIR_2, "alignment_summary.gff"),
- os.path.join(DATA_DIR_2, "variants.gff.gz")
+ pbtestdata.get_file("alignment-summary-gff"),
+ pbtestdata.get_file("variants-gff")
]
TASK_OPTIONS = {}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git
More information about the debian-med-commit
mailing list