[med-svn] [pbgenomicconsensus] 01/05: Imported Upstream version 2.1.0

Sat Jan 21 21:59:11 UTC 2017

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository pbgenomicconsensus.

commit db3cde2805ef6dbe0e35728aece4924f583061b0
Author: Afif Elghraoui <afif at debian.org>
Date:   Sat Jul 23 15:54:41 2016 -0700

    Imported Upstream version 2.1.0
---
 CHANGELOG                                       |   6 +
 GenomicConsensus/__init__.py                    |   2 +-
 GenomicConsensus/arrow/arrow.py                 |  52 ++++++--
 GenomicConsensus/arrow/evidence.py              | 161 +++++++++++++-----------
 GenomicConsensus/arrow/model.py                 |  49 ++++----
 GenomicConsensus/arrow/utils.py                 |  77 ++++++++----
 GenomicConsensus/main.py                        |  11 +-
 GenomicConsensus/options.py                     |  10 +-
 circle.yml                                      |   4 +
 doc/HowTo.rst                                   |   8 +-
 tests/cram/bad_input.t                          |   2 +-
 tests/cram/extra/arrow-evidence.t               |  28 +++++
 tests/cram/internal/alignment_summary_scaling.t |   4 +-
 tests/cram/internal/arrow-staph.t               |  14 +--
 tests/cram/version.t                            |   2 +-
 tests/unit/test_algorithm_selection.py          |   4 +-
 tests/unit/test_summarize_consensus.py          |   2 +-
 tests/unit/test_tool_contract.py                |  72 +++++------
 18 files changed, 315 insertions(+), 193 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 1e0cfea..eea0fd8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+Version 2.1.0
+  * Major fixes for arrow
+  * Documentation for arrow
+  * Update for substantially-refactored ConsensusCore2
+  * --dumpEvidence support for Arrow
+
 Version 2.0.0
   * Working support for Arrow and POA-only consensus models
 
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py
index 0e66fb7..68ac61b 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/__init__.py
@@ -30,4 +30,4 @@
 
 # Author: David Alexander
 
-__VERSION__ = "2.0.0"
+__VERSION__ = "2.1.0"
diff --git a/GenomicConsensus/arrow/arrow.py b/GenomicConsensus/arrow/arrow.py
index ced2e25..5016ddf 100755
--- a/GenomicConsensus/arrow/arrow.py
+++ b/GenomicConsensus/arrow/arrow.py
@@ -30,7 +30,7 @@
 
 # Authors: David Alexander, Lance Hepler
 
-import logging
+import logging, os.path
 import ConsensusCore2 as cc, numpy as np
 
 from .. import reference
@@ -41,8 +41,9 @@ from ..ResultCollector import ResultCollectorProcess, ResultCollectorThread
 from GenomicConsensus.consensus import Consensus, ArrowConsensus, join
 from GenomicConsensus.windows import kSpannedIntervals, holes, subWindow
 from GenomicConsensus.variants import filterVariants, annotateVariants
-from GenomicConsensus.arrow.evidence import dumpEvidence
+from GenomicConsensus.arrow.evidence import ArrowEvidence
 from GenomicConsensus.arrow import diploid
+from GenomicConsensus.utils import die
 
 import GenomicConsensus.arrow.model as M
 import GenomicConsensus.arrow.utils as U
@@ -68,7 +69,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
         alnHits = U.readsInWindow(alnFile, refWindow,
                                   depthLimit=20000,
                                   minMapQV=arrowConfig.minMapQV,
-                                  strategy="longest",
+                                  strategy="long-and-strand-balanced",
                                   stratum=options.readStratum,
                                   barcode=options.barcode)
         starts = np.fromiter((hit.tStart for hit in alnHits), np.int)
@@ -98,7 +99,7 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
         alns = U.readsInWindow(alnFile, subWin,
                                depthLimit=depthLimit,
                                minMapQV=arrowConfig.minMapQV,
-                               strategy="longest",
+                               strategy="long-and-strand-balanced",
                                stratum=options.readStratum,
                                barcode=options.barcode)
         clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ]
@@ -133,14 +134,26 @@ def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig,
             variants += filteredVars
 
             # Dump?
-            shouldDumpEvidence = \
+            maybeDumpEvidence = \
                 ((options.dumpEvidence == "all") or
+                 (options.dumpEvidence == "outliers") or
                  (options.dumpEvidence == "variants") and (len(variants) > 0))
-            if shouldDumpEvidence:
-                logging.info("Arrow does not yet support --dumpEvidence")
-#                 dumpEvidence(options.evidenceDirectory,
-#                              subWin, windowRefSeq,
-#                              clippedAlns, css)
+            if maybeDumpEvidence:
+                refId, refStart, refEnd = subWin
+                refName = reference.idToName(refId)
+                windowDirectory = os.path.join(
+                    options.evidenceDirectory,
+                    refName,
+                    "%d-%d" % (refStart, refEnd))
+                ev = ArrowEvidence.fromConsensus(css)
+                if options.dumpEvidence != "outliers":
+                    ev.save(windowDirectory)
+                elif (np.max(np.abs(ev.delta)) > 20):
+                    # Mathematically I don't think we should be seeing
+                    # deltas > 6 in magnitude, but let's just restrict
+                    # attention to truly bonkers outliers.
+                    ev.save(windowDirectory)
+
         else:
             css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                                  subWin, intRefSeq)
@@ -238,12 +251,29 @@ def configure(options, alnFile):
     if options.diploid:
         logging.warn("Diploid analysis not yet supported under Arrow model.")
 
+    # test available chemistries
+    supp = set(cc.SupportedChemistries())
+    logging.info("Found consensus models for: ({0})".format(", ".join(sorted(supp))))
+
+    used = set(alnFile.sequencingChemistry)
+    if options.parametersSpec != "auto":
+        used = set([options.parametersSpec])
+
+    unsupp = used - supp
+    if unsupp:
+        die("Arrow: unsupported chemistries found: ({0})".format(", ".join(sorted(unsupp))))
+
+    logging.info("Using consensus models for: ({0})".format(", ".join(sorted(used))))
+
     return M.ArrowConfig(minMapQV=options.minMapQV,
                          noEvidenceConsensus=options.noEvidenceConsensusCall,
                          computeConfidence=(not options.fastMode),
                          minReadScore=options.minReadScore,
                          minHqRegionSnr=options.minHqRegionSnr,
-                         minZScore=options.minZScore)
+                         minZScore=options.minZScore,
+                         minAccuracy=options.minAccuracy,
+                         chemistryOverride=(None if options.parametersSpec == "auto"
+                                            else options.parametersSpec))
 
 def slaveFactories(threaded):
     # By default we use slave processes. The tuple ordering is important.
diff --git a/GenomicConsensus/arrow/evidence.py b/GenomicConsensus/arrow/evidence.py
index 450f9c5..03d008f 100755
--- a/GenomicConsensus/arrow/evidence.py
+++ b/GenomicConsensus/arrow/evidence.py
@@ -30,8 +30,7 @@
 
 # Authors: David Alexander, Lance Hepler
 
-__all__ = [ "dumpEvidence",
-            "ArrowEvidence" ]
+__all__ = [ "ArrowEvidence" ]
 
 import h5py, logging, os.path, numpy as np
 from collections import namedtuple
@@ -41,60 +40,7 @@ from pbcore.io import FastaReader, FastaWriter
 from .utils import scoreMatrix
 from .. import reference
 
-def dumpEvidence(evidenceDumpBaseDirectory,
-                 refWindow, refSequence, alns,
-                 arrowConsensus):
-    # Format of evidence dump:
-    # evidence_dump/
-    #   ref000001/
-    #     0-1005/
-    #       reference.fa
-    #       reads.fa
-    #       consensus.fa
-    #       arrow-scores.h5
-    #     995-2005/
-    #       ...
-    join = os.path.join
-    refId, refStart, refEnd = refWindow
-    refName = reference.idToName(refId)
-    windowDirectory = join(evidenceDumpBaseDirectory,
-                           refName,
-                           "%d-%d" % (refStart, refEnd))
-    logging.info("Dumping evidence to %s" % (windowDirectory,))
-
-    if os.path.exists(windowDirectory):
-        raise Exception, "Evidence dump does not expect directory %s to exist." % windowDirectory
-    os.makedirs(windowDirectory)
-    refFasta       = FastaWriter(join(windowDirectory, "reference.fa"))
-    readsFasta     = FastaWriter(join(windowDirectory, "reads.fa"))
-    consensusFasta = FastaWriter(join(windowDirectory, "consensus.fa"))
-
-    windowName = refName + (":%d-%d" % (refStart, refEnd))
-    refFasta.writeRecord(windowName, refSequence)
-    refFasta.close()
-
-    consensusFasta.writeRecord(windowName + "|arrow", arrowConsensus.sequence)
-    consensusFasta.close()
-
-    rowNames, columnNames, baselineScores, scores = scoreMatrix(arrowConsensus.ai)
-    arrowScoreFile = h5py.File(join(windowDirectory, "arrow-scores.h5"))
-    arrowScoreFile.create_dataset("Scores", data=scores)
-    vlen_str = h5py.special_dtype(vlen=str)
-    arrowScoreFile.create_dataset("RowNames", data=rowNames, dtype=vlen_str)
-    arrowScoreFile.create_dataset("ColumnNames", data=columnNames, dtype=vlen_str)
-    arrowScoreFile.create_dataset("BaselineScores", data=baselineScores)
-    arrowScoreFile.close()
-    for aln in alns:
-        readsFasta.writeRecord(str(aln.rowNumber),
-                               aln.read(orientation="genomic", aligned=False))
-    readsFasta.close()
-
-
 class ArrowEvidence(object):
-    """
-    An experimental reader class for arrow evidence dumps produced by
-    arrow --dumpEvidence
-    """
 
     Mutation = namedtuple("Mutation", ("Position", "Type", "FromBase", "ToBase"))
 
@@ -105,9 +51,9 @@ class ArrowEvidence(object):
         type, fromBase, _, toBase = fields[1:]
         return ArrowEvidence.Mutation(pos, type, fromBase, toBase)
 
-    def __init__(self, path, refStart, consensus, rowNames, colNames, baselineScores, scores):
-        self.path           = path
-        self.refStart       = refStart
+    def __init__(self, refWindow, consensus, rowNames, colNames, baselineScores, scores):
+        assert isinstance(consensus, str)
+        self.refWindow      = refWindow # tuple(str, int, int)
         self.consensus      = consensus
         self.rowNames       = rowNames
         self.colNames       = colNames
@@ -115,6 +61,27 @@ class ArrowEvidence(object):
         self.scores         = scores
         self.muts           = map(ArrowEvidence._parseMutName, self.colNames)
 
+    @staticmethod
+    def fromConsensus(css):
+        rowNames, colNames, baselineScores, scores = scoreMatrix(css.ai)
+        return ArrowEvidence(css.refWindow,
+                             css.sequence,
+                             rowNames,
+                             colNames,
+                             baselineScores,
+                             scores)
+    @property
+    def refName(self):
+        return self.refWindow[0]
+
+    @property
+    def refStart(self):
+        return self.refWindow[1]
+
+    @property
+    def refEnd(self):
+        return self.refWindow[2]
+
     @property
     def positions(self):
         return  [ mut.Position for mut in self.muts ]
@@ -124,33 +91,75 @@ class ArrowEvidence(object):
         return sorted(list(set(self.positions)))
 
     @property
-    def totalScores(self):
-        return self.baselineScores[:, np.newaxis] + self.scores
+    def delta(self):
+        return self.scores - self.baselineScores[:, np.newaxis]
 
     @staticmethod
-    def load(path):
-        if path.endswith("/"): path = path[:-1]
-
-        refWin_ = path.split("/")[-1].split("-")
-        refStart = int(refWin_[0])
-
-        with FastaReader(path + "/consensus.fa") as fr:
+    def load(dir):
+        """
+        Load an ArrowEvidence from a directory
+        """
+        if dir.endswith("/"): dir = dir[:-1]
+        refStart, refEnd = map(int, dir.split("/")[-1].split("-"))
+        refName = dir.split("/")[-2]
+        refWindow = (refName, refStart, refEnd)
+        with FastaReader(dir + "/consensus.fa") as fr:
             consensus = next(iter(fr)).sequence
-
-        with h5py.File(path + "/arrow-scores.h5", "r") as f:
+        with h5py.File(dir + "/arrow-scores.h5", "r") as f:
             scores   = f["Scores"].value
             baselineScores = f["BaselineScores"].value
             colNames = f["ColumnNames"].value
             rowNames = f["RowNames"].value
-            return ArrowEvidence(path, refStart, consensus,
+            return ArrowEvidence(refWindow, consensus,
                                  rowNames, colNames,
                                  baselineScores, scores)
 
+    def save(self, dir):
+        """
+        Save this ArrowEvidence to a directory.  The directory will be
+        *created* by this method.
+
+        Format of evidence dump:
+        evidence_dump/
+          ref000001/
+            0-1005/
+              consensus.fa
+              arrow-scores.h5
+            995-2005/
+            ...
+        """
+        logging.info("Dumping evidence to %s" % (dir,))
+        join = os.path.join
+        if os.path.exists(dir):
+            raise Exception, "Evidence dump does not expect directory %s to exist." % dir
+        os.makedirs(dir)
+        #refFasta       = FastaWriter(join(dir, "reference.fa"))
+        #readsFasta     = FastaWriter(join(dir, "reads.fa"))
+        consensusFasta = FastaWriter(join(dir, "consensus.fa"))
+        windowName = self.refName + (":%d-%d" % (self.refStart, self.refEnd))
+        #refFasta.writeRecord(windowName, self.refSequence)
+        #refFasta.close()
+
+        consensusFasta.writeRecord(windowName + "|arrow", self.consensus)
+        consensusFasta.close()
+
+        arrowScoreFile = h5py.File(join(dir, "arrow-scores.h5"))
+        arrowScoreFile.create_dataset("Scores", data=self.scores)
+        vlen_str = h5py.special_dtype(vlen=str)
+        arrowScoreFile.create_dataset("RowNames", data=self.rowNames, dtype=vlen_str)
+        arrowScoreFile.create_dataset("ColumnNames", data=self.colNames, dtype=vlen_str)
+        arrowScoreFile.create_dataset("BaselineScores", data=self.baselineScores)
+        arrowScoreFile.close()
+        # for aln in alns:
+        #     readsFasta.writeRecord(str(aln.rowNumber),
+        #                            aln.read(orientation="genomic", aligned=False))
+        # readsFasta.close()
+
+
     def forPosition(self, pos):
         posStart = bisect_left(self.positions, pos)
         posEnd   = bisect_right(self.positions, pos)
-        return ArrowEvidence(self.path,
-                             self.refStart,
+        return ArrowEvidence(self.refStart,
                              self.consensus,
                              self.rowNames,
                              self.colNames[posStart:posEnd],
@@ -160,8 +169,7 @@ class ArrowEvidence(object):
 
     def justSubstitutions(self):
         colMask = np.array(map(lambda s: ("Sub" in s), self.colNames))
-        return ArrowEvidence(self.path,
-                             self.refStart,
+        return ArrowEvidence(self.refStart,
                              self.consensus,
                              self.rowNames,
                              self.colNames[colMask],
@@ -169,5 +177,6 @@ class ArrowEvidence(object):
                              self.scores[:, colMask])
 
     def rowNumbers(self):
-        with FastaReader(self.path + "/reads.fa") as fr:
-            return [ int(ctg.name) for ctg in fr ]
+        # with FastaReader(self.dir + "/reads.fa") as fr:
+        #     return [ int(ctg.name) for ctg in fr ]
+        raise NotImplementedError
diff --git a/GenomicConsensus/arrow/model.py b/GenomicConsensus/arrow/model.py
index 2138c65..a1c8f1d 100755
--- a/GenomicConsensus/arrow/model.py
+++ b/GenomicConsensus/arrow/model.py
@@ -64,7 +64,9 @@ class ArrowConfig(object):
                  readStumpinessThreshold=0.1,
                  minReadScore=0.75,
                  minHqRegionSnr=3.75,
-                 minZScore=-3.5):
+                 minZScore=-3.5,
+                 minAccuracy=0.82,
+                 chemistryOverride=None):
 
         self.minMapQV                   = minMapQV
         self.minPoaCoverage             = minPoaCoverage
@@ -78,32 +80,37 @@ class ArrowConfig(object):
         self.minReadScore               = minReadScore
         self.minHqRegionSnr             = minHqRegionSnr
         self.minZScore                  = minZScore
+        self.minAccuracy                = minAccuracy
+        self.chemistryOverride          = chemistryOverride
 
-    @staticmethod
-    def extractFeatures(aln):
-        """
-        Extract the data in a cmp.h5 alignment record into a
-        native-orientation gapless string.
-        """
-        if isinstance(aln, CmpH5Alignment):
-            die("Arrow does not support CmpH5 files!")
-        else:
-            return aln.read(aligned=False, orientation="native")
-
-    @staticmethod
-    def extractMappedRead(aln, windowStart):
+    def extractMappedRead(self, aln, windowStart):
         """
         Given a clipped alignment, convert its coordinates into template
         space (starts with 0), bundle it up with its features as a
         MappedRead.
         """
+        if isinstance(aln, CmpH5Alignment):
+            die("Arrow does not support CmpH5 files!")
+
         assert aln.referenceSpan > 0
+
+        def baseFeature(featureName):
+            if aln.reader.hasBaseFeature(featureName):
+                rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native")
+                return rawFeature.clip(0,255).astype(np.uint8)
+            else:
+                return np.zeros((aln.qLen,), dtype=np.uint8)
+
         name = aln.readName
         chemistry = aln.sequencingChemistry
-        strand = cc.StrandEnum_REVERSE if aln.isReverseStrand else cc.StrandEnum_FORWARD
-        read = cc.Read(name, ArrowConfig.extractFeatures(aln), chemistry)
-        return (cc.MappedRead(read,
-                              strand,
-                              int(aln.referenceStart - windowStart),
-                              int(aln.referenceEnd   - windowStart)),
-                cc.SNR(aln.hqRegionSnr))
+        strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD
+        read = cc.Read(name,
+                       aln.read(aligned=False, orientation="native"),
+                       cc.Uint8Vector(baseFeature("Ipd").tolist()),
+                       cc.Uint8Vector(baseFeature("PulseWidth").tolist()),
+                       cc.SNR(aln.hqRegionSnr),
+                       chemistry if self.chemistryOverride is None else self.chemistryOverride)
+        return cc.MappedRead(read,
+                             strand,
+                             int(aln.referenceStart - windowStart),
+                             int(aln.referenceEnd   - windowStart))
diff --git a/GenomicConsensus/arrow/utils.py b/GenomicConsensus/arrow/utils.py
index c8bbe8e..8277fd9 100755
--- a/GenomicConsensus/arrow/utils.py
+++ b/GenomicConsensus/arrow/utils.py
@@ -132,8 +132,8 @@ def refineConsensus(ai, arrowConfig):
     cfg = cc.PolishConfig(arrowConfig.maxIterations,
                           arrowConfig.mutationSeparation,
                           arrowConfig.mutationNeighborhood)
-    isConverged, nTested, nApplied = cc.Polish(ai, cfg)
-    return str(ai), isConverged
+    polishResult = cc.Polish(ai, cfg)
+    return str(ai), polishResult.hasConverged
 
 def consensusConfidence(ai, positions=None):
     """
@@ -142,7 +142,7 @@ def consensusConfidence(ai, positions=None):
     omitted, confidence values are returned for all positions in the
     consensus (str(ai)).
     """
-    return np.array(np.clip(cc.ConsensusQVs(ai), 0, 93), dtype=np.uint8)
+    return np.array(np.clip(cc.ConsensusQualities(ai), 0, 93), dtype=np.uint8)
 
 def variantsFromAlignment(a, refWindow, cssQvInWindow=None, siteCoverage=None):
     """
@@ -233,9 +233,9 @@ def _shortMutationDescription(mut, tpl):
     """
     _type = _typeMap[mut.Type]
     _pos = mut.Start()
-    _oldBase = "." if mut.Type() == cc.MutationType_INSERTION \
+    _oldBase = "." if mut.Type == cc.MutationType_INSERTION \
                else tpl[_pos]
-    _newBase = "." if mut.Type() == cc.MutationType_DELETION \
+    _newBase = "." if mut.Type == cc.MutationType_DELETION \
                else mut.Base
     return "%d %s %s > %s" % (_pos, _type, _oldBase, _newBase)
 
@@ -253,16 +253,17 @@ def scoreMatrix(ai):
     """
     css = str(ai)
     allMutations = sorted(allSingleBaseMutations(css))
-    shape = (ai.NumReads(), len(allMutations))
+    readNames = list(ai.ReadNames())
+    numReads = len(readNames)
+    shape = (numReads, len(allMutations))
     scoreMatrix = np.zeros(shape)
     for j, mut in enumerate(allMutations):
-        mutScores = ai.ReadLLs(mut)
+        mutScores = ai.LLs(mut)
         scoreMatrix[:, j] = mutScores
-    baselineScores =  np.array(ai.ReadLLs())
-    rowNames = [ ai.Read(i).Name
-                 for i in xrange(ai.NumReads()) ]
+    baselineScores =  np.array(ai.LLs())
     columnNames = [ _shortMutationDescription(mut, css)
                     for mut in allMutations ]
+    rowNames = readNames
     return (rowNames, columnNames, baselineScores, scoreMatrix)
 
 
@@ -309,6 +310,24 @@ def filterAlns(refWindow, alns, arrowConfig):
                 a.readScore >= arrowConfig.minReadScore ]
 
 
+def sufficientlyAccurate(mappedRead, poaCss, minAccuracy):
+    if minAccuracy <= 0.0:
+        return True
+    s, e = mappedRead.TemplateStart, mappedRead.TemplateEnd
+    tpl = poaCss[s:e]
+    if mappedRead.Strand == cc.StrandType_FORWARD:
+        pass
+    elif mappedRead.Strand == cc.StrandType_REVERSE:
+        tpl = reverseComplement(tpl)
+    else:
+        return False
+    aln = cc.AlignLinear(tpl, mappedRead.Seq)
+    nErrors = sum(1 for t in aln.Transcript() if t != 'M')
+    tlen = len(tpl)
+    acc = 1.0 - 1.0 * min(nErrors, tlen) / tlen
+    return acc >= minAccuracy
+
+
 def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
     """
     Call consensus on this interval---without subdividing the interval
@@ -341,29 +360,38 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
     # coordinates relative to the POA consensus
     mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ]
     queryPositions = cc.TargetToQueryPositions(ga)
-    mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ]
+    mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ]
 
     # Load the mapped reads into the mutation scorer, and iterate
     # until convergence.
     ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore))
     coverage = 0
-    for (mr, snr) in mappedReads:
+    for mr in mappedReads:
         if (mr.TemplateEnd <= mr.TemplateStart or
             mr.TemplateEnd - mr.TemplateStart < 2 or
             mr.Length() < 2):
             continue
-        coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0
-
-    # TODO(lhepler, dalexander): propagate coverage around somehow
+        if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy):
+            tpl = poaCss[mr.TemplateStart:mr.TemplateEnd]
+            if mr.Strand == cc.StrandType_FORWARD:
+                pass
+            elif mr.Strand == cc.StrandType_REVERSE:
+                tpl = reverseComplement(tpl)
+            else:
+                tpl = "INACTIVE/UNMAPPED"
+            logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq))
+            continue
+        if ai.AddRead(mr) == cc.State_VALID:
+            coverage += 1
 
     # Iterate until covergence
-    try:
-        if coverage < arrowConfig.minPoaCoverage:
-            logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
-            return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
-                                                  refWindow, refSequence)
-        _, converged = refineConsensus(ai, arrowConfig)
-        assert converged, "Arrow did not converge to MLE"
+    if coverage < arrowConfig.minPoaCoverage:
+        logging.info("%s: Inadequate coverage to call consensus" % (refWindow,))
+        return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
+                                              refWindow, refSequence)
+    _, converged = refineConsensus(ai, arrowConfig)
+
+    if converged:
         arrowCss = str(ai)
         if arrowConfig.computeConfidence:
             confidence = consensusConfidence(ai)
@@ -373,9 +401,8 @@ def consensusForAlignments(refWindow, refSequence, alns, arrowConfig):
                               arrowCss,
                               confidence,
                               ai)
-    except:
-        traceback = ''.join(format_exception(*sys.exc_info()))
-        logging.info("%s: %s" % (refWindow, traceback))
+    else:
+        logging.info("%s: Arrow did not converge to MLE" % (refWindow,))
         return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus,
                                               refWindow, refSequence)
 
diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py
index 4c5ca14..0bbf436 100644
--- a/GenomicConsensus/main.py
+++ b/GenomicConsensus/main.py
@@ -78,7 +78,7 @@ class ToolRunner(object):
         options.temporaryDirectory = tempfile.mkdtemp(prefix="GenomicConsensus-", dir="/tmp")
         logging.info("Created temporary directory %s" % (options.temporaryDirectory,) )
 
-    def _algorithmByName(self, name, cmpH5):
+    def _algorithmByName(self, name, peekFile):
         if name == "plurality":
             from GenomicConsensus.plurality import plurality
             algo = plurality
@@ -88,14 +88,19 @@ class ToolRunner(object):
         elif name == "arrow":
             from GenomicConsensus.arrow import arrow
             algo = arrow
+            # All arrow models require PW except P6 and the first S/P1-C1
+            if set(peekFile.sequencingChemistry) - set(["P6-C4", "S/P1-C1/beta"]):
+                if (not peekFile.hasBaseFeature("Ipd") or
+                    not peekFile.hasBaseFeature("PulseWidth")):
+                    die("Model requires missing base feature: IPD or PulseWidth")
         elif name == "poa":
             from GenomicConsensus.poa import poa
             algo = poa
         elif name == "best":
             logging.info("Identifying best algorithm based on input data")
             from GenomicConsensus import algorithmSelection
-            algoName = algorithmSelection.bestAlgorithm(cmpH5.sequencingChemistry)
-            return self._algorithmByName(algoName, cmpH5)
+            algoName = algorithmSelection.bestAlgorithm(peekFile.sequencingChemistry)
+            return self._algorithmByName(algoName, peekFile)
         else:
             die("Failure: unrecognized algorithm %s" % name)
         isOK, msg = algo.availability
diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py
index 7303e89..127d1c9 100644
--- a/GenomicConsensus/options.py
+++ b/GenomicConsensus/options.py
@@ -90,6 +90,7 @@ class Constants(object):
     DEFAULT_MIN_READSCORE = 0.65
     DEFAULT_MIN_HQREGIONSNR = 3.75
     DEFAULT_MIN_ZSCORE = -3.5
+    DEFAULT_MIN_ACCURACY = 0.82
 
 def get_parser():
     """
@@ -300,6 +301,13 @@ def add_options_to_argument_parser(parser):
         type=float,
         default=Constants.DEFAULT_MIN_ZSCORE,
         help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).")
+    readSelection.add_argument(
+        "--minAccuracy",
+        action="store",
+        dest="minAccuracy",
+        type=float,
+        default=Constants.DEFAULT_MIN_ACCURACY,
+        help="The minimum acceptable window-global alignment accuracy for reads that will be used for the analysis (arrow-only).")
 
     algorithm = parser.add_argument_group("Algorithm and parameter settings")
     algorithm.add_argument(
@@ -349,7 +357,7 @@ def add_options_to_argument_parser(parser):
         nargs="?",
         default=None,
         const="variants",
-        choices=["variants", "all"])
+        choices=["variants", "all", "outliers"])
     debugging.add_argument(
         "--evidenceDirectory",
         default="evidence_dump")
diff --git a/circle.yml b/circle.yml
index b1db9e5..f1e3793 100644
--- a/circle.yml
+++ b/circle.yml
@@ -7,12 +7,15 @@ dependencies:
         - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
         - sudo apt-get update
         - sudo apt-get install g++-4.8
+        - curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
+        - sudo apt-get install git-lfs=1.1.0
         - if [ ! -d _deps ] ; then mkdir _deps ; fi  # Create a directory for dependencies, These are static, cache them.
         - pushd _deps ; if [ ! -d cmake-3.3.0-Linux-x86_64 ] ; then wget --no-check-certificate http://www.cmake.org/files/v3.3/cmake-3.3.0-Linux-x86_64.tar.gz ; tar xzf cmake-3.3.0-Linux-x86_64.tar.gz ; fi
         - pushd _deps ; if [ ! -d boost_1_58_0 ] ; then wget http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_1_58_0.tar.bz2 ; tar xjf boost_1_58_0.tar.bz2 ; fi
         - pushd _deps ; if [ ! -f swig-3.0.8/bin/swig ] ; then rm -fr swig-3.0.8* ; mkdir dl ; pushd dl ; wget http://downloads.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz ; tar xzf swig-3.0.8.tar.gz ; pushd swig-3.0.8 ; ./configure --prefix $(readlink -f ../../swig-3.0.8) ; make ; make install ; fi
         - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore.git
         - pushd _deps ; git clone https://github.com/PacificBiosciences/ConsensusCore2.git
+        - pushd _deps ; git clone https://github.com/PacificBiosciences/PacBioTestData.git
         - pip install --upgrade pip
         - pip install numpy
         - pip install cython
@@ -24,6 +27,7 @@ dependencies:
     override:
         - pushd _deps/ConsensusCore ; python setup.py install --boost=$(readlink -f ../boost_1_58_0) --swig=$(readlink -f ../swig-3.0.8/bin/swig)
         - pushd _deps/ConsensusCore2 ; CC=gcc-4.8 CXX=g++-4.8 CMAKE_COMMAND=$(readlink -f ../cmake-3.3.0-Linux-x86_64/bin/cmake) Boost_INCLUDE_DIRS=$(readlink -f ../boost_1_58_0) SWIG_COMMAND=$(readlink -f ../swig-3.0.8/bin/swig) pip install --verbose --upgrade --no-deps .
+        - pushd _deps/PacBioTestData ; git lfs pull && make python
 test:
     pre:
         - pip install --verbose .
diff --git a/doc/HowTo.rst b/doc/HowTo.rst
index 538f707..0466d52 100644
--- a/doc/HowTo.rst
+++ b/doc/HowTo.rst
@@ -36,9 +36,14 @@ release of SMRTanalysis 3.0 or build from GitHub sources.*
 Running a large-scale resequencing/polishing job in SMRTanalysis 2.3
 --------------------------------------------------------------------
 
+We do not recommend attempting  to construct a single giant cmp.h5 file and
+then processing it on a single node.  This is inefficient and users attempting to do this
+have run into many problems with the instability of the HDF5 library (which PacBio is
+moving away from, in favor of BAM_.)
+
 To run a large-scale resequencing job (>50 megabase genome @ 50x
 coverage,nominally), you want to spread the computation load across
-multiple nodes in your computing cluster.
+multiple nodes in your computing cluster.  
 
 The `smrtpipe` workflow engine in SMRTanalysis 2.3 provides a
 convenient workflow automating this---it will automatically spread the
@@ -126,3 +131,4 @@ Please consult the `FAQ document`_.
 .. _pitchfork : https://github.com/PacificBiosciences/pitchfork
 .. _`params.xml template`: ./params-template.xml
 .. _`SMRTpipe reference manual`: http://www.pacb.com/wp-content/uploads/2015/09/SMRT-Pipe-Reference-Guide.pdf
+.. _`BAM`: http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html
diff --git a/tests/cram/bad_input.t b/tests/cram/bad_input.t
index 30d59e9..74e49bb 100644
--- a/tests/cram/bad_input.t
+++ b/tests/cram/bad_input.t
@@ -7,5 +7,5 @@ Test for sane behavior in the presence of bogus arguments.
 Test that it doesn't crash when one BAM file in an otherwise valid AlignmentSet is empty.
 
   $ DATA=$TESTDIR/../data/sanity
-  $ REF="`python -c 'import pbcore.data ; print pbcore.data.getLambdaFasta()'`"
+  $ REF="`pbdata get lambda-fasta`"
   $ arrow --reference $REF -o contig.fasta $DATA/mixed.alignmentset.xml
diff --git a/tests/cram/extra/arrow-evidence.t b/tests/cram/extra/arrow-evidence.t
new file mode 100644
index 0000000..3c42cd1
--- /dev/null
+++ b/tests/cram/extra/arrow-evidence.t
@@ -0,0 +1,28 @@
+Get the arrow evidence dump and make sure it can be grokked.
+
+  $ export DATA=$TESTDIR/../../data
+  $ export INPUT=$DATA/all4mer/out.aligned_subreads.bam
+  $ export REFERENCE=$DATA/all4mer/All4mer.V2.01_Insert.fa
+
+Run arrow w/ evidence dump
+
+  $ arrow --dumpEvidence=all $INPUT -r $REFERENCE -o v.gff -o css.fa -o css.fq
+
+Inspect the output...
+
+  $ find evidence_dump | sort
+  evidence_dump
+  evidence_dump/All4mer.V2.01_Insert
+  evidence_dump/All4mer.V2.01_Insert/0-260
+  evidence_dump/All4mer.V2.01_Insert/0-260/arrow-scores.h5
+  evidence_dump/All4mer.V2.01_Insert/0-260/consensus.fa
+
+
+Try to load it up using the API...
+
+  $ python << EOF
+  > from GenomicConsensus.arrow.evidence import ArrowEvidence
+  > ev = ArrowEvidence.load("evidence_dump/All4mer.V2.01_Insert/0-260")
+  > assert 8*len(ev.consensus)==len(ev.colNames)==2080
+  > assert ev.delta.shape == (len(ev.rowNames), len(ev.colNames))==(95,2080)
+  > EOF
diff --git a/tests/cram/internal/alignment_summary_scaling.t b/tests/cram/internal/alignment_summary_scaling.t
index e20e454..cbc3a1a 100644
--- a/tests/cram/internal/alignment_summary_scaling.t
+++ b/tests/cram/internal/alignment_summary_scaling.t
@@ -12,7 +12,7 @@ unless an O(N^2) loop is used.
   $ GFFREF=$TESTDATA/alignment_summary_variants.gff
   $ OUTPUT=alignment_summary_variants_test.gff
   $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
-  $ diff -I "##source-commandline" $OUTPUT $GFFREF
+  $ diff -I "##source" $OUTPUT $GFFREF
 
 Second test has 20000 regions in a single contig, and 10000 variants.  This
 will also take several minutes.
@@ -22,4 +22,4 @@ will also take several minutes.
   $ GFFREF=$TESTDATA/alignment_summary_variants_big_chr.gff
   $ OUTPUT=alignment_summary_variants_big_chr_test.gff
   $ summarizeConsensus --variantsGff $VARIANTS --output $OUTPUT $SUMMARY
-  $ diff -I "##source-commandline" $OUTPUT $GFFREF
+  $ diff -I "##source" $OUTPUT $GFFREF
diff --git a/tests/cram/internal/arrow-staph.t b/tests/cram/internal/arrow-staph.t
index 99eb11c..433675d 100644
--- a/tests/cram/internal/arrow-staph.t
+++ b/tests/cram/internal/arrow-staph.t
@@ -14,19 +14,9 @@ Quiver does a good job here---no errors.
   $ fastacomposition quiver-css.fasta
   quiver-css.fasta A 960233 C 470725 G 470271 T 971458
 
-Arrow, on the other hand, makes a number of errors.
+Arrow, since the SNR capping fix, also gets no errors. 
 
   $ gffsubtract.pl arrow-variants.gff $MASK | grep -v "#" | sed 's/\t/ /g'
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 42450 42450 . . . reference=A;variantSeq=.;coverage=100;confidence=68
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 442861 442861 . . . reference=T;variantSeq=.;coverage=100;confidence=64
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 562107 562107 . . . reference=T;variantSeq=.;coverage=100;confidence=52
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 690814 690814 . . . reference=T;variantSeq=.;coverage=100;confidence=46
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1041877 1041877 . . . reference=A;variantSeq=.;coverage=100;confidence=45
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1801427 1801427 . . . reference=T;variantSeq=.;coverage=100;confidence=51
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1840635 1840635 . . . reference=A;variantSeq=.;coverage=100;confidence=88
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 1850379 1850379 . . . reference=T;variantSeq=.;coverage=100;confidence=40
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2147449 2147449 . . . reference=T;variantSeq=.;coverage=100;confidence=93
-  Staphylococcus_aureus_subsp_aureus_USA300_TCH1516 . deletion 2231043 2231043 . . . reference=A;variantSeq=.;coverage=100;confidence=80
 
   $ fastacomposition arrow-css.fasta
-  arrow-css.fasta A 960033 C 470641 G 470190 T 971286 a 178 c 84 g 80 t 158
+  arrow-css.fasta A 960232 C 470725 G 470271 T 971457
diff --git a/tests/cram/version.t b/tests/cram/version.t
index 7df81c8..2ec1942 100644
--- a/tests/cram/version.t
+++ b/tests/cram/version.t
@@ -2,7 +2,7 @@ This actually failed once because of a missing import, so we might as
 well test it.
 
   $ variantCaller --version
-  2.0.0
+  2.1.0
 
 This will break if the parser setup is messed up.
 
diff --git a/tests/unit/test_algorithm_selection.py b/tests/unit/test_algorithm_selection.py
index 65e8e89..8ffaf6b 100644
--- a/tests/unit/test_algorithm_selection.py
+++ b/tests/unit/test_algorithm_selection.py
@@ -10,5 +10,5 @@ def test_algorithm_selection():
     EQ("arrow",  bestAlgorithm_(["P6-C4", "S/P1-C1/beta"]))
     EQ(None,     bestAlgorithm_(["P6-C4", "unknown"]))
     EQ("arrow",  bestAlgorithm_(["S/P1-C1"]))
-    EQ("arrow",  bestAlgorithm_(["P6-C4", "S/P1-C1"]))
-    EQ("arrow",  bestAlgorithm_(["P5-C3", "S/P1-C1"])) # (Arrow pres. no training for P5.  But it will tell us that)
+    EQ("arrow",  bestAlgorithm_(["P6-C4", "S/P1-C1.1"]))
+    EQ("arrow",  bestAlgorithm_(["P5-C3", "S/P1-C1.1"])) # (Arrow pres. no training for P5.  But it will tell us that)
diff --git a/tests/unit/test_summarize_consensus.py b/tests/unit/test_summarize_consensus.py
index b1d2718..e335120 100644
--- a/tests/unit/test_summarize_consensus.py
+++ b/tests/unit/test_summarize_consensus.py
@@ -31,7 +31,7 @@ chr2\t.\tregion\t10000\t23469\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1
 chr3\t.\tregion\t1\t7000\t0.00\t+\t.\tcov=0,48,89;cov2=47.303,12.036;gaps=1,24"""
 
 EXPECTED = """\
-##source GenomicConsensus 2.0.0
+##source GenomicConsensus 2.1.0
 ##pacbio-alignment-summary-version 0.6
 ##source-commandline this line will be skipped in the comparison
 chr1\t.\tregion\t1\t5000\t0.00\t+\t.\tcov=4,23,28;cov2=20.162,5.851;gaps=0,0;cQv=20,20,20;del=0;ins=0;sub=1
diff --git a/tests/unit/test_tool_contract.py b/tests/unit/test_tool_contract.py
index 357ada3..33de349 100644
--- a/tests/unit/test_tool_contract.py
+++ b/tests/unit/test_tool_contract.py
@@ -3,43 +3,46 @@ import unittest
 import os.path
 
 from pbcore.io import openDataSet, ContigSet
-import pbcore.data
 import pbcommand.testkit
 
-# XXX local data directory, absolutely required
+import pbtestdata
+
 DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
 assert os.path.isdir(DATA_DIR)
 
-# optional (but required for TestSummarizeConsensus)
-DATA_DIR_2 = "/mnt/secondary/Share/Quiver/TestData/tinyLambda/"
-
-class TestVariantCaller(pbcommand.testkit.PbTestApp):
-    DRIVER_BASE = "variantCaller "
-    DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
-    DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
-    REQUIRES_PBCORE = True
-    INPUT_FILES = [
-        pbcore.data.getBamAndCmpH5()[0],
-#        "/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/0007_tiny/Alignment_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.alignmentset.xml",
-        pbcore.data.getLambdaFasta()
-    ]
-    TASK_OPTIONS = {
-      "genomic_consensus.task_options.min_coverage": 0,
-      "genomic_consensus.task_options.min_confidence": 0,
-      "genomic_consensus.task_options.algorithm": "quiver",
-      "genomic_consensus.task_options.diploid": False,
-    }
-
-    def run_after(self, rtc, output_dir):
-        contigs_file = rtc.task.output_files[1]
-        with openDataSet(contigs_file, strict=True) as ds:
-            self.assertTrue(isinstance(ds, ContigSet))
-
-
-class TestVariantCallerArrow(TestVariantCaller):
-    TASK_OPTIONS = {
-      "genomic_consensus.task_options.algorithm": "arrow",
-    }
+# These tests seem to cause some logging failure at shutdown;
+# disabling them pending upstream fix.  See:
+# https://bugzilla.nanofluidics.com/show_bug.cgi?id=33699
+
+# class TestVariantCaller(pbcommand.testkit.PbTestApp):
+#     DRIVER_BASE = "variantCaller "
+#     DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
+#     DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
+#     REQUIRES_PBCORE = True
+#     INPUT_FILES = [
+#         pbtestdata.get_file("aligned-xml"), pbtestdata.get_file("lambdaNEB")
+#     ]
+#     TASK_OPTIONS = {
+#       "genomic_consensus.task_options.min_coverage": 0,
+#       "genomic_consensus.task_options.min_confidence": 0,
+#       "genomic_consensus.task_options.algorithm": "quiver",
+#       "genomic_consensus.task_options.diploid": False,
+#     }
+
+#     def test_run_e2e(self):
+#         import ipdb; ipdb.set_trace()
+#         super(TestVariantCaller, self).test_run_e2e()
+
+#     def run_after(self, rtc, output_dir):
+#         contigs_file = rtc.task.output_files[1]
+#         with openDataSet(contigs_file, strict=True) as ds:
+#             self.assertTrue(isinstance(ds, ContigSet))
+
+
+# class TestVariantCallerArrow(TestVariantCaller):
+#     TASK_OPTIONS = {
+#       "genomic_consensus.task_options.algorithm": "arrow",
+#     }
 
 
 class TestGffToBed(pbcommand.testkit.PbTestApp):
@@ -70,15 +73,14 @@ class TestGffToVcf(pbcommand.testkit.PbTestApp):
     }
 
 
- at unittest.skipUnless(os.path.isdir(DATA_DIR_2), "Missing %s" % DATA_DIR_2)
 class TestSummarizeConsensus(pbcommand.testkit.PbTestApp):
     DRIVER_BASE = "summarizeConsensus"
     DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
     DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
     REQUIRES_PBCORE = True
     INPUT_FILES = [
-        os.path.join(DATA_DIR_2, "alignment_summary.gff"),
-        os.path.join(DATA_DIR_2, "variants.gff.gz")
+        pbtestdata.get_file("alignment-summary-gff"),
+        pbtestdata.get_file("variants-gff")
     ]
     TASK_OPTIONS = {}
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git