[med-svn] [python-pbcore] 01/12: Imported Upstream version 1.2.9+dfsg

Afif Elghraoui afif at moszumanska.debian.org
Tue Apr 12 06:31:33 UTC 2016


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository python-pbcore.

commit 60c0774d1ab9947ee237aa4d355ac5c5fb54f1a9
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Mon Apr 11 21:09:23 2016 -0700

    Imported Upstream version 1.2.9+dfsg
---
 CHANGELOG.org                                      |   14 +-
 bin/dataset.py                                     |   64 +-
 doc/pbcore.io.rst                                  |   24 +-
 pbcore/__init__.py                                 |    2 +-
 pbcore/chemistry/resources/mapping.xml             |    8 +-
 pbcore/data/datasets/__init__.py                   |    6 +-
 pbcore/data/datasets/empty_lambda.aligned.bam      |  Bin 0 -> 700 bytes
 pbcore/data/datasets/empty_lambda.aligned.bam.bai  |  Bin 0 -> 24 bytes
 pbcore/data/datasets/empty_lambda.aligned.bam.pbi  |  Bin 0 -> 67 bytes
 ...69412550000001823090301191423_s1_p0.ccs.bam.pbi |  Bin 0 -> 297 bytes
 pbcore/io/BasH5IO.py                               |  145 +--
 pbcore/io/FastaIO.py                               |   20 +-
 pbcore/io/align/BamAlignment.py                    |   45 +-
 pbcore/io/align/BamIO.py                           |   45 +-
 pbcore/io/align/CmpH5IO.py                         |   99 +-
 pbcore/io/align/_BamSupport.py                     |    7 +-
 pbcore/io/dataset/DataSetIO.py                     | 1272 +++++++++++++-------
 pbcore/io/dataset/DataSetMembers.py                |  338 ++++--
 pbcore/io/dataset/DataSetReader.py                 |   49 +-
 pbcore/io/dataset/DataSetWriter.py                 |   10 +
 pbcore/io/dataset/EntryPoints.py                   |  246 ----
 pbcore/io/dataset/utils.py                         |   20 +-
 pbcore/io/rangeQueries.py                          |    4 +-
 pbcore/model/__init__.py                           |    2 +
 pbcore/model/baseRegions.py                        |  192 +++
 requirements-dev.txt                               |    1 +
 requirements.txt                                   |    2 +-
 tests/test_pbcore_io_BasH5Reader.py                |    3 +-
 tests/test_pbcore_io_unaligned_bam.py              |    2 +-
 tests/test_pbdataset.py                            | 1139 ++++++++++++++++--
 tests/test_pbdataset_subtypes.py                   |  319 ++++-
 31 files changed, 2810 insertions(+), 1268 deletions(-)

diff --git a/CHANGELOG.org b/CHANGELOG.org
index 4eaeb5a..bf187aa 100644
--- a/CHANGELOG.org
+++ b/CHANGELOG.org
@@ -1,4 +1,16 @@
-* Current progress
+* Version 1.2.9
+  - Rename pulseFeature accessors to "baseFeature...", since they
+    reflect features accompanying basecalls as opposed to pulse calls;
+    add "pulseFeature..." accessors that actually get the pulse
+    features (from an internal-mode BAM)
+  - Improve FastaWriter performance with IndexedFastaRecords
+  - Improve DataSet chunking, filtration, consolidation
+
+* Version 1.2.8
+  - Dataset improvements
+  - Dataset CLI moved to pbcoretools
+  - Support for context flag filtering
+  - Misc fixes
 
 * Version 1.2.7...
   - Improved IPython tab completion
diff --git a/bin/dataset.py b/bin/dataset.py
index e5fcef8..ebf100f 100755
--- a/bin/dataset.py
+++ b/bin/dataset.py
@@ -1,67 +1,13 @@
 #!/usr/bin/env python
 
 import sys
-import argparse
-import logging
-import time
-from pbcore.io.dataset import EntryPoints
-from pbcore import __VERSION__
-
-
-LOG_FORMAT = "%(asctime)s [%(levelname)s] %(message)s"
-
-def _setup_logging():
-    log = logging.getLogger()
-    logging.Formatter.converter = time.gmtime
-    if not log.handlers:
-        logging.basicConfig(level=logging.WARN, format=LOG_FORMAT)
-    return log
-
-def get_subparsers():
-    sps = [('create', EntryPoints.create_options),
-           ('filter', EntryPoints.filter_options),
-           ('merge', EntryPoints.merge_options),
-           ('split', EntryPoints.split_options),
-           ('validate', EntryPoints.validate_options),
-           ('loadstats', EntryPoints.loadStatsXml_options),
-           ('summarize', EntryPoints.summarize_options),
-           ('consolidate', EntryPoints.consolidate_options)]
-    return sps
-
-def add_subparsers(parser, sps):
-    subparsers = parser.add_subparsers(
-        title='DataSet sub-commands', dest='subparser_name',
-        help="Type {command} -h for a command's options")
-    for command_name, func in sps:
-        subparser = subparsers.add_parser(command_name)
-        subparser = func(subparser)
-    return parser
-
-def get_parser():
-    description = 'Run dataset.py by specifying a command.'
-    parser = argparse.ArgumentParser(version=__VERSION__,
-            description=description)
-    parser.add_argument("--debug", default=False, action='store_true',
-                        help="Turn on debug level logging")
-    parser.add_argument("--strict", default=False, action='store_true',
-                        help="Turn on strict tests, raise all errors")
-    parser.add_argument("--skipCounts", default=False, action='store_true',
-                        help="Turn on strict tests, raise all errors")
-    subparser_list = get_subparsers()
-    parser = add_subparsers(parser, subparser_list)
-    return parser
 
 def main(argv=sys.argv):
-    """Main point of Entry"""
-    log = _setup_logging()
-    log.info("Starting {f} version {v} dataset manipulator".format(
-        f=__file__, v=__VERSION__))
-    parser = get_parser()
-    args = parser.parse_args()
-    if args.debug:
-        log.setLevel(logging.DEBUG)
-    return args.func(args)
-    #return main_runner_default(argv[1:], get_parser(), log)
+    """This file, its use in setup.py should be removed after 1.2.8 is cut"""
+    print ("WARNING: dataset.py is no longer part of pbcore. It has moved to "
+           "pbcoretools (https://github.com/PacificBiosciences/pbcoretools) "
+           "and is now just 'dataset'")
+    exit(1)
 
 if __name__ == '__main__':
     sys.exit(main())
diff --git a/doc/pbcore.io.rst b/doc/pbcore.io.rst
index 34a9744..4d481ee 100644
--- a/doc/pbcore.io.rst
+++ b/doc/pbcore.io.rst
@@ -12,9 +12,28 @@ The classes within ``pbcore.io`` adhere to a few conventions, in order
 to provide a uniform API:
 
   - Each data file type is thought of as a container of a `Record`
-    type; all `Reader` classes support streaming access, and
-    `CmpH5Reader` and `BasH5Reader` additionally provide random-access
+    type; all `Reader` classes support streaming access by iterating on the 
+    reader object, and
+    `CmpH5Reader`, `BasH5Reader` and `IndexedBarReader` additionally 
+    provide random-access
     to alignments/reads.
+    
+    For example::
+    
+      from pbcore.io import *
+      with IndexedBamReader(filename) as f:
+        for r in f:
+            process(r)
+    
+    To make scripts a bit more user friendly, a progress bar can be
+    easily added using the `tqdm` third-party package::
+    
+      from pbcore.io import *
+      from tqdm import tqdm
+      with IndexedBamReader(filename) as f:
+        for r in tqdm(f):
+            process(r)
+    
 
   - The constructor argument needed to instantiate `Reader` and
     `Writer` objects can be either a filename (which can be suffixed
@@ -31,7 +50,6 @@ to provide a uniform API:
     the `CmpH5Reader` object will be automatically closed after the
     block within the "with" statement is executed.
 
-
 BAM/cmp.h5 compatibility: quick start
 -------------------------------------
 
diff --git a/pbcore/__init__.py b/pbcore/__init__.py
index 5edce9f..bfe805f 100644
--- a/pbcore/__init__.py
+++ b/pbcore/__init__.py
@@ -28,4 +28,4 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
 
-__VERSION__ = "1.2.7"
+__VERSION__ = "1.2.9"
diff --git a/pbcore/chemistry/resources/mapping.xml b/pbcore/chemistry/resources/mapping.xml
index 2f625d8..c40b10a 100644
--- a/pbcore/chemistry/resources/mapping.xml
+++ b/pbcore/chemistry/resources/mapping.xml
@@ -176,9 +176,15 @@
     <SoftwareVersion>2.3</SoftwareVersion>
   </Mapping>
   <Mapping>
-    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <SequencingChemistry>S/P1-C1/beta</SequencingChemistry>
     <BindingKit>100-619-300</BindingKit>
     <SequencingKit>100-620-000</SequencingKit>
     <SoftwareVersion>3.0</SoftwareVersion>
   </Mapping>
+  <Mapping>
+    <SequencingChemistry>S/P1-C1</SequencingChemistry>
+    <BindingKit>100-619-300</BindingKit>
+    <SequencingKit>100-620-000</SequencingKit>
+    <SoftwareVersion>3.1</SoftwareVersion>
+  </Mapping>
 </MappingTable>
diff --git a/pbcore/data/datasets/__init__.py b/pbcore/data/datasets/__init__.py
index 2d4378a..b3d17c8 100755
--- a/pbcore/data/datasets/__init__.py
+++ b/pbcore/data/datasets/__init__.py
@@ -27,7 +27,8 @@ XML_FILES = ["alignment.dataset.xml", #0
             ]
 BAM_FILES = ["pbalchemysim0.pbalign.bam",
              "pbalchemysim1.pbalign.bam",
-             os.path.join('..', 'bam_mapping.bam')]
+             os.path.join('..', 'bam_mapping.bam'),
+             "empty_lambda.aligned.bam",]
 STATS_FILES = [
     "m150430_142051_Mon_p1_b25.sts.xml",
     "m150616_053259_ethan_c100710482550000001823136404221563_s1_p0.sts.xml"]
@@ -46,6 +47,9 @@ def getXmlWithStats():
 def getBam(no=0):
     return _getAbsPath(BAM_FILES[no])
 
+def getEmptyAlignedBam():
+    return _getAbsPath(BAM_FILES[3])
+
 def getStats(no=0):
     return _getAbsPath(STATS_FILES[no])
 
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam b/pbcore/data/datasets/empty_lambda.aligned.bam
new file mode 100644
index 0000000..edd878c
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam differ
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam.bai b/pbcore/data/datasets/empty_lambda.aligned.bam.bai
new file mode 100644
index 0000000..13df110
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam.bai differ
diff --git a/pbcore/data/datasets/empty_lambda.aligned.bam.pbi b/pbcore/data/datasets/empty_lambda.aligned.bam.pbi
new file mode 100644
index 0000000..e398d79
Binary files /dev/null and b/pbcore/data/datasets/empty_lambda.aligned.bam.pbi differ
diff --git a/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi
new file mode 100644
index 0000000..1d37ca5
Binary files /dev/null and b/pbcore/data/m130727_114215_42211_c100569412550000001823090301191423_s1_p0.ccs.bam.pbi differ
diff --git a/pbcore/io/BasH5IO.py b/pbcore/io/BasH5IO.py
index 86e50a3..9d3055d 100644
--- a/pbcore/io/BasH5IO.py
+++ b/pbcore/io/BasH5IO.py
@@ -44,30 +44,13 @@ from pbcore.io.FofnIO import readFofn
 from pbcore.chemistry import (decodeTriple,
                               tripleFromMetadataXML,
                               ChemistryLookupError)
+from pbcore.model import ExtraBaseRegionsMixin, HQ_REGION
 from ._utils import arrayFromDataset, CommonEqualityMixin
 
 
-def intersectRanges(r1, r2):
-    b1, e1 = r1
-    b2, e2 = r2
-    b, e = max(b1, b2), min(e1, e2)
-    return (b, e) if (b < e) else None
-
-def rangeLength(r):
-    b, e = r
-    return e - b
-
-def removeNones(lst):
-    return filter(lambda x: x!=None, lst)
-
 # ZMW hole Types
 SEQUENCING_ZMW = 0
 
-# Region types
-ADAPTER_REGION = 0
-INSERT_REGION  = 1
-HQ_REGION      = 2
-
 # This seems to be the magic incantation to get a RecArray that can be
 # indexed to yield a record that can then be accessed using dot
 # notation.
@@ -85,7 +68,7 @@ def _makeQvAccessor(featureName):
         return self.qv(featureName)
     return f
 
-class Zmw(CommonEqualityMixin):
+class Zmw(CommonEqualityMixin, ExtraBaseRegionsMixin):
     """
     A Zmw represents all data from a ZMW (zero-mode waveguide) hole
     within a bas.h5 movie file.  Accessor methods provide convenient
@@ -111,65 +94,6 @@ class Zmw(CommonEqualityMixin):
             return toRecArray(REGION_TABLE_DTYPE,
                               [ (self.holeNumber, HQ_REGION, 0, 0, 0) ])
 
-    #
-    # The following "region" calls return one or more intervals ((int, int)).
-    #  - The default implementations perform clipping to the hqRegion.
-    #  - The "unclipped" implementations entail no clipping
-    #
-    @property
-    def adapterRegionsNoQC(self):
-        """
-        Get adapter regions as intervals, without clipping to the HQ
-        region.  Don't use this unless you know what you're doing.
-        """
-        return [ (region.regionStart, region.regionEnd)
-                 for region in self.regionTable
-                 if region.regionType == ADAPTER_REGION ]
-
-    @property
-    def adapterRegions(self):
-        """
-        Get adapter regions as intervals, performing clipping to the HQ region
-        """
-        hqRegion = self.hqRegion
-        return removeNones([ intersectRanges(hqRegion, region)
-                             for region in self.adapterRegionsNoQC ])
-
-    @property
-    def insertRegionsNoQC(self):
-        """
-        Get insert regions as intervals, without clipping to the HQ
-        region.  Don't use this unless you know what you're doing.
-        """
-        return [ (region.regionStart, region.regionEnd)
-                 for region in self.regionTable
-                 if region.regionType == INSERT_REGION ]
-
-    @property
-    def insertRegions(self):
-        """
-        Get insert regions as intervals, clipped to the HQ region
-        """
-        hqRegion = self.hqRegion
-        return removeNones([ intersectRanges(hqRegion, region)
-                             for region in self.insertRegionsNoQC ])
-    @property
-    def hqRegion(self):
-        """
-        Return the HQ region interval.
-
-        The HQ region is an interval of basecalls where the basecaller has
-        inferred that a single sequencing reaction is taking place.
-        Secondary analysis should only use subreads within the HQ
-        region.  Methods in this class, with the exception of the
-        "NoQC" methods, return data appropriately clipped/filtered to
-        the HQ region.
-        """
-        rt = self.regionTable
-        hqRows = rt[rt.regionType == HQ_REGION]
-        assert len(hqRows) == 1
-        hqRow = hqRows[0]
-        return hqRow.regionStart, hqRow.regionEnd
 
     @property
     def readScore(self):
@@ -242,7 +166,6 @@ class Zmw(CommonEqualityMixin):
         readEnd   = hqEnd if readEnd is None else readEnd
         return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
 
-
     def readNoQC(self, readStart=None, readEnd=None):
         """
         Given no arguments, returns the entire polymerase read, *not
@@ -255,71 +178,12 @@ class Zmw(CommonEqualityMixin):
             as we make no guarantees about what happens outside of the
             HQ region.
         """
-        if not self.baxH5.hasRawBasecalls:
-            raise ValueError, "No raw reads in this file"
         polymeraseBegin = 0
         polymeraseEnd = self.numEvents
         readStart = polymeraseBegin if readStart is None else readStart
         readEnd   = polymeraseEnd   if readEnd   is None else readEnd
         return ZmwRead(self.baxH5, self.holeNumber, readStart, readEnd)
 
-    @property
-    def subreadsNoQC(self):
-        """
-        Get the subreads, including data beyond the bounds of the HQ region.
-
-        .. warning::
-
-            It is not recommended that production code use this method
-            as we make no guarantees about what happens outside of the
-            HQ region.
-        """
-        if not self.baxH5.hasRawBasecalls:
-            raise ValueError, "No raw reads in this file"
-        return [ self.read(readStart, readEnd)
-                 for (readStart, readEnd) in self.insertRegionsNoQC ]
-
-    @property
-    def subreads(self):
-        """
-        Get the subreads as a list of ZmwRead objects.  Restricts focus,
-        and clips to, the HQ region.  This method can be used by
-        production code.
-        """
-        if not self.baxH5.hasRawBasecalls:
-            raise ValueError, "No raw reads in this file"
-        return [ self.read(readStart, readEnd)
-                 for (readStart, readEnd) in self.insertRegions ]
-
-
-    @property
-    def adapters(self):
-        """
-        Get the adapter hits as a list of ZmwRead objects.  Restricts
-        focus, and clips to, the HQ region.  This method can be used
-        by production code.
-        """
-        if not self.baxH5.hasRawBasecalls:
-            raise ValueError, "No raw reads in this file"
-        return [ self.read(readStart, readEnd)
-                 for (readStart, readEnd) in self.adapterRegions ]
-
-    @property
-    def adaptersNoQC(self):
-        """
-        Get the adapters, including data beyond the bounds of the HQ
-        region.
-
-        .. warning::
-
-            It is not recommended that production code use this method
-            as we make no guarantees about what happens outside of the
-            HQ region.
-        """
-        if not self.baxH5.hasRawBasecalls:
-            raise ValueError, "No raw reads in this file"
-        return [ self.read(readStart, readEnd)
-                 for (readStart, readEnd) in self.adapterRegionsNoQC ]
 
     @property
     def ccsRead(self):
@@ -429,8 +293,9 @@ class CCSZmwRead(ZmwRead):
 def _makeOffsetsDataStructure(h5Group):
     numEvent   = h5Group["ZMW/NumEvent"].value
     holeNumber = h5Group["ZMW/HoleNumber"].value
-    endOffset = np.cumsum(numEvent)
-    beginOffset = np.hstack(([0], endOffset[0:-1]))
+    endOffset = np.cumsum(numEvent, dtype=np.uint32)
+    singleZero = np.array([0], dtype=np.uint32)
+    beginOffset = np.hstack((singleZero, endOffset[0:-1]))
     offsets = zip(beginOffset, endOffset)
     return dict(zip(holeNumber, offsets))
 
diff --git a/pbcore/io/FastaIO.py b/pbcore/io/FastaIO.py
index 79867fc..6735636 100644
--- a/pbcore/io/FastaIO.py
+++ b/pbcore/io/FastaIO.py
@@ -252,12 +252,9 @@ class FastaWriter(WriterBase):
             raise ValueError
         if len(args) == 1:
             record = args[0]
-            if isinstance(record, IndexedFastaRecord):
-                record = FastaRecord(record.header, record.sequence)
-            assert isinstance(record, FastaRecord)
         else:
             header, sequence = args
-            record = FastaRecord(header, sequence)
+            record = FastaRecord(header, str(sequence))
         self.file.write(str(record))
         self.file.write("\n")
 
@@ -350,7 +347,14 @@ class MmappedFastaSequence(Sequence):
         return (isinstance(other, MmappedFastaSequence) and
                 self[:] == other[:])
 
+    def __str__(self):
+        return str(self[:])
+
+
 class IndexedFastaRecord(object):
+
+    COLUMNS   = 60
+
     def __init__(self, view, faiRecord):
         self.view = view
         self.faiRecord = faiRecord
@@ -391,6 +395,14 @@ class IndexedFastaRecord(object):
                 self.header == other.header and
                 self.sequence == other.sequence)
 
+    def __str__(self):
+        """
+        Output a string representation of this FASTA record, observing
+        standard conventions about sequence wrapping.
+        """
+        return (">%s\n" % self.header) + \
+            wrap(self.sequence, self.COLUMNS)
+
 class IndexedFastaReader(ReaderBase, Sequence):
     """
     Random-access FASTA file reader.
diff --git a/pbcore/io/align/BamAlignment.py b/pbcore/io/align/BamAlignment.py
index 2bdba22..8428d0f 100644
--- a/pbcore/io/align/BamAlignment.py
+++ b/pbcore/io/align/BamAlignment.py
@@ -58,12 +58,11 @@ def _unrollCigar(cigar, exciseSoftClips=False):
     else:
         return ops
 
-def _makePulseFeatureAccessor(featureName):
+def _makeBaseFeatureAccessor(featureName):
     def f(self, aligned=True, orientation="native"):
-        return self.pulseFeature(featureName, aligned, orientation)
+        return self.baseFeature(featureName, aligned, orientation)
     return f
 
-
 def requiresReference(method):
     @wraps(method)
     def f(bamAln, *args, **kwargs):
@@ -260,11 +259,27 @@ class BamAlignment(AlignmentRecordMixin):
         return self.readGroupInfo.ReadType
 
     @property
+    def scrapType(self):
+        if self.readType != "SCRAP":
+            raise ValueError, "scrapType not meaningful for non-scrap reads"
+        else:
+            return self.peer.opt("sc")
+
+    @property
     def sequencingChemistry(self):
         return self.readGroupInfo.SequencingChemistry
 
     @property
     def hqRegionSnr(self):
+        """
+        Return the per-channel SNR averaged over the HQ region.
+
+        .. note::
+
+            This capability was not available in `cmp.h5` files, so
+            use of this property can result in code that won't work on
+            legacy data.
+        """
         return self.peer.opt("sn")
 
     @property
@@ -431,9 +446,9 @@ class BamAlignment(AlignmentRecordMixin):
             return pos[ucOriented != BAM_CINS]
 
 
-    def pulseFeature(self, featureName, aligned=True, orientation="native"):
+    def baseFeature(self, featureName, aligned=True, orientation="native"):
         """
-        Retrieve the pulse feature as indicated.
+        Retrieve the base feature as indicated.
         - `aligned`    : whether gaps should be inserted to reflect the alignment
         - `orientation`: "native" or "genomic"
 
@@ -450,10 +465,10 @@ class BamAlignment(AlignmentRecordMixin):
 
         # 0. Get the "concrete" feature name.  (Example: Ipd could be
         # Ipd:Frames or Ipd:CodecV1)
-        concreteFeatureName = self.bam._featureNameMappings[self.qId][featureName]
+        concreteFeatureName = self.bam._baseFeatureNameMappings[self.qId][featureName]
 
         # 1. Extract in native orientation
-        tag, kind_, dtype_ = PULSE_FEATURE_TAGS[concreteFeatureName]
+        tag, kind_, dtype_ = BASE_FEATURE_TAGS[concreteFeatureName]
         data_ = self.peer.opt(tag)
 
         if isinstance(data_, str):
@@ -518,14 +533,14 @@ class BamAlignment(AlignmentRecordMixin):
         alnData[~gapMask] = data
         return alnData
 
-    IPD            = _makePulseFeatureAccessor("Ipd")
-    PulseWidth     = _makePulseFeatureAccessor("PulseWidth")
-    #QualityValue   = _makePulseFeatureAccessor("QualityValue")
-    InsertionQV    = _makePulseFeatureAccessor("InsertionQV")
-    DeletionQV     = _makePulseFeatureAccessor("DeletionQV")
-    DeletionTag    = _makePulseFeatureAccessor("DeletionTag")
-    MergeQV        = _makePulseFeatureAccessor("MergeQV")
-    SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+    IPD            = _makeBaseFeatureAccessor("Ipd")
+    PulseWidth     = _makeBaseFeatureAccessor("PulseWidth")
+    #QualityValue   = _makeBaseFeatureAccessor("QualityValue")
+    InsertionQV    = _makeBaseFeatureAccessor("InsertionQV")
+    DeletionQV     = _makeBaseFeatureAccessor("DeletionQV")
+    DeletionTag    = _makeBaseFeatureAccessor("DeletionTag")
+    MergeQV        = _makeBaseFeatureAccessor("MergeQV")
+    SubstitutionQV = _makeBaseFeatureAccessor("SubstitutionQV")
 
     def read(self, aligned=True, orientation="native"):
         if not (orientation == "native" or orientation == "genomic"):
diff --git a/pbcore/io/align/BamIO.py b/pbcore/io/align/BamIO.py
index f731b14..a9e7fa7 100644
--- a/pbcore/io/align/BamIO.py
+++ b/pbcore/io/align/BamIO.py
@@ -98,7 +98,10 @@ class _BamReaderBase(ReaderBase):
     def _loadReadGroupInfo(self):
         rgs = self.peer.header["RG"]
         readGroupTable_ = []
-        self._featureNameMappings = {}  # RGID -> ("abstract feature name" -> actual feature name)
+
+        # RGID -> ("abstract feature name" -> actual feature name)
+        self._baseFeatureNameMappings = {}
+        self._pulseFeatureNameMappings = {}
 
         for rg in rgs:
             rgID = rgAsInt(rg["ID"])
@@ -110,9 +113,7 @@ class _BamReaderBase(ReaderBase):
             triple = ds["BINDINGKIT"], ds["SEQUENCINGKIT"], basecallerVersion
             rgChem = decodeTriple(*triple)
             rgReadType = ds["READTYPE"]
-            # TODO(dalexander): need FRAMERATEHZ in RG::DS!
-            #rgFrameRate = ds["FRAMERATEHZ"]
-            rgFrameRate = 75.0
+            rgFrameRate = ds["FRAMERATEHZ"]
             readGroupTable_.append((rgID, rgName, rgReadType, rgChem, rgFrameRate))
 
             # Look for the features manifest entries within the DS tag,
@@ -120,10 +121,14 @@ class _BamReaderBase(ReaderBase):
             # "Ipd"  to "Ipd:Frames"
             # (This is a bit messy.  Can we separate the manifest from
             # the rest of the DS content?)
-            featureNameMapping = { key.split(":")[0] : key
-                                   for key in ds.keys()
-                                   if key in PULSE_FEATURE_TAGS }
-            self._featureNameMappings[rgID] = featureNameMapping
+            baseFeatureNameMapping  = { key.split(":")[0] : key
+                                        for key in ds.keys()
+                                        if key in BASE_FEATURE_TAGS }
+            pulseFeatureNameMapping = { key.split(":")[0] : key
+                                        for key in ds.keys()
+                                        if key in PULSE_FEATURE_TAGS }
+            self._baseFeatureNameMappings[rgID]  = baseFeatureNameMapping
+            self._pulseFeatureNameMappings[rgID] = pulseFeatureNameMapping
 
         self._readGroupTable = np.rec.fromrecords(
             readGroupTable_,
@@ -138,10 +143,12 @@ class _BamReaderBase(ReaderBase):
         self._readGroupDict = { rg.ID : rg
                                 for rg in self._readGroupTable }
 
-        # The pulse features "available" to clients of this file are the intersection
-        # of pulse features available from each read group.
+        # The base/pulse features "available" to clients of this file are the intersection
+        # of features available from each read group.
+        self._baseFeaturesAvailable = set.intersection(
+            *[set(mapping.keys()) for mapping in self._baseFeatureNameMappings.values()])
         self._pulseFeaturesAvailable = set.intersection(
-            *[set(mapping.keys()) for mapping in self._featureNameMappings.values()])
+            *[set(mapping.keys()) for mapping in self._pulseFeatureNameMappings.values()])
 
     def _loadProgramInfo(self):
         pgRecords = [ (pg["ID"], pg.get("VN", None), pg.get("CL", None))
@@ -288,12 +295,25 @@ class _BamReaderBase(ReaderBase):
         self.peer.seek(offset)
         return BamAlignment(self, next(self.peer))
 
+    def hasBaseFeature(self, featureName):
+        return featureName in self._baseFeaturesAvailable
+
+    def baseFeaturesAvailable(self):
+        return self._baseFeaturesAvailable
+
     def hasPulseFeature(self, featureName):
         return featureName in self._pulseFeaturesAvailable
 
     def pulseFeaturesAvailable(self):
         return self._pulseFeaturesAvailable
 
+    def hasPulseFeatures(self):
+        """
+        Is this BAM file a product of running analysis with the
+        PacBio-internal analysis mode enabled?
+        """
+        return self.hasPulseFeature("PulseCall")
+
     @property
     def barcode(self):
         raise Unimplemented()
@@ -369,7 +389,8 @@ class IndexedBamReader(_BamReaderBase, IndexedAlignmentReaderMixin):
             if exists(pbiFname):
                 self.pbi = PacBioBamIndex(pbiFname)
             else:
-                raise IOError, "IndexedBamReader requires bam.pbi index file"
+                raise IOError("IndexedBamReader requires bam.pbi index file "+
+                              "to read {f}".format(f=fname))
         else:
             self.pbi = sharedIndex
 
diff --git a/pbcore/io/align/CmpH5IO.py b/pbcore/io/align/CmpH5IO.py
index 61eb81a..aa5e83b 100755
--- a/pbcore/io/align/CmpH5IO.py
+++ b/pbcore/io/align/CmpH5IO.py
@@ -187,9 +187,9 @@ OFFSET_TABLE_DTYPE = [ ("ID",       np.uint32),
                        ("EndRow",   np.uint32) ]
 
 
-def _makePulseFeatureAccessor(featureName):
+def _makeBaseFeatureAccessor(featureName):
     def f(self, aligned=True, orientation="native"):
-        return self.pulseFeature(featureName, aligned, orientation)
+        return self.baseFeature(featureName, aligned, orientation)
     return f
 
 class CmpH5Alignment(AlignmentRecordMixin):
@@ -546,9 +546,9 @@ class CmpH5Alignment(AlignmentRecordMixin):
         else:
             return self.rStart + np.hstack([0, np.cumsum(readNonGapMask[:-1])])
 
-    def pulseFeature(self, featureName, aligned=True, orientation="native"):
+    def baseFeature(self, featureName, aligned=True, orientation="native"):
         """
-        Access a pulse feature by name.
+        Access a base feature by name.
         """
         pulseDataset = self._alignmentGroup[featureName]
         pulseArray = arrayFromDataset(pulseDataset, self.Offset_begin, self.Offset_end)
@@ -561,14 +561,14 @@ class CmpH5Alignment(AlignmentRecordMixin):
         else:
             return ungappedPulseArray(alignedPulseArray)
 
-    IPD            = _makePulseFeatureAccessor("IPD")
-    PulseWidth     = _makePulseFeatureAccessor("PulseWidth")
-    QualityValue   = _makePulseFeatureAccessor("QualityValue")
-    InsertionQV    = _makePulseFeatureAccessor("InsertionQV")
-    DeletionQV     = _makePulseFeatureAccessor("DeletionQV")
-    DeletionTag    = _makePulseFeatureAccessor("DeletionTag")
-    MergeQV        = _makePulseFeatureAccessor("MergeQV")
-    SubstitutionQV = _makePulseFeatureAccessor("SubstitutionQV")
+    IPD            = _makeBaseFeatureAccessor("IPD")
+    PulseWidth     = _makeBaseFeatureAccessor("PulseWidth")
+    QualityValue   = _makeBaseFeatureAccessor("QualityValue")
+    InsertionQV    = _makeBaseFeatureAccessor("InsertionQV")
+    DeletionQV     = _makeBaseFeatureAccessor("DeletionQV")
+    DeletionTag    = _makeBaseFeatureAccessor("DeletionTag")
+    MergeQV        = _makeBaseFeatureAccessor("MergeQV")
+    SubstitutionQV = _makeBaseFeatureAccessor("SubstitutionQV")
 
     def __getattr__(self, key):
         return self.cmpH5.alignmentIndex[self.rowNumber][key]
@@ -755,8 +755,8 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
         # dataset_name -> dataset }.  This way we avoid B-tree
         # scanning in basic data access.
         self._alignmentGroupById = {}
-        for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"],
-                                              self.file["/AlnGroup/Path"]):
+        for (alnGroupId, alnGroupPath) in zip(self.file["/AlnGroup/ID"][:],
+                                              self.file["/AlnGroup/Path"][:]):
             alnGroup = self.file[alnGroupPath]
             self._alignmentGroupById[alnGroupId] = dict(alnGroup.items())
 
@@ -807,18 +807,18 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
 
     def _loadReferenceInfo(self):
         _referenceGroupTbl = np.rec.fromrecords(
-            zip(self.file["/RefGroup/ID"],
-                self.file["/RefGroup/RefInfoID"],
+            zip(self.file["/RefGroup/ID"][:],
+                self.file["/RefGroup/RefInfoID"][:],
                 [path[1:] for path in self.file["/RefGroup/Path"]]),
             dtype=[("ID"       , int),
                    ("RefInfoID", int),
                    ("Name"     , object)])
 
         _referenceInfoTbl = np.rec.fromrecords(
-            zip(self.file["/RefInfo/ID"],
-                self.file["/RefInfo/FullName"],
-                self.file["/RefInfo/Length"],
-                self.file["/RefInfo/MD5"]) ,
+            zip(self.file["/RefInfo/ID"][:],
+                self.file["/RefInfo/FullName"][:],
+                self.file["/RefInfo/Length"][:],
+                self.file["/RefInfo/MD5"][:]) ,
             dtype=[("RefInfoID", int),
                    ("FullName" , object),
                    ("Length"   , int),
@@ -838,26 +838,37 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
                                                 jointype="inner")
         self._referenceDict = {}
         self._readLocatorByKey = {}
-        for record in self._referenceInfoTable:
-            if record.ID != -1:
-                assert record.ID != record.Name
+
+        # For cmp.h5 files with large numbers of references, accessing
+        # the recarray fields in the inner loop was terribly slow.
+        # This makes things faster, though the code is less
+        # straightforward.  (One of the tradeoffs we have to make
+        # without a compiler to help us...)
+        recordID       = self._referenceInfoTable.ID
+        recordName     = self._referenceInfoTable.Name
+        recordFullName = self._referenceInfoTable.FullName
+        recordMD5      = self._referenceInfoTable.MD5
+
+        for i, record in enumerate(self._referenceInfoTable):
+            if recordID[i] != -1:
+                assert recordID[i] != record.Name
                 shortName = splitFastaHeader(record.FullName)[0]
-                if (shortName       in self._referenceDict or
-                    record.ID       in self._referenceDict or
-                    record.Name     in self._referenceDict or
-                    record.FullName in self._referenceDict or
-                    record.MD5      in self._referenceDict):
+                if (shortName         in self._referenceDict or
+                    recordID[i]       in self._referenceDict or
+                    recordName[i]     in self._referenceDict or
+                    recordFullName[i] in self._referenceDict or
+                    recordMD5[i]      in self._referenceDict):
                     raise ValueError, "Duplicate reference contig sequence or identifier"
                 else:
-                    self._referenceDict[shortName]       = record
-                    self._referenceDict[record.ID]       = record
-                    self._referenceDict[record.Name]     = record
-                    self._referenceDict[record.FullName] = record
-                    self._referenceDict[record.MD5]      = record
+                    self._referenceDict[shortName]         = record
+                    self._referenceDict[recordID[i]]       = record
+                    self._referenceDict[recordName[i]]     = record
+                    self._referenceDict[recordFullName[i]] = record
+                    self._referenceDict[recordMD5[i]]      = record
 
                 if self.isSorted:
-                    readLocator = makeReadLocator(self, record.ID)
-                    self._readLocatorByKey[record.ID] = readLocator
+                    readLocator = makeReadLocator(self, recordID[i])
+                    self._readLocatorByKey[recordID[i]] = readLocator
                     self._readLocatorByKey[shortName] = readLocator
 
     def _loadMiscInfo(self):
@@ -1169,7 +1180,7 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
         else:
             return self[rowNumbers]
 
-    def hasPulseFeature(self, featureName):
+    def hasBaseFeature(self, featureName):
         """
         Are the datasets for pulse feature `featureName` loaded in
         this file?  Specifically, is it loaded for all movies within
@@ -1177,30 +1188,30 @@ class CmpH5Reader(ReaderBase, IndexedAlignmentReaderMixin):
 
         .. doctest::
 
-            >>> c.hasPulseFeature("InsertionQV")
+            >>> c.hasBaseFeature("InsertionQV")
             True
-            >>> c.hasPulseFeature("MergeQV")
+            >>> c.hasBaseFeature("MergeQV")
             False
 
         """
         return all(featureName in alnGroup.keys()
                    for alnGroup in self._alignmentGroupById.values())
 
-    def pulseFeaturesAvailable(self):
+    def baseFeaturesAvailable(self):
         """
         What pulse features are available in this cmp.h5 file?
 
         .. doctest::
 
-            >>> c.pulseFeaturesAvailable()
+            >>> c.baseFeaturesAvailable()
             [u'QualityValue', u'IPD', u'PulseWidth', u'InsertionQV', u'DeletionQV']
 
         """
-        pulseFeaturesByMovie = [ alnGroup.keys()
+        baseFeaturesByMovie = [ alnGroup.keys()
                                  for alnGroup in self._alignmentGroupById.values() ]
-        pulseFeaturesAvailableAsSet = set.intersection(*map(set, pulseFeaturesByMovie))
-        pulseFeaturesAvailableAsSet.discard("AlnArray")
-        return list(pulseFeaturesAvailableAsSet)
+        baseFeaturesAvailableAsSet = set.intersection(*map(set, baseFeaturesByMovie))
+        baseFeaturesAvailableAsSet.discard("AlnArray")
+        return list(baseFeaturesAvailableAsSet)
 
     @property
     def barcode(self):
diff --git a/pbcore/io/align/_BamSupport.py b/pbcore/io/align/_BamSupport.py
index 4f8f92a..fd12f62 100644
--- a/pbcore/io/align/_BamSupport.py
+++ b/pbcore/io/align/_BamSupport.py
@@ -38,7 +38,7 @@ class ReferenceMismatch(Exception):  pass
 class IncompatibleFile(Exception):   pass
 
 
-PULSE_FEATURE_TAGS = { "InsertionQV"        : ("iq", "qv",      np.uint8),
+BASE_FEATURE_TAGS =  { "InsertionQV"        : ("iq", "qv",      np.uint8),
                        "DeletionQV"         : ("dq", "qv",      np.uint8),
                        "DeletionTag"        : ("dt", "base",    np.int8 ),
                        "SubstitutionQV"     : ("sq", "qv",      np.uint8),
@@ -48,6 +48,11 @@ PULSE_FEATURE_TAGS = { "InsertionQV"        : ("iq", "qv",      np.uint8),
                        "PulseWidth:Frames"  : ("pw", "frames",  np.uint16),
                        "PulseWidth:CodecV1" : ("pw", "codecV1", np.uint8) }
 
+PULSE_FEATURE_TAGS = { "PulseCall"          : ("pc", "pulse",    np.uint8),
+                       "StartFrame"         : ("sf", "frames32", np.uint32),
+                       "PkMid"              : ("pm", "photons",  np.uint16),
+                       "PkMean"             : ("pa", "photons",  np.uint16) }
+
 ASCII_COMPLEMENT_MAP = { ord("A") : ord("T"),
                          ord("T") : ord("A"),
                          ord("C") : ord("G"),
diff --git a/pbcore/io/dataset/DataSetIO.py b/pbcore/io/dataset/DataSetIO.py
index d80191b..a5edaef 100755
--- a/pbcore/io/dataset/DataSetIO.py
+++ b/pbcore/io/dataset/DataSetIO.py
@@ -2,21 +2,22 @@
 Classes representing DataSets of various types.
 """
 
-from collections import defaultdict, Counter
 import hashlib
 import datetime
-import math
 import copy
-import os
+import os, sys
+import re
 import errno
 import logging
 import itertools
 import xml.dom.minidom
 import tempfile
-from functools import wraps
 import numpy as np
 from urlparse import urlparse
+from functools import wraps, partial
+from collections import defaultdict, Counter
 from pbcore.util.Process import backticks
+from pbcore.chemistry.chemistry import ChemistryLookupError
 from pbcore.io.align.PacBioBamIndex import PBI_FLAGS_BARCODE
 from pbcore.io.FastaIO import splitFastaHeader, FastaWriter
 from pbcore.io.FastqIO import FastqReader, FastqWriter, qvsFromAscii
@@ -55,6 +56,13 @@ def filtered(generator):
                     yield read
     return wrapper
 
+def _getTimeStampedName(mType):
+    """Generate a timestamped name using the given metatype 'mType' and the
+    current UTC time"""
+    mType = mType.lower()
+    mType = '_'.join(mType.split('.'))
+    time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
+    return "{m}-{t}".format(m=mType, t=time)
 
 def _toDsId(name):
     """Translate a class name into a MetaType/ID"""
@@ -100,15 +108,16 @@ def isDataSet(xmlfile):
     try:
         _typeDataSet(xmlfile)
         return True
-    except:
+    except Exception:
         return False
 
 def openDataSet(*files, **kwargs):
     """Factory function for DataSet types as suggested by the first file"""
-    if not isDataSet(files[0]):
+    try:
+        tbrType = _typeDataSet(files[0])
+        return tbrType(*files, **kwargs)
+    except Exception:
         raise TypeError("openDataSet requires that the first file is a DS")
-    tbrType = _typeDataSet(files[0])
-    return tbrType(*files, **kwargs)
 
 def openDataFile(*files, **kwargs):
     """Factory function for DataSet types determined by the first data file"""
@@ -158,12 +167,59 @@ def _stackRecArrays(recArrays):
     tbr = tbr.view(np.recarray)
     return tbr
 
+def _uniqueRecords(recArray):
+    """Remove duplicate records"""
+    unique = set()
+    unique_i = []
+    for i, rec in enumerate(recArray):
+        rect = tuple(rec)
+        if rect not in unique:
+            unique.add(rect)
+            unique_i.append(i)
+    return recArray[unique_i]
+
+def _fieldsView(recArray, fields):
+    vdtype = np.dtype({fi:recArray.dtype.fields[fi] for fi in fields})
+    return np.ndarray(recArray.shape, vdtype, recArray, 0, recArray.strides)
+
+def _renameField(recArray, current, new):
+    ind = recArray.dtype.names.index(current)
+    names = list(recArray.dtype.names)
+    names[ind] = new
+    recArray.dtype.names = names
+
 def _flatten(lol, times=1):
     """ This wont do well with mixed nesting"""
     for _ in range(times):
         lol = np.concatenate(lol)
     return lol
 
+def divideKeys(keys, chunks):
+    if chunks < 1:
+        return []
+    if chunks > len(keys):
+        chunks = len(keys)
+    chunksize = len(keys)/chunks
+    key_chunks = [keys[(i * chunksize):((i + 1) * chunksize)] for i in
+                  range(chunks-1)]
+    key_chunks.append(keys[((chunks - 1) * chunksize):])
+    return key_chunks
+
+def splitKeys(keys, chunks):
+    if chunks < 1:
+        return []
+    if chunks > len(keys):
+        chunks = len(keys)
+    chunksize = len(keys)/chunks
+    key_chunks = [(keys[i * chunksize], keys[(i + 1) * chunksize - 1]) for i in
+                  range(chunks-1)]
+    key_chunks.append((keys[(chunks - 1) * chunksize], keys[-1]))
+    return key_chunks
+
+def keysToRanges(keys):
+    key_ranges = [[min(k), max(k)] for k in keys]
+    return key_ranges
+
 class DataSetMetaTypes(object):
     """
     This mirrors the PacBioSecondaryDataModel.xsd definitions and be used
@@ -199,6 +255,32 @@ def _fileType(fname):
     ftype = ftype.strip('.')
     return ftype
 
+def _fileExists(fname):
+    """Assert that a file exists with a useful failure mode"""
+    if not isinstance(fname, str):
+        fname = fname.resourceId
+    if not os.path.isfile(fname):
+        raise InvalidDataSetIOError("Resource {f} not found".format(f=fname))
+    return True
+
+def checkAndResolve(fname, possibleRelStart=None):
+    """Try and skip resolveLocation if possible"""
+    tbr = fname
+    if not fname.startswith(os.path.sep):
+        log.debug('Unable to assume path is already absolute')
+        tbr = resolveLocation(fname, possibleRelStart)
+    return tbr
+
+def _pathChanger(osPathFunc, metaTypeFunc, resource):
+    """Apply these two functions to the resource or ResourceId"""
+    resId = resource.resourceId
+    currentPath = urlparse(resId)
+    if currentPath.scheme == 'file' or not currentPath.scheme:
+        currentPath = currentPath.path
+        currentPath = osPathFunc(currentPath)
+        resource.resourceId = currentPath
+        metaTypeFunc(resource)
+
 
 class DataSet(object):
     """The record containing the DataSet information, with possible type
@@ -248,7 +330,8 @@ class DataSet(object):
             >>> ds1.numExternalResources
             2
             >>> # Constructors should be used directly
-            >>> SubreadSet(data.getSubreadSet()) # doctest:+ELLIPSIS
+            >>> SubreadSet(data.getSubreadSet(),
+            ...            skipMissing=True) # doctest:+ELLIPSIS
             <SubreadSet...
             >>> # Even with untyped inputs
             >>> AlignmentSet(data.getBam()) # doctest:+ELLIPSIS
@@ -266,7 +349,9 @@ class DataSet(object):
             True
 
         """
+        files = [str(fn) for fn in files]
         self._strict = kwargs.get('strict', False)
+        skipMissing = kwargs.get('skipMissing', False)
         self._skipCounts = kwargs.get('skipCounts', False)
         _induceIndices = kwargs.get('generateIndices', False)
 
@@ -288,9 +373,10 @@ class DataSet(object):
         self.fileNames = files
 
         # parse files
-        log.debug('Containers specified')
         populateDataSet(self, files)
-        log.debug('Done populating')
+
+        if not skipMissing:
+            self._modResources(_fileExists)
 
         # DataSet base class shouldn't really be used. It is ok-ish for just
         # basic xml mainpulation. May start warning at some point, but
@@ -311,8 +397,8 @@ class DataSet(object):
         else:
             dsType = self.objMetadata.setdefault("MetaType",
                                                  _toDsId('DataSet'))
-        if not self.objMetadata.get("TimeStampedName", ""):
-            self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
+        if not "TimeStampedName" in self.objMetadata:
+            self.objMetadata["TimeStampedName"] = _getTimeStampedName(
                 self.objMetadata["MetaType"])
         self.objMetadata.setdefault("Name",
                                     self.objMetadata["TimeStampedName"])
@@ -351,8 +437,12 @@ class DataSet(object):
         self._referenceInfoTable = None
         self._referenceDict = {}
         self._indexMap = None
-        self._stackedReferenceInfoTable = False
+        self._referenceInfoTableIsStacked = None
+        self._readGroupTableIsRemapped = False
         self._index = None
+        # only to be used against incorrect counts from the XML, not for
+        # internal accounting:
+        self._countsUpdated = False
 
         # update counts
         if files:
@@ -360,12 +450,17 @@ class DataSet(object):
                 self.updateCounts()
             elif self.totalLength <= 0 or self.numRecords <= 0:
                 self.updateCounts()
+            elif len(files) > 1:
+                self.updateCounts()
 
         # generate indices if requested and needed
         if _induceIndices:
             self.induceIndices()
 
     def induceIndices(self):
+        """Generate indices for ExternalResources.
+
+        Not compatible with DataSet base type"""
         raise NotImplementedError()
 
     def __repr__(self):
@@ -428,7 +523,7 @@ class DataSet(object):
             # Block on filters?
             if (not firstIn and
                     not self.filters.testCompatibility(other.filters)):
-                log.warning("Filter incompatibility has blocked the addition "
+                log.warning("Filter incompatibility has blocked the merging "
                             "of two datasets")
                 return None
             else:
@@ -526,21 +621,34 @@ class DataSet(object):
                              "closed properly.")
                 else:
                     raise
+            del reader
         self._openReaders = []
 
     def __exit__(self, *exec_info):
         self.close()
 
     def __len__(self):
-        if self.numRecords == -1:
+        """Return the number of records in this DataSet"""
+        if self.numRecords <= 0:
             if self._filters:
-                log.warn("Base class DataSet length cannot be calculate when "
-                         "filters present")
+                if isinstance(self.datasetType, tuple):
+                    log.debug("Base class DataSet length cannot be calculated "
+                              "when filters are present")
+                else:
+                    self.updateCounts()
             else:
-                count = 0
-                for reader in self.resourceReaders():
-                    count += len(reader)
-                self.numRecords = count
+                try:
+                    # a little cheaper:
+                    count = 0
+                    for reader in self.resourceReaders():
+                        count += len(reader)
+                    self.numRecords = count
+                except UnavailableFeature:
+                    # UnavailableFeature: no .bai
+                    self.updateCounts()
+        elif not self._countsUpdated:
+            # isn't that expensive, avoids crashes due to incorrect numRecords:
+            self.updateCounts()
         return self.numRecords
 
     def newUuid(self, setter=True):
@@ -619,8 +727,7 @@ class DataSet(object):
             ...                    ds1.subdatasets for ds2d in
             ...                    ds2.subdatasets])
             >>> # But types are maintained:
-            >>> # TODO: turn strict back on once sim sets are indexable
-            >>> ds1 = SubreadSet(data.getXml(no=10), strict=False)
+            >>> ds1 = SubreadSet(data.getXml(no=10), strict=True)
             >>> ds1.metadata # doctest:+ELLIPSIS
             <SubreadSetMetadata...
             >>> ds2 = ds1.copy()
@@ -650,8 +757,6 @@ class DataSet(object):
                                 "{t}".format(s=type(self).__name__,
                                              t=asType))
             tbr.merge(self)
-            # update the metatypes: (TODO: abstract out 'iterate over all
-            # resources and modify the element that contains them')
             tbr.makePathsAbsolute()
             return tbr
         result = copy.deepcopy(self)
@@ -738,19 +843,25 @@ class DataSet(object):
             True
             >>> # unmerge:
             >>> ds1, ds2 = sorted(
-            ... dss.split(2, ignoreSubDatasets=False), key=lambda x: x.totalLength, reverse=True)
+            ... dss.split(2, ignoreSubDatasets=False),
+            ... key=lambda x: x.totalLength, reverse=True)
             >>> ds1.totalLength == ds1tl
             True
             >>> ds2.totalLength == ds2tl
             True
 
         """
+        # File must have pbi index to be splittable:
+        if len(self) == 0:
+            return [self.copy()]
         if contigs:
             return self._split_contigs(chunks, maxChunks, breakContigs,
                                        targetSize=targetSize,
                                        byRecords=byRecords,
                                        updateCounts=updateCounts)
         elif zmws:
+            if chunks == 0:
+                chunks = maxChunks
             return self._split_zmws(chunks, targetSize=targetSize)
         elif barcodes:
             return self._split_barcodes(chunks)
@@ -863,9 +974,9 @@ class DataSet(object):
             >>> import tempfile, os
             >>> outdir = tempfile.mkdtemp(suffix="dataset-doctest")
             >>> outfile = os.path.join(outdir, 'tempfile.xml')
-            >>> ds1 = DataSet(data.getXml())
+            >>> ds1 = DataSet(data.getXml(), skipMissing=True)
             >>> ds1.write(outfile, validate=False)
-            >>> ds2 = DataSet(outfile)
+            >>> ds2 = DataSet(outfile, skipMissing=True)
             >>> ds1 == ds2
             True
         """
@@ -879,8 +990,14 @@ class DataSet(object):
         if pretty:
             xml_string = xml.dom.minidom.parseString(xml_string).toprettyxml(
                 encoding="UTF-8")
+
+        # not useful yet as a list, but nice to keep the options open:
+        validation_errors = []
         if validate:
-            validateString(xml_string, relTo=os.path.dirname(outFile))
+            try:
+                validateString(xml_string, relTo=os.path.dirname(outFile))
+            except Exception as e:
+                validation_errors.append(e)
         fileName = urlparse(outFile).path.strip()
         if self._strict and not isinstance(self.datasetType, tuple):
             if not fileName.endswith(_dsIdToSuffix(self.datasetType)):
@@ -892,6 +1009,10 @@ class DataSet(object):
         with open(fileName, 'w') as outFile:
             outFile.writelines(xml_string)
 
+        for e in validation_errors:
+            log.error("Invalid file produced: {f}".format(f=fileName))
+            raise e
+
     def loadStats(self, filename):
         """Load pipeline statistics from a <moviename>.sts.xml file. The subset
         of these data that are defined in the DataSet XSD become available
@@ -917,8 +1038,8 @@ class DataSet(object):
             [3152, 1802, 798, 0]
 
         """
-        if isinstance(filename, str):
-            statsMetadata = parseStats(filename)
+        if isinstance(filename, basestring):
+            statsMetadata = parseStats(str(filename))
         else:
             statsMetadata = filename
         if self.metadata.summaryStats:
@@ -953,16 +1074,10 @@ class DataSet(object):
         return filters
 
     def _filterType(self):
+        """A key that maps to a set of filtration keywords specific to this
+        DataSet's ExternalResource type"""
         raise NotImplementedError()
 
-    def checkAndResolve(self, fname, possibleRelStart=None):
-        """Try and skip resolveLocation if possible"""
-        if fname.startswith(os.path.sep):
-            return fname
-        else:
-            log.debug('Unable to assume path is already absolute')
-            return resolveLocation(fname, possibleRelStart)
-
     def makePathsAbsolute(self, curStart="."):
         """As part of the validation process, make all ResourceIds absolute
         URIs rather than relative paths. Generally not called by API users.
@@ -972,7 +1087,7 @@ class DataSet(object):
         """
         log.debug("Making paths absolute")
         self._changePaths(
-            lambda x, s=curStart: self.checkAndResolve(x, s))
+            lambda x, s=curStart: checkAndResolve(x, s))
 
     def makePathsRelative(self, outDir=False):
         """Make things easier for writing test cases: make all
@@ -990,6 +1105,8 @@ class DataSet(object):
             self._changePaths(os.path.relpath)
 
     def _modResources(self, func):
+        """Execute some function 'func' on each external resource in the
+        dataset and each subdataset"""
         # check all ExternalResources
         stack = list(self.externalResources)
         while stack:
@@ -1000,6 +1117,7 @@ class DataSet(object):
             func(item)
             try:
                 stack.extend(list(item.indices))
+                stack.extend(list(item.externalResources))
             except AttributeError:
                 # Some things just don't have indices
                 pass
@@ -1016,34 +1134,13 @@ class DataSet(object):
             :osPathFunc: A function for modifying paths (e.g. os.path.abspath)
             :checkMetaType: Update the metatype of externalResources if needed
         """
-        # check all ExternalResources
-        stack = list(self.externalResources)
-        while stack:
-            item = stack.pop()
-            resId = item.resourceId
-            if not resId:
-                continue
-            currentPath = urlparse(resId)
-            if currentPath.scheme == 'file' or not currentPath.scheme:
-                currentPath = currentPath.path
-                currentPath = osPathFunc(currentPath)
-                item.resourceId = currentPath
-                if checkMetaType:
-                    self._updateMetaType(item)
-            try:
-                stack.extend(list(item.indices))
-                stack.extend(list(item.externalResources))
-            except AttributeError:
-                # Some things just don't have indices or subresources
-                pass
-
-        # check all DataSetMetadata
-
-        # check all SubDatasets
-        for dataset in self.subdatasets:
-            dataset._changePaths(osPathFunc)
+        metaTypeFunc = self._updateMetaType if checkMetaType else lambda x: x
+        resFunc = partial(_pathChanger, osPathFunc, metaTypeFunc)
+        self._modResources(resFunc)
 
     def _populateMetaTypes(self):
+        """Add metatypes to those ExternalResources that currently are
+        without"""
         self._modResources(self._updateMetaType)
 
     def _updateMetaType(self, resource):
@@ -1054,15 +1151,9 @@ class DataSet(object):
             resource.metaType = self._metaTypeMapping().get(file_type, "")
         if not resource.timeStampedName:
             mtype = resource.metaType
-            tsName = self._getTimeStampedName(mtype)
+            tsName = _getTimeStampedName(mtype)
             resource.timeStampedName = tsName
 
-    def _getTimeStampedName(self, mType):
-        mType = mType.lower()
-        mType = '_'.join(mType.split('.'))
-        time = datetime.datetime.utcnow().strftime("%y%m%d_%H%M%S%f")[:-3]
-        return "{m}-{t}".format(m=mType, t=time)
-
     @staticmethod
     def _metaTypeMapping():
         """The available mappings between file extension and MetaType (informed
@@ -1071,6 +1162,8 @@ class DataSet(object):
         return {}
 
     def copyFiles(self, outdir):
+        """Copy all of the top level ExternalResources to an output
+        directory 'outdir'"""
         backticks('cp {i} {o}'.format(i=' '.join(self.toExternalFiles()),
                                       o=outdir))
 
@@ -1078,6 +1171,7 @@ class DataSet(object):
         """Disable read filtering for this object"""
         self.reFilter()
         self.noFiltering = True
+        # a dummy filter:
         self._cachedFilters = [lambda x: True]
 
     def enableFilters(self):
@@ -1095,9 +1189,9 @@ class DataSet(object):
 
         Doctest:
             >>> import pbcore.data.datasets as data
-            >>> from pbcore.io import DataSet
+            >>> from pbcore.io import SubreadSet
             >>> from pbcore.io.dataset.DataSetMembers import Filters
-            >>> ds1 = DataSet()
+            >>> ds1 = SubreadSet()
             >>> filt = Filters()
             >>> filt.addRequirement(rq=[('>', '0.85')])
             >>> ds1.addFilters(filt)
@@ -1120,12 +1214,25 @@ class DataSet(object):
             :newMetadata: The object metadata of a DataSet being considered for
                           merging
         """
-        if self.objMetadata.get('Version'):
-            if newMetadata.get('Version') > self.objMetadata.get('Version'):
+        # If there is no objMetadata, this is a new dataset being populated
+        if self.objMetadata:
+            # if there isn't a Version in each, that will fail eventually
+            if 'Version' in self.objMetadata and 'Version' in newMetadata:
+                if newMetadata['Version'] == self.objMetadata['Version']:
+                    return True
+                # We'll make an exception for now: major version number passes
+                elif (newMetadata['Version'].split('.')[0] ==
+                      self.objMetadata['Version'].split('.')[0]):
+                    log.warn("Future warning: merging datasets that don't "
+                             "share a version number will fail.")
+                    return True
                 raise ValueError("Wrong dataset version for merging {v1} vs "
                                  "{v2}".format(
                                      v1=newMetadata.get('Version'),
                                      v2=self.objMetadata.get('Version')))
+            log.warn("Future warning: merging will require Version "
+                     "numbers for both DataSets")
+        return True
 
 
     def addMetadata(self, newMetadata, **kwargs):
@@ -1187,8 +1294,16 @@ class DataSet(object):
             self.metadata.addMetadata(key, value)
 
     def updateCounts(self):
+        """Update the TotalLength and NumRecords for this DataSet.
+
+        Not compatible with the base DataSet class, which has no ability to
+        touch ExternalResources. -1 is used as a sentinel value for failed size
+        determination. It should never be written out to XML in regular use.
+
+        """
         self.metadata.totalLength = -1
         self.metadata.numRecords = -1
+        self._countsUpdated = True
 
     def addExternalResources(self, newExtResources, updateCount=True):
         """Add additional ExternalResource objects, ensuring no duplicate
@@ -1256,9 +1371,12 @@ class DataSet(object):
             self.subdatasets.append(otherDataSet)
 
     def _openFiles(self):
+        """Open the top level ExternalResources"""
         raise RuntimeError("Not defined for this type of DataSet")
 
     def resourceReaders(self):
+        """Return a list of open pbcore Reader objects for the
+        top level ExternalResources in this DataSet"""
         raise RuntimeError("Not defined for this type of DataSet")
 
     @property
@@ -1284,9 +1402,18 @@ class DataSet(object):
                 yield record
 
     def __iter__(self):
-        """ Iterate over the records. (sorted for AlignmentSet objects)"""
-        for record in self.records:
-            yield record
+        """Iterate over the records.
+
+        The order of yielded reads is determined by the order of the
+        ExternalResources and record order within each file"""
+        if self.isIndexed:
+            # this uses the index to respect filters
+            for i in xrange(len(self)):
+                yield self[i]
+        else:
+            # this uses post-filtering to respect filters
+            for record in self.records:
+                yield record
 
     @property
     def subSetNames(self):
@@ -1296,8 +1423,10 @@ class DataSet(object):
             subNames.extend(sds.name)
         return subNames
 
-    def readsInSubDatasets(self, subNames=[]):
+    def readsInSubDatasets(self, subNames=None):
         """To be used in conjunction with self.subSetNames"""
+        if subNames is None:
+            subNames = []
         if self.subdatasets:
             for sds in self.subdatasets:
                 if subNames and sds.name not in subNames:
@@ -1372,7 +1501,8 @@ class DataSet(object):
 
         Doctest:
             >>> from pbcore.io import DataSet
-            >>> DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(uri=False)
+            >>> DataSet("bam1.bam", "bam2.bam", strict=False,
+            ...         skipMissing=True).toFofn(uri=False)
             ['bam1.bam', 'bam2.bam']
         """
         lines = [er.resourceId for er in self.externalResources]
@@ -1397,6 +1527,7 @@ class DataSet(object):
 
     @property
     def _castableDataSetTypes(self):
+        """Tuple of DataSet types to which this DataSet can be cast"""
         if isinstance(self.datasetType, tuple):
             return self.datasetType
         else:
@@ -1444,7 +1575,7 @@ class DataSet(object):
     @property
     def filters(self):
         """Limit setting to ensure cache hygiene and filter compatibility"""
-        self._filters.registerCallback(lambda x=self: x.reFilter())
+        self._filters.registerCallback(self._wipeCaches)
         return self._filters
 
     @filters.setter
@@ -1453,7 +1584,9 @@ class DataSet(object):
         if value is None:
             self._filters = Filters()
         else:
+            value.clearCallbacks()
             self._filters = value
+        self.reFilter()
 
     def reFilter(self, light=True):
         """
@@ -1468,6 +1601,15 @@ class DataSet(object):
             self.metadata.numRecords = -1
             if self.metadata.summaryStats:
                 self.metadata.removeChildren('SummaryStats')
+            self.updateCounts()
+
+    def _wipeCaches(self):
+        self.reFilter(False)
+
+    @property
+    def createdAt(self):
+        """Return the DataSet CreatedAt timestamp"""
+        return self.objMetadata.get('CreatedAt')
 
     @property
     def numRecords(self):
@@ -1505,6 +1647,16 @@ class DataSet(object):
         self.objMetadata['Name'] = value
 
     @property
+    def description(self):
+        """The description of this DataSet"""
+        return self.objMetadata.get('Description', '')
+
+    @description.setter
+    def description(self, value):
+        """The description of this DataSet"""
+        self.objMetadata['Description'] = value
+
+    @property
     def numExternalResources(self):
         """The number of ExternalResources in this DataSet"""
         return len(self.externalResources)
@@ -1524,24 +1676,24 @@ class DataSet(object):
                     raise ResourceMismatchError(responses)
         return responses[0]
 
-    def hasPulseFeature(self, featureName):
+    def hasBaseFeature(self, featureName):
         responses = self._pollResources(
-            lambda x: x.hasPulseFeature(featureName))
+            lambda x: x.hasBaseFeature(featureName))
         return self._unifyResponses(responses)
 
-    def pulseFeaturesAvailable(self):
-        responses = self._pollResources(lambda x: x.pulseFeaturesAvailable())
+    def baseFeaturesAvailable(self):
+        responses = self._pollResources(lambda x: x.baseFeaturesAvailable())
         return self._unifyResponses(responses)
 
     @property
     def sequencingChemistry(self):
-        key = 'sequencingChemistry'
-        responses = self._pollResources(lambda x: getattr(x, key))
+        responses = self._pollResources(lambda x: x.sequencingChemistry)
         return list(_flatten(responses))
 
     @property
     def isEmpty(self):
-        return self._checkIdentical('isEmpty')
+        responses = self._pollResources(lambda x: getattr(x, 'isEmpty'))
+        return all(responses)
 
     @property
     def readType(self):
@@ -1583,27 +1735,19 @@ class DataSet(object):
     def _indexRecords(self):
         raise NotImplementedError()
 
+    def isIndexed(self):
+        raise NotImplementedError()
+
     def assertIndexed(self):
         raise NotImplementedError()
 
     def _assertIndexed(self, acceptableTypes):
         if not self._openReaders:
-            try:
-                tmp = self._strict
-                self._strict = True
-                self._openFiles()
-            except Exception:
-                # Catch everything to recover the strictness status, then raise
-                # whatever error was found.
-                self._strict = tmp
-                raise
-            finally:
-                self._strict = tmp
-        else:
-            for fname, reader in zip(self.toExternalFiles(),
-                                     self.resourceReaders()):
-                if not isinstance(reader, acceptableTypes):
-                    raise IOError(errno.EIO, "File not indexed", fname)
+            self._openFiles()
+        for fname, reader in zip(self.toExternalFiles(),
+                                 self.resourceReaders()):
+            if not isinstance(reader, acceptableTypes):
+                raise IOError(errno.EIO, "File not indexed", fname)
         return True
 
     def __getitem__(self, index):
@@ -1632,7 +1776,7 @@ class DataSet(object):
 
 
 class InvalidDataSetIOError(Exception):
-    """The base class for all DataSetIO related custom exceptions (hopefully)
+    """The base class for all DataSetIO related custom exceptions
     """
 
 
@@ -1680,11 +1824,16 @@ class ReadSet(DataSet):
             return True
         return False
 
-    def assertBarcoded(self):
-        """Test whether all resources are barcoded files"""
+    @property
+    def isBarcoded(self):
+        """Determine whether all resources are barcoded files"""
         res = self._pollResources(
             lambda x: x.index.pbiFlags & PBI_FLAGS_BARCODE)
-        if not self._unifyResponses(res):
+        return self._unifyResponses(res)
+
+    def assertBarcoded(self):
+        """Test whether all resources are barcoded files"""
+        if not self.isBarcoded:
             raise RuntimeError("File not barcoded")
 
     def _openFiles(self):
@@ -1695,17 +1844,22 @@ class ReadSet(DataSet):
             log.debug("Closing old readers...")
             self.close()
         log.debug("Opening ReadSet resources")
+        sharedRefs = {}
+        infotables = []
+        infodicts = []
         for extRes in self.externalResources:
             refFile = extRes.reference
             if refFile:
-                log.debug("Using reference: {r}".format(r=refFile))
+                if not refFile in sharedRefs:
+                    log.debug("Using reference: {r}".format(r=refFile))
+                    sharedRefs[refFile] = IndexedFastaReader(refFile)
             location = urlparse(extRes.resourceId).path
             resource = None
             try:
                 if extRes.resourceId.endswith('bam'):
-                    resource = IndexedBamReader(
-                        location,
-                        referenceFastaFname=refFile)
+                    resource = IndexedBamReader(location)
+                    if refFile:
+                        resource.referenceFasta = sharedRefs[refFile]
                 else:
                     resource = CmpH5Reader(location)
             except (IOError, ValueError):
@@ -1713,23 +1867,43 @@ class ReadSet(DataSet):
                     log.warn("pbi file missing for {f}, operating with "
                              "reduced speed and functionality".format(
                                  f=location))
-                    resource = BamReader(
-                        location, referenceFastaFname=refFile)
+                    resource = BamReader(location)
+                    if refFile:
+                        resource.referenceFasta = sharedRefs[refFile]
                 else:
                     raise
+            # Consolidate referenceDicts
+            # This gets huge when there are ~90k references. If you have ~28
+            # chunks, each with 28 BamReaders, each with 100MB referenceDicts,
+            # you end up storing tens of gigs of just these (often identical)
+            # dicts
+            if not len(infotables):
+                infotables.append(resource._referenceInfoTable)
+                infodicts.append(resource._referenceDict)
+            else:
+                for ri, rd in zip(infotables, infodicts):
+                    if np.array_equal(resource._referenceInfoTable, ri):
+                        del resource._referenceInfoTable
+                        del resource._referenceDict
+                        resource._referenceInfoTable = ri
+                        resource._referenceDict = rd
+                        break
+                    infotables.append(resource._referenceInfoTable)
+                    infodicts.append(resource._referenceDict)
             self._openReaders.append(resource)
             try:
                 if resource.isEmpty:
-                    log.warn("{f} contains no reads!".format(
+                    log.debug("{f} contains no reads!".format(
                         f=extRes.resourceId))
             except UnavailableFeature: # isEmpty requires bai
                 if not list(itertools.islice(resource, 1)):
-                    log.warn("{f} contains no reads!".format(
+                    log.debug("{f} contains no reads!".format(
                         f=extRes.resourceId))
         if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
             raise IOError("No files were openable")
         log.debug("Done opening resources")
 
+
     def _filterType(self):
         return 'bam'
 
@@ -1742,7 +1916,7 @@ class ReadSet(DataSet):
             return self._unifyResponses(res)
         except ResourceMismatchError:
             if not self._strict:
-                log.error("Resources inconsistently indexed")
+                log.warn("Resources inconsistently indexed")
                 return False
             else:
                 raise
@@ -1762,7 +1936,7 @@ class ReadSet(DataSet):
         try:
             self.assertBarcoded()
         except RuntimeError:
-            log.warn("No barcodes found in BAM file, skipping split")
+            log.info("No barcodes found in BAM file, skipping split")
             return [self.copy()]
         barcodes = defaultdict(int)
         for bcTuple in itertools.izip(self.index.bcForward,
@@ -1789,6 +1963,7 @@ class ReadSet(DataSet):
         log.debug("Done chunking")
         log.debug("Modifying filters or resources")
         for result, chunk in zip(results, chunks):
+            result.filters.removeRequirement('bc')
             result.filters.addRequirement(
                 bc=[('=', list(c[0])) for c in chunk])
 
@@ -1806,83 +1981,121 @@ class ReadSet(DataSet):
         return results
 
     def _split_zmws(self, chunks, targetSize=None):
-        files_to_movies = defaultdict(list)
-        n_bam = 0
-        for bam in self.resourceReaders():
-            n_bam += 1
-            if len(bam.readGroupTable) > 1:
-                raise RuntimeError("Multiple read groups in single .bam")
-        if chunks < n_bam:
-            return self.split(chunks=chunks)
-        target_nchunks = self.numRecords/targetSize
-        # turn on to provide more reasonable nchunks on small datasets:
-        #chunks = min(chunks, target_nchunks)
-        n_chunks_per_bam = max(1, int(math.floor(float(chunks) / n_bam)))
-        if n_chunks_per_bam < 2:
-            log.warn("%d ZMW chunks requested but there are %d files" %
-                     (chunks, n_bam))
-        n_chunks = n_bam * n_chunks_per_bam
-        active_holenumbers = 0
-        for reader in self.resourceReaders():
-            active_holenumbers += len(set(reader.holeNumber))
-        n_chunks = min(active_holenumbers, n_chunks)
+        """Holenumbers must be unique within each movie"""
+
+        if chunks == 1:
+            return [self.copy()]
+        # make sure we can pull out the movie name:
+        rgIdMovieNameMap = {rg[0]: rg[1] for rg in self.readGroupTable}
+
+        # find atoms:
+        active_holenumbers = self.index
+        n_chunks = min(len(active_holenumbers), chunks)
+
+        # if we have a target size and can have two or more chunks:
+        if (not targetSize is None and len(active_holenumbers) > 1 and
+                chunks > 1):
+            n_chunks = min(n_chunks, len(active_holenumbers)/targetSize)
+            # we want at least two if we can swing it:
+            n_chunks = max(n_chunks, 2)
+
+        # make sure there aren't too few atoms
         if n_chunks != chunks:
             log.info("Adjusted number of chunks to %d" % n_chunks)
-        log.debug("Making copies")
-        results = [self.copy() for _ in range(n_chunks)]
-        j_chunk = 0
-        for bam in self.resourceReaders():
-            rg = bam.readGroupTable[0]
-            n_reads = len(bam.holeNumber)
-            chunk_size = int(math.ceil(float(n_reads / n_chunks_per_bam)))
-            i_read = 0
-            for i_chunk in range(n_chunks_per_bam):
-                if i_read >= n_reads:
-                    break
-                result = results[j_chunk]
-                j_chunk += 1
-                zmw_start = bam.holeNumber[i_read]
-                if i_chunk == n_chunks_per_bam - 1:
-                    zmw_end = bam.holeNumber.max()
-                else:
-                    i_read += chunk_size
-                    zmw_end = bam.holeNumber[min(i_read, n_reads-1)]
-                    while i_read < n_reads-1:
-                        if zmw_end == bam.holeNumber[i_read+1]:
-                            i_read += 1
-                            zmw_end = bam.holeNumber[i_read]
-                        else:
-                            break
-                result.filters.addRequirement(
-                    movie=[('=', rg.MovieName)],
-                    zm=[('<', zmw_end+1)])
-                result.filters.addRequirement(
-                    zm=[('>', zmw_start-1)])
-                i_read += 1
 
-        # UniqueId was regenerated when the ExternalResource list was
-        # whole, therefore we need to regenerate it again here
-        log.debug("Generating new UUID")
-        for result in results:
-            result.reFilter()
-            result.newUuid()
-            result.updateCounts()
+        # sort atoms and group into chunks:
+        active_holenumbers.sort(order=['qId', 'holeNumber'])
+        view = _fieldsView(self.index, ['qId', 'holeNumber'])
+        keys = np.unique(view)
+        ranges = splitKeys(keys, n_chunks)
+
+        # The above ranges can include hidden, unrepresented movienames that
+        # are sandwiched between those in the range. In order to capture those,
+        # we need to find the indices of the range bounds, then pull out the
+        # chunks.
+        hn_chunks = []
+        for zmw_range in ranges:
+            if zmw_range[0][0] == zmw_range[1][0]:
+                hn_chunks.append(active_holenumbers[
+                    (active_holenumbers['qId'] == zmw_range[0][0]) &
+                    (active_holenumbers['holeNumber'] >= zmw_range[0][1]) &
+                    (active_holenumbers['holeNumber'] <= zmw_range[1][1])])
+            else:
+                start = np.flatnonzero(
+                    (active_holenumbers['qId'] == zmw_range[0][0]) &
+                    (active_holenumbers['holeNumber'] == zmw_range[0][1]))[0]
+                end = np.flatnonzero(
+                    (active_holenumbers['qId'] == zmw_range[1][0]) &
+                    (active_holenumbers['holeNumber'] == zmw_range[1][1]))[-1]
+                hn_chunks.append(active_holenumbers[start:(end + 1)])
+
+        results = []
+        log.debug("Making copies")
+        tmp_results = [self.copy() for _ in range(n_chunks)]
+
+        # add filters
+        for chunk, res in zip(hn_chunks, tmp_results):
+            # check if multiple movies:
+            if chunk[0]['qId'] == chunk[-1]['qId']:
+                movieName = rgIdMovieNameMap[chunk[0]['qId']]
+                zmwStart = chunk[0]['holeNumber']
+                zmwEnd = chunk[-1]['holeNumber']
+                res._filters.clearCallbacks()
+                res._filters.addRequirement(
+                    movie=[('=', movieName)],
+                    zm=[('<', zmwEnd+1)])
+                res._filters.addRequirement(
+                    zm=[('>', zmwStart-1)])
+            else:
+                movieNames = []
+                zmwStarts = []
+                zmwEnds = []
+                for mov in np.unique(chunk['qId']):
+                    movieNames.append(rgIdMovieNameMap[mov])
+                    inds = np.flatnonzero(chunk['qId'] == mov)
+                    zmwStarts.append(chunk[inds[0]]['holeNumber'])
+                    zmwEnds.append(chunk[inds[-1]]['holeNumber'])
+                res._filters.clearCallbacks()
+                res._filters.addRequirement(
+                    movie=[('=', mn) for mn in movieNames],
+                    zm=[('<', ze + 1) for ze in zmwEnds])
+                res._filters.mapRequirement(
+                    zm=[('>', zs - 1) for zs in zmwStarts])
+            res.numRecords = len(chunk)
+            res.totalLength = sum(chunk['qEnd'] - chunk['qStart'])
+            res.newUuid()
+            results.append(res)
+
+        # we changed the sort order above, so this is dirty:
+        self._index = None
+        self._indexMap = None
 
         # Update the basic metadata for the new DataSets from external
         # resources, or at least mark as dirty
         # TODO
         return results
 
+
     @property
     def readGroupTable(self):
         """Combine the readGroupTables of each external resource"""
         responses = self._pollResources(lambda x: x.readGroupTable)
         if len(responses) > 1:
-            tbr = reduce(np.append, responses)
+            # append the read groups, but eliminate duplicates.
+            tbr = _uniqueRecords(reduce(np.append, responses))
+            # reassign qIds if dupes:
+            if len(set(tbr['ID'])) < len(tbr):
+                self._readGroupTableIsRemapped = True
+                tbr['ID'] = range(len(tbr))
             return tbr
         else:
             return responses[0]
 
+    @property
+    def movieIds(self):
+        """A dict of movieName: movieId for the joined readGroupTable"""
+        return {rg.MovieName: rg.ID for rg in self.readGroupTable}
+
     def assertIndexed(self):
         self._assertIndexed((IndexedBamReader, CmpH5Reader))
 
@@ -1892,6 +2105,33 @@ class ReadSet(DataSet):
         res = self._pollResources(lambda x: isinstance(x, CmpH5Reader))
         return self._unifyResponses(res)
 
+    def _fixQIds(self, indices, resourceReader):
+        qId_acc = lambda x: x.MovieID
+        if not self.isCmpH5:
+            qId_acc = lambda x: x.qId
+
+        rr = resourceReader
+        try:
+            # this would populate the _readGroupTableIsRemapped member, but
+            # for whatever reason a lot of cmp.h5's are broken
+            _ = self.readGroupTable
+        except ChemistryLookupError:
+            # this should be an error, but that would mess up Quiver cram
+            # tests. If anyone tries to access the readGroupTable in a
+            # dataset it will still fail, at least
+            log.info("Chemistry information could not be found in "
+                     "cmp.h5, cannot fix the readGroupTable or "
+                     "MovieID field.")
+        if self._readGroupTableIsRemapped:
+            log.debug("Must correct index qId's")
+            qIdMap = dict(zip(rr.readGroupTable.ID,
+                              rr.readGroupTable.MovieName))
+            nameMap = self.movieIds
+            for qId in qIdMap.keys():
+                qId_acc(indices)[qId_acc(indices) == qId] = nameMap[
+                    qIdMap[qId]]
+
+
     def _indexRecords(self):
         """Returns index recarray summarizing all of the records in all of
         the resources that conform to those filters addressing parameters
@@ -1899,13 +2139,17 @@ class ReadSet(DataSet):
 
         """
         recArrays = []
-        self._indexMap = []
+        _indexMap = []
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.index
+            if len(indices) == 0:
+                continue
+
+            self._fixQIds(indices, rr)
 
             if not self._filters or self.noFiltering:
                 recArrays.append(indices._tbl)
-                self._indexMap.extend([(rrNum, i) for i in
+                _indexMap.extend([(rrNum, i) for i in
                                        range(len(indices._tbl))])
             else:
                 # Filtration will be necessary:
@@ -1914,14 +2158,17 @@ class ReadSet(DataSet):
                     nameMap = {name: n
                                for n, name in enumerate(
                                    rr.referenceInfoTable['Name'])}
-
                 passes = self._filters.filterIndexRecords(indices._tbl,
-                                                          nameMap)
+                                                          nameMap,
+                                                          self.movieIds)
                 newInds = indices._tbl[passes]
                 recArrays.append(newInds)
-                self._indexMap.extend([(rrNum, i) for i in
-                                       np.nonzero(passes)[0]])
-        self._indexMap = np.array(self._indexMap)
+                _indexMap.extend([(rrNum, i) for i in
+                                       np.flatnonzero(passes)])
+        self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+                                                    ('index', 'uint64')])
+        if recArrays == []:
+            return recArrays
         return _stackRecArrays(recArrays)
 
     def resourceReaders(self):
@@ -1944,7 +2191,9 @@ class ReadSet(DataSet):
 
         """
         count = len(self.index)
-        length = sum(self.index.qEnd - self.index.qStart)
+        length = 0
+        if count:
+            length = sum(self.index.qEnd - self.index.qStart)
         return count, length
 
     def _resourceSizes(self):
@@ -2005,20 +2254,14 @@ class ReadSet(DataSet):
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
             self.addExternalResources([dataFile])
+        # reset the indexmap especially, as it is out of date:
+        self._index = None
+        self._indexMap = None
+        self._openReaders = []
         self._populateMetaTypes()
 
-    def __len__(self):
-        if self.numRecords == -1:
-            if self._filters:
-                self.updateCounts()
-            else:
-                count = 0
-                for reader in self.resourceReaders():
-                    count += len(reader)
-                self.numRecords = count
-        return self.numRecords
-
     def updateCounts(self):
+        self._countsUpdated = True
         if self._skipCounts:
             log.debug("SkipCounts is true, skipping updateCounts()")
             self.metadata.totalLength = -1
@@ -2044,13 +2287,12 @@ class HdfSubreadSet(ReadSet):
     datasetType = DataSetMetaTypes.HDF_SUBREAD
 
     def __init__(self, *files, **kwargs):
-        log.debug("Opening HdfSubreadSet")
         super(HdfSubreadSet, self).__init__(*files, **kwargs)
 
         # The metatype for this dataset type is inconsistent, plaster over it
         # here:
         self.objMetadata["MetaType"] = "PacBio.DataSet.HdfSubreadSet"
-        self.objMetadata["TimeStampedName"] = self._getTimeStampedName(
+        self.objMetadata["TimeStampedName"] = _getTimeStampedName(
             self.objMetadata["MetaType"])
 
     def induceIndices(self):
@@ -2058,7 +2300,7 @@ class HdfSubreadSet(ReadSet):
 
     @property
     def isIndexed(self):
-        return True
+        return False
 
     def _openFiles(self):
         """Open the files (assert they exist, assert they are of the proper
@@ -2089,6 +2331,7 @@ class HdfSubreadSet(ReadSet):
 
     def updateCounts(self):
         """Overriding here so we don't have to assertIndexed"""
+        self._countsUpdated = True
         if self._skipCounts:
             log.debug("SkipCounts is true, skipping updateCounts()")
             self.metadata.totalLength = -1
@@ -2124,8 +2367,8 @@ class SubreadSet(ReadSet):
         >>> from pbcore.io import SubreadSet
         >>> from pbcore.io.dataset.DataSetMembers import ExternalResources
         >>> import pbcore.data.datasets as data
-        >>> ds1 = SubreadSet(data.getXml(no=5))
-        >>> ds2 = SubreadSet(data.getXml(no=5))
+        >>> ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
+        >>> ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
         >>> # So they don't conflict:
         >>> ds2.externalResources = ExternalResources()
         >>> ds1 # doctest:+ELLIPSIS
@@ -2143,7 +2386,7 @@ class SubreadSet(ReadSet):
         >>> ds3 = ds1 + ds2
         >>> len(ds3.metadata.collections)
         2
-        >>> ds4 = SubreadSet(data.getSubreadSet())
+        >>> ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
         >>> ds4 # doctest:+ELLIPSIS
         <SubreadSet...
         >>> ds4._metadata # doctest:+ELLIPSIS
@@ -2155,7 +2398,6 @@ class SubreadSet(ReadSet):
     datasetType = DataSetMetaTypes.SUBREAD
 
     def __init__(self, *files, **kwargs):
-        log.debug("Opening SubreadSet")
         super(SubreadSet, self).__init__(*files, **kwargs)
 
     @staticmethod
@@ -2184,11 +2426,37 @@ class AlignmentSet(ReadSet):
             :strict=False: see base class
             :skipCounts=False: see base class
         """
-        log.debug("Opening AlignmentSet with {f}".format(f=files))
         super(AlignmentSet, self).__init__(*files, **kwargs)
         fname = kwargs.get('referenceFastaFname', None)
         if fname:
             self.addReference(fname)
+        self.__referenceIdMap = None
+
+    @property
+    @filtered
+    def records(self):
+        """A generator of (usually) BamAlignment objects for the
+        records in one or more Bam files pointed to by the
+        ExternalResources in this DataSet.
+
+        Yields:
+            A BamAlignment object
+
+        Doctest:
+            >>> import pbcore.data.datasets as data
+            >>> from pbcore.io import AlignmentSet
+            >>> ds = AlignmentSet(data.getBam())
+            >>> for record in ds.records:
+            ...     print 'hn: %i' % record.holeNumber # doctest:+ELLIPSIS
+            hn: ...
+        """
+        if self.isIndexed:
+            for i in range(len(self.index)):
+                yield self[i]
+        else:
+            for resource in self.resourceReaders():
+                for record in resource:
+                    yield record
 
     def consolidate(self, *args, **kwargs):
         if self.isCmpH5:
@@ -2226,6 +2494,21 @@ class AlignmentSet(ReadSet):
                 res.reference = reference[0]
             self._openFiles()
 
+    def _fixTIds(self, indices, rr, correctIds=True):
+        tId_acc = lambda x: x.RefGroupID
+        rName = lambda x: x['FullName']
+        if not self.isCmpH5:
+            tId_acc = lambda x: x.tId
+            rName = lambda x: x['Name']
+        if correctIds and self._stackedReferenceInfoTable:
+            log.debug("Must correct index tId's")
+            tIdMap = dict(zip(rr.referenceInfoTable['ID'],
+                              rName(rr.referenceInfoTable)))
+            nameMap = self.refIds
+            for tId in tIdMap.keys():
+                tId_acc(indices)[tId_acc(indices) == tId] = nameMap[
+                    tIdMap[tId]]
+
     def _indexRecords(self, correctIds=True):
         """Returns index records summarizing all of the records in all of
         the resources that conform to those filters addressing parameters
@@ -2234,47 +2517,57 @@ class AlignmentSet(ReadSet):
         """
         recArrays = []
         log.debug("Processing resource pbis")
-        self._indexMap = []
+        _indexMap = []
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.index
-            tId = lambda x: x.tId
+            # pbi files lack e.g. mapping cols when bam emtpy, ignore
+            if len(indices) == 0:
+                continue
+            # TODO(mdsmith)(2016-01-19) rename the fields instead of branching:
+            #if self.isCmpH5:
+            #    _renameField(indices, 'MovieID', 'qId')
+            #    _renameField(indices, 'RefGroupID', 'tId')
             if not self.isCmpH5:
                 indices = indices._tbl
-                tId = lambda x: x.RefGroupID
 
-            if correctIds and self._stackedReferenceInfoTable:
-                log.debug("Must correct index tId's")
-                tIdMap = {n: name
-                          for n, name in enumerate(
-                              rr.referenceInfoTable['Name'])}
-                nameMap = self.refIds
+            # Correct tId field
+            self._fixTIds(indices, rr, correctIds)
 
+            # Correct qId field
+            self._fixQIds(indices, rr)
+
+            # filter
             if not self._filters or self.noFiltering:
-                if correctIds and self._stackedReferenceInfoTable:
-                    for i in range(len(indices)):
-                        tId(indices)[i] = nameMap[
-                            tIdMap[tId(indices)[i]]]
                 recArrays.append(indices)
-                self._indexMap.extend([(rrNum, i) for i in
-                                       range(len(indices))])
+                _indexMap.extend([(rrNum, i) for i in
+                                  range(len(indices))])
             else:
-                # Filtration will be necessary:
-                nameMap = {name: n
-                           for n, name in enumerate(
-                               rr.referenceInfoTable['Name'])}
-
-                passes = self._filters.filterIndexRecords(indices,
-                                                          nameMap)
-                if correctIds and self._stackedReferenceInfoTable:
-                    for i in range(len(indices)):
-                        tId(indices)[i] = nameMap[
-                            tIdMap[tId(indices)[i]]]
+                passes = self._filters.filterIndexRecords(indices, self.refIds,
+                                                          self.movieIds)
                 newInds = indices[passes]
                 recArrays.append(newInds)
-                self._indexMap.extend([(rrNum, i) for i in
-                                       np.nonzero(passes)[0]])
-        self._indexMap = np.array(self._indexMap)
-        return _stackRecArrays(recArrays)
+                _indexMap.extend([(rrNum, i) for i in
+                                  np.flatnonzero(passes)])
+        self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+                                                    ('index', 'uint64')])
+        if recArrays == []:
+            return recArrays
+        tbr = _stackRecArrays(recArrays)
+
+        # sort if cmp.h5 so we can rectify RowStart/End, maybe someday bam
+        if self.isCmpH5:
+            sort_order = np.argsort(tbr, order=('RefGroupID', 'tStart',
+                                                'tEnd',))
+            tbr = tbr[sort_order]
+            self._indexMap = self._indexMap[sort_order]
+            for ref in self.referenceInfoTable:
+                hits = np.flatnonzero(tbr.RefGroupID == ref.ID)
+                if len(hits) != 0:
+                    ref.StartRow = hits[0]
+                    ref.EndRow = hits[-1]
+                # and fix the naming scheme while we're at it
+                ref.Name = self._cleanCmpName(ref.FullName)
+        return tbr
 
     def resourceReaders(self, refName=False):
         """A generator of Indexed*Reader objects for the ExternalResources
@@ -2315,10 +2608,7 @@ class AlignmentSet(ReadSet):
     @property
     def refNames(self):
         """A list of reference names (id)."""
-        if self.isCmpH5:
-            return [self._cleanCmpName(name) for _, name in
-                    self.refInfo('FullName')]
-        return sorted([name for _, name in self.refInfo('Name')])
+        return np.sort(self.referenceInfoTable["Name"])
 
     def _indexReadsInReference(self, refName):
         # This can probably be deprecated for all but the official reads in
@@ -2326,7 +2616,7 @@ class AlignmentSet(ReadSet):
         refName = self.guaranteeName(refName)
 
         desiredTid = self.refIds[refName]
-        tIds = self.index.tId
+        tIds = self.tId
         passes = tIds == desiredTid
         return self.index[passes]
 
@@ -2336,34 +2626,12 @@ class AlignmentSet(ReadSet):
             sizes.append((len(rr.index), sum(rr.index.aEnd - rr.index.aStart)))
         return sizes
 
-    def _countMappedReads(self):
-        """It is too slow for large datasets to use _indexReadsInReference"""
-        counts = {rId: 0 for _, rId in self.refIds.items()}
-        for ind in self.index:
-            counts[ind["tId"]] += 1
-        tbr = {}
-        idMap = {rId: name for name, rId in self.refIds.items()}
-        for key, value in counts.iteritems():
-            tbr[idMap[key]] = value
-        return tbr
-
-    def _getMappedReads(self):
-        """It is too slow for large datasets to use _indexReadsInReference for
-        each reference"""
-        counts = {rId: 0 for _, rId in self.refIds.items()}
-        for ind in self.index:
-            counts[ind["tId"]].append(ind)
-        tbr = {}
-        idMap = {rId: name for name, rId in self.refIds.items()}
-        for key, value in counts.iteritems():
-            tbr[idMap[key]] = value
-        return tbr
-
     @property
     def refWindows(self):
         """Going to be tricky unless the filters are really focused on
         windowing the reference. Much nesting or duplication and the correct
         results are really not guaranteed"""
+        log.debug("Fetching reference windows...")
         windowTuples = []
         nameIDs = self.refInfo('Name')
         refLens = None
@@ -2396,6 +2664,7 @@ class AlignmentSet(ReadSet):
                     refLens = self.refLengths
                 refLen = refLens[name]
                 windowTuples.append((refId, 0, refLen))
+        log.debug("Done fetching reference windows")
         return sorted(windowTuples)
 
     def countRecords(self, rname=None, winStart=None, winEnd=None):
@@ -2429,30 +2698,15 @@ class AlignmentSet(ReadSet):
 
         """
 
-        if isinstance(refName, np.int64):
-            refName = str(refName)
-        if refName.isdigit():
-            if (not refName in self.refNames
-                    and not refName in self.fullRefNames):
-                try:
-                    refName = self._idToRname(int(refName))
-                except AttributeError:
-                    raise StopIteration
-
-        # I would love to use readsInRange(refName, None, None), but
-        # IndexedBamReader breaks this (works for regular BamReader).
-        # So I have to do a little hacking...
-        refLen = 0
-        for resource in self.resourceReaders():
-            if (refName in resource.referenceInfoTable['Name'] or
-                    refName in resource.referenceInfoTable['FullName']):
-                refLen = resource.referenceInfo(refName).Length
-                break
-        if refLen:
-            for read in self.readsInRange(refName, 0, refLen):
-                yield read
+        try:
+            refName = self.guaranteeName(refName)
+            refLen = self.refLengths[refName]
+        except (KeyError, AttributeError):
+            raise StopIteration
+        for read in self.readsInRange(refName, 0, refLen):
+            yield read
 
-    def _intervalContour(self, rname):
+    def intervalContour(self, rname, tStart=0, tEnd=None):
         """Take a set of index records and build a pileup of intervals, or
         "contour" describing coverage over the contig
 
@@ -2468,15 +2722,27 @@ class AlignmentSet(ReadSet):
         """
         log.debug("Generating coverage summary")
         index = self._indexReadsInReference(rname)
-        # indexing issue. Doesn't really matter (just for split). Shifted:
-        coverage = [0] * (self.refLengths[rname] + 1)
+        if tEnd is None:
+            tEnd = self.refLengths[rname]
+        coverage = [0] * (tEnd - tStart)
         starts = sorted(index.tStart)
         for i in starts:
-            coverage[i] += 1
+            # ends are exclusive
+            if i >= tEnd:
+                continue
+            if i >= tStart:
+                coverage[i - tStart] += 1
+            else:
+                coverage[0] += 1
         del starts
         ends = sorted(index.tEnd)
         for i in ends:
-            coverage[i] -= 1
+            # ends are exclusive
+            if i <= tStart:
+                continue
+            # ends are exclusive
+            if i < tEnd:
+                coverage[i - tStart] -= 1
         del ends
         curCov = 0
         for i, delta in enumerate(coverage):
@@ -2484,7 +2750,7 @@ class AlignmentSet(ReadSet):
             coverage[i] = curCov
         return coverage
 
-    def _splitContour(self, contour, splits):
+    def splitContour(self, contour, splits):
         """Take a contour and a number of splits, return the location of each
         coverage mediated split with the first at 0"""
         log.debug("Splitting coverage summary")
@@ -2511,8 +2777,8 @@ class AlignmentSet(ReadSet):
             rnames[atom[0]].append(atom)
         for rname, rAtoms in rnames.iteritems():
             if len(rAtoms) > 1:
-                contour = self._intervalContour(rname)
-                splits = self._splitContour(contour, len(rAtoms))
+                contour = self.intervalContour(rname)
+                splits = self.splitContour(contour, len(rAtoms))
                 ends = splits[1:] + [self.refLengths[rname]]
                 for start, end in zip(splits, ends):
                     newAtom = (rname, start, end)
@@ -2545,37 +2811,24 @@ class AlignmentSet(ReadSet):
         log.debug("{i} references found".format(i=len(refNames)))
 
         log.debug("Finding contigs")
-        # FIXME: this mess:
-        if len(refNames) < 100 and len(refNames) > 1:
-            if byRecords:
-                log.debug("Counting records...")
-                atoms = [(rn, 0, 0, self.countRecords(rn))
-                         for rn in refNames
-                         if len(self._indexReadsInReference(rn)) != 0]
-            else:
-                atoms = [(rn, 0, refLens[rn]) for rn in refNames if
-                         len(self._indexReadsInReference(rn)) != 0]
-        else:
-            log.debug("Skipping records for each reference check")
-            atoms = [(rn, 0, refLens[rn]) for rn in refNames]
-            if byRecords:
-                log.debug("Counting records...")
-                # This is getting out of hand, but the number of references
-                # determines the best read counting algorithm:
-                if len(refNames) < 100:
-                    atoms = [(rn, 0, refLens[rn], self.countRecords(rn))
-                             for rn in refNames]
-                else:
-                    counts = self._countMappedReads()
-                    atoms = [(rn, 0, refLens[rn], counts[rn])
-                             for rn in refNames]
-        log.debug("{i} contigs found".format(i=len(atoms)))
-
         if byRecords:
+            log.debug("Counting records...")
+            atoms = [(rn, 0, 0, count)
+                     for rn, count in zip(refNames, map(self.countRecords,
+                                                        refNames))
+                     if count != 0]
             balanceKey = lambda x: self.countRecords(*x)
         else:
-            # The window length is used for balancing
+            # if there are that many references, on average they will probably
+            # be distributed pretty evenly. Checking the counts will also be
+            # super expensive
+            if len(refNames) < 100:
+                atoms = [(rn, 0, refLens[rn]) for rn in refNames if
+                         self.countRecords(rn) != 0]
+            else:
+                atoms = [(rn, 0, refLens[rn]) for rn in refNames]
             balanceKey = lambda x: x[2] - x[1]
+        log.debug("{i} contigs found".format(i=len(atoms)))
 
         # By providing maxChunks and not chunks, this combination will set
         # chunks down to < len(atoms) < maxChunks
@@ -2675,13 +2928,16 @@ class AlignmentSet(ReadSet):
         log.debug("Done chunking")
         log.debug("Modifying filters or resources")
         for result, chunk in zip(results, chunks):
+            # we don't want to updateCounts or anything right now, so we'll
+            # block that functionality:
+            result._filters.clearCallbacks()
             if atoms[0][2]:
-                result.filters.addRequirement(
+                result._filters.addRequirement(
                     rname=[('=', c[0]) for c in chunk],
                     tStart=[('<', c[2]) for c in chunk],
                     tEnd=[('>', c[1]) for c in chunk])
             else:
-                result.filters.addRequirement(
+                result._filters.addRequirement(
                     rname=[('=', c[0]) for c in chunk])
 
         # UniqueId was regenerated when the ExternalResource list was
@@ -2691,10 +2947,23 @@ class AlignmentSet(ReadSet):
         # here:
         for result in results:
             result.newUuid()
-            if updateCounts:
+            # If there are so many filters that it will be really expensive, we
+            # will use an approximation for the number of records and bases.
+            # This is probably not too far off, if there are that many chunks
+            # to distribute. We'll still round to indicate that it is an
+            # abstraction.
+            if len(result._filters) > 100:
+                meanNum = self.numRecords/len(chunks)
+                result.numRecords = long(round(meanNum,
+                                               (-1 * len(str(meanNum))) + 3))
+                meanLen = self.totalLength/len(chunks)
+                result.totalLength = long(round(meanLen,
+                                                (-1 * len(str(meanLen))) + 3))
+            elif updateCounts:
                 result._openReaders = self._openReaders
                 passes = result._filters.filterIndexRecords(self.index,
-                                                            self.refIds)
+                                                            self.refIds,
+                                                            self.movieIds)
                 result._index = self.index[passes]
                 result.updateCounts()
                 del result._index
@@ -2987,6 +3256,18 @@ class AlignmentSet(ReadSet):
                     yield read
 
     @property
+    def tId(self):
+        if self.isCmpH5:
+            return self.index.RefGroupID
+        return self.index.tId
+
+    @property
+    def mapQV(self):
+        if self.isCmpH5:
+            return self.index.MapQV
+        return self.index.mapQV
+
+    @property
     def isSorted(self):
         return self._checkIdentical('isSorted')
 
@@ -3012,20 +3293,22 @@ class AlignmentSet(ReadSet):
 
         """
         count = len(self.index)
-        if self.isCmpH5:
-            length = sum(self.index.rEnd - self.index.rStart)
-        else:
-            try:
-                length = sum(self.index.aEnd - self.index.aStart)
-            except AttributeError:
-                # If the bam is empty or the file is not actually aligned, this
-                # field wont be populated
-                if self.isMapped:
-                    log.warn(".pbi mapping columns missing from mapped bam, "
-                             "bam may be empty")
-                else:
-                    log.warn("File not actually mapped!")
-                length = 0
+        length = 0
+        if count:
+            if self.isCmpH5:
+                length = sum(self.index.rEnd - self.index.rStart)
+            else:
+                try:
+                    length = sum(self.index.aEnd - self.index.aStart)
+                except AttributeError:
+                    # If the bam is empty or the file is not actually aligned,
+                    # this field wont be populated
+                    if self.isMapped:
+                        log.debug(".pbi mapping columns missing from mapped "
+                                  "bam, bam may be empty")
+                    else:
+                        log.warn("File not actually mapped!")
+                    length = 0
         return count, length
 
     @property
@@ -3046,16 +3329,23 @@ class AlignmentSet(ReadSet):
 
     def referenceInfo(self, refName):
         """Select a row from the DataSet.referenceInfoTable using the reference
-        name as a unique key"""
-        # TODO: upgrade to use _referenceDict if needed
-        if not self.isCmpH5:
-            for row in self.referenceInfoTable:
-                if row.Name == refName:
-                    return row
+        name as a unique key (or ID, if you really have to)"""
+
+        # Convert it to a name if you have to:
+        if not isinstance(refName, str):
+            refName = str(refName)
+        if refName.isdigit():
+            if not refName in self.refNames:
+                refName = self._idToRname(int(refName))
+
+        tbr = self.referenceInfoTable[
+            self.referenceInfoTable['Name'] == refName]
+        if len(tbr) > 1:
+            log.info("Duplicate reference names detected")
+        elif len(tbr) == 1:
+            return tbr[0]
         else:
-            for row in self.referenceInfoTable:
-                if row.FullName.startswith(refName):
-                    return row
+            log.debug("No reference found by that Name or ID")
 
     @property
     def referenceInfoTable(self):
@@ -3064,15 +3354,28 @@ class AlignmentSet(ReadSet):
         is preferred). Record.Names are remapped for cmp.h5 files to be
         consistent with bam files.
 
+        ..note:: Reference names are assumed to be unique
+
         """
         if self._referenceInfoTable is None:
-            # This isn't really right for cmp.h5 files (rowStart, rowEnd, for
-            # instance). Use the resource readers directly instead.
-            responses = self._pollResources(lambda x: x.referenceInfoTable)
+            self._referenceInfoTableIsStacked = False
+            responses = []
+
+            # allow for merge here:
+            for res in self._pollResources(lambda x: x.referenceInfoTable):
+                if not res is None:
+                    if self.isCmpH5:
+                        for rec in res:
+                            rec.StartRow = 0
+                            rec.EndRow = 0
+                    responses.append(res)
             table = []
             if len(responses) > 1:
-                assert not self.isCmpH5 # see above
+                # perhaps this can be removed someday so that cmpH5 references
+                # are 0-indexed even if only one exists
                 try:
+                    # this works even for cmp's, because we overwrite the
+                    # highly variable fields above
                     table = self._unifyResponses(
                         responses,
                         eqFunc=np.array_equal)
@@ -3081,18 +3384,19 @@ class AlignmentSet(ReadSet):
                     table = np.unique(table)
                     for i, rec in enumerate(table):
                         rec.ID = i
-                    self._stackedReferenceInfoTable = True
-            else:
+                        rec.RefInfoID = i
+                    self._referenceInfoTableIsStacked = True
+            elif len(responses) == 1:
                 table = responses[0]
-                if self.isCmpH5:
-                    for rec in table:
-                        rec.Name = self._cleanCmpName(rec.FullName)
+            else:
+                raise InvalidDataSetIOError("No reference tables found, "
+                                            "are these input files aligned?")
+            if self.isCmpH5:
+                for rec in table:
+                    rec.Name = self._cleanCmpName(rec.FullName)
             log.debug("Filtering reference entries")
             if not self.noFiltering and self._filters:
-                passes = []
-                for i, reference in enumerate(table):
-                    if self._filters.testParam('rname', reference.Name):
-                        passes.append(i)
+                passes = self._filters.testField('rname', table['Name'])
                 table = table[passes]
             self._referenceInfoTable = table
             #TODO: Turn on when needed (expensive)
@@ -3101,26 +3405,21 @@ class AlignmentSet(ReadSet):
         return self._referenceInfoTable
 
     @property
+    def _stackedReferenceInfoTable(self):
+        if self._referenceInfoTableIsStacked is None:
+            _ = self.referenceInfoTable
+        return self._referenceInfoTableIsStacked
+
+    @property
     def _referenceIdMap(self):
         """Map the dataset shifted refIds to the [resource, refId] they came
         from.
 
         """
-        # This isn't really possible for cmp.h5 files (rowStart, rowEnd, for
-        # instance). Use the resource readers directly instead.
-        responses = self._pollResources(lambda x: x.referenceInfoTable)
-        if len(responses) > 1:
-            assert not self.isCmpH5 # see above
-            tbr = reduce(np.append, responses)
-            tbrMeta = []
-            for i, res in enumerate(responses):
-                for j in res['ID']:
-                    tbrMeta.append([i, j])
-            _, indices = np.unique(tbr, return_index=True)
-            tbrMeta = list(tbrMeta[i] for i in indices)
-            return {i: meta for i, meta in enumerate(tbrMeta)}
-        else:
-            return {i: [0, i] for i in responses[0]['ID']}
+        if self.__referenceIdMap is None:
+            self.__referenceIdMap = {ref.ID: ref.Name
+                                     for ref in self.referenceInfoTable}
+        return self.__referenceIdMap
 
     def _cleanCmpName(self, name):
         return splitFastaHeader(name)[0]
@@ -3147,24 +3446,12 @@ class AlignmentSet(ReadSet):
         return [name for _, name in self.refInfo('FullName')]
 
     def refInfo(self, key):
-        """The reference names present in the referenceInfoTable of the
-        ExtResources.
-
-        Args:
-            :key: a key for the referenceInfoTable of each resource
-        Returns:
-            A list of tuples of refrence name, key_result pairs
+        """Return a column in the referenceInfoTable, tupled with the reference
+        name. TODO(mdsmith)(2016-01-27): pick a better name for this method...
 
         """
-        #log.debug("Sampling references")
-        names = self.referenceInfoTable['Name']
-        infos = self.referenceInfoTable[key]
-
-        #log.debug("Removing duplicate reference entries")
-        sampled = zip(names, infos)
-        sampled = set(sampled)
-
-        return sampled
+        return zip(self.referenceInfoTable['Name'],
+                   self.referenceInfoTable[key])
 
     def _idToRname(self, rId):
         """Map the DataSet.referenceInfoTable.ID to the superior unique
@@ -3177,17 +3464,7 @@ class AlignmentSet(ReadSet):
             The referenceInfoTable.Name corresponding to rId
 
         """
-        resNo, rId = self._referenceIdMap[rId]
-        if self.isCmpH5:
-            rId -= 1
-        if self.isCmpH5:
-            # This is what CmpIO recognizes as the 'shortname'
-            refName = self.resourceReaders()[
-                resNo].referenceInfoTable[rId]['FullName']
-        else:
-            refName = self.resourceReaders()[
-                resNo].referenceInfoTable[rId]['Name']
-        return refName
+        return self._referenceIdMap[rId]
 
     @staticmethod
     def _metaTypeMapping():
@@ -3207,7 +3484,8 @@ class ConsensusReadSet(ReadSet):
     Doctest:
         >>> import pbcore.data.datasets as data
         >>> from pbcore.io import ConsensusReadSet
-        >>> ds2 = ConsensusReadSet(data.getXml(2), strict=False)
+        >>> ds2 = ConsensusReadSet(data.getXml(2), strict=False,
+        ...                        skipMissing=True)
         >>> ds2 # doctest:+ELLIPSIS
         <ConsensusReadSet...
         >>> ds2._metadata # doctest:+ELLIPSIS
@@ -3247,11 +3525,29 @@ class ContigSet(DataSet):
     datasetType = DataSetMetaTypes.CONTIG
 
     def __init__(self, *files, **kwargs):
-        log.debug("Opening ContigSet")
-        super(ContigSet, self).__init__(*files, **kwargs)
-        self._metadata = ContigSetMetadata(self._metadata)
-        self._updateMetadata()
         self._fastq = False
+        super(ContigSet, self).__init__(*files, **kwargs)
+        # weaken by permitting failure to allow BarcodeSet to have own
+        # Metadata type
+        try:
+            self._metadata = ContigSetMetadata(self._metadata)
+            self._updateMetadata()
+        except TypeError:
+            pass
+
+    def split(self, nchunks):
+        log.debug("Getting and dividing contig id's")
+        keys = self.index.id
+        chunks = divideKeys(keys, nchunks)
+        log.debug("Creating copies of the dataset")
+        results = [self.copy() for _ in range(nchunks)]
+        log.debug("Applying filters and updating counts")
+        for chunk, res in zip(chunks, results):
+            res._filters.addRequirement(id=[('=', n) for n in chunk])
+            res.newUuid()
+            log.debug("Updating new res counts:")
+            res.updateCounts()
+        return results
 
     def consolidate(self, outfn=None, numFiles=1, useTmp=False):
         """Consolidation should be implemented for window text in names and
@@ -3286,6 +3582,8 @@ class ContigSet(DataSet):
         # consolidate multiple files into one
         if len(self.toExternalFiles()) > 1:
             writeTemp = True
+        if self._filters and not self.noFiltering:
+            writeTemp = True
         writeMatches = {}
         writeComments = {}
         writeQualities = {}
@@ -3330,28 +3628,40 @@ class ContigSet(DataSet):
             if not outfn:
                 log.debug("Writing to a temp directory as no path given")
                 outdir = tempfile.mkdtemp(suffix="consolidated-contigset")
-                outfn = os.path.join(outdir,
-                                     'consolidated_contigs.fasta')
+                if self._fastq:
+                    outfn = os.path.join(outdir,
+                                         'consolidated_contigs.fastq')
+                else:
+                    outfn = os.path.join(outdir,
+                                         'consolidated_contigs.fasta')
             with self._writer(outfn) as outfile:
                 log.debug("Writing new resource {o}".format(o=outfn))
                 for name, seq in writeMatches.items():
+                    name_key = name
                     if writeComments[name]:
                         name = ' '.join([name, writeComments[name]])
                     if self._fastq:
-                        outfile.writeRecord(name, seq,
-                                            qvsFromAscii(writeQualities[name]))
-                        continue
-                    outfile.writeRecord(name, seq)
+                        outfile.writeRecord(
+                            name, seq, qvsFromAscii(writeQualities[name_key]))
+                    else:
+                        outfile.writeRecord(name, seq)
             if not self._fastq:
                 _indexFasta(outfn)
             # replace resources
             log.debug("Replacing resources")
             self.externalResources = ExternalResources()
             self.addExternalResources([outfn])
+            self._index = None
+            self._indexMap = None
+            self._openReaders = []
+            self._populateMetaTypes()
+            self.updateCounts()
+
             # replace contig info
-            log.debug("Replacing metadata")
-            self._metadata.contigs = []
-            self._populateContigMetadata()
+            if not self._fastq:
+                log.debug("Replacing metadata")
+                self._metadata.contigs = []
+                self._populateContigMetadata()
         self._populateMetaTypes()
 
     @property
@@ -3364,7 +3674,7 @@ class ContigSet(DataSet):
         """Chunking and quivering adds suffixes to contig names, after the
         normal ID and window. This complicates our dewindowing and
         consolidation, so we'll remove them for now"""
-        observedSuffixes = ['|quiver', '|plurality', '|arrow']
+        observedSuffixes = ['|quiver', '|plurality', '|arrow', '|poa']
         for suff in observedSuffixes:
             if name.endswith(suff):
                 log.debug("Suffix found: {s}".format(s=suff))
@@ -3392,6 +3702,8 @@ class ContigSet(DataSet):
         """Chunking and quivering appends a window to the contig ID, which
         allows us to consolidate the contig chunks."""
         name, _ = self._popSuffix(name)
+        if re.search("_\d+_\d+$", name) is None:
+            return None
         possibilities = name.split('_')[-2:]
         for pos in possibilities:
             if not pos.isdigit():
@@ -3413,20 +3725,22 @@ class ContigSet(DataSet):
             self._metadata.addContig(contig)
 
     def updateCounts(self):
+        self._countsUpdated = True
         if self._skipCounts:
             if not self.metadata.totalLength:
                 self.metadata.totalLength = -1
             if not self.metadata.numRecords:
                 self.metadata.numRecords = -1
             return
-        try:
-            log.debug('Updating counts')
+        if not self.isIndexed:
+            log.info("Cannot updateCounts without an index file")
             self.metadata.totalLength = 0
             self.metadata.numRecords = 0
-            for res in self.resourceReaders():
-                self.metadata.numRecords += len(res)
-                for index in res.fai:
-                    self.metadata.totalLength += index.length
+            return
+        try:
+            log.debug('Updating counts')
+            self.metadata.totalLength = sum(self.index.length)
+            self.metadata.numRecords = len(self.index)
         except (IOError, UnavailableFeature, TypeError):
             # IOError for missing files
             # UnavailableFeature for files without companion files
@@ -3477,16 +3791,26 @@ class ContigSet(DataSet):
         type before accessing any file)
         """
         if self._openReaders:
-            log.debug("Closing old readers...")
+            log.debug("Closing old {t} readers".format(
+                t=self.__class__.__name__))
             self.close()
-        log.debug("Opening resources")
+        log.debug("Opening {t} resources".format(
+            t=self.__class__.__name__))
         for extRes in self.externalResources:
-            location = urlparse(extRes.resourceId).path
-            if location.endswith("fastq"):
-                self._fastq = True
-                self._openReaders.append(FastqReader(location))
-                continue
-            resource = None
+            resource = self._openFile(urlparse(extRes.resourceId).path)
+            if resource is not None:
+                self._openReaders.append(resource)
+        if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
+            raise IOError("No files were openable")
+        log.debug("Done opening {t} resources".format(
+            t=self.__class__.__name__))
+
+    def _openFile(self, location):
+        resource = None
+        if location.endswith("fastq"):
+            self._fastq = True
+            resource = FastqReader(location)
+        else:
             try:
                 resource = IndexedFastaReader(location)
             except IOError:
@@ -3498,18 +3822,14 @@ class ContigSet(DataSet):
                 else:
                     raise
             except ValueError:
-                log.warn("Fasta file is empty")
+                log.debug("Fasta file is empty")
                 # this seems to work for an emtpy fasta, interesting:
                 resource = FastaReader(location)
                 # we know this file is empty
                 self._skipCounts = True
                 self.metadata.totalLength = 0
                 self.metadata.numRecords = 0
-            if resource is not None:
-                self._openReaders.append(resource)
-        if len(self._openReaders) == 0 and len(self.toExternalFiles()) != 0:
-            raise IOError("No files were openable")
-        log.debug("Done opening resources")
+        return resource
 
     def resourceReaders(self, refName=None):
         """A generator of fastaReader objects for the ExternalResources in this
@@ -3521,7 +3841,12 @@ class ContigSet(DataSet):
         """
         if refName:
             log.error("Specifying a contig name not yet implemented")
-        self._openFiles()
+        if not self._openReaders:
+            self._openFiles()
+        else:
+            for reader in self._openReaders:
+                if isinstance(reader, (FastaReader, FastqReader)):
+                    reader.file = open(reader.filename, "r")
         return self._openReaders
 
     @property
@@ -3546,11 +3871,12 @@ class ContigSet(DataSet):
                 return contig
 
     def assertIndexed(self):
-        # I would prefer to use _assertIndexed, but want to pass up the
-        # original error
-        if not self.isIndexed:
-            self._strict = True
-            self._openFiles()
+        try:
+            self._assertIndexed(IndexedFastaReader)
+        except IOError:
+            raise IOError("Companion FASTA index (.fai) file not found or "
+                          "malformatted! Use 'samtools faidx' to generate "
+                          "FASTA index.")
         return True
 
     @property
@@ -3561,13 +3887,13 @@ class ContigSet(DataSet):
             return self._unifyResponses(res)
         except ResourceMismatchError:
             if not self._strict:
-                log.error("Not all resource are equally indexed.")
+                log.info("Not all resource are equally indexed.")
                 return False
             else:
                 raise
         except IndexError:
             if not self._strict:
-                log.error("No resource readers!")
+                log.info("No resource readers!")
                 return False
             else:
                 raise InvalidDataSetIOError("No openable resources!")
@@ -3576,6 +3902,8 @@ class ContigSet(DataSet):
     def contigNames(self):
         """The names assigned to the External Resources, or contigs if no name
         assigned."""
+        # TODO{mdsmith}{3/10/2016} Make this faster by using (optionally) the
+        # index file
         names = []
         for contig in self.contigs:
             if (self.noFiltering
@@ -3602,9 +3930,12 @@ class ContigSet(DataSet):
 
         """
         recArrays = []
-        self._indexMap = []
+        _indexMap = []
         for rrNum, rr in enumerate(self.resourceReaders()):
             indices = rr.fai
+            if len(indices) == 0:
+                continue
+            # have to manually specify the dtype to make it work like a pbi
             indices = np.rec.fromrecords(
                 indices,
                 dtype=[('id', 'O'), ('comment', 'O'), ('header', 'O'),
@@ -3613,19 +3944,27 @@ class ContigSet(DataSet):
 
             if not self._filters or self.noFiltering:
                 recArrays.append(indices)
-                self._indexMap.extend([(rrNum, i) for i in
-                                       range(len(indices))])
+                _indexMap.extend([(rrNum, i) for i in
+                                  range(len(indices))])
             else:
                 # Filtration will be necessary:
                 # dummy map, the id is the name in fasta space
                 nameMap = {name: name for name in indices.id}
 
-                passes = self._filters.filterIndexRecords(indices, nameMap)
+                passes = self._filters.filterIndexRecords(indices, nameMap, {},
+                                                          readType='fasta')
                 newInds = indices[passes]
                 recArrays.append(newInds)
-                self._indexMap.extend([(rrNum, i) for i in
-                                       np.nonzero(passes)[0]])
-        self._indexMap = np.array(self._indexMap)
+                _indexMap.extend([(rrNum, i) for i in
+                                  np.flatnonzero(passes)])
+        self._indexMap = np.array(_indexMap, dtype=[('reader', 'uint64'),
+                                                    ('index', 'uint64')])
+        if len(recArrays) == 0:
+            recArrays = [np.array(
+                [],
+                dtype=[('id', 'O'), ('comment', 'O'), ('header', 'O'),
+                       ('length', '<i8'), ('offset', '<i8'),
+                       ('lineWidth', '<i8'), ('stride', '<i8')])]
         return _stackRecArrays(recArrays)
 
     def _filterType(self):
@@ -3638,7 +3977,6 @@ class ReferenceSet(ContigSet):
     datasetType = DataSetMetaTypes.REFERENCE
 
     def __init__(self, *files, **kwargs):
-        log.debug("Opening ReferenceSet with {f}".format(f=files))
         super(ReferenceSet, self).__init__(*files, **kwargs)
 
     @property
@@ -3659,15 +3997,15 @@ class ReferenceSet(ContigSet):
                }
 
 
-class BarcodeSet(DataSet):
+class BarcodeSet(ContigSet):
     """DataSet type specific to Barcodes"""
 
     datasetType = DataSetMetaTypes.BARCODE
 
     def __init__(self, *files, **kwargs):
-        log.debug("Opening BarcodeSet")
         super(BarcodeSet, self).__init__(*files, **kwargs)
-        self._metadata = BarcodeSetMetadata(self._metadata)
+        self._metadata = BarcodeSetMetadata(self._metadata.record)
+        self._updateMetadata()
 
     def addMetadata(self, newMetadata, **kwargs):
         """Add metadata specific to this subtype, while leaning on the
@@ -3690,6 +4028,24 @@ class BarcodeSet(DataSet):
         # Pull subtype specific values where important
         # -> No type specific merging necessary, for now
 
+    @property
+    def metadata(self):
+        if not isinstance(self._metadata, BarcodeSetMetadata):
+           self._metadata = BarcodeSetMetadata(self._metadata)
+        return self._metadata
+
+    @metadata.setter
+    def metadata(self, value):
+        if not isinstance(value, BarcodeSetMetadata):
+            value = BarcodeSetMetadata(value)
+        self._metadata = value
+
+    def _updateMetadata(self):
+        # update barcode specific metadata:
+        if not self._metadata.barcodeConstruction:
+            self._metadata.barcodeConstruction = ''
+
+
     @staticmethod
     def _metaTypeMapping():
         return {'fasta':'PacBio.BarcodeFile.BarcodeFastaFile',
diff --git a/pbcore/io/dataset/DataSetMembers.py b/pbcore/io/dataset/DataSetMembers.py
index 6e3837c..05303a7 100755
--- a/pbcore/io/dataset/DataSetMembers.py
+++ b/pbcore/io/dataset/DataSetMembers.py
@@ -65,6 +65,11 @@ from pbcore.io.dataset.DataSetWriter import namespaces
 
 log = logging.getLogger(__name__)
 
+DISTLIST = ["ProdDist", "ReadTypeDist", "ReadLenDist", "ReadQualDist",
+            "MedianInsertDist", "InsertReadQualDist",
+            "InsertReadLenDist", "ControlReadQualDist",
+            "ControlReadLenDist"]
+
 def newUuid(record):
     # At some point the uuid may need to be a digest
     #newId = str(hashlib.md5(str(record)).hexdigest())
@@ -83,6 +88,47 @@ def getTime():
 def getTimeStampedName(mType):
     return "{m}-{t}".format(m=mType, t=getTime())
 
+OPMAP = {'==': OP.eq,
+         '=': OP.eq,
+         'eq': OP.eq,
+         '!=': OP.ne,
+         'ne': OP.ne,
+         '>=': OP.ge,
+         '>=': OP.ge,
+         'gte': OP.ge,
+         '<=': OP.le,
+         '<=': OP.le,
+         'lte': OP.le,
+         '>': OP.gt,
+         '>': OP.gt,
+         'gt': OP.gt,
+         '<': OP.lt,
+         '<': OP.lt,
+         'lt': OP.lt,
+         '&': lambda x, y: OP.and_(x, y).view(np.bool_),
+         '~': lambda x, y: np.logical_not(OP.and_(x, y).view(np.bool_)),
+        }
+
+def mapOp(op):
+    return OPMAP[op]
+
+class PbiFlags(object):
+    NO_LOCAL_CONTEXT = 0
+    ADAPTER_BEFORE = 1
+    ADAPTER_AFTER = 2
+    BARCODE_BEFORE = 4
+    BARCODE_AFTER = 8
+    FORWARD_PASS = 16
+    REVERSE_PASS = 32
+
+    @classmethod
+    def flagMap(cls, flag):
+        if flag.isdigit():
+            return int(flag)
+        return reduce(OP.or_,
+                      (getattr(cls, fl.strip()) for fl in flag.split('|')))
+
+
 class RecordWrapper(object):
     """The base functionality of a metadata element.
 
@@ -292,10 +338,14 @@ class RecordWrapper(object):
 
     def removeChildren(self, tag):
         keepers = []
+        removed = []
         for child in self.record['children']:
             if child['tag'] != tag:
                 keepers.append(child)
+            else:
+                removed.append(child)
         self.record['children'] = keepers
+        return removed
 
     def pruneChildrenTo(self, whitelist):
         newChildren = []
@@ -350,6 +400,9 @@ class Filters(RecordWrapper):
         if func not in self._callbacks:
             self._callbacks.append(func)
 
+    def clearCallbacks(self):
+        self._callbacks = []
+
     def _runCallbacks(self):
         for func in self._callbacks:
             func()
@@ -412,31 +465,23 @@ class Filters(RecordWrapper):
         for i, filt in enumerate(self):
             for req in filt:
                 if req.name == param:
-                    if not self.opMap(oper)(testType(req.value),
+                    if not mapOp(oper)(testType(req.value),
                                             testType(value)):
                         options[i] = False
         return any(options)
 
-    def opMap(self, op):
-        ops = {'==': OP.eq,
-               '=': OP.eq,
-               'eq': OP.eq,
-               '!=': OP.ne,
-               'ne': OP.ne,
-               '>=': OP.ge,
-               '>=': OP.ge,
-               'gte': OP.ge,
-               '<=': OP.le,
-               '<=': OP.le,
-               'lte': OP.le,
-               '>': OP.gt,
-               '>': OP.gt,
-               'gt': OP.gt,
-               '<': OP.lt,
-               '<': OP.lt,
-               'lt': OP.lt,
-              }
-        return ops[op]
+    def testField(self, param, values, testType=str, oper='='):
+        passes = np.zeros(len(values), dtype=np.bool_)
+        tested = False
+        for i, filt in enumerate(self):
+            for req in filt:
+                if req.name == param:
+                    tested = True
+                    passes |= mapOp(oper)(testType(req.value),
+                                          values)
+        if not tested:
+            return np.ones(len(values), dtype=np.bool_)
+        return passes
 
     @property
     def _bamAccMap(self):
@@ -457,11 +502,13 @@ class Filters(RecordWrapper):
                 'readstart': (lambda x: int(x.aStart)),
                 'tstart': (lambda x: int(x.tStart)),
                 'tend': (lambda x: int(x.tEnd)),
+                'n_subreads': (lambda x: len(np.flatnonzero(
+                                            x.reader.holeNumber ==
+                                            x.HoleNumber))),
                }
 
-    def _pbiAccMap(self, tIdMap):
-        return {'rname': (lambda x, m=tIdMap: m[x.tId]),
-                'length': (lambda x: int(x.aEnd)-int(x.aStart)),
+    def _pbiAccMap(self):
+        return {'length': (lambda x: int(x.aEnd)-int(x.aStart)),
                 'qname': (lambda x: x.qId),
                 'zm': (lambda x: int(x.holeNumber)),
                 'pos': (lambda x: int(x.tStart)),
@@ -470,29 +517,40 @@ class Filters(RecordWrapper):
                 'tend': (lambda x: int(x.tEnd)),
                }
 
-    def _pbiMappedVecAccMap(self, tIdMap):
-        plus = {'rname': (lambda x, m=tIdMap: m[x.tId]),
+    def _pbiMappedVecAccMap(self):
+        plus = {'rname': (lambda x: x.tId),
                 'length': (lambda x: x.aEnd - x.aStart),
                 'pos': (lambda x: x.tStart),
                 'readstart': (lambda x: x.aStart),
                 'tstart': (lambda x: x.tStart),
                 'tend': (lambda x: x.tEnd),
+                'accuracy': (
+                    lambda x: (np.ones(len(x.nMM), dtype='f4') -
+                               (x.nMM + x.nIns + x.nDel).astype(np.float32)/
+                               (x.aEnd - x.aStart + x.tEnd - x.tStart -
+                                x.nM - x.nMM)))
                }
-        base = self._pbiVecAccMap(tIdMap)
+        base = self._pbiVecAccMap()
         base.update(plus)
         return base
 
-    def _pbiVecAccMap(self, tIdMap):
+    def _pbiVecAccMap(self):
         return {'length': (lambda x: x.qEnd - x.qStart),
                 'qstart': (lambda x: x.qStart),
                 'qend': (lambda x: x.qEnd),
                 'qname': (lambda x: x.qId),
+                'movie': (lambda x: x.qId),
                 'zm': (lambda x: x.holeNumber),
                 'rq': (lambda x: x.readQual),
                 'bcf': (lambda x: x.bcForward),
                 'bcr': (lambda x: x.bcForward),
                 'bcq': (lambda x: x.bcQual),
-                'bc': (lambda x: np.array(zip(x.bcForward, x.bcReverse))),
+                'bq': (lambda x: x.bcQual),
+                'bc': (lambda x: x['bcForward', 'bcReverse']),
+                'cx': (lambda x: x.contextFlag),
+                'n_subreads': (lambda x: np.array(
+                    [len(np.flatnonzero(x.holeNumber == hn))
+                     for hn in x.holeNumber])),
                }
 
     @property
@@ -503,13 +561,19 @@ class Filters(RecordWrapper):
                 'movie': str,
                 'zm': int,
                 'bc': str,
+                'bcr': int,
+                'bcf': int,
+                'bcq': int,
+                'bq': int,
                 'qs': int,
-                'rq': float,
+                'rq': np.float32,
                 'pos': int,
                 'tstart': int,
                 'tend': int,
-                'accuracy': float,
+                'accuracy': np.float32,
                 'readstart': int,
+                'cx': PbiFlags.flagMap,
+                'n_subreads': int,
                }
 
     def tests(self, readType="bam", tIdMap=None):
@@ -522,13 +586,13 @@ class Filters(RecordWrapper):
             typeMap = self._bamTypeMap
         elif readType.lower() == "fasta":
             accMap = {'id': (lambda x: x.id),
-                      'length': (lambda x: int(x.length)),
+                      'length': (lambda x: int(len(x))),
                      }
             typeMap = {'id': str,
                        'length': int,
                       }
         elif readType.lower() == "pbi":
-            accMap = self._pbiAccMap(tIdMap)
+            accMap = self._pbiAccMap()
             typeMap = self._bamTypeMap
         else:
             raise TypeError("Read type not properly specified")
@@ -538,58 +602,72 @@ class Filters(RecordWrapper):
             for req in filt:
                 param = req.name
                 value = typeMap[param](req.value)
-                operator = self.opMap(req.operator)
+                operator = mapOp(req.operator)
                 reqTests.append(P(filter_read, accMap[param], operator, value))
             tests.append(
                 lambda x, reqTests=reqTests: all([f(x) for f in reqTests]))
         return tests
 
-    def filterIndexRecords(self, indexRecords, nameMap):
-        typeMap = self._bamTypeMap
-        accMap = self._pbiVecAccMap({})
-        if 'aEnd' in indexRecords.dtype.names:
-            accMap = self._pbiMappedVecAccMap({})
-            accMap['rname'] = lambda x: x.tId
-        filterLastResult = None
+    def filterIndexRecords(self, indexRecords, nameMap, movieMap,
+                           readType='bam'):
+        if readType == 'bam':
+            typeMap = self._bamTypeMap
+            accMap = self._pbiVecAccMap()
+            if 'tStart' in indexRecords.dtype.names:
+                accMap = self._pbiMappedVecAccMap()
+                if 'RefGroupID' in indexRecords.dtype.names:
+                    accMap['rname'] = (lambda x: x.RefGroupID)
+            if 'MovieID' in indexRecords.dtype.names:
+                # TODO(mdsmith)(2016-01-29) remove these once the fields are
+                # renamed:
+                accMap['movie'] = (lambda x: x.MovieID)
+                accMap['qname'] = (lambda x: x.MovieID)
+        elif readType == 'fasta':
+            accMap = {'id': (lambda x: x.id),
+                      'length': (lambda x: x.length.astype(int)),
+                     }
+            typeMap = {'id': str,
+                       'length': int,
+                      }
+
+        filterLastResult = np.zeros(len(indexRecords), dtype=np.bool_)
         for filt in self:
-            lastResult = None
+            lastResult = np.ones(len(indexRecords), dtype=np.bool_)
             for req in filt:
                 param = req.name
                 if param in accMap.keys():
                     value = typeMap[param](req.value)
                     if param == 'rname':
                         value = nameMap[value]
+                    if param == 'movie':
+                        value = movieMap[value]
                     if param == 'bc':
                         # convert string to list:
                         values = ast.literal_eval(value)
                         param = 'bcf'
                         value = values[0]
-                        operator = self.opMap(req.operator)
+                        operator = mapOp(req.operator)
                         reqResultsForRecords = operator(
                             accMap[param](indexRecords), value)
                         param = 'bcr'
                         value = values[1]
-                        operator = self.opMap(req.operator)
+                        operator = mapOp(req.operator)
                         reqResultsForRecords &= operator(
                             accMap[param](indexRecords), value)
                     else:
-                        operator = self.opMap(req.operator)
+                        operator = mapOp(req.operator)
                         reqResultsForRecords = operator(
                             accMap[param](indexRecords), value)
-                    if lastResult is None:
-                        lastResult = reqResultsForRecords
-                    else:
-                        lastResult = np.logical_and(lastResult,
-                                                    reqResultsForRecords)
-                        del reqResultsForRecords
-            if filterLastResult is None:
-                filterLastResult = lastResult
-            else:
-                filterLastResult = np.logical_or(filterLastResult, lastResult)
-                del lastResult
+                    lastResult &= reqResultsForRecords
+                    del reqResultsForRecords
+                else:
+                    log.warn("Filter not recognized: {f}".format(f=param))
+            filterLastResult |= lastResult
+            del lastResult
         return filterLastResult
 
     def fromString(self, filterString):
+        # TODO(mdsmith)(2016-02-09) finish this
         filtDict = {}
         self._runCallbacks()
 
@@ -604,6 +682,8 @@ class Filters(RecordWrapper):
             name: The name of the requirement, e.g. 'rq'
             options: A list of (operator, value) tuples, e.g. ('>', '0.85')
         """
+        if not kwargs:
+            return
         # if there are already filters, you must copy the filters for each new
         # option and add one set of requirements to each option:
         if self.submetadata:
@@ -622,14 +702,58 @@ class Filters(RecordWrapper):
                 for i, (oper, val) in enumerate(options):
                     newFilts[i].addRequirement(name, oper, val)
             self.extend(newFilts)
+        #log.debug("Current filters: {s}".format(s=str(self)))
         self._runCallbacks()
 
+    def addFilter(self, **kwargs):
+        """Use this to add filters. Members of the list will be considered
+        requirements for fulfilling this option. Use multiple calls to add
+        multiple filters.
+
+        Args:
+            name: The name of the requirement, e.g. 'rq'
+            options: A list of (operator, value) tuples, e.g. ('>', '0.85')
+        """
+        if not kwargs:
+            return
+        newFilt = Filter()
+        for name, options in kwargs.items():
+            for oper, val in options:
+                newFilt.addRequirement(name, oper, val)
+        self.append(newFilt)
+        log.debug("Current filters: {s}".format(s=str(self)))
+        self._runCallbacks()
+
+    def removeFilter(self, index):
+        self.pop(index)
+        log.debug("Current filters: {s}".format(s=str(self)))
+        self._runCallbacks()
+
+    def mapRequirement(self, **kwargs):
+        """Add requirements to each of the existing requirements, mapped one
+        to one"""
+        # Check that all lists of values are the same length:
+        values = kwargs.values()
+        if len(values) > 1:
+            for v in values[1:]:
+                assert len(v) == len(values[0])
+
+        # Check that this length is equal to the current number of filters:
+        assert len(kwargs.values()[0]) == len(list(self))
+
+        for req, opvals in kwargs.items():
+            for filt, opval in zip(self, opvals):
+                filt.addRequirement(req, opval[0], opval[1])
+
     def removeRequirement(self, req):
         log.debug("Removing requirement {r}".format(r=req))
+        to_remove = []
         for i, filt in enumerate(self):
             empty = filt.removeRequirement(req)
             if empty:
-                self.pop(i)
+                to_remove.append(i)
+        for i in sorted(to_remove, reverse=True):
+            self.pop(i)
         self._runCallbacks()
 
 
@@ -664,9 +788,12 @@ class Filter(RecordWrapper):
         self.plist.append(param)
 
     def removeRequirement(self, req):
+        to_remove = []
         for i, param in enumerate(self):
             if param.name == req:
-                self.pop(i)
+                to_remove.append(i)
+        for i in sorted(to_remove, reverse=True):
+            self.pop(i)
         if len(self.plist):
             return False
         else:
@@ -909,6 +1036,14 @@ class ExternalResource(RecordWrapper):
                 return index.resourceId
 
     @property
+    def sts(self):
+        return self._getSubResByMetaType('PacBio.SubreadFile.ChipStatsFile')
+
+    @sts.setter
+    def sts(self, value):
+        self._setSubResByMetaType('PacBio.SubreadFile.ChipStatsFile', value)
+
+    @property
     def scraps(self):
         return self._getSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile')
 
@@ -917,6 +1052,14 @@ class ExternalResource(RecordWrapper):
         self._setSubResByMetaType('PacBio.SubreadFile.ScrapsBamFile', value)
 
     @property
+    def barcodes(self):
+        return self._getSubResByMetaType("PacBio.DataSet.BarcodeSet")
+
+    @barcodes.setter
+    def barcodes(self, value):
+        self._setSubResByMetaType("PacBio.DataSet.BarcodeSet", value)
+
+    @property
     def reference(self):
         return self._getSubResByMetaType(
             'PacBio.ReferenceFile.ReferenceFastaFile')
@@ -979,7 +1122,7 @@ class ExternalResource(RecordWrapper):
             fileIndices = FileIndices(fileIndices[0])
         else:
             fileIndices = FileIndices()
-        for index in indices:
+        for index in list(indices):
             temp = FileIndex()
             temp.resourceId = index
             fileIndices.append(temp)
@@ -1093,6 +1236,12 @@ class DataSetMetadata(RecordWrapper):
         except ValueError:
             return None
 
+    @summaryStats.setter
+    def summaryStats(self, value):
+        self.removeChildren('SummaryStats')
+        if value:
+            self.append(value)
+
     @property
     def provenance(self):
         try:
@@ -1119,8 +1268,14 @@ class SubreadSetMetadata(DataSetMetadata):
 
     def merge(self, other):
         super(self.__class__, self).merge(other)
-        self.collections.merge(other.collections)
-        self.bioSamples.merge(other.bioSamples)
+        if other.collections and not self.collections:
+            self.append(other.collections)
+        else:
+            self.collections.merge(other.collections)
+        if other.bioSamples and not self.bioSamples:
+            self.append(other.bioSamples)
+        else:
+            self.bioSamples.merge(other.bioSamples)
 
     @property
     def collections(self):
@@ -1129,6 +1284,12 @@ class SubreadSetMetadata(DataSetMetadata):
         return CollectionsMetadata(self.getV(tag='Collections',
                                              container='children'))
 
+    @collections.setter
+    def collections(self, value):
+        self.removeChildren('Collections')
+        if value:
+            self.append(value)
+
     @property
     def bioSamples(self):
         """Return a list of wrappers around BioSamples elements of the Metadata
@@ -1220,7 +1381,14 @@ class BarcodeSetMetadata(DataSetMetadata):
 
     @property
     def barcodeConstruction(self):
-        return self.getMemberV('BarcodeConstruction')
+        try:
+            return self.getMemberV('BarcodeConstruction')
+        except ValueError:
+            return None
+
+    @barcodeConstruction.setter
+    def barcodeConstruction(self, value):
+        self.setMemberV('BarcodeConstruction', value)
 
 
 class ContigsMetadata(RecordWrapper):
@@ -1395,20 +1563,14 @@ class StatsMetadata(RecordWrapper):
                                          self.prodDist.bins[1]
                                          + other.prodDist.bins[1])
         self.numSequencingZmws += other.numSequencingZmws
-        toHandle = [
-            (self.prodDist, other.prodDist),
-            (self.readTypeDist, other.readTypeDist),
-            (self.readLenDist, other.readLenDist),
-            (self.readQualDist, other.readQualDist),
-            (self.medianInsertDist, other.medianInsertDist),
-            (self.insertReadQualDist, other.insertReadQualDist),
-            (self.insertReadLenDist, other.insertReadLenDist),
-            (self.controlReadQualDist, other.controlReadQualDist),
-            (self.controlReadLenDist, other.controlReadLenDist),
-            ]
-        for selfDist, otherDist in toHandle:
+        for dist in DISTLIST:
+            selfDist = getattr(self, dist[0].lower() + dist[1:])
+            otherDist = getattr(other, dist[0].lower() + dist[1:])
             try:
                 selfDist.merge(otherDist)
+            except ZeroBinWidthError as e:
+                removed = self.removeChildren(dist)
+                self.append(otherDist)
             except BinMismatchError:
                 self.append(otherDist)
             except ValueError:
@@ -1519,6 +1681,10 @@ def _staggeredZip(binWidth, start1, start2, bins1, bins2):
 class ContinuousDistribution(RecordWrapper):
 
     def merge(self, other):
+        if other.binWidth == 0:
+            return
+        if self.binWidth == 0:
+            raise ZeroBinWidthError(self.binWidth, other.binWidth)
         if self.binWidth != other.binWidth:
             raise BinWidthMismatchError(self.binWidth, other.binWidth)
         if (self.minBinValue % self.binWidth
@@ -1533,6 +1699,10 @@ class ContinuousDistribution(RecordWrapper):
     def numBins(self):
         return int(self.getMemberV('NumBins'))
 
+    @numBins.setter
+    def numBins(self, value):
+        self.setMemberV('NumBins', str(value))
+
     @property
     def sampleSize(self):
         return int(self.getMemberV('SampleSize'))
@@ -1581,6 +1751,10 @@ class ContinuousDistribution(RecordWrapper):
     def maxBinValue(self):
         return float(self.getMemberV('MaxBinValue'))
 
+    @maxBinValue.setter
+    def maxBinValue(self, value):
+        self.setMemberV('MaxBinValue', str(value))
+
     @property
     def description(self):
         return self.getMemberV('MetricDescription')
@@ -1608,6 +1782,16 @@ class ContinuousDistribution(RecordWrapper):
         return [self.minBinValue + i * self.binWidth for i in
                 range(len(self.bins))]
 
+class ZeroBinWidthError(Exception):
+
+    def __init__(self, width1, width2):
+        self.width1 = width1
+        self.width2 = width2
+
+    def __str__(self):
+        return "Zero bin width: {w1}, {w2}".format(w1=self.width1,
+                                                   w2=self.width2)
+
 class BinMismatchError(Exception):
     pass
 
@@ -1748,7 +1932,7 @@ class PrimaryMetadata(RecordWrapper):
         >>> import os, tempfile
         >>> from pbcore.io import SubreadSet
         >>> import pbcore.data.datasets as data
-        >>> ds1 = SubreadSet(data.getXml(5))
+        >>> ds1 = SubreadSet(data.getXml(5), skipMissing=True)
         >>> ds1.metadata.collections[0].primary.outputOptions.resultsFolder
         'Analysis_Results'
         >>> ds1.metadata.collections[0].primary.outputOptions.resultsFolder = (
@@ -1758,7 +1942,7 @@ class PrimaryMetadata(RecordWrapper):
         >>> outdir = tempfile.mkdtemp(suffix="dataset-doctest")
         >>> outXml = 'xml:' + os.path.join(outdir, 'tempfile.xml')
         >>> ds1.write(outXml, validate=False)
-        >>> ds2 = SubreadSet(outXml)
+        >>> ds2 = SubreadSet(outXml, skipMissing=True)
         >>> ds2.metadata.collections[0].primary.outputOptions.resultsFolder
         'BetterAnalysis_Results'
     """
@@ -1810,7 +1994,7 @@ class BioSamplesMetadata(RecordWrapper):
         Doctest:
             >>> from pbcore.io import SubreadSet
             >>> import pbcore.data.datasets as data
-            >>> ds = SubreadSet(data.getSubreadSet())
+            >>> ds = SubreadSet(data.getSubreadSet(), skipMissing=True)
             >>> ds.metadata.bioSamples[0].name
             'consectetur purus'
             >>> for bs in ds.metadata.bioSamples:
diff --git a/pbcore/io/dataset/DataSetReader.py b/pbcore/io/dataset/DataSetReader.py
index e2a8a58..c33ddb1 100755
--- a/pbcore/io/dataset/DataSetReader.py
+++ b/pbcore/io/dataset/DataSetReader.py
@@ -9,7 +9,7 @@ from pbcore.io.dataset.DataSetMembers import (ExternalResource,
                                               ExternalResources,
                                               DataSetMetadata,
                                               Filters, AutomationParameters,
-                                              StatsMetadata)
+                                              StatsMetadata, DISTLIST)
 from pbcore.io.dataset.DataSetWriter import _eleFromDictList
 
 log = logging.getLogger(__name__)
@@ -58,6 +58,14 @@ def _addXmlFile(dset, path):
     root = tree.getroot()
     tmp = _parseXml(type(dset), root)
     tmp.makePathsAbsolute(curStart=os.path.dirname(path))
+    if not tmp.metadata.summaryStats:
+        for extres in tmp.externalResources:
+            if extres.sts:
+                try:
+                    tmp.loadStats(extres.sts)
+                except IOError as e:
+                    log.info("Sts.xml file {f} "
+                              "unopenable".format(f=extres.sts))
     # copyOnMerge must be false, you're merging in a tmp and maintaining dset
     dset.merge(tmp, copyOnMerge=False)
 
@@ -122,12 +130,6 @@ def wrapNewResource(path):
 def _parseXml(dsetType, element):
     """Parse an XML DataSet tag, or the root tag of a DataSet XML file (they
     should be equivalent)
-
-    Doctest:
-        >>> import pbcore.data.datasets as data
-        >>> ds = _openXmlFile(data.getXml(no=8).split(':')[-1])
-        >>> type(ds).__name__
-        'SubreadSet'
     """
     result = dsetType()
     result.objMetadata = element.attrib
@@ -154,6 +156,8 @@ def _parseXmls(dsetType, element):
     return result
 
 def _updateStats(element):
+    """This is ugly, hackish and prone to failure. Delete as soon as pre 3.0.16
+    sts.xml files can go unsupported"""
     namer = functools.partial(
         _namespaceTag,
         "http://pacificbiosciences.com/PipelineStats/PipeStats.xsd")
@@ -174,28 +178,28 @@ def _updateStats(element):
         ]
     for tag in binCounts:
         if element.findall(tag):
-            log.error("Outdated stats XML received")
+            log.info("Outdated stats XML received")
             finds = element.findall(tag)
             parent = tag.split('Dist')[0] + 'Dist'
             parent = element.find(parent)
             for sub_ele in finds:
                 parent.remove(sub_ele)
-            binCounts = ET.Element(namer('BinCounts'))
+            bce = ET.Element(namer('BinCounts'))
             for sub_ele in finds:
-                binCounts.append(sub_ele)
-            parent.append(binCounts)
+                bce.append(sub_ele)
+            parent.append(bce)
     for tag in binLabels:
         if element.findall(tag):
-            log.error("Outdated stats XML received")
+            log.info("Outdated stats XML received")
             finds = element.findall(tag)
             parent = tag.split('Dist')[0] + 'Dist'
             parent = element.find(parent)
             for sub_ele in finds:
                 parent.remove(sub_ele)
-            binCounts = ET.Element(namer('BinLabels'))
+            bce = ET.Element(namer('BinLabels'))
             for sub_ele in finds:
-                binCounts.append(sub_ele)
-            parent.append(binCounts)
+                bce.append(sub_ele)
+            parent.append(bce)
     return element
 
 def _namespaceTag(xmlns, tagName):
@@ -233,21 +237,11 @@ def _parseXmlFilters(element):
     """Pull filters from XML file, put them in a list of dictionaries, where
     each dictionary contains a representation of a Filter tag: key, value pairs
     with parameter names and value expressions.
-
-    Doctest:
-        >>> import xml.etree.ElementTree as ET
-        >>> import pbcore.data.datasets as data
-        >>> tree = ET.parse(data.getXml(no=8).split(':')[1])
-        >>> root = tree.getroot()
-        >>> filters = root[1]
-        >>> str(_parseXmlFilters(filters))
-        '( rq > 0.75 ) OR ( qname == 100/0/0_100 )'
     """
     return Filters(_eleToDictList(element))
 
 def parseStats(filename):
     url = urlparse(filename)
-    fileType = url.scheme
     fileLocation = url.path.strip()
     if url.netloc:
         fileLocation = url.netloc
@@ -257,10 +251,7 @@ def parseStats(filename):
     stats = StatsMetadata(_eleToDictList(root))
     stats.record['tag'] = 'SummaryStats'
     whitelist = ['ShortInsertFraction', 'AdapterDimerFraction',
-                 'MedianInsertDist', 'ProdDist', 'ReadTypeDist',
-                 'ReadLenDist', 'ReadQualDist', 'InsertReadQualDist',
-                 'InsertReadLenDist', 'ControlReadQualDist',
-                 'ControlReadLenDist', 'NumSequencingZmws']
+                 'NumSequencingZmws'] + DISTLIST
     stats.pruneChildrenTo(whitelist)
     return stats
 
diff --git a/pbcore/io/dataset/DataSetWriter.py b/pbcore/io/dataset/DataSetWriter.py
index 1bdaff2..e8989d4 100755
--- a/pbcore/io/dataset/DataSetWriter.py
+++ b/pbcore/io/dataset/DataSetWriter.py
@@ -2,6 +2,9 @@
 
 import copy, time
 import xml.etree.ElementTree as ET
+import logging
+
+log = logging.getLogger(__name__)
 
 # TODO add in other namespaces
 XMLNS = 'http://pacificbiosciences.com/PacBioDatasets.xsd'
@@ -218,7 +221,14 @@ def _addDataSetMetadataElement(dataSet, root):
         root: The root ElementTree object. Extended here using SubElement
     """
     if dataSet.metadata:
+        # hide the stats:
+        stats = None
+        if dataSet.metadata.summaryStats:
+            stats = dataSet.metadata.summaryStats
+            dataSet.metadata.summaryStats = None
         root.append(_eleFromDictList(dataSet.metadata.record))
+        if stats:
+            dataSet.metadata.summaryStats = stats
         # Metadata order matters....
         #tl = dsmd.find('TotalLength')
         #tl = dsmd.remove(tl)
diff --git a/pbcore/io/dataset/EntryPoints.py b/pbcore/io/dataset/EntryPoints.py
deleted file mode 100755
index c56495e..0000000
--- a/pbcore/io/dataset/EntryPoints.py
+++ /dev/null
@@ -1,246 +0,0 @@
-#/usr/bin/env python
-
-import os
-import argparse
-from collections import defaultdict
-from pbcore.io import DataSet, ContigSet, openDataSet
-from pbcore.io.dataset.DataSetMembers import Filters
-from pbcore.io.dataset.DataSetValidator import validateFile
-import logging
-
-log = logging.getLogger(__name__)
-
-def summarizeXml(args):
-    dset = openDataSet(args.infile, strict=args.strict)
-    for fname in dset.toExternalFiles():
-        print fname
-    print "Number of records: {r}".format(r=dset.numRecords)
-    print "Total number of bases: {r}".format(r=dset.totalLength)
-
-def summarize_options(parser):
-    parser.description = "Summarize a DataSet XML file"
-    parser.add_argument("infile", type=str,
-                        help="The xml file to summarize")
-    parser.set_defaults(func=summarizeXml)
-
-def createXml(args):
-    dsTypes = DataSet.castableTypes()
-    dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
-                                skipCounts=args.skipCounts,
-                                generateIndices=args.generateIndices)
-    if args.dsName != '':
-        dset.name = args.dsName
-    if args.generateIndices:
-        # we generated the indices with the last open, lets capture them with
-        # this one:
-        dset = dsTypes[args.dsType](*args.infile, strict=args.strict,
-                                    skipCounts=args.skipCounts)
-    log.debug("Dataset created")
-    dset.write(args.outfile, validate=args.novalidate, modPaths=True,
-               relPaths=args.relative)
-    log.debug("Dataset written")
-
-
-def create_options(parser):
-    parser.description = ('Create an XML file from a fofn or bam. Possible '
-                          'types: SubreadSet, AlignmentSet, ReferenceSet, '
-                          'HdfSubreadSet, BarcodeSet, ConsensusAlignmentSet, '
-                          'ConsensusReadSet, ContigSet')
-    parser.add_argument("outfile", type=str, help="The XML to create")
-    #parser.add_argument("infile", type=validate_file, nargs='+',
-    parser.add_argument("infile", type=str, nargs='+',
-                        help="The fofn or BAM file(s) to make into an XML")
-    parser.add_argument("--type", type=str, default='DataSet',
-                        dest='dsType', help="The type of XML to create")
-    parser.add_argument("--name", type=str, default='',
-                        dest='dsName', help="The name of the new DataSet")
-    parser.add_argument("--generateIndices", action='store_true',
-                        default=False, help="The type of XML to create")
-    parser.add_argument("--novalidate", action='store_false', default=True,
-                        help=("Don't validate the resulting XML, don't touch "
-                              "paths"))
-    parser.add_argument("--relative", action='store_true', default=False,
-                        help=("Make the included paths relative instead of "
-                              "absolute (not compatible with --novalidate)"))
-    parser.set_defaults(func=createXml)
-
-def filterXml(args):
-    if args.infile.endswith('xml'):
-        dataSet = openDataSet(args.infile, strict=args.strict)
-        filters = defaultdict(list)
-        separators = ['<=', '>=', '!=', '==', '>', '<', '=']
-        for filt in args.filters:
-            for sep in separators:
-                if sep in filt:
-                    param, condition = filt.split(sep)
-                    condition = (sep, condition)
-                    filters[param].append(condition)
-                    break
-        dataSet.filters.addRequirement(**filters)
-        dataSet.updateCounts()
-        log.info("{i} filters added".format(i=len(filters)))
-        dataSet.write(args.outfile)
-    else:
-        raise IOError("No files found/found to be compatible")
-
-def filter_options(parser):
-    pbiFilterOptions = set(Filters()._pbiMappedVecAccMap({}).keys())
-    bamFilterOptions = set(Filters()._bamAccMap.keys())
-    parser.description = ('Add filters to an XML file. Suggested fields: '
-                          '{f}. More expensive fields: {b}'.format(
-        f=sorted(list(pbiFilterOptions)),
-        b=sorted(list(bamFilterOptions - pbiFilterOptions))))
-    #parser.add_argument("infile", type=validate_file,
-    parser.add_argument("infile", type=str,
-                        help="The xml file to filter")
-    parser.add_argument("outfile", type=str, help="The resulting xml file")
-    parser.add_argument("filters", type=str, nargs='+',
-                        help="The values and thresholds to filter (e.g. "
-                        "'rq>0.85')")
-    parser.set_defaults(func=filterXml)
-
-def splitXml(args):
-    log.debug("Starting split")
-    dataSet = openDataSet(args.infile, strict=args.strict)
-    chunks = len(args.outfiles)
-    if args.chunks:
-        chunks = args.chunks
-    dss = dataSet.split(chunks=chunks,
-                        ignoreSubDatasets=(not args.subdatasets),
-                        contigs=args.contigs,
-                        maxChunks=args.maxChunks,
-                        breakContigs=args.breakContigs,
-                        targetSize=args.targetSize,
-                        zmws=args.zmws,
-                        barcodes=args.barcodes,
-                        byRecords=(not args.byRefLength),
-                        updateCounts=(not args.noCounts))
-    log.debug("Splitting into {i} chunks".format(i=len(dss)))
-    infix = 'chunk{i}'
-    if args.contigs:
-        infix += 'contigs'
-    if not args.outfiles:
-        if not args.outdir:
-            args.outfiles = ['.'.join(args.infile.split('.')[:-1] +
-                                      [infix.format(i=chNum), 'xml'])
-                             for chNum in range(len(dss))]
-        else:
-            args.outfiles = ['.'.join(args.infile.split('.')[:-1] +
-                                      [infix.format(i=chNum), 'xml'])
-                             for chNum in range(len(dss))]
-            args.outfiles = [os.path.join(args.outdir,
-                                          os.path.basename(outfn))
-                             for outfn in args.outfiles]
-            num = len(dss)
-            end = ''
-            if num > 5:
-                num = 5
-                end = '...'
-            log.debug("Emitting {f} {e}".format(
-                f=', '.join(args.outfiles[:num]),
-                e=end))
-    log.debug("Finished splitting, now writing")
-    for out_fn, dset in zip(args.outfiles, dss):
-        dset.write(out_fn)
-    log.debug("Done writing files")
-
-def split_options(parser):
-    parser.description = "Split the dataset"
-    parser.add_argument("infile", type=str,
-                        help="The xml file to split")
-    parser.add_argument("--contigs", default=False, action='store_true',
-                        help="Split on contigs")
-    parser.add_argument("--barcodes", default=False, action='store_true',
-                        help="Split on barcodes")
-    parser.add_argument("--zmws", default=False, action='store_true',
-                        help="Split on zmws")
-    parser.add_argument("--byRefLength", default=False, action='store_true',
-                        help="Split contigs by contig length")
-    parser.add_argument("--noCounts", default=False, action='store_true',
-                        help="Update dataset counts after split")
-    parser.add_argument("--chunks", default=0, type=int,
-                        help="Split contigs into <chunks> total windows")
-    parser.add_argument("--maxChunks", default=0, type=int,
-                        help="Split contig list into at most <chunks> groups")
-    parser.add_argument("--targetSize", default=5000, type=int,
-                        help="Target number of records per chunk")
-    parser.add_argument("--breakContigs", default=False, action='store_true',
-                        help="Break contigs to get closer to maxCounts")
-    parser.add_argument("--subdatasets", default=False, action='store_true',
-                        help="Split on subdatasets")
-    parser.add_argument("--outdir", default=False, type=str,
-                        help="Specify an output directory")
-    parser.add_argument("outfiles", nargs=argparse.REMAINDER,
-                        type=str, help="The resulting xml files")
-    parser.set_defaults(func=splitXml)
-
-def mergeXml(args):
-    dss = []
-    for infn in args.infiles:
-        dss.append(openDataSet(infn, strict=args.strict))
-    reduce(lambda ds1, ds2: ds1 + ds2, dss).write(args.outfile)
-
-def merge_options(parser):
-    parser.description = 'Combine XML (and BAM) files'
-    parser.add_argument("outfile", type=str,
-                        help="The resulting XML file")
-    #parser.add_argument("infiles", type=validate_file, nargs='+',
-    parser.add_argument("infiles", type=str, nargs='+',
-                        help="The XML files to merge")
-    parser.set_defaults(func=mergeXml)
-
-def loadStatsXml(args):
-    dset = openDataSet(args.infile, strict=args.strict)
-    dset.loadStats(args.statsfile)
-    if args.outfile:
-        dset.write(args.outfile, validate=False)
-    else:
-        dset.write(args.infile, validate=False)
-
-def loadStatsXml_options(parser):
-    parser.description = 'Load an sts.xml file into a DataSet XML file'
-    parser.add_argument("infile", type=str,
-                        help="The XML file to modify")
-    parser.add_argument("statsfile", type=str,
-                        help="The .sts.xml file to load")
-    parser.add_argument("--outfile", type=str, default=None,
-                        help="The XML file to output")
-    parser.set_defaults(func=loadStatsXml)
-
-def validateXml(args):
-    validateFile(args.infile, args.skipFiles)
-    print("{f} is valid DataSet XML with valid ResourceId "
-          "references".format(f=args.infile))
-
-def validate_options(parser):
-    parser.description = 'Validate XML and ResourceId files'
-    parser.add_argument("infile", type=str,
-                        help="The XML file to validate")
-    parser.add_argument("--skipFiles",
-                        default=False, action='store_true',
-                        help="Skip validating external resources")
-    parser.set_defaults(func=validateXml)
-
-def consolidateXml(args):
-    """Combine BAMs and apply the filters described in the XML file, producing
-    one consolidated XML"""
-    dset = openDataSet(args.infile)
-    dset.consolidate(args.datafile, numFiles=args.numFiles, useTmp=(not
-                     args.noTmp))
-    dset.write(args.xmlfile)
-
-def consolidate_options(parser):
-    parser.description = 'Consolidate the XML files'
-    #parser.add_argument("infile", type=validate_file,
-    parser.add_argument("--numFiles", type=int, default=1,
-                        help="The number of data files to produce (1)")
-    parser.add_argument("--noTmp", default=False, action='store_true',
-                        help="Don't copy to a tmp location to ensure local"
-                             " disk use")
-    parser.add_argument("infile", type=str,
-                        help="The XML file to consolidate")
-    parser.add_argument("datafile", type=str,
-                        help="The resulting data file")
-    parser.add_argument("xmlfile", type=str,
-                        help="The resulting XML file")
-    parser.set_defaults(func=consolidateXml)
diff --git a/pbcore/io/dataset/utils.py b/pbcore/io/dataset/utils.py
index a99690f..6881555 100755
--- a/pbcore/io/dataset/utils.py
+++ b/pbcore/io/dataset/utils.py
@@ -15,26 +15,28 @@ def consolidateBams(inFiles, outFile, filterDset=None, useTmp=True):
     """Take a list of infiles, an outFile to produce, and optionally a dataset
     (filters) to provide the definition and content of filtrations."""
     # check space available
+    final_free_space = disk_free(os.path.split(outFile)[0])
+    projected_size = sum(file_size(infn) for infn in inFiles)
+    log.debug("Projected size:            {p}".format(p=projected_size))
+    log.debug("In place free space:       {f}".format(f=final_free_space))
+    # give it a 5% buffer
+    if final_free_space < (projected_size * 1.05):
+        raise RuntimeError("No space available to consolidate")
     if useTmp:
         tmpout = tempfile.mkdtemp(suffix="consolidation-filtration")
-        local_free_space = disk_free(os.path.split(outFile)[0])
         tmp_free_space = disk_free(tmpout)
-        projected_size = sum(file_size(infn) for infn in inFiles)
-        log.debug("Projected size: {p}".format(p=projected_size))
-        log.debug("In place free space: {f}".format(f=local_free_space))
-        log.debug("Tmp free space: {f}".format(f=tmp_free_space))
-        if tmp_free_space > projected_size:
+        log.debug("Tmp free space (need ~2x): {f}".format(f=tmp_free_space))
+        # need 2x for tmp in and out, plus 10% buffer
+        if tmp_free_space > (projected_size * 2.1):
             log.debug("Using tmp storage: " + tmpout)
             tmpInFiles = _tmpFiles(inFiles, tmpout)
             origOutFile = outFile
             origInFiles = inFiles[:]
             inFiles = tmpInFiles
             outFile = os.path.join(tmpout, "outfile.bam")
-        elif local_free_space > projected_size:
+        else:
             log.debug("Using in place storage")
             useTmp = False
-        else:
-            raise RuntimeError("No space available to consolidate")
 
     if filterDset and filterDset.filters:
         finalOutfile = outFile
diff --git a/pbcore/io/rangeQueries.py b/pbcore/io/rangeQueries.py
index 0832696..814ecd3 100644
--- a/pbcore/io/rangeQueries.py
+++ b/pbcore/io/rangeQueries.py
@@ -128,8 +128,8 @@ def makeReadLocator(cmpH5, refSeq):
     quickly.
     """
     if not cmpH5.isSorted: raise Exception, "CmpH5 is not sorted"
-    offsets = cmpH5.file["/RefGroup/OffsetTable"].value
-    offStart, offEnd = offsets[offsets[:,0] == refSeq, 1:3].ravel()
+    refInfo = cmpH5.referenceInfo(refSeq)
+    offStart, offEnd = refInfo.StartRow, refInfo.EndRow
 
     if (offEnd - offStart > 0):
         refAlignIdx = cmpH5.alignmentIndex[offStart:offEnd, ]
diff --git a/pbcore/model/__init__.py b/pbcore/model/__init__.py
index 6861c47..b6fb7ab 100644
--- a/pbcore/model/__init__.py
+++ b/pbcore/model/__init__.py
@@ -27,3 +27,5 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 #################################################################################
+
+from baseRegions import *
diff --git a/pbcore/model/baseRegions.py b/pbcore/model/baseRegions.py
new file mode 100644
index 0000000..61dac5a
--- /dev/null
+++ b/pbcore/model/baseRegions.py
@@ -0,0 +1,192 @@
+#################################################################################
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Pacific Biosciences nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+# THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
+# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#################################################################################
+
+__all__ = [ "BaseRegionsMixin",
+            "ExtraBaseRegionsMixin",
+            "ADAPTER_REGION",
+            "INSERT_REGION",
+            "HQ_REGION" ]
+
+# Region types
+ADAPTER_REGION = 0
+INSERT_REGION  = 1
+HQ_REGION      = 2
+
+
+# Interval arithmetic
+def intersectRanges(r1, r2):
+    b1, e1 = r1
+    b2, e2 = r2
+    b, e = max(b1, b2), min(e1, e2)
+    return (b, e) if (b < e) else None
+
+def removeNones(lst):
+    return filter(lambda x: x!=None, lst)
+
+
+class BaseRegionsMixin(object):
+    """
+    Mixin class for "ZMW" client classes providing access to base
+    regions and reads sliced to those regions.
+
+    Requires the client subclass to provide 'regionsTable' and 'read'
+    (subrange slicing) methods with the correct semantics.
+
+    The "...Region[s]" calls here return one or more intervals ((int, int));
+
+    *All accessors in this mixin class implicitly clip regions to the
+     HQ region.*
+    """
+    @property
+    def hqRegion(self):
+        """
+        Return the HQ region interval.
+
+        The HQ region is an interval of basecalls where the basecaller has
+        inferred that a single sequencing reaction is taking place.
+        Secondary analysis should only use subreads within the HQ
+        region.  Methods in this class, with the exception of the
+        "NoQC" methods, return data appropriately clipped/filtered to
+        the HQ region.
+        """
+        rt = self.regionTable
+        hqRows = rt[rt.regionType == HQ_REGION]
+        assert len(hqRows) == 1
+        hqRow = hqRows[0]
+        return hqRow.regionStart, hqRow.regionEnd
+
+    def _unclippedInsertRegions(self):
+        return [ (region.regionStart, region.regionEnd)
+                 for region in self.regionTable
+                 if region.regionType == INSERT_REGION ]
+
+    @property
+    def insertRegions(self):
+        """
+        Get insert regions as intervals, clipped to the HQ region
+        """
+        hqRegion = self.hqRegion
+        return removeNones([ intersectRanges(hqRegion, region)
+                             for region in self._unclippedInsertRegions() ])
+
+    @property
+    def subreads(self):
+        """
+        Get the subreads as a list of ZmwRead objects.  Restricts focus,
+        and clips to, the HQ region.  This method can be used by
+        production code.
+        """
+        return [ self.read(readStart, readEnd)
+                 for (readStart, readEnd) in self.insertRegions ]
+
+    def _unclippedAdapterRegions(self):
+        return [ (region.regionStart, region.regionEnd)
+                 for region in self.regionTable
+                 if region.regionType == ADAPTER_REGION ]
+
+    @property
+    def adapterRegions(self):
+        """
+        Get adapter regions as intervals, performing clipping to the HQ region
+        """
+        hqRegion = self.hqRegion
+        return removeNones([ intersectRanges(hqRegion, region)
+                             for region in self._unclippedAdapterRegions() ])
+
+
+    @property
+    def adapters(self):
+        """
+        Get the adapter hits as a list of ZmwRead objects.  Restricts
+        focus, and clips to, the HQ region.  This method can be used
+        by production code.
+        """
+        return [ self.read(readStart, readEnd)
+                 for (readStart, readEnd) in self.adapterRegions ]
+
+
+
+class ExtraBaseRegionsMixin(BaseRegionsMixin):
+    """
+    Mixin class for a "ZMW" class providing the basic region
+    accessors, and additional region accessors that do not clip to the
+    HQ region.
+
+    These accessors only make sense for bas.h5 ZMW objects, where we
+    have access to region information extending beyond the HQ region.
+    In the BAM world, "regions" are all inherently clipped to HQ by
+    design of PPA.
+
+    Requires 'readNoQC' accessor
+    """
+    @property
+    def adapterRegionsNoQC(self):
+        """
+        Get adapter regions as intervals, without clipping to the HQ
+        region.  Don't use this unless you know what you're doing.
+        """
+        return self._unclippedAdapterRegions()
+
+    @property
+    def adaptersNoQC(self):
+        """
+        Get the adapters, including data beyond the bounds of the HQ
+        region.
+
+        .. warning::
+
+            It is not recommended that production code use this method
+            as we make no guarantees about what happens outside of the
+            HQ region.
+        """
+        return [ self.read(readStart, readEnd)
+                 for (readStart, readEnd) in self.adapterRegionsNoQC ]
+
+    @property
+    def insertRegionsNoQC(self):
+        """
+        Get insert regions as intervals, without clipping to the HQ
+        region.  Don't use this unless you know what you're doing.
+        """
+        return self._unclippedInsertRegions()
+
+    @property
+    def subreadsNoQC(self):
+        """
+        Get the subreads, including data beyond the bounds of the HQ region.
+
+        .. warning::
+
+            It is not recommended that production code use this method
+            as we make no guarantees about what happens outside of the
+            HQ region.
+        """
+        return [ self.read(readStart, readEnd)
+                 for (readStart, readEnd) in self.insertRegionsNoQC ]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0af0380..266b3e6 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +1,3 @@
 sphinx
 nose
+pyxb
diff --git a/requirements.txt b/requirements.txt
index 22e02eb..357619b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 cython
 numpy >= 1.7.1
 h5py >= 2.0.1
-pysam >= 0.8.4
+pysam == 0.8.4
diff --git a/tests/test_pbcore_io_BasH5Reader.py b/tests/test_pbcore_io_BasH5Reader.py
index 43cdfeb..caee1e2 100644
--- a/tests/test_pbcore_io_BasH5Reader.py
+++ b/tests/test_pbcore_io_BasH5Reader.py
@@ -484,9 +484,8 @@ class TestBasH5Reader_CCS(ReadIteratorTests):
 
             nose.tools.assert_equal(len(reader[zmw].insertRegions), 0)
             nose.tools.assert_equal(len(reader[zmw].adapterRegions), 0)
+            nose.tools.assert_equal(len(reader[zmw].subreads), 0)
 
-            with nose.tools.assert_raises(ValueError):
-                reader[zmw].subreads
 
             with nose.tools.assert_raises(ValueError):
                 reader[zmw].read()
diff --git a/tests/test_pbcore_io_unaligned_bam.py b/tests/test_pbcore_io_unaligned_bam.py
index 5e2f8ae..3d33ae3 100644
--- a/tests/test_pbcore_io_unaligned_bam.py
+++ b/tests/test_pbcore_io_unaligned_bam.py
@@ -69,7 +69,7 @@ class TestUnalignedBam(object):
 
     def testIpd(self):
         """Check that 'Ipd' feature is recognized correctly."""
-        pfa = self.bam.pulseFeaturesAvailable()
+        pfa = self.bam.baseFeaturesAvailable()
         EQ(pfa, frozenset(['Ipd', 'DeletionTag', 'MergeQV', 'SubstitutionQV',
                            'InsertionQV', 'DeletionQV']))
         ipd = self.bamRead0.IPD(aligned=False, orientation="native")
diff --git a/tests/test_pbdataset.py b/tests/test_pbdataset.py
index f571fb7..13ea083 100644
--- a/tests/test_pbdataset.py
+++ b/tests/test_pbdataset.py
@@ -8,6 +8,7 @@ import tempfile
 import numpy as np
 import unittest
 import shutil
+from random import shuffle
 from unittest.case import SkipTest
 
 from pbcore.io import PacBioBamIndex, IndexedBamReader
@@ -15,10 +16,10 @@ from pbcore.io import openIndexedAlignmentFile
 from pbcore.io.dataset.utils import BamtoolsVersion
 from pbcore.io import (DataSet, SubreadSet, ReferenceSet, AlignmentSet,
                        openDataSet, DataSetMetaTypes, HdfSubreadSet,
-                       ConsensusReadSet, ConsensusAlignmentSet)
-from pbcore.io.dataset.DataSetIO import _dsIdToSuffix
+                       ConsensusReadSet, ConsensusAlignmentSet, openDataFile,
+                       divideKeys, keysToRanges)
+from pbcore.io.dataset.DataSetIO import _dsIdToSuffix, InvalidDataSetIOError
 from pbcore.io.dataset.DataSetMembers import ExternalResource, Filters
-from pbcore.io.dataset.DataSetWriter import toXml
 from pbcore.io.dataset.DataSetValidator import validateFile
 from pbcore.util.Process import backticks
 import pbcore.data.datasets as data
@@ -27,11 +28,6 @@ import pbcore.data as upstreamdata
 log = logging.getLogger(__name__)
 
 def _check_constools():
-    cmd = "dataset.py"
-    o, r, m = backticks(cmd)
-    if r != 2:
-        return False
-
     if not BamtoolsVersion().good:
         log.warn("Bamtools not found or out of date")
         return False
@@ -85,7 +81,8 @@ class TestDataSet(unittest.TestCase):
         ds1 = DataSet(data.getFofn())
         self.assertEquals(ds1.numExternalResources, 2)
         # New! Use the correct constructor:
-        self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__,
+        self.assertEquals(type(SubreadSet(data.getSubreadSet(),
+                                          skipMissing=True)).__name__,
                           'SubreadSet')
         # Even with untyped inputs
         self.assertTrue(str(SubreadSet(data.getBam())).startswith(
@@ -113,45 +110,6 @@ class TestDataSet(unittest.TestCase):
             ds.externalResources[-1].indices[0].resourceId ==
             "IdontExist.bam.pbi")
 
-    @unittest.skipIf(not _check_constools(),
-                     "bamtools or pbindex not found, skipping")
-    def test_split_cli(self):
-        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
-        cmd = "dataset.py split --outdir {o} --contigs --chunks 2 {d}".format(
-            o=outdir,
-            d=data.getXml(8))
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
-        self.assertTrue(os.path.exists(
-            os.path.join(outdir, os.path.basename(data.getXml(15)))))
-        self.assertTrue(os.path.exists(
-            os.path.join(outdir, os.path.basename(data.getXml(16)))))
-
-    @unittest.skipIf(not _check_constools(),
-                     "bamtools or pbindex not found, skipping")
-    def test_filter_cli(self):
-        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
-        outfn = os.path.join(outdir, "filtered8.xml")
-        log.debug(outfn)
-        cmd = "dataset.py filter {i} {o} {f}".format(
-            i=data.getXml(8),
-            o=outfn,
-            f="rname=E.faecalis.1")
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        if r != 0:
-            log.debug(m)
-        self.assertEqual(r, 0)
-        self.assertTrue(os.path.exists(outfn))
-        aln = AlignmentSet(data.getXml(8))
-        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
-        aln.updateCounts()
-        dset = AlignmentSet(outfn)
-        self.assertEqual(str(aln.filters), str(dset.filters))
-        self.assertEqual(aln.totalLength, dset.totalLength)
-        self.assertEqual(aln.numRecords, dset.numRecords)
-
     def test_merge_subdatasets(self):
         # from data file
         ds1 = AlignmentSet(data.getBam(0))
@@ -191,45 +149,6 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(merged.subdatasets[1].toExternalFiles(),
                          AlignmentSet(data.getXml(11)).toExternalFiles())
 
-    @unittest.skipIf(not _check_constools(),
-                     "bamtools or pbindex not found, skipping")
-    def test_create_cli(self):
-        log.debug("Absolute")
-        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
-        cmd = "dataset.py create --type AlignmentSet {o} {i1} {i2}".format(
-            o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'),
-            i1=data.getXml(8), i2=data.getXml(11))
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
-        self.assertTrue(os.path.exists(
-            os.path.join(outdir, os.path.basename(data.getXml(12)))))
-
-        log.debug("Relative")
-        cmd = ("dataset.py create --relative --type AlignmentSet "
-               "{o} {i1} {i2}".format(
-                   o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'),
-                   i1=data.getXml(8),
-                   i2=data.getXml(11)))
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
-        self.assertTrue(os.path.exists(
-            os.path.join(outdir, os.path.basename(data.getXml(12)))))
-
-        log.debug("Emtpy ContigSet")
-        emptyfastafile = tempfile.NamedTemporaryFile(suffix=".fasta")
-        emptyfasta = emptyfastafile.name
-        emptycontigset = os.path.join(outdir, 'empty.contigset.xml')
-        cmd = ("dataset.py create --relative --type ContigSet "
-               "{o} {i}".format(
-                   o=emptycontigset,
-                   i=emptyfasta))
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
-        self.assertTrue(os.path.exists(emptycontigset))
-
     def test_empty_metatype(self):
         inBam = data.getBam()
         d = DataSet(inBam)
@@ -255,6 +174,24 @@ class TestDataSet(unittest.TestCase):
         dset.updateCounts()
         dset.index
         self.assertEqual(len(dset.resourceReaders()), 1)
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            1)
+
+        # empty and full:
+        full_bam = SubreadSet(data.getXml(10)).toExternalFiles()[0]
+        dset = SubreadSet(upstreamdata.getEmptyBam(), full_bam)
+        self.assertEqual(dset.numRecords, 92)
+        self.assertEqual(dset.totalLength, 124093)
+        self.assertEqual(len(list(dset)), 92)
+        dset.updateCounts()
+        self.assertNotEqual(list(dset.index), [])
+        self.assertEqual(len(dset.resourceReaders()), 2)
+        # there are 9 reads in this set, < the minimum chunk size
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            2)
+
 
         dset = AlignmentSet(upstreamdata.getEmptyBam())
         self.assertEqual(dset.numRecords, 0)
@@ -263,6 +200,23 @@ class TestDataSet(unittest.TestCase):
         dset.updateCounts()
         dset.index
         self.assertEqual(len(dset.resourceReaders()), 1)
+        # there is a minimum chunk size here:
+        self.assertEqual(
+            len(dset.split(contigs=True, maxChunks=12, breakContigs=True)),
+            1)
+
+        # empty and full:
+        dset = AlignmentSet(upstreamdata.getEmptyBam(), data.getBam())
+        self.assertEqual(dset.numRecords, 92)
+        self.assertEqual(dset.totalLength, 123588)
+        self.assertEqual(len(list(dset)), 92)
+        dset.updateCounts()
+        self.assertNotEqual(list(dset.index), [])
+        self.assertEqual(len(dset.resourceReaders()), 2)
+        # there are 9 reads in this set, < the minimum chunk size
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            2)
 
         dset = ConsensusReadSet(upstreamdata.getEmptyBam())
         self.assertEqual(dset.numRecords, 0)
@@ -291,6 +245,25 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(list(dset)), 0)
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            1)
+
+        # empty and full:
+        full_bam = SubreadSet(data.getXml(10)).toExternalFiles()[0]
+        dset = SubreadSet(outpath, full_bam)
+        self.assertEqual(len(dset.resourceReaders()), 2)
+        dset.updateCounts()
+        # without a pbi, updating counts is broken
+        self.assertEqual(dset.numRecords, 0)
+        self.assertEqual(dset.totalLength, 0)
+        self.assertEqual(len(list(dset)), 92)
+        with self.assertRaises(IOError):
+            self.assertNotEqual(list(dset.index), [])
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            1)
+
 
         dset = AlignmentSet(outpath)
         self.assertEqual(dset.numRecords, 0)
@@ -298,6 +271,23 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(list(dset)), 0)
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
+        self.assertEqual(
+            len(dset.split(contigs=True, maxChunks=12, breakContigs=True)),
+            1)
+
+        # empty and full:
+        dset = AlignmentSet(outpath, data.getBam())
+        # without a pbi, updating counts is broken
+        self.assertEqual(dset.numRecords, 0)
+        self.assertEqual(dset.totalLength, 0)
+        self.assertEqual(len(list(dset)), 92)
+        dset.updateCounts()
+        with self.assertRaises(IOError):
+            self.assertNotEqual(list(dset.index), [])
+        self.assertEqual(len(dset.resourceReaders()), 2)
+        self.assertEqual(
+            len(dset.split(zmws=True, maxChunks=12)),
+            1)
 
         dset = ConsensusReadSet(outpath)
         self.assertEqual(dset.numRecords, 0)
@@ -320,6 +310,45 @@ class TestDataSet(unittest.TestCase):
         dset.updateCounts()
         self.assertEqual(len(dset.resourceReaders()), 1)
 
+    def test_read_ranges(self):
+        # This models the old and new ways by which Genomic Consensus generates
+        # lists of paired tStarts and tEnds.
+
+        full_bam = upstreamdata.getBamAndCmpH5()[0]
+        empty_bam = data.getEmptyAlignedBam()
+        file_lists = [[full_bam, empty_bam],
+                      [empty_bam, full_bam]]
+        refId_list = ['lambda_NEB3011', 0]
+        minMapQV = 30
+
+        for file_list in file_lists:
+            for refId in refId_list:
+                alnFile = AlignmentSet(*file_list)
+
+                # old GC:
+                # TODO(mdsmith)(2016-01-27): Remove assertRaises when tId
+                # accession is fixed for empty aligned bam+pbi files
+                with self.assertRaises(AttributeError):
+                    for reader in alnFile.resourceReaders(refId):
+                        reader.index.tId
+                        refLen = reader.referenceInfo(refId).Length
+                        rows = reader.readsInRange(refId, 0, refLen,
+                                                   justIndices=True)
+
+                # new GC (just testing that it doesn't raise exceptions):
+                rows = alnFile.index[
+                    ((alnFile.tId == alnFile.referenceInfo(refId).ID) &
+                     (alnFile.mapQV >= minMapQV))]
+
+                unsorted_tStart = rows.tStart
+                unsorted_tEnd = rows.tEnd
+
+                # Sort (expected by CoveredIntervals)
+                sort_order = np.lexsort((unsorted_tEnd, unsorted_tStart))
+                tStart = unsorted_tStart[sort_order].tolist()
+                tEnd = unsorted_tEnd[sort_order].tolist()
+
+
 
     def test_loading_reference(self):
         log.info('Opening Reference')
@@ -354,6 +383,11 @@ class TestDataSet(unittest.TestCase):
                 ds = openDataSet(infn, strict=False)
             self.assertEqual(type(ds), exp)
 
+    def test_openDataSet_unicode(self):
+        # Test to see if we can't open a unicode filename
+        fn = data.getXml(8)
+        aln = openDataSet(unicode(fn))
+
     def test_type_checking(self):
         bam = data.getBam()
         fasta = ReferenceSet(data.getXml(9)).toExternalFiles()[0]
@@ -490,14 +524,19 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(expLen, accLen)
         self.assertEqual(expNum, accNum)
 
-    # TODO: turn this back on once a bam is found with a different
-    # referenceInfoTable
-    @SkipTest
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
     def test_referenceInfoTableMerging(self):
         log.info("Testing refIds, etc. after merging")
-        ds = DataSet(data.getXml(17))
-        also_lambda = ds.toExternalFiles()[0]
-        aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda)
+        bam1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+                "2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c100713652400000001823152"
+                "404301534_s1_p0.1.aligned.bam")
+        bam2 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+                "2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c100713652400000001823152"
+                "404301534_s1_p0.3.aligned.bam")
+        aln = AlignmentSet(bam1, bam2)
         readers = aln.resourceReaders()
 
         ids = sorted([i for _, i in aln.refInfo('ID')])
@@ -570,24 +609,121 @@ class TestDataSet(unittest.TestCase):
         self.assertTrue(ds1.totalLength == ds1tl)
         self.assertTrue(ds2.totalLength == ds2tl)
 
+    def test_split_zmws(self):
+        N_RECORDS = 117
+        test_file = upstreamdata.getUnalignedBam()
+        ds1 = openDataFile(test_file)
+        self.assertEqual(len([r for r in ds1]), N_RECORDS)
+        self.assertEqual(len(ds1), N_RECORDS)
+        dss = ds1.split(chunks=1, zmws=True)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+
+        # We have a lower limit on the number of zmws, now
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 2)
+        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(
+            dss[0].zmwRanges,
+            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+              1650, 32328)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m140905_042212_sidney_c100564852550000001823085912221377_s1_X0',
+              32560, 54396)])
+        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+        interspans = []
+        last = None
+        for rg in ranges:
+            if not last is None:
+                interspans.append((last, rg[0]))
+                self.assertFalse(last == rg[0])
+            last = rg[1]
+        for rg in interspans:
+            self.assertEqual(len(np.nonzero(np.logical_and(
+                ds1.index.holeNumber < rg[1],
+                ds1.index.holeNumber > rg[0]))[0]), 0)
+
     @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
                          "Missing testadata directory")
-    def test_split_zmws(self):
-        test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/2372215/"
-                     "0007_micro/Analysis_Results/m150404_101626_42267_c"
-                     "100807920800000001823174110291514_s1_p0.all."
-                     "subreadset.xml")
-        ds1 = openDataSet(test_file)
-        self.assertEqual(len([r for r in ds1]), 1220)
+    def test_large_split_zmws(self):
+        N_RECORDS = 959539
+        test_file = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                     "2372215/0007/Analysis_Results/m150404_101626_42"
+                     "267_c100807920800000001823174110291514_s1_p0.al"
+                     "l.subreadset.xml")
+        ds1 = openDataFile(test_file)
+        self.assertEqual(len(ds1), N_RECORDS)
+        dss = ds1.split(chunks=1, zmws=True)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 12)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+        self.assertEqual(
+            dss[0].zmwRanges,
+            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+              7, 14007)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
+              149876, 163475)])
+        ranges = sorted([c.zmwRanges[0][1:] for c in dss])
+        interspans = []
+        last = None
+        for rg in ranges:
+            if not last is None:
+                interspans.append((last, rg[0]))
+                self.assertFalse(last == rg[0])
+            last = rg[1]
+        for rg in interspans:
+            self.assertEqual(len(np.nonzero(np.logical_and(
+                ds1.index.holeNumber < rg[1],
+                ds1.index.holeNumber > rg[0]))[0]), 0)
+
+
+    @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
+                         "Missing testadata directory")
+    def test_multi_movie_split_zmws(self):
+        N_RECORDS = 1745161
+        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2372215/0007/Analysis_Results/m150404_101626_42"
+                       "267_c100807920800000001823174110291514_s1_p0.al"
+                       "l.subreadset.xml")
+        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
+                       "2590980/0008/Analysis_Results/m141115_075238_et"
+                       "han_c100699872550000001823139203261572_s1_p0.al"
+                       "l.subreadset.xml")
+        ds1 = SubreadSet(test_file_1, test_file_2)
+        # used to get total:
+        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
+        self.assertEqual(len(ds1), N_RECORDS)
         dss = ds1.split(chunks=1, zmws=True)
-        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), 1220)
-        dss = ds1.split(chunks=9, zmws=True)
-        self.assertEqual(sum([len([r for r in ds_]) for ds_ in dss]), 1220)
+        self.assertEqual(len(dss), 1)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
+
+        dss = ds1.split(chunks=12, zmws=True)
+        self.assertEqual(len(dss), 12)
+        self.assertEqual(sum([len(ds_) for ds_ in dss]),
+                         N_RECORDS)
         self.assertEqual(
             dss[0].zmwRanges,
             [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
-              55, 1815)])
-        #55, 2865)])
+              7, 22098)])
+        self.assertEqual(
+            dss[-1].zmwRanges,
+            [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
+              127814, 163468)])
+
 
     @unittest.skipUnless(os.path.isdir("/pbi/dept/secondary/siv/testdata"),
                          "Missing testadata directory")
@@ -662,6 +798,38 @@ class TestDataSet(unittest.TestCase):
         self.assertTrue(str(ds2.filters).startswith(
             '( rname = E.faecalis'))
 
+    def test_setFilters(self):
+        ds1 = DataSet()
+        filt = Filters()
+        filt.addRequirement(rq=[('>', '0.85')])
+        ds1.addFilters(filt)
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
+        # Or added from a source XML
+        ds2 = DataSet()
+        ds2.filters = ds1.filters
+        self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
+
+
+    def test_add_double_bound_filters(self):
+        ds1 = AlignmentSet(data.getXml(8))
+        ds1.filters.addRequirement(rq=[('>', '0.85'),
+                                       ('<', '0.99')])
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')
+
+        ds1 = AlignmentSet(data.getXml(8))
+        self.assertEquals(str(ds1.filters), '')
+        ds1.filters.addFilter(rq=[('>', '0.85'),
+                                  ('<', '0.99')])
+        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')
+
+        ds1.filters.addFilter(length=[('>', '1000')])
+        self.assertEquals(str(ds1.filters),
+                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')
+
+        ds1.filters.removeFilter(0)
+        self.assertEquals(str(ds1.filters),
+                          '( length > 1000 )')
+
     def test_addMetadata(self):
         ds = DataSet()
         ds.addMetadata(None, Name='LongReadsRock')
@@ -712,7 +880,7 @@ class TestDataSet(unittest.TestCase):
 
     def test_toFofn(self):
         self.assertEquals(DataSet("bam1.bam", "bam2.bam",
-                                  strict=False).toFofn(),
+                                  strict=False, skipMissing=True).toFofn(),
                           ['bam1.bam', 'bam2.bam'])
         realDS = DataSet(data.getXml(8))
         files = realDS.toFofn()
@@ -725,12 +893,14 @@ class TestDataSet(unittest.TestCase):
         self.assertFalse(os.path.isabs(files[0]))
 
     def test_toExternalFiles(self):
-        bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False)
+        bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False,
+                          skipMissing=True)
         self.assertEqual(['bam1.bam', 'bam2.bam'],
                          bogusDS.externalResources.resourceIds)
-        self.assertEquals(DataSet("bam1.bam", "bam2.bam",
-                                  strict=False).toExternalFiles(),
-                          ['bam1.bam', 'bam2.bam'])
+        self.assertEquals(
+            DataSet("bam1.bam", "bam2.bam", strict=False,
+                    skipMissing=True).toExternalFiles(),
+                    ['bam1.bam', 'bam2.bam'])
         realDS = DataSet(data.getXml(8))
         files = realDS.toExternalFiles()
         self.assertEqual(len(files), 1)
@@ -886,6 +1056,12 @@ class TestDataSet(unittest.TestCase):
         for r1, r2 in itertools.izip(reads1, reads2):
             self.assertEqual(r1, r2)
 
+    def test_n_subreads_filter(self):
+        ds2 = AlignmentSet(data.getXml(8))
+        ds2.filters.addRequirement(n_subreads=[('>', '4')])
+        self.assertEqual(len(list(ds2.records)), 87)
+        self.assertEqual(len(ds2), 87)
+
     def test_filter(self):
         ds2 = AlignmentSet(data.getXml(8))
         ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
@@ -895,6 +1071,111 @@ class TestDataSet(unittest.TestCase):
         ds2.enableFilters()
         self.assertEqual(len(list(ds2.records)), 20)
 
+    def test_context_filters(self):
+        ss = SubreadSet(upstreamdata.getUnalignedBam())
+        self.assertEqual(set(ss.index.contextFlag), {0, 1, 2, 3})
+        self.assertEqual(
+            [len(np.flatnonzero(ss.index.contextFlag == cx))
+             for cx in sorted(set(ss.index.contextFlag))],
+            [15, 33, 32, 37])
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapters/barcodes
+        ss.filters.addRequirement(cx=[('=', 0)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapters/barcodes
+        ss.filters.addRequirement(cx=[('=', 'NO_LOCAL_CONTEXT')])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # some adapters/barcodes
+        ss.filters.addRequirement(cx=[('!=', 0)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before
+        ss.filters.addRequirement(cx=[('&', 1)])
+        self.assertEqual(len(ss.index), 70)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before
+        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE')])
+        self.assertEqual(len(ss.index), 70)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter after
+        ss.filters.addRequirement(cx=[('&', 2)])
+        self.assertEqual(len(ss.index), 69)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 3)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 'ADAPTER_BEFORE | ADAPTER_AFTER')])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after but not both
+        ss.filters.addRequirement(cx=[('!=', 0)])
+        ss.filters.addRequirement(cx=[('~', 1),
+                                      ('~', 2)])
+        self.assertEqual(len(ss.index), 65)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before or after
+        ss.filters.addRequirement(cx=[('&', 1),
+                                      ('&', 2)])
+        self.assertEqual(len(ss.index), 102)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before and after
+        ss.filters.addRequirement(cx=[('&', 1)])
+        ss.filters.addRequirement(cx=[('&', 2)])
+        self.assertEqual(len(ss.index), 37)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # adapter before but not after
+        ss.filters.addRequirement(cx=[('&', 1)])
+        ss.filters.addRequirement(cx=[('~', 2)])
+        self.assertEqual(len(ss.index), 33)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before
+        ss.filters.addRequirement(cx=[('~', 1)])
+        self.assertEqual(len(ss.index), 47)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before or after
+        ss.filters.addRequirement(cx=[('~', 1)])
+        ss.filters.addRequirement(cx=[('~', 2)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
+        # no adapter before or after
+        ss.filters.addRequirement(cx=[('~', 3)])
+        self.assertEqual(len(ss.index), 15)
+        ss.filters.removeRequirement('cx')
+        self.assertEqual(len(ss.index), 117)
+
     @SkipTest
     def test_split_by_contigs_presplit(self):
         # Consumes too much memory for Jenkins
@@ -1195,6 +1476,76 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
                                          ('E.faecalis.2', 100, 299)])
 
+    def test_intervalContour(self):
+        ds = AlignmentSet(data.getBam(0))
+        coverage = ds.intervalContour('E.faecalis.1')
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
+        # regular totalLength uses aEnd/aStart, which includes insertions
+        totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
+        # partial interval
+        ds = AlignmentSet(data.getBam(0))
+        coverage = ds.intervalContour('E.faecalis.1', tStart=100, tEnd=500)
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.1')],
+                                  tStart=[('<', '500')],
+                                  tEnd=[('>', '100')])
+        # regular totalLength uses aEnd/aStart, which includes insertions
+        ends = ds.index.tEnd
+        post = ends > 500
+        ends[post] = 500
+        starts = ds.index.tStart
+        pre = starts < 100
+        starts[pre] = 100
+        totalTargetLength = sum(ends - starts)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
+        # test a second reference in this set
+        ds.filters.removeRequirement('rname')
+        coverage = ds.intervalContour('E.faecalis.2')
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.2')])
+        totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
+        # partial interval
+        ds = AlignmentSet(data.getBam(0))
+        coverage = ds.intervalContour('E.faecalis.2', tStart=100, tEnd=500)
+        ds.filters.addRequirement(rname=[('=', 'E.faecalis.2')],
+                                  tStart=[('<', '500')],
+                                  tEnd=[('>', '100')])
+        # regular totalLength uses aEnd/aStart, which includes insertions
+        ends = ds.index.tEnd
+        post = ends > 500
+        ends[post] = 500
+        starts = ds.index.tStart
+        pre = starts < 100
+        starts[pre] = 100
+        totalTargetLength = sum(ends - starts)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
+
+        # test a cmp.h5 alignmentset
+        ds = AlignmentSet(upstreamdata.getBamAndCmpH5()[1])
+        coverage = ds.intervalContour('lambda_NEB3011')
+        totalTargetLength = sum(ds.index.tEnd - ds.index.tStart)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
+        # partial interval
+        ds = AlignmentSet(upstreamdata.getBamAndCmpH5()[1])
+        coverage = ds.intervalContour('lambda_NEB3011', tStart=100, tEnd=500)
+        ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011')],
+                                  tStart=[('<', '500')],
+                                  tEnd=[('>', '100')])
+        # regular totalLength uses aEnd/aStart, which includes insertions
+        starts = ds.index.tStart
+        ends = ds.index.tEnd
+        post = ends > 500
+        ends[post] = 500
+        pre = starts < 100
+        starts[pre] = 100
+        totalTargetLength = sum(ends - starts)
+        self.assertEqual(totalTargetLength, sum(coverage))
+
 
     def test_refLengths(self):
         ds = AlignmentSet(data.getBam(0))
@@ -1420,6 +1771,10 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(readers[2].readGroupTable), 1)
         self.assertEqual(len(aln.readGroupTable), 3)
 
+    def test_missing_file(self):
+        with self.assertRaises(InvalidDataSetIOError):
+            aln = AlignmentSet("NOPE")
+
     def test_repr(self):
         ds = DataSet(data.getBam())
         rep = str(ds)
@@ -1428,6 +1783,133 @@ class TestDataSet(unittest.TestCase):
                                   rep))
         self.assertTrue(re.search('pbalchemysim0.pbalign.bam', rep))
 
+    def test_stats_metadata_zero_binwidth(self):
+        # both zero
+        ds1 = DataSet(data.getXml(8))
+        ds1.loadStats(data.getStats())
+        ds2 = DataSet(data.getXml(11))
+        ds2.loadStats(data.getStats())
+        ds1.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds1.metadata.summaryStats.readLenDist.binWidth = 0
+        ds2.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds2.metadata.summaryStats.readLenDist.binWidth = 0
+        ds3 = ds1 + ds2
+        self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+
+        # one zero
+        ds1 = DataSet(data.getXml(8))
+        ds1.loadStats(data.getStats())
+        ds2 = DataSet(data.getXml(11))
+        ds2.loadStats(data.getStats())
+        ds1.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds1.metadata.summaryStats.readLenDist.binWidth = 0
+        ds2.metadata.summaryStats.readLenDist.bins = (
+            [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        ds2.metadata.summaryStats.readLenDist.minBinValue = 20
+        ds2.metadata.summaryStats.readLenDist.binWidth = 10
+        ds3 = ds1 + ds2
+        self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
+        # other zero
+        ds1 = DataSet(data.getXml(8))
+        ds1.loadStats(data.getStats())
+        ds2 = DataSet(data.getXml(11))
+        ds2.loadStats(data.getStats())
+        ds1.metadata.summaryStats.readLenDist.bins = (
+            [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
+        ds1.metadata.summaryStats.readLenDist.binWidth = 10
+        ds2.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds2.metadata.summaryStats.readLenDist.binWidth = 0
+        ds3 = ds1 + ds2
+        self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
+        # one zero more zero
+        ds1 = DataSet(data.getXml(8))
+        ds1.loadStats(data.getStats())
+        ds2 = DataSet(data.getXml(11))
+        ds2.loadStats(data.getStats())
+        ds3 = DataSet(data.getXml(11))
+        ds3.loadStats(data.getStats())
+        ds1.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds1.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds1.metadata.summaryStats.readLenDist.binWidth = 0
+        ds2.metadata.summaryStats.readLenDist.bins = (
+            [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+        ds2.metadata.summaryStats.readLenDist.minBinValue = 20
+        ds2.metadata.summaryStats.readLenDist.binWidth = 10
+        ds3.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds3.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds3.metadata.summaryStats.readLenDist.binWidth = 0
+        ds4 = ds1 + ds2 + ds3
+        self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+        self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
+                         [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])
+
+        # other zero
+        ds1 = DataSet(data.getXml(8))
+        ds1.loadStats(data.getStats())
+        ds2 = DataSet(data.getXml(11))
+        ds2.loadStats(data.getStats())
+        ds3 = DataSet(data.getXml(11))
+        ds3.loadStats(data.getStats())
+        ds1.metadata.summaryStats.readLenDist.bins = (
+            [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+        ds1.metadata.summaryStats.readLenDist.minBinValue = 10
+        ds1.metadata.summaryStats.readLenDist.binWidth = 10
+        ds2.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds2.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds2.metadata.summaryStats.readLenDist.binWidth = 0
+        ds3.metadata.summaryStats.readLenDist.bins = (
+            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins,
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+        ds3.metadata.summaryStats.readLenDist.minBinValue = 0
+        ds3.metadata.summaryStats.readLenDist.binWidth = 0
+        ds4 = ds1 + ds2 + ds3
+        self.assertEqual(len(ds3.metadata.summaryStats.readLenDists), 1)
+        self.assertEqual(ds4.metadata.summaryStats.readLenDist.bins,
+                         [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1])
+
     def test_stats_metadata(self):
         ds = DataSet(data.getBam())
         ds.loadStats(data.getStats())
@@ -1527,4 +2009,439 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(
             2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws)
         self.assertEqual(
-            150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+            150292.0,
+            ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_merged_cmp(self):
+        cmp1 = ("/pbi/dept/secondary/siv/testdata/pbreports"
+                "-unittest/data/sat/aligned_reads.cmp.h5")
+        cmp2 = upstreamdata.getBamAndCmpH5()[1]
+        aln0 = AlignmentSet(cmp1)
+        self.assertEqual(aln0.referenceInfoTable['EndRow'][0], 338001)
+        self.assertEqual(len(aln0), 338002)
+        aln1 = AlignmentSet(cmp2)
+        self.assertEqual(aln1.referenceInfoTable['EndRow'][0], 111)
+        self.assertEqual(len(aln1), 112)
+        aln = AlignmentSet(cmp1, cmp2)
+        refnames = aln.refNames
+        self.assertEqual(refnames, ['lambda_NEB3011'])
+        self.assertEqual(aln.refIds[aln.refNames[0]], 1)
+        self.assertEqual(aln.referenceInfoTable['EndRow'][0], 338113)
+        self.assertEqual(len(aln), 338114)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_cmpH5(self):
+        cmp1 = ("/pbi/dept/secondary/siv/testdata/pbreports"
+                "-unittest/data/sat/aligned_reads.cmp.h5")
+        cmp2 = upstreamdata.getBamAndCmpH5()[1]
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        aln = AlignmentSet(cmp1, cmp2)
+        len3 = len(aln)
+        self.assertEqual(len1 + len2, len3)
+        self.assertEqual(len3, 338114)
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', "
+             "48502, 'a1319ff90e994c8190a4fe6569d0822a', 0L, 338113L)]"))
+        self.assertEqual(set(aln.tId), {1})
+        # + 1, because bounds are inclusive, rather than exclusive
+        self.assertEqual(len3, (aln.referenceInfoTable[0].EndRow -
+                                aln.referenceInfoTable[0].StartRow) + 1)
+        self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+                         aln.referenceInfo(1))
+        # ask for the wrong one:
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         None)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_ref_cmpH5(self):
+        cmp1 = upstreamdata.getBamAndCmpH5()[1]
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "genomic_consensus-unittest/bam_c4p6_tests/"
+                "ecoli_c4p6.cmp.h5")
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        aln = AlignmentSet(cmp1, cmp2)
+        len3 = len(aln)
+        self.assertEqual(len1 + len2, len3)
+        self.assertEqual(len3, 57147)
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+             "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 57034L)\n"
+             " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+             "'a1319ff90e994c8190a4fe6569d0822a', 57035L, 57146L)]"))
+        self.assertEqual(set(aln.tId), {0, 1})
+        # + 1, because bounds are inclusive, rather than exclusive
+        self.assertEqual(len1, (aln.referenceInfoTable[1].EndRow -
+                                aln.referenceInfoTable[1].StartRow) + 1)
+        self.assertEqual(len2, (aln.referenceInfoTable[0].EndRow -
+                                aln.referenceInfoTable[0].StartRow) + 1)
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         aln.referenceInfo(0))
+        self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+                         aln.referenceInfo(1))
+
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_bam(self):
+        cmp1 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+                "2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c100713652400000001823152"
+                "404301534_s1_p0.1.aligned.bam")
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/"
+                "2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c100713652400000001823152"
+                "404301534_s1_p0.2.aligned.bam")
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        aln = AlignmentSet(cmp1, cmp2)
+        len3 = len(aln)
+        self.assertEqual(len1 + len2, len3)
+        self.assertEqual(len3, 65346)
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+             "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)]"))
+        self.assertEqual(set(aln.tId), {0})
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         aln.referenceInfo(0))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_xml(self):
+        cmp1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c1007136524000000018231"
+                "52404301534_s1_p0.all.alignmentset.xml")
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+                "m140913_222218_42240_c1006999524000000018231"
+                "39203261564_s1_p0.all.alignmentset.xml")
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        aln = AlignmentSet(cmp1, cmp2)
+        len3 = len(aln)
+        self.assertEqual(len1 + len2, len3)
+        self.assertEqual(len3, 160264)
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+             "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)]"))
+        self.assertEqual(set(aln.tId), {0})
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         aln.referenceInfo(0))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_ref_bam(self):
+        cmp1 = upstreamdata.getBamAndCmpH5()[0]
+        # this is the supposedly the same data as above:
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+                "m140913_222218_42240_c1006999524000000018231"
+                "39203261564_s1_p0.all.alignmentset.xml")
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        aln = AlignmentSet(cmp1, cmp2)
+        len3 = len(aln)
+        self.assertEqual(len1 + len2, len3)
+        self.assertEqual(len3, 57344)
+        # TODO(mdsmith)(2016-01-25) I would like to be able to use the startrow
+        # and endrow fields for bams someday...
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+             "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)\n"
+             " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+             "'a1319ff90e994c8190a4fe6569d0822a', 0L, 0L)]"))
+        self.assertEqual(set(aln.tId), {0, 1})
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         aln.referenceInfo(0))
+        self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+                         aln.referenceInfo(1))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_two_ref_three_bam(self):
+        # Here we test whether duplicate references in a non-identical
+        # reference situation remain duplicates or are collapsed
+        cmp1 = upstreamdata.getBamAndCmpH5()[0]
+        # this is the supposedly the same data as above:
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+                "m140913_222218_42240_c1006999524000000018231"
+                "39203261564_s1_p0.all.alignmentset.xml")
+        cmp3 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c1007136524000000018231"
+                "52404301534_s1_p0.all.alignmentset.xml")
+        len1 = len(AlignmentSet(cmp1))
+        len2 = len(AlignmentSet(cmp2))
+        len3 = len(AlignmentSet(cmp3))
+        aln = AlignmentSet(cmp1, cmp2, cmp3)
+        len4 = len(aln)
+        self.assertEqual(len1 + len2 + len3, len4)
+        self.assertEqual(len4, 160376)
+        self.assertEqual(
+            str(aln.referenceInfoTable),
+            ("[ (0, 0, 'ecoliK12_pbi_March2013', 'ecoliK12_pbi_March2013', "
+             "4642522, '52cd7c5fa92877152fa487906ae484c5', 0L, 0L)\n"
+             " (1, 1, 'lambda_NEB3011', 'lambda_NEB3011', 48502, "
+             "'a1319ff90e994c8190a4fe6569d0822a', 0L, 0L)]"))
+        self.assertEqual(set(aln.tId), {0, 1})
+        self.assertEqual(aln.referenceInfo('ecoliK12_pbi_March2013'),
+                         aln.referenceInfo(0))
+        self.assertEqual(aln.referenceInfo('lambda_NEB3011'),
+                         aln.referenceInfo(1))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_movie_filter(self):
+        # unaligned bam
+        bam0 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590956/0003/"
+                "Analysis_Results/m140913_222218_42240_c10069"
+                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/"
+                "Analysis_Results/m140913_005018_42139_c10071"
+                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
+        aln = SubreadSet(bam0, bam1)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.qId)))
+        self.assertEqual(len(aln), 178570)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+        self.assertEqual(len(SubreadSet(bam1)), len(aln))
+
+        # aligned bam
+        #bam0 = ("/pbi/dept/secondary/siv/testdata/"
+        #        "SA3-DS/ecoli/2590956/0003/Alignment_Results/"
+        #        "m140913_222218_42240_c1006999524000000018231"
+        #        "39203261564_s1_p0.all.alignmentset.xml")
+        bam0 = upstreamdata.getBamAndCmpH5()[0]
+        bam1 = ("/pbi/dept/secondary/siv/testdata/"
+                "SA3-DS/ecoli/2590953/0001/Alignment_Results/"
+                "m140913_005018_42139_c1007136524000000018231"
+                "52404301534_s1_p0.all.alignmentset.xml")
+        aln = AlignmentSet(bam0, bam1)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.qId)))
+        self.assertEqual(len(aln), 103144)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140913_005018_42139_c100713652400000001823152404301534_s1_p0')])
+        self.assertEqual(len(AlignmentSet(bam1)), len(aln))
+
+        # cmpH5
+        cmp1 = upstreamdata.getBamAndCmpH5()[1]
+        cmp2 = ("/pbi/dept/secondary/siv/testdata/"
+                "genomic_consensus-unittest/bam_c4p6_tests/"
+                "ecoli_c4p6.cmp.h5")
+        aln = AlignmentSet(cmp1, cmp2)
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(aln.readGroupTable['ID']))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])),
+                         len(set(aln.index.MovieID)))
+        self.assertEqual(len(set(aln.readGroupTable['ID'])), 2)
+        self.assertEqual(len(aln), 57147)
+        aln.filters.addRequirement(movie=[(
+            '=',
+            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0')])
+        len1 = len(AlignmentSet(cmp1))
+        self.assertEqual(len1, len(aln))
+
+        aln.filters.removeRequirement('movie')
+        self.assertEqual(len(aln), 57147)
+
+    def test_exceptions(self):
+        try:
+            raise InvalidDataSetIOError("Wrong!")
+        except InvalidDataSetIOError as e:
+            self.assertEqual(e.message, "Wrong!")
+
+    def test_createdAt(self):
+        aln = AlignmentSet(data.getXml(8))
+        self.assertEqual(aln.createdAt, '2015-08-05T10:25:18')
+
+    def test_divideKeys_keysToRanges(self):
+        keys = [0, 1, 2, 3, 5, 8, 50]
+        res = divideKeys(keys, 0)
+        self.assertEqual(res, [])
+        res = keysToRanges(res)
+        self.assertEqual(res, [])
+
+        res = divideKeys(keys, 1)
+        self.assertEqual(res, [[0, 1, 2, 3, 5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 50]])
+
+        res = divideKeys(keys, 2)
+        self.assertEqual(res, [[0, 1, 2], [3, 5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [3, 50]])
+
+        res = divideKeys(keys, 3)
+        self.assertEqual(res, [[0, 1], [2, 3], [5, 8, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 1], [2, 3], [5, 50]])
+
+        res = divideKeys(keys, 7)
+        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50]])
+
+        res = divideKeys(keys, 8)
+        self.assertEqual(res, [[0], [1], [2], [3], [5], [8], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50]])
+
+
+        keys = [0, 1, 2, 2, 3, 5, 8, 50, 50]
+        res = divideKeys(keys, 0)
+        self.assertEqual(res, [])
+        res = keysToRanges(res)
+        self.assertEqual(res, [])
+
+        res = divideKeys(keys, 1)
+        self.assertEqual(res, [[0, 1, 2, 2, 3, 5, 8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 50]])
+
+        res = divideKeys(keys, 2)
+        self.assertEqual(res, [[0, 1, 2, 2], [3, 5, 8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [3, 50]])
+
+        res = divideKeys(keys, 3)
+        self.assertEqual(res, [[0, 1, 2], [2, 3, 5], [8, 50, 50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 2], [2, 5], [8, 50]])
+
+        res = divideKeys(keys, 9)
+        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50], [50, 50]])
+
+        res = divideKeys(keys, 10)
+        self.assertEqual(res, [[0], [1], [2], [2], [3], [5], [8], [50], [50]])
+        res = keysToRanges(res)
+        self.assertEqual(res, [[0, 0], [1, 1], [2, 2], [2, 2], [3, 3],
+                               [5, 5], [8, 8], [50, 50], [50, 50]])
+
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_load_sts_from_extres(self):
+        # don't have a subreadset.xml with loaded sts.xml in testdata,
+        # fabricate one here:
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = ('/pbi/dept/secondary/siv/testdata/'
+                                         'SA3-Sequel/lambda/roche_SAT/'
+                                         'm54013_151205_032353.sts.xml')
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        ss.write(outXml)
+        ss = SubreadSet(outXml)
+        self.assertTrue(ss.metadata.summaryStats)
+        # test validation on write with loaded stats:
+        outXml = os.path.join(outdir, 'tempfileWithStats.xml')
+        ss.write(outXml, validate=False)
+        ss.write(outXml)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_fixed_bin_sts(self):
+        # don't have a subreadset.xml with loaded sts.xml in testdata,
+        # fabricate one here:
+        ss = SubreadSet(data.getXml(10))
+        ss.externalResources[0].sts = ('/pbi/dept/secondary/siv/testdata/'
+                                       'pbreports-unittest/data/sts_xml/'
+                                       '3120134-r54009_20160323_173308-'
+                                       '1_A01-Bug30772/m54009_160323_'
+                                       '173323.sts.xml')
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        outXml2 = os.path.join(outdir, 'tempfile2.xml')
+        ss.write(outXml)
+        ss.write(outXml2)
+        ss = SubreadSet(outXml)
+        ss2 = SubreadSet(outXml2)
+        ss3 = ss + ss2
+        self.assertEqual(ss3.metadata.summaryStats.readLenDist.bins,
+                         [b1 + b2 for b1, b2 in
+                          zip(ss.metadata.summaryStats.readLenDist.bins,
+                              ss2.metadata.summaryStats.readLenDist.bins)])
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not available")
+    def test_missing_extres(self):
+        # copy a file with relative paths, rescue ResourceId's
+        test_file = ('/pbi/dept/secondary/siv/testdata/'
+                     'SA3-Sequel/lambda/roche_SAT/'
+                     'm54013_151205_032353.subreadset.xml')
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outXml = os.path.join(outdir, 'tempfile.xml')
+        resXml = os.path.join(outdir, 'tempfile.rescued.xml')
+        sset = SubreadSet(test_file)
+
+        # record the original paths:
+        path_map = {}
+        recorder = lambda x, m=path_map: m.setdefault(os.path.split(x)[1], x)
+        sset._changePaths(recorder)
+
+        # make the paths relative and write out dataset with all missing:
+        sset.makePathsRelative(os.path.dirname(test_file))
+        sset.write(outXml, validate=False)
+
+        # check that it is broken:
+        with self.assertRaises(InvalidDataSetIOError):
+            sset = SubreadSet(outXml)
+
+        # check that rescuing fixes it:
+        replacer = lambda x, m=path_map: m[x]
+        sset = SubreadSet(outXml, skipMissing=True)
+        sset._changePaths(replacer)
+        sset.write(resXml, validate=False)
+        sset = SubreadSet(resXml)
+
+        # check that removing any one breaks it:
+        for key in path_map.keys():
+            mod_pmap = path_map.copy()
+
+            # remove a resourceId from the map:
+            mod_pmap.pop(key)
+            log.debug(key)
+            # use dict.get to maintain the breakage:
+            replacer = lambda x, m=mod_pmap: m.get(x, x)
+            sset = SubreadSet(outXml, skipMissing=True)
+            sset._changePaths(replacer)
+            sset.write(resXml, validate=False)
+            with self.assertRaises(InvalidDataSetIOError):
+                sset = SubreadSet(resXml)
+
+
+    @unittest.skip("Too expensive")
+    def test_huge_zmw_split(self):
+        human = ('/pbi/dept/secondary/siv/testdata/SA3-DS/'
+                 'human/JCV_85x_v030/jcv_85x_v030.subreadset.xml')
+        sset = SubreadSet(human)
+        ssets = sset.split(zmws=True, maxChunks=5)
+
diff --git a/tests/test_pbdataset_subtypes.py b/tests/test_pbdataset_subtypes.py
index bf8637e..7741b36 100644
--- a/tests/test_pbdataset_subtypes.py
+++ b/tests/test_pbdataset_subtypes.py
@@ -5,15 +5,18 @@ import unittest
 import tempfile
 import os
 import itertools
+import numpy as np
+
+import pysam
 
 from pbcore.util.Process import backticks
 from pbcore.io.dataset.utils import (consolidateBams, _infixFname,
                                     BamtoolsVersion)
 from pbcore.io import (DataSet, SubreadSet, ConsensusReadSet,
-                       ReferenceSet, ContigSet, AlignmentSet,
+                       ReferenceSet, ContigSet, AlignmentSet, BarcodeSet,
                        FastaReader, FastaWriter, IndexedFastaReader,
                        HdfSubreadSet, ConsensusAlignmentSet,
-                       openDataFile, FastaWriter)
+                       openDataFile, FastaWriter, FastqReader)
 import pbcore.data as upstreamData
 import pbcore.data.datasets as data
 from pbcore.io.dataset.DataSetValidator import validateXml
@@ -22,11 +25,6 @@ import xml.etree.ElementTree as ET
 log = logging.getLogger(__name__)
 
 def _check_constools():
-    cmd = "dataset.py"
-    o, r, m = backticks(cmd)
-    if r != 2:
-        return False
-
     if not BamtoolsVersion().good:
         log.warn("Bamtools not found or out of date")
         return False
@@ -53,8 +51,8 @@ class TestDataSet(unittest.TestCase):
 
 
     def test_subread_build(self):
-        ds1 = SubreadSet(data.getXml(no=5))
-        ds2 = SubreadSet(data.getXml(no=5))
+        ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
+        ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
         self.assertEquals(type(ds1).__name__, 'SubreadSet')
         self.assertEquals(ds1._metadata.__class__.__name__,
                           'SubreadSetMetadata')
@@ -64,7 +62,7 @@ class TestDataSet(unittest.TestCase):
         self.assertEquals(len(ds2.metadata.collections), 1)
         ds3 = ds1 + ds2
         self.assertEquals(len(ds3.metadata.collections), 2)
-        ds4 = SubreadSet(data.getSubreadSet())
+        ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
         self.assertEquals(type(ds4).__name__, 'SubreadSet')
         self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
         self.assertEquals(len(ds4.metadata.collections), 1)
@@ -136,11 +134,33 @@ class TestDataSet(unittest.TestCase):
         for name in names:
             self.assertTrue(ds[name].id == name)
 
+    def test_contigset_split(self):
+        ref = ReferenceSet(data.getXml(9))
+        exp_n_contigs = len(ref)
+        refs = ref.split(10)
+        self.assertEqual(len(refs), 10)
+        obs_n_contigs = 0
+        for r in refs:
+            obs_n_contigs += sum(1 for _ in r)
+        self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
+    def test_contigset_len(self):
+        ref = ReferenceSet(data.getXml(9))
+        exp_n_contigs = len(ref)
+        refs = ref.split(10)
+        self.assertEqual(len(refs), 10)
+        obs_n_contigs = 0
+        for r in refs:
+            obs_n_contigs += len(r)
+        self.assertEqual(obs_n_contigs, exp_n_contigs)
+
+
     def test_ccsread_build(self):
-        ds1 = ConsensusReadSet(data.getXml(2), strict=False)
+        ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
         self.assertEquals(type(ds1).__name__, 'ConsensusReadSet')
         self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
-        ds2 = ConsensusReadSet(data.getXml(2), strict=False)
+        ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
         self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
         self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
 
@@ -161,7 +181,8 @@ class TestDataSet(unittest.TestCase):
         ds1.write(fn)
 
     def test_ccsalignment_build(self):
-        ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False)
+        ds1 = ConsensusAlignmentSet(data.getXml(20), strict=False,
+                                    skipMissing=True)
         self.assertEquals(type(ds1).__name__, 'ConsensusAlignmentSet')
         self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
         # XXX strict=True requires actual existing .bam files
@@ -223,10 +244,10 @@ class TestDataSet(unittest.TestCase):
 
 
     def test_contigset_build(self):
-        ds1 = ContigSet(data.getXml(3))
+        ds1 = ContigSet(data.getXml(3), skipMissing=True)
         self.assertEquals(type(ds1).__name__, 'ContigSet')
         self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
-        ds2 = ContigSet(data.getXml(3))
+        ds2 = ContigSet(data.getXml(3), skipMissing=True)
         self.assertEquals(type(ds2).__name__, 'ContigSet')
         self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
         for contigmd in ds2.metadata.contigs:
@@ -240,7 +261,22 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(aln.toExternalFiles()), 2)
         outdir = tempfile.mkdtemp(suffix="dataset-unittest")
         outfn = os.path.join(outdir, 'merged.bam')
-        consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln)
+        consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
+                        useTmp=False)
+        self.assertTrue(os.path.exists(outfn))
+        consAln = AlignmentSet(outfn)
+        self.assertEqual(len(consAln.toExternalFiles()), 1)
+        for read1, read2 in zip(sorted(list(aln)), sorted(list(consAln))):
+            self.assertEqual(read1, read2)
+        self.assertEqual(len(aln), len(consAln))
+
+        log.debug("Test methods directly in tmp")
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(aln.toExternalFiles()), 2)
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+        outfn = os.path.join(outdir, 'merged.bam')
+        consolidateBams(aln.toExternalFiles(), outfn, filterDset=aln,
+                        useTmp=True)
         self.assertTrue(os.path.exists(outfn))
         consAln = AlignmentSet(outfn)
         self.assertEqual(len(consAln.toExternalFiles()), 1)
@@ -326,16 +362,17 @@ class TestDataSet(unittest.TestCase):
             self.assertEqual(read1, read2)
         self.assertEqual(len(list(aln)), len(list(nonCons)))
 
-        log.debug("Test cli")
-        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
-        datafile = os.path.join(outdir, "merged.bam")
-        xmlfile = os.path.join(outdir, "merged.xml")
-        cmd = "dataset.py consolidate {i} {d} {x}".format(i=data.getXml(12),
-                                                          d=datafile,
-                                                          x=xmlfile)
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
+    def test_accuracy_filter(self):
+        aln = AlignmentSet(data.getXml(12))
+        self.assertEqual(len(list(aln)), 177)
+        aln.filters.addRequirement(accuracy=[('>', '.85')])
+        self.assertEqual(len(list(aln)), 174)
+
+    def test_contigset_filter(self):
+        ref = ReferenceSet(data.getXml(9))
+        self.assertEqual(len(list(ref)), 59)
+        ref.filters.addRequirement(length=[('>', '1450')])
+        self.assertEqual(len(list(ref)), 34)
 
     @unittest.skipIf(not _check_constools() or not _internal_data(),
                      "bamtools, pbindex or data not found, skipping")
@@ -360,15 +397,6 @@ class TestDataSet(unittest.TestCase):
             self.assertEqual(read1, read2)
         self.assertEqual(len(aln), len(nonCons))
 
-        log.debug("Test cli")
-        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
-        datafile = os.path.join(outdir, "merged.bam")
-        xmlfile = os.path.join(outdir, "merged.xml")
-        cmd = "dataset.py consolidate --numFiles 2 {i} {d} {x}".format(
-            i=testFile, d=datafile, x=xmlfile)
-        log.debug(cmd)
-        o, r, m = backticks(cmd)
-        self.assertEqual(r, 0)
 
     @unittest.skipIf(not _check_constools(),
                      "bamtools or pbindex not found, skipping")
@@ -435,7 +463,10 @@ class TestDataSet(unittest.TestCase):
         exp_single_seqs = [rec.sequence for rec in exp_singles]
 
         acc_file = ContigSet(outFas1, outFas2)
+        acc_file.induceIndices()
         log.debug(acc_file.toExternalFiles())
+        self.assertEqual(len(acc_file), 4)
+        self.assertEqual(len(list(acc_file)), 4)
         acc_file.consolidate()
         log.debug(acc_file.toExternalFiles())
 
@@ -445,6 +476,47 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(acc_file.get_contig(double).sequence[:],
                          exp_double_seq)
 
+        self.assertEqual(len(acc_file._openReaders), 1)
+        self.assertEqual(len(acc_file.index), 3)
+        self.assertEqual(len(acc_file._indexMap), 3)
+        self.assertEqual(len(acc_file), 3)
+        self.assertEqual(len(list(acc_file)), 3)
+
+    def test_contigset_consolidate_int_names(self):
+        #build set to merge
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+
+        inFas = os.path.join(outdir, 'infile.fasta')
+        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
+        outFas2 = os.path.join(outdir, 'tempfile2.fasta')
+
+        # copy fasta reference to hide fai and ensure FastaReader is used
+        backticks('cp {i} {o}'.format(
+                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
+                      o=inFas))
+        rs1 = ContigSet(inFas)
+
+        double = 'B.cereus.1'
+        exp_double = rs1.get_contig(double)
+
+        # todo: modify the names first:
+        with FastaWriter(outFas1) as writer:
+            writer.writeRecord('5141', exp_double.sequence)
+        with FastaWriter(outFas2) as writer:
+            writer.writeRecord('5142', exp_double.sequence)
+
+        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
+        exp_names = ['5141', '5142']
+
+        obs_file = ContigSet(outFas1, outFas2)
+        log.debug(obs_file.toExternalFiles())
+        obs_file.consolidate()
+        log.debug(obs_file.toExternalFiles())
+
+        # open obs and compare to exp
+        for name, seq in zip(exp_names, exp_double_seqs):
+            self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
+
     def test_contigset_consolidate_genomic_consensus(self):
         """
         Verify that the contigs output by GenomicConsensus (e.g. quiver) can
@@ -459,20 +531,20 @@ class TestDataSet(unittest.TestCase):
         files = []
         for i, (header, seq) in enumerate([FASTA1, FASTA2, FASTA3]):
             _files = []
-            for suffix in ["", "|quiver", "|plurality", "|arrow"]:
+            for suffix in ["", "|quiver", "|plurality", "|arrow", "|poa"]:
                 tmpfile = tempfile.NamedTemporaryFile(suffix=".fasta").name
                 with open(tmpfile, "w") as f:
                     f.write(">{h}{s}\n{q}".format(h=header, s=suffix, q=seq))
                 _files.append(tmpfile)
             files.append(_files)
         for i in range(3):
-            ds = ContigSet(*[ f[i] for f in files ])
+            ds = ContigSet(*[f[i] for f in files])
             out1 = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
             fa1 = tempfile.NamedTemporaryFile(suffix=".fasta").name
             ds.consolidate(fa1)
             ds.write(out1)
             with ContigSet(out1) as ds_new:
-                self.assertEqual(len([ rec for rec in ds_new ]), 1,
+                self.assertEqual(len([rec for rec in ds_new]), 1,
                                  "failed on %d" % i)
 
     def test_split_hdfsubreadset(self):
@@ -483,6 +555,52 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(len(hdfdss[0].toExternalFiles()), 2)
         self.assertEqual(len(hdfdss[1].toExternalFiles()), 1)
 
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_len_fastq(self):
+        fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+              'lambda/2590980/0008/Analysis_Results/'
+              'm141115_075238_ethan_c100699872550000001'
+              '823139203261572_s1_p0.1.subreads.fastq')
+        fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+        with open(fq_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 24):
+                    fqh.write(line)
+        cset = ContigSet(fq_out)
+        self.assertFalse(cset.isIndexed)
+        self.assertTrue(isinstance(cset.resourceReaders()[0], FastqReader))
+        self.assertEqual(sum(1 for _ in cset),
+                         sum(1 for _ in FastqReader(fq_out)))
+        self.assertEqual(sum(1 for _ in cset), 6)
+        # XXX not possible, fastq files can't be indexed:
+        #self.assertEqual(len(cset), sum(1 for _ in cset))
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_fastq_consolidate(self):
+        fn = ('/pbi/dept/secondary/siv/testdata/SA3-RS/'
+              'lambda/2590980/0008/Analysis_Results/'
+              'm141115_075238_ethan_c100699872550000001'
+              '823139203261572_s1_p0.1.subreads.fastq')
+        fq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+        cfq_out = tempfile.NamedTemporaryFile(suffix=".fastq").name
+        with open(fq_out, 'w') as fqh:
+            with open(fn, 'r') as fih:
+                for line in itertools.islice(fih, 240):
+                    fqh.write(line)
+        cset = ContigSet(fq_out)
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 60)
+        cset.filters.addRequirement(length=[('>', 1000)])
+        cset_l = sum(1 for _ in cset)
+        self.assertEqual(cset_l, 23)
+        cset.consolidate(cfq_out)
+        cset_l = sum(1 for _ in cset)
+        cfq = FastqReader(cfq_out)
+        self.assertEqual(cset_l, 23)
+        self.assertEqual(cset_l, sum(1 for _ in cfq))
+
 
     def test_len(self):
         # AlignmentSet
@@ -498,6 +616,8 @@ class TestDataSet(unittest.TestCase):
         aln.updateCounts()
         self.assertEqual(aln.totalLength, 123588)
         self.assertEqual(aln.numRecords, 92)
+        self.assertEqual(sum(1 for _ in aln), 92)
+        self.assertEqual(sum(len(rec) for rec in aln), 123588)
 
         # AlignmentSet with filters
         aln = AlignmentSet(data.getXml(15), strict=True)
@@ -514,18 +634,18 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(aln.numRecords, 40)
 
         # AlignmentSet with cmp.h5
-        aln = AlignmentSet(upstreamData.getCmpH5(), strict=True)
-        self.assertEqual(len(aln), 84)
-        self.assertEqual(aln._length, (84, 26103))
-        self.assertEqual(aln.totalLength, 26103)
-        self.assertEqual(aln.numRecords, 84)
+        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
+        self.assertEqual(len(aln), 112)
+        self.assertEqual(aln._length, (112, 59970))
+        self.assertEqual(aln.totalLength, 59970)
+        self.assertEqual(aln.numRecords, 112)
         aln.totalLength = -1
         aln.numRecords = -1
         self.assertEqual(aln.totalLength, -1)
         self.assertEqual(aln.numRecords, -1)
         aln.updateCounts()
-        self.assertEqual(aln.totalLength, 26103)
-        self.assertEqual(aln.numRecords, 84)
+        self.assertEqual(aln.totalLength, 59970)
+        self.assertEqual(aln.numRecords, 112)
 
 
         # SubreadSet
@@ -541,6 +661,8 @@ class TestDataSet(unittest.TestCase):
         sset.updateCounts()
         self.assertEqual(sset.totalLength, 124093)
         self.assertEqual(sset.numRecords, 92)
+        self.assertEqual(sum(1 for _ in sset), 92)
+        self.assertEqual(sum(len(rec) for rec in sset), 124093)
 
         # HdfSubreadSet
         # len means something else in bax/bas land. These numbers may actually
@@ -602,7 +724,7 @@ class TestDataSet(unittest.TestCase):
 
     def test_nested_external_resources(self):
         log.debug("Testing nested externalResources in AlignmentSets")
-        aln = AlignmentSet(data.getXml(0))
+        aln = AlignmentSet(data.getXml(0), skipMissing=True)
         self.assertTrue(aln.externalResources[0].pbi)
         self.assertTrue(aln.externalResources[0].reference)
         self.assertEqual(
@@ -611,7 +733,7 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(aln.externalResources[0].scraps, None)
 
         log.debug("Testing nested externalResources in SubreadSets")
-        subs = SubreadSet(data.getXml(5))
+        subs = SubreadSet(data.getXml(5), skipMissing=True)
         self.assertTrue(subs.externalResources[0].scraps)
         self.assertEqual(
             subs.externalResources[0].externalResources[0].metaType,
@@ -626,6 +748,11 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(
             subs.externalResources[0].externalResources[0].metaType,
             'PacBio.SubreadFile.ScrapsBamFile')
+        subs.externalResources[0].barcodes = 'bc.fasta'
+        self.assertTrue(subs.externalResources[0].barcodes)
+        self.assertEqual(
+            subs.externalResources[0].externalResources[1].metaType,
+            "PacBio.DataSet.BarcodeSet")
 
         log.debug("Testing adding nested externalResources to AlignmetnSet "
                   "manually")
@@ -670,8 +797,100 @@ class TestDataSet(unittest.TestCase):
         self.assertEqual(ds[0].name, "lambda_NEB3011")
 
     def test_alignmentset_index(self):
-        aln = AlignmentSet(upstreamData.getCmpH5(), strict=True)
+        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
         reads = aln.readsInRange(aln.refNames[0], 0, 1000)
         self.assertEqual(len(list(reads)), 2)
-        self.assertEqual(len(list(aln)), 84)
-        self.assertEqual(len(aln.index), 84)
+        self.assertEqual(len(list(aln)), 112)
+        self.assertEqual(len(aln.index), 112)
+
+    def test_barcodeset(self):
+        fa_out = tempfile.NamedTemporaryFile(suffix=".fasta").name
+        with open(fa_out, "w") as f:
+            f.write(">bc1\nAAAAAAAAAAAAAAAA\n>bc2\nCCCCCCCCCCCCCCCC")
+        ds = BarcodeSet(fa_out)
+        ds.induceIndices()
+        self.assertEqual([r.id for r in ds], ["bc1","bc2"])
+        ds_out = tempfile.NamedTemporaryFile(suffix=".barcodeset.xml").name
+        ds.write(ds_out)
+
+    @unittest.skipIf(not _internal_data(),
+                     "Internal data not found, skipping")
+    def test_barcode_split_cornercases(self):
+        fn = ('/pbi/dept/secondary/siv/testdata/'
+              'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
+              '.tiny.subreadset.xml')
+        sset = SubreadSet(fn)
+        ssets = sset.split(chunks=3, barcodes=True)
+        self.assertEqual([str(ss.filters) for ss in ssets],
+                         ["( bc = [0, 0] )",
+                          "( bc = [1, 1] )",
+                          "( bc = [2, 2] )"])
+        sset = SubreadSet(fn)
+        self.assertEqual(len(sset), 15133)
+        sset.filters = None
+        self.assertEqual(str(sset.filters), "")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 2667562)
+
+        sset.filters.addRequirement(bc=[('=', '[2, 2]')])
+        self.assertEqual(str(sset.filters), "( bc = [2, 2] )")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 4710)
+
+        sset.filters = None
+        self.assertEqual(str(sset.filters), "")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 2667562)
+
+        sset.filters.addRequirement(bc=[('=', '[2,2]')])
+        self.assertEqual(str(sset.filters), "( bc = [2,2] )")
+        sset.updateCounts()
+        self.assertEqual(len(sset), 4710)
+
+    def test_merged_contigset(self):
+        fn = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name
+        with ContigSet(upstreamData.getLambdaFasta(),
+                upstreamData.getFasta()) as cset:
+            self.assertEqual(len(list(cset)), 49)
+            self.assertEqual(len(cset), 49)
+            cset.consolidate()
+            cset.write(fn)
+            log.debug("Writing to {f}".format(f=fn))
+            self.assertEqual(len(list(cset)), 49)
+            self.assertEqual(len(cset), 49)
+        with ContigSet(fn) as cset:
+            self.assertEqual(len(list(cset)), 49)
+            self.assertEqual(len(cset), 49)
+
+    def test_incorrect_len_getitem(self):
+        types = [AlignmentSet(data.getXml(8)),
+                 ReferenceSet(data.getXml(9)),
+                 SubreadSet(data.getXml(10)),
+                 HdfSubreadSet(data.getXml(19))]
+        fn = tempfile.NamedTemporaryFile(suffix=".xml").name
+        for ds in types:
+            explen = -2
+            with openDataFile(ds.toExternalFiles()[0]) as mystery:
+                # try to avoid crashes...
+                explen = len(mystery)
+                mystery.numRecords = 1000000000
+                mystery.write(fn)
+            with openDataFile(fn) as mystery:
+                self.assertEqual(len(list(mystery)), explen)
+
+    def test_missing_fai_error_message(self):
+        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
+
+        inFas = os.path.join(outdir, 'infile.fasta')
+
+        # copy fasta reference to hide fai and ensure FastaReader is used
+        backticks('cp {i} {o}'.format(
+            i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
+            o=inFas))
+        rs1 = ContigSet(inFas)
+        with self.assertRaises(IOError) as cm:
+            rs1.assertIndexed()
+        self.assertEqual(
+            str(cm.exception),
+            ( "Companion FASTA index (.fai) file not found or malformatted! "
+             "Use 'samtools faidx' to generate FASTA index."))

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pbcore.git



More information about the debian-med-commit mailing list